diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..ddea2dab8d4d1849f69ac7a738481873fec7107a 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,49 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/epic_gif.gif filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000000000005.2.013.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000000000793.0.003.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000000000896.2.005.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000000001059.6.004.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000000049282.4.005.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000000049768.11.002.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000002012796.6.001.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/masked_videos/000003031893.11.003.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000000000005.2.013.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000000000793.0.003.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000000000896.2.005.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000000001059.6.004.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000000049282.4.005.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000000049768.11.002.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000002012796.6.001.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_i2v/videos/000003031893.11.003.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/amalfi-coast_traj_loop2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/art-museum_gradual_0_-30_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/dog_gradual_0_-30_-0.6_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/fish_gradual_25_0_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/happy-cat_gradual_-20_0_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/masked_videos/vlogger-corgi_traj_loop2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/amalfi-coast_traj_loop2.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/art-museum_gradual_0_-30_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/dog_gradual_0_-30_-0.6_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/fish_gradual_25_0_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/happy-cat_gradual_-20_0_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4 filter=lfs diff=lfs merge=lfs -text
+data/test_v2v/videos/vlogger-corgi_traj_loop2.mp4 filter=lfs diff=lfs merge=lfs -text
+inference/v2v_data/test/videos/0-NNvgaTcVzAG0-r.mp4 filter=lfs diff=lfs merge=lfs -text
+inference/v2v_data/test/videos/p7.mp4 filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0016.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0017.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0018.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0019.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0020.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0021.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0022.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0023.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0024.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/demo-frames/frame_0025.png filter=lfs diff=lfs merge=lfs -text
+preprocess/RAFT/RAFT.png filter=lfs diff=lfs merge=lfs -text
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..857acfed727dc6680404eea0b0dd18d571321af1
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,44 @@
+FROM pytorch/pytorch:2.2.2-cuda12.1-cudnn8-runtime
+
+SHELL ["/bin/bash", "-c"]
+
+# Environment variables for Hugging Face cache
+ENV HF_HOME=/app/hf_cache
+ENV TRANSFORMERS_CACHE=/app/hf_cache
+ENV HF_TOKEN=${HF_TOKEN}
+ENV PATH=/opt/conda/bin:$PATH
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git wget curl unzip ffmpeg libgl1-mesa-glx libglib2.0-0 && \
+    apt-get clean
+
+# Set up working directory as /app
+WORKDIR /app
+
+# Copy project into /app
+COPY . /app
+
+# Fix permissions for all subdirectories
+RUN mkdir -p /app/pretrained /app/hf_cache /.cache/gdown && \
+    chmod -R 777 /app && \
+    chmod -R 777 /.cache && \
+    chmod -R 777 /root
+
+# Create conda environment and install dependencies
+COPY requirements.txt /app/requirements.txt
+RUN conda create -n epic python=3.10 -y && \
+    conda run -n epic pip install --upgrade pip && \
+    conda run -n epic pip install -r /app/requirements.txt
+
+RUN chmod -R 777 /app /workspace
+
+# # List contents (for debug)
+RUN ls -la /app
+RUN pip install gradio
+
+# Expose Gradio default port
+EXPOSE 7860
+
+# Start the Gradio app
+CMD ["conda", "run", "--no-capture-output", "-n", "epic", "python", "gradio_app.py"]
\ No newline at end of file
diff --git a/README.md b/README.md
index d519eaf67cc65861499c6bb74ce70fa524ff1960..a2b59a270f28702b9234b07d014f7bdcc5c92674 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,10 @@
 ---
-title: EPiC
-emoji: 🐨
-colorFrom: red
-colorTo: red
+title: EPiC - Control
+emoji: 📚
+colorFrom: indigo
+colorTo: blue
 sdk: docker
+app_file: Dockerfile
 pinned: false
 license: mit
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+---
\ No newline at end of file
diff --git a/cogvideo_controlnet_pcd.py b/cogvideo_controlnet_pcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..481fa16525126ba716f33b45f9d8351d046c3492
--- /dev/null
+++ b/cogvideo_controlnet_pcd.py
@@ -0,0 +1,235 @@
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+from torch import nn
+from einops import rearrange
+import torch.nn.functional as F
+from diffusers.models.transformers.cogvideox_transformer_3d import Transformer2DModelOutput, CogVideoXBlock
+from diffusers.utils import is_torch_version
+from diffusers.loaders import  PeftAdapterMixin
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor2_0
+from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+
+
+class CogVideoXControlnetPCD(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    _supports_gradient_checkpointing = True
+    
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30,
+        use_zero_conv: bool = False,
+        attention_head_dim: int = 64,
+        vae_channels: int = 16,
+        in_channels: int = 3,
+        downscale_coef: int = 8,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        num_layers: int = 8,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+        out_proj_dim: int = None,
+        out_proj_dim_zero_init: bool = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+
+        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+            raise ValueError(
+                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+                "issue at https://github.com/huggingface/diffusers/issues."
+            )
+        
+        self.vae_channels = vae_channels
+        start_channels = in_channels * (downscale_coef ** 2)
+        input_channels = [start_channels, start_channels // 2, start_channels // 4]
+        self.unshuffle = nn.PixelUnshuffle(downscale_coef)
+        self.use_zero_conv = use_zero_conv
+        
+        if use_zero_conv:
+            self.controlnet_encode_first = nn.Sequential(
+                nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0),
+                nn.GroupNorm(2, input_channels[1]),
+                nn.ReLU(),
+            )
+
+            self.controlnet_encode_second = nn.Sequential(
+                nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0),
+                nn.GroupNorm(2, input_channels[2]),
+                nn.ReLU(),
+            )
+            patch_embed_in_channels = vae_channels + input_channels[2]
+        
+        else:
+            patch_embed_in_channels = vae_channels*2
+            
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+            patch_size=patch_size,
+            in_channels=patch_embed_in_channels,
+            embed_dim=inner_dim,
+            bias=True,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+        )
+        
+        self.embedding_dropout = nn.Dropout(dropout)
+
+        # 2. Time embeddings
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+
+        # 3. Define spatio-temporal transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+
+        self.out_projectors = None
+        if out_proj_dim is not None:
+            self.out_projectors = nn.ModuleList(
+                [nn.Linear(inner_dim, out_proj_dim) for _ in range(num_layers)]
+            )
+            if out_proj_dim_zero_init:
+                for out_projector in self.out_projectors:
+                    self.zeros_init_linear(out_projector)   
+            
+        self.gradient_checkpointing = False
+    
+    def zeros_init_linear(self, linear: nn.Module):
+        if isinstance(linear, (nn.Linear, nn.Conv1d)):
+            if hasattr(linear, "weight"):
+                nn.init.zeros_(linear.weight)
+            if hasattr(linear, "bias"):
+                nn.init.zeros_(linear.bias)
+        
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+
+    def compress_time(self, x, num_frames):
+        x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
+        batch_size, frames, channels, height, width = x.shape
+        x = rearrange(x, 'b f c h w -> (b h w) c f')
+        
+        if x.shape[-1] % 2 == 1:
+            x_first, x_rest = x[..., 0], x[..., 1:]
+            if x_rest.shape[-1] > 0:
+                x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+
+            x = torch.cat([x_first[..., None], x_rest], dim=-1)
+        else:
+            x = F.avg_pool1d(x, kernel_size=2, stride=2)
+        x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width)
+        return x
+        
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        controlnet_states: Tuple[torch.Tensor, torch.Tensor],
+        timestep: Union[int, float, torch.LongTensor],
+        controlnet_output_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        hidden_states = torch.cat([hidden_states, controlnet_states], dim=2)
+        
+        # controlnet_states = self.controlnext_encoder(controlnet_states, timestep=timestep)
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+
+
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        
+        controlnet_hidden_states = ()
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+                
+            if self.out_projectors is not None:
+                if controlnet_output_mask is not None:
+                    controlnet_hidden_states += (self.out_projectors[i](hidden_states) * controlnet_output_mask,)
+                else:
+                    controlnet_hidden_states += (self.out_projectors[i](hidden_states),)
+            else:
+                controlnet_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return (controlnet_hidden_states,)
+        return Transformer2DModelOutput(sample=controlnet_hidden_states)
\ No newline at end of file
diff --git a/cogvideo_transformer.py b/cogvideo_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a069e3e0a8b3cf207428fac7a341f52b5b75d5e
--- /dev/null
+++ b/cogvideo_transformer.py
@@ -0,0 +1,127 @@
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import numpy as np
+from diffusers.utils import is_torch_version
+from diffusers.models.transformers.cogvideox_transformer_3d import CogVideoXTransformer3DModel, Transformer2DModelOutput
+
+
+class CustomCogVideoXTransformer3DModel(CogVideoXTransformer3DModel):      
+    def set_learnable_parameters(self, unfrozen_layers: int = 16):
+        for param in self.patch_embed.parameters():
+            param.requires_grad = True
+                
+        for i in range(unfrozen_layers):
+            block = self.transformer_blocks[i]
+            attn = block.attn1
+            for module in  [block.norm2, block.ff]:
+                for param in self.patch_embed.parameters():
+                    param.requires_grad = True
+                    
+            for name in ['to_q', 'to_k', 'to_v', 'norm_q', 'norm_k']:
+                module = getattr(attn, name, None)
+                if module is not None:
+                    for param in module.parameters():
+                        param.requires_grad = True
+                else:
+                    print(f"[Warning] {name} not found in attn1 of block {i}")
+        
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        start_frame = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        controlnet_states: torch.Tensor = None,
+        controlnet_weights: Optional[Union[float, int, list, np.ndarray, torch.FloatTensor]] = 1.0,
+        return_dict: bool = True,
+    ):
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+
+        if start_frame is not None:
+            hidden_states = torch.cat([start_frame, hidden_states], dim=2)
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+
+            if (controlnet_states is not None) and (i < len(controlnet_states)):
+                controlnet_states_block = controlnet_states[i]
+                controlnet_block_weight = 1.0
+                if isinstance(controlnet_weights, (list, np.ndarray)) or torch.is_tensor(controlnet_weights):
+                    controlnet_block_weight = controlnet_weights[i]
+                elif isinstance(controlnet_weights, (float, int)):
+                    controlnet_block_weight = controlnet_weights
+                
+                hidden_states = hidden_states + controlnet_states_block * controlnet_block_weight
+
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+
+        # 5. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
\ No newline at end of file
diff --git a/controlnet_pipeline.py b/controlnet_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..13cf327390bd2e68aac4813c240d1cafa494b6dc
--- /dev/null
+++ b/controlnet_pipeline.py
@@ -0,0 +1,807 @@
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import numpy as np
+import PIL
+from PIL import Image
+from torchvision import transforms
+from einops import rearrange, repeat
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers.video_processor import VideoProcessor
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
+from diffusers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import CogVideoXPipelineOutput, CogVideoXLoraLoaderMixin
+from diffusers.image_processor import PipelineImageInput
+
+from cogvideo_controlnet_pcd import CogVideoXControlnetPCD as CogVideoXControlnet
+
+
+def resize_for_crop(image, crop_h, crop_w):
+    img_h, img_w = image.shape[-2:]
+    if img_h >= crop_h and img_w >= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    elif img_h <= crop_h and img_w <= crop_w:
+        coef = max(crop_h / img_h, crop_w / img_w)
+    else:
+        coef = crop_h / img_h if crop_h > img_h else crop_w / img_w 
+    out_h, out_w = int(img_h * coef), int(img_w * coef)
+    resized_image = transforms.functional.resize(image, (out_h, out_w), antialias=True)
+    return resized_image
+
+
+def prepare_frames(input_images, video_size, do_resize=True, do_crop=True):
+    input_images = np.stack([np.array(x) for x in input_images])
+    images_tensor = torch.from_numpy(input_images).permute(0, 3, 1, 2) / 127.5 - 1
+    if do_resize:
+        images_tensor = [resize_for_crop(x, crop_h=video_size[0], crop_w=video_size[1]) for x in images_tensor]
+    if do_crop:
+        images_tensor = [transforms.functional.center_crop(x, video_size) for x in images_tensor]
+    if isinstance(images_tensor, list):
+        images_tensor = torch.stack(images_tensor)
+    return images_tensor.unsqueeze(0) 
+
+
+def get_resize_crop_region_for_grid(src, tgt_width, tgt_height):
+    tw = tgt_width
+    th = tgt_height
+    h, w = src
+    r = h / w
+    if r > (th / tw):
+        resize_height = th
+        resize_width = int(round(th / h * w))
+    else:
+        resize_width = tw
+        resize_height = int(round(tw / w * h))
+
+    crop_top = int(round((th - resize_height) / 2.0))
+    crop_left = int(round((tw - resize_width) / 2.0))
+
+    return (crop_top, crop_left), (crop_top + resize_height, crop_left + resize_width)
+
+
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+    
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+
+class ControlnetCogVideoXImageToVideoPCDPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
+    _optional_components = []
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+    ]
+
+    def __init__(
+        self,
+        tokenizer: T5Tokenizer,
+        text_encoder: T5EncoderModel,
+        vae: AutoencoderKLCogVideoX,
+        transformer: CogVideoXTransformer3DModel,
+        scheduler: Union[CogVideoXDDIMScheduler, CogVideoXDPMScheduler],
+        controlnet: CogVideoXControlnet,
+    ):
+        super().__init__()
+
+        self.register_modules(
+            tokenizer=tokenizer, text_encoder=text_encoder, vae=vae, transformer=transformer, controlnet=controlnet, scheduler=scheduler
+        )
+        self.vae_scale_factor_spatial = (
+            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
+        )
+        self.vae_scale_factor_temporal = (
+            self.vae.config.temporal_compression_ratio if hasattr(self, "vae") and self.vae is not None else 4
+        )
+        self.vae_scaling_factor_image = (
+            self.vae.config.scaling_factor if hasattr(self, "vae") and self.vae is not None else 0.7
+        )
+
+        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._get_t5_prompt_embeds
+    def _get_t5_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        num_videos_per_prompt: int = 1,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+
+        prompt_embeds = self.text_encoder(text_input_ids.to(device))[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+        return prompt_embeds
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        do_classifier_free_guidance: bool = True,
+        num_videos_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 226,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        r"""
+        Encodes the prompt into text encoder hidden states.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+                less than `1`).
+            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
+                Whether to use classifier free guidance or not.
+            num_videos_per_prompt (`int`, *optional*, defaults to 1):
+                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            device: (`torch.device`, *optional*):
+                torch device
+            dtype: (`torch.dtype`, *optional*):
+                torch dtype
+        """
+        device = device or self._execution_device
+
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt is not None:
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        if prompt_embeds is None:
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        if do_classifier_free_guidance and negative_prompt_embeds is None:
+            negative_prompt = negative_prompt or ""
+            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
+
+            if prompt is not None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+
+            negative_prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=negative_prompt,
+                num_videos_per_prompt=num_videos_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+
+        return prompt_embeds, negative_prompt_embeds
+
+    def prepare_latents(
+        self,
+        image: torch.Tensor,
+        batch_size: int = 1,
+        num_channels_latents: int = 16,
+        num_frames: int = 13,
+        height: int = 60,
+        width: int = 90,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        num_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+        shape = (
+            batch_size,
+            num_frames,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        # For CogVideoX1.5, the latent should add 1 for padding (Not use)
+        if self.transformer.config.patch_size_t is not None:
+            shape = shape[:1] + (shape[1] + shape[1] % self.transformer.config.patch_size_t,) + shape[2:]
+
+        image = image.unsqueeze(2)  # [B, C, F, H, W]
+
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i].unsqueeze(0)), generator[i]) for i in range(batch_size)
+            ]
+        else:
+            image_latents = [retrieve_latents(self.vae.encode(img.unsqueeze(0)), generator) for img in image]
+
+        image_latents = torch.cat(image_latents, dim=0).to(dtype).permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
+
+        if not self.vae.config.invert_scale_latents:
+            image_latents = self.vae_scaling_factor_image * image_latents
+        else:
+            # This is awkward but required because the CogVideoX team forgot to multiply the
+            # scaling factor during training :)
+            image_latents = 1 / self.vae_scaling_factor_image * image_latents
+
+        padding_shape = (
+            batch_size,
+            num_frames - 1,
+            num_channels_latents,
+            height // self.vae_scale_factor_spatial,
+            width // self.vae_scale_factor_spatial,
+        )
+
+        latent_padding = torch.zeros(padding_shape, device=device, dtype=dtype)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+        # Select the first frame along the second dimension
+        if self.transformer.config.patch_size_t is not None:
+            first_frame = image_latents[:, : image_latents.size(1) % self.transformer.config.patch_size_t, ...]
+            image_latents = torch.cat([first_frame, image_latents], dim=1)
+
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents, image_latents
+    
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.decode_latents
+    def decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
+        latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+        latents = 1 / self.vae_scaling_factor_image * latents
+
+        frames = self.vae.decode(latents).sample
+        return frames
+
+    # Copied from diffusers.pipelines.animatediff.pipeline_animatediff_video2video.AnimateDiffVideoToVideoPipeline.get_timesteps
+    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = timesteps[t_start * self.scheduler.order :]
+
+        return timesteps, num_inference_steps - t_start
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+
+    def check_inputs(
+        self,
+        image,
+        prompt,
+        height,
+        width,
+        negative_prompt,
+        callback_on_step_end_tensor_inputs,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+    ):
+        if (
+            not isinstance(image, torch.Tensor)
+            and not isinstance(image, PIL.Image.Image)
+            and not isinstance(image, list)
+        ):
+            raise ValueError(
+                "`image` has to be of type `torch.Tensor` or `PIL.Image.Image` or `List[PIL.Image.Image]` but is"
+                f" {type(image)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+
+        if prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+
+        if prompt_embeds is not None and negative_prompt_embeds is not None:
+            if prompt_embeds.shape != negative_prompt_embeds.shape:
+                raise ValueError(
+                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
+                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
+                    f" {negative_prompt_embeds.shape}."
+                )
+                
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.fuse_qkv_projections
+    def fuse_qkv_projections(self) -> None:
+        r"""Enables fused QKV projections."""
+        self.fusing_transformer = True
+        self.transformer.fuse_qkv_projections()
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline.unfuse_qkv_projections
+    def unfuse_qkv_projections(self) -> None:
+        r"""Disable QKV projection fusion if enabled."""
+        if not self.fusing_transformer:
+            logger.warning("The Transformer was not initially fused for QKV projections. Doing nothing.")
+        else:
+            self.transformer.unfuse_qkv_projections()
+            self.fusing_transformer = False
+
+    # Copied from diffusers.pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipeline._prepare_rotary_positional_embeddings
+    def _prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+        grid_width = width // (self.vae_scale_factor_spatial * self.transformer.config.patch_size)
+
+        p = self.transformer.config.patch_size
+        p_t = self.transformer.config.patch_size_t
+
+        base_size_width = self.transformer.config.sample_width // p
+        base_size_height = self.transformer.config.sample_height // p
+
+        if p_t is None:
+            # CogVideoX 1.0
+            grid_crops_coords = get_resize_crop_region_for_grid(
+                (grid_height, grid_width), base_size_width, base_size_height
+            )
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=grid_crops_coords,
+                grid_size=(grid_height, grid_width),
+                temporal_size=num_frames,
+                device=device,
+            )
+        else:
+            # CogVideoX 1.5
+            base_num_frames = (num_frames + p_t - 1) // p_t
+
+            freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+                embed_dim=self.transformer.config.attention_head_dim,
+                crops_coords=None,
+                grid_size=(grid_height, grid_width),
+                temporal_size=base_num_frames,
+                grid_type="slice",
+                max_size=(base_size_height, base_size_width),
+                device=device,
+            )
+
+        return freqs_cos, freqs_sin
+
+    def encode_video(self, video):
+        video = video.to(self.device, dtype=self.vae.dtype)
+        video = video.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        latent_dist = self.vae.encode(video).latent_dist.sample() * self.vae.config.scaling_factor
+        return latent_dist.permute(0, 2, 1, 3, 4).to(memory_format=torch.contiguous_format)
+    
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+
+    @property
+    def interrupt(self):
+        return self._interrupt
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: PipelineImageInput,
+        anchor_video: None,
+        controlnet_output_mask: None,
+        prompt: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 49,
+        num_inference_steps: int = 50,
+        timesteps: Optional[List[int]] = None,
+        guidance_scale: float = 6,
+        use_dynamic_cfg: bool = False,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: str = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 226,
+        controlnet_weights: Optional[Union[float, list, np.ndarray, torch.FloatTensor]] = 1.0,
+        controlnet_guidance_start: float = 0.0,
+        controlnet_guidance_end: float = 1.0,
+    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        height = height or self.transformer.config.sample_height * self.vae_scale_factor_spatial
+        width = width or self.transformer.config.sample_width * self.vae_scale_factor_spatial
+        num_frames = num_frames or self.transformer.config.sample_frames
+
+        num_videos_per_prompt = 1
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            image=image,
+            prompt=prompt,
+            height=height,
+            width=width,
+            negative_prompt=negative_prompt,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._interrupt = False
+
+        # 2. Default call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+
+        # 3. Encode input prompt
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            do_classifier_free_guidance=do_classifier_free_guidance,
+            num_videos_per_prompt=num_videos_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            max_sequence_length=max_sequence_length,
+            device=device,
+        )
+        if do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        self._num_timesteps = len(timesteps)
+
+        # 5. Prepare latents
+        latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
+
+        # For CogVideoX 1.5, the latent frames should be padded to make it divisible by patch_size_t
+        patch_size_t = self.transformer.config.patch_size_t
+        additional_frames = 0
+        if patch_size_t is not None and latent_frames % patch_size_t != 0:
+            additional_frames = patch_size_t - latent_frames % patch_size_t
+            num_frames += additional_frames * self.vae_scale_factor_temporal
+
+        image = self.video_processor.preprocess(image, height=height, width=width).to(
+            device, dtype=prompt_embeds.dtype
+        )
+
+        latent_channels = self.transformer.config.in_channels // 2
+        latents, image_latents = self.prepare_latents(
+            image,
+            batch_size * num_videos_per_prompt,
+            latent_channels,
+            num_frames,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        
+        # 6 encoding anchor videos
+        
+        anchor_states = self.encode_video(anchor_video[None]).to(device)
+  
+        # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 8. Create rotary embeds if required
+        image_rotary_emb = (
+            self._prepare_rotary_positional_embeddings(height, width, latents.size(1), device)
+            if self.transformer.config.use_rotary_positional_embeddings
+            else None
+        )
+
+        # 8. Create ofs embeds if required
+        ofs_emb = None if self.transformer.config.ofs_embed_dim is None else latents.new_full((1,), fill_value=2.0)
+
+        # 9. Denoising loop
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+            
+        if do_classifier_free_guidance:
+            anchor_states = torch.cat([anchor_states] * 2)
+        
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            # for DPM-solver++
+            old_pred_original_sample = None
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                latent_image_input = torch.cat([image_latents] * 2) if do_classifier_free_guidance else image_latents
+                
+                latent_and_image_input = torch.cat([latent_model_input, latent_image_input], dim=2)
+                
+                
+                # latent_and_image_input = torch.cat([latent_model_input], dim=2)
+                                    
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latent_model_input.shape[0])
+
+                current_sampling_percent = i / len(timesteps)
+
+                latent_model_input = latent_model_input.to(dtype=self.transformer.dtype)
+                prompt_embeds = prompt_embeds.to(dtype=self.transformer.dtype)
+                
+                controlnet_states = None
+                
+                input_controlnet_states = anchor_states
+                if (controlnet_guidance_start <= current_sampling_percent <= controlnet_guidance_end):
+                    controlnet_states = self.controlnet(
+                        hidden_states=latent_model_input,
+                        encoder_hidden_states=prompt_embeds,
+                        image_rotary_emb=image_rotary_emb,
+                        controlnet_states=input_controlnet_states,
+                        controlnet_output_mask = controlnet_output_mask,
+                        timestep=timestep,
+                        return_dict=False,
+                    )[0]
+                    if isinstance(controlnet_states, (tuple, list)):
+                        controlnet_states = [x.to(dtype=self.transformer.dtype) for x in controlnet_states]
+                    else:
+                        controlnet_states = controlnet_states.to(dtype=self.transformer.dtype)
+
+                # predict noise model_output
+                noise_pred = self.transformer(
+                    hidden_states=latent_and_image_input,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timestep,
+                    image_rotary_emb=image_rotary_emb,
+                    controlnet_states=controlnet_states,
+                    controlnet_weights=controlnet_weights,
+                    return_dict=False,
+                )[0]
+                
+                noise_pred = noise_pred.float()
+
+                # perform guidance
+                if use_dynamic_cfg:
+                    self._guidance_scale = 1 + guidance_scale * (
+                        (1 - math.cos(math.pi * ((num_inference_steps - t.item()) / num_inference_steps) ** 5.0)) / 2
+                    )
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                if not isinstance(self.scheduler, CogVideoXDPMScheduler):
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                else:
+                    latents, old_pred_original_sample = self.scheduler.step(
+                        noise_pred,
+                        old_pred_original_sample,
+                        t,
+                        timesteps[i - 1] if i > 0 else None,
+                        latents,
+                        **extra_step_kwargs,
+                        return_dict=False,
+                    )
+                latents = latents.to(prompt_embeds.dtype)
+
+                # call the callback, if provided
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        if not output_type == "latent":
+            # Discard any padding frames that were added for CogVideoX 1.5
+            latents = latents[:, additional_frames:]
+            video = self.decode_latents(latents)
+            video = self.video_processor.postprocess_video(video=video, output_type=output_type)
+        else:
+            video = latents
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (video,)
+
+        return CogVideoXPipelineOutput(frames=video)
+
+  
diff --git a/data/epic_gif.gif b/data/epic_gif.gif
new file mode 100644
index 0000000000000000000000000000000000000000..ec5ae233c91e09a333d592235b4dd6fdd6e982c4
--- /dev/null
+++ b/data/epic_gif.gif
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:03703e31f7964e5cbc1c8a10a16936a2bdab6e81070a62730b416e5108526459
+size 10707767
diff --git a/data/test_i2v/captions/000000000005.2.013.txt b/data/test_i2v/captions/000000000005.2.013.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7fcfde331ecd8da2ddab0ca41211101baac695a5
--- /dev/null
+++ b/data/test_i2v/captions/000000000005.2.013.txt
@@ -0,0 +1 @@
+Inside a dimly lit subway car, a first-person perspective shows a gloved hand holding a handgun aimed at the camera, with the barrel pointing directly at the viewer. the scene is tense, with the hand's movement suggesting an imminent threat. the background is blurred, indicating motion, and the interior is adorned with teal and blue seats, silver poles, and red and white advertisements. two seconds later, the perspective shifts to a gloved hand holding a black handgun aimed at the camera, with the barrel pointing directly at the viewer. the interior is quiet, with teal and blue seats, silver poles, and red and white advertisements, including one for 'the lion king'. the atmosphere is tense, with the hand's movement suggesting an imminent threat.
\ No newline at end of file
diff --git a/data/test_i2v/captions/000000000793.0.003.txt b/data/test_i2v/captions/000000000793.0.003.txt
new file mode 100644
index 0000000000000000000000000000000000000000..816a898afc3157df2827f06f2b5fd7b0b045774b
--- /dev/null
+++ b/data/test_i2v/captions/000000000793.0.003.txt
@@ -0,0 +1 @@
+A purple porsche suv drives on a dirt road through a mountainous landscape, with a helicopter parked on a rocky outcrop in the distance, under a sky with scattered clouds. the scene is part of a video game, indicated by the 'gta 5' logo. two seconds later, the same purple suv, now identified with a 'cayenne' badge, continues its journey on the dirt road, kicking up dust. the background features a rugged mountainous terrain, a clear blue sky with scattered clouds, and a distant helicopter on a rocky outcrop, suggesting an adventurous setting.
\ No newline at end of file
diff --git a/data/test_i2v/captions/000000000896.2.005.txt b/data/test_i2v/captions/000000000896.2.005.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7f8adb655dfe8a22e549859b08ed6955f436e389
--- /dev/null
+++ b/data/test_i2v/captions/000000000896.2.005.txt
@@ -0,0 +1 @@
+A character dressed in dark, medieval attire with white hair and yellow eyes rides a reddish-brown horse through a lush, green forest. the scene is set in a video game environment, with a user interface visible in the upper left corner, indicating gameplay mechanics. as the character continues, they traverse a grassy path surrounded by dense coniferous trees, with the game's interface showing health and stamina bars, a mini-map, and inventory icons. the player's progress is tracked by level indicators and currency, suggesting an immersive role-playing experience. the character's yellow eyes and white hair are consistent throughout the journey.
\ No newline at end of file
diff --git a/data/test_i2v/captions/000000001059.6.004.txt b/data/test_i2v/captions/000000001059.6.004.txt
new file mode 100644
index 0000000000000000000000000000000000000000..866121b51e5e1e5792bfac5d92b1be71ff35499b
--- /dev/null
+++ b/data/test_i2v/captions/000000001059.6.004.txt
@@ -0,0 +1 @@
+A lone explorer in a detailed exoskeleton suit with a helmet and visor is seen traversing a desolate, rocky terrain under an overcast sky, suggesting a science fiction setting. the explorer's journey is marked by solitude and the stark beauty of an extraterrestrial environment, with no other life or human activity visible. the scene is set against a backdrop of reddish-brown soil and dark volcanic rocks, with the mood of isolation and the allure of discovery emphasized by the subdued lighting and the absence of any other life or human activity.
\ No newline at end of file
diff --git a/data/test_i2v/captions/000000049282.4.005.txt b/data/test_i2v/captions/000000049282.4.005.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7c27dd575c5e5aec27433ddedd8e65279f3742ae
--- /dev/null
+++ b/data/test_i2v/captions/000000049282.4.005.txt
@@ -0,0 +1 @@
+A bustling street scene in rome, italy, with pedestrians walking by shops like 'superman' and 'salvatore ferragamo'. the architecture is european, with buildings painted in warm yellows and oranges. a man in a white t-shirt and backpack looks over his shoulder, while a woman in a red coat and sunglasses walks forward. two seconds later, the scene shifts to a narrow street lined with yellow buildings, where people stroll under an awning labeled 'ristorante'. the atmosphere is vibrant, with a mix of casual and smart-casual attire, and a 'sale' sign on a storefront suggests commercial activity.
\ No newline at end of file
diff --git a/data/test_i2v/captions/000000049768.11.002.txt b/data/test_i2v/captions/000000049768.11.002.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0fda7ba6d71ec4a288245f2a494ca1b4dfdeaf46
--- /dev/null
+++ b/data/test_i2v/captions/000000049768.11.002.txt
@@ -0,0 +1 @@
+A serene coastal town with a pastel-colored building adorned with green shutters and a striped awning. a clothesline with laundry adds a domestic touch to the tranquil scene. the town is set against a backdrop of a rugged cliff and a calm blue sea, with a clear sky overhead. as the video continues, the same building is shown with a green door and a yellow and white striped awning, maintaining the peaceful atmosphere. contrasting with the detailed coastal setting, the video caption suggests a group of people sitting at a table with food and drinks, which does not align with the visual content described in the frames.
\ No newline at end of file
diff --git a/data/test_i2v/captions/000002012796.6.001.txt b/data/test_i2v/captions/000002012796.6.001.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e3c2c7536bc27555580aa9f1d027e396a4010aec
--- /dev/null
+++ b/data/test_i2v/captions/000002012796.6.001.txt
@@ -0,0 +1 @@
+A whimsical, nautical-themed interior with wooden cabinetry and a central mural of a pirate scene. initially, the room is adorned with a ship's wheel, a compass rose, and a framed picture of a ship, all under a warm, inviting light. at the two-second mark, the scene shifts to a corridor with wooden paneling, round portholes, and a framed picture of a shipwreck, with a solitary figure standing in the distance. by the fourth second, the setting changes to a warmly lit bar area with wooden tables, red stools, and a maritime-themed mural, creating a cozy atmosphere.
\ No newline at end of file
diff --git a/data/test_i2v/captions/000003031893.11.003.txt b/data/test_i2v/captions/000003031893.11.003.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5c376f8e0106c19a319bb123184968d1a79ebbe6
--- /dev/null
+++ b/data/test_i2v/captions/000003031893.11.003.txt
@@ -0,0 +1 @@
+A muscular, tattooed man with a black cap and red shorts is seen performing bicep curls with a white dumbbell at a gym, his reflection visible in a large mirror. the gym is equipped with various weights and exercise machines, set against wooden walls and flooring, creating a warm atmosphere. two seconds later, the same man, now in a black tank top, continues his workout with a barbell on a bench, surrounded by a mirror reflecting the gym's interior, including a treadmill and weight racks, all under warm lighting. contrary to the detailed scenes, the overall description inaccurately mentions a shirtless man standing in front of a refrigerator.
\ No newline at end of file
diff --git a/data/test_i2v/masked_videos/000000000005.2.013.mp4 b/data/test_i2v/masked_videos/000000000005.2.013.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5706c8a69f60ba96e3ff5fa2e4985ed0df47d91a
--- /dev/null
+++ b/data/test_i2v/masked_videos/000000000005.2.013.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e26c4e1563e7e7963a180408468e377f48cf6a0c9639c60abe7d97d794798304
+size 494333
diff --git a/data/test_i2v/masked_videos/000000000793.0.003.mp4 b/data/test_i2v/masked_videos/000000000793.0.003.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..031dd2233935447cd7fc25acbeff1afa78e35e8b
--- /dev/null
+++ b/data/test_i2v/masked_videos/000000000793.0.003.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e1010d8cd07b77aaf14ee10a86f75916d8cc501aaf8fad6d4cf0b196dfb0f083
+size 321633
diff --git a/data/test_i2v/masked_videos/000000000896.2.005.mp4 b/data/test_i2v/masked_videos/000000000896.2.005.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..4e2581620f048f96e0f817fdc8633280b2a73da0
--- /dev/null
+++ b/data/test_i2v/masked_videos/000000000896.2.005.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ebe76f05ad108e42d9a668cf3f08f0965f10dbf5f7f755b9dc4b13b4ccc9cd03
+size 1168624
diff --git a/data/test_i2v/masked_videos/000000001059.6.004.mp4 b/data/test_i2v/masked_videos/000000001059.6.004.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a77c13d488032a928761c161ef3e7f027fbe49f0
--- /dev/null
+++ b/data/test_i2v/masked_videos/000000001059.6.004.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d37c98497b103552cde6355e186bf7be0d9fbd18029d512af76c6df3a54f2c9c
+size 304053
diff --git a/data/test_i2v/masked_videos/000000049282.4.005.mp4 b/data/test_i2v/masked_videos/000000049282.4.005.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..a5038f0da034c427d34546df8180c8156bb41907
--- /dev/null
+++ b/data/test_i2v/masked_videos/000000049282.4.005.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:966ba1cf67689feafc6e3964041c7282957ee3bb8032116b9e58e3d11a52804f
+size 402082
diff --git a/data/test_i2v/masked_videos/000000049768.11.002.mp4 b/data/test_i2v/masked_videos/000000049768.11.002.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2cd18621edff153bdba48b9003f4332aaedc3377
--- /dev/null
+++ b/data/test_i2v/masked_videos/000000049768.11.002.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7c8d5ae12acbe72000b14741a645df14b3309f4a16854061e1b68cb2f30ae50
+size 1076475
diff --git a/data/test_i2v/masked_videos/000002012796.6.001.mp4 b/data/test_i2v/masked_videos/000002012796.6.001.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..bc28c82a84a62b0d7f492b78aec3853614573254
--- /dev/null
+++ b/data/test_i2v/masked_videos/000002012796.6.001.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5ab4d4bee407c6f86a5d4188c3679bee832f070388a251e2d4629fd6d7e6250d
+size 157999
diff --git a/data/test_i2v/masked_videos/000003031893.11.003.mp4 b/data/test_i2v/masked_videos/000003031893.11.003.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..cc0f54ea6e27bbdbcd799675015cbbd0418ecede
--- /dev/null
+++ b/data/test_i2v/masked_videos/000003031893.11.003.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bb7d9beb34ff81afd97834b3322a90ec6a28ac351be4211b78636b7b74300f7e
+size 171900
diff --git a/data/test_i2v/masks/000000000005.2.013.npz b/data/test_i2v/masks/000000000005.2.013.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e38ccdf316a0987c3b92fd1cd22dec5779c7cede
--- /dev/null
+++ b/data/test_i2v/masks/000000000005.2.013.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f088f95b36218a7cc33406b77783b7913afed9213b2ee73d80c23caab5d464a6
+size 441136
diff --git a/data/test_i2v/masks/000000000793.0.003.npz b/data/test_i2v/masks/000000000793.0.003.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d7531a1d8266b94bf1fb31e91695c28b1ab3ee50
--- /dev/null
+++ b/data/test_i2v/masks/000000000793.0.003.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2b37b20f2b1e3fe3843bb145c3279aa414018bb167ba28fc54e1a9bc5eed975b
+size 205096
diff --git a/data/test_i2v/masks/000000000896.2.005.npz b/data/test_i2v/masks/000000000896.2.005.npz
new file mode 100644
index 0000000000000000000000000000000000000000..928004be29069f840578110b127fb6b599130270
--- /dev/null
+++ b/data/test_i2v/masks/000000000896.2.005.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf449ba298632f7d2a811e15e5dbabbbaec3c2301e7ba70021afd42ad90831e3
+size 691623
diff --git a/data/test_i2v/masks/000000001059.6.004.npz b/data/test_i2v/masks/000000001059.6.004.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ecb522e42e37be4334b6b09f1bcaa382a0ff9f61
--- /dev/null
+++ b/data/test_i2v/masks/000000001059.6.004.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d03d4c6877ffbc1a6fb49fb59d9f5f1098d96bc1f7d73829768760fac3a7bf99
+size 148122
diff --git a/data/test_i2v/masks/000000049282.4.005.npz b/data/test_i2v/masks/000000049282.4.005.npz
new file mode 100644
index 0000000000000000000000000000000000000000..526cde037434ab97998a8ae1b7deee30e02ef7f9
--- /dev/null
+++ b/data/test_i2v/masks/000000049282.4.005.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ded019f277286353c850f08c49a94b70887e0ea71607c6eb8959b79739ecb924
+size 214608
diff --git a/data/test_i2v/masks/000000049768.11.002.npz b/data/test_i2v/masks/000000049768.11.002.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7345ca049dfee20fe6da24b09b4627b3715046d5
--- /dev/null
+++ b/data/test_i2v/masks/000000049768.11.002.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fe0878aee08cc545f6b717515f142cb9299813ce3d89b4e64c5b2c7a6a2a708
+size 590769
diff --git a/data/test_i2v/masks/000002012796.6.001.npz b/data/test_i2v/masks/000002012796.6.001.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d88e06a45e3da73b6055ee159ec3db18768b802f
--- /dev/null
+++ b/data/test_i2v/masks/000002012796.6.001.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a1951d5c5fdec649a59e569e3309398fb311745ed72930e8e720a5699208c4c4
+size 169341
diff --git a/data/test_i2v/masks/000003031893.11.003.npz b/data/test_i2v/masks/000003031893.11.003.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c119c95af9fed4eac695263d7358e8dee4851c5b
--- /dev/null
+++ b/data/test_i2v/masks/000003031893.11.003.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d79e7b6e102b7adcc5ddaafafa0c66e67c4efe81a0f7f9f1270cd17b66a193f
+size 158881
diff --git a/data/test_i2v/videos/000000000005.2.013.mp4 b/data/test_i2v/videos/000000000005.2.013.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..ec384e5018610e0a0ec4a7857d1fa12ae6caa3e1
--- /dev/null
+++ b/data/test_i2v/videos/000000000005.2.013.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a385d5f2066dacaf38a6819cb8594e1b44ca25574c23cc5e67d7a039b9309ef2
+size 575154
diff --git a/data/test_i2v/videos/000000000793.0.003.mp4 b/data/test_i2v/videos/000000000793.0.003.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..b4ce75ccab13c39c3d5ef430ae9245182bdbbd5d
--- /dev/null
+++ b/data/test_i2v/videos/000000000793.0.003.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8e51cb421efeb29dfc0ddf93a01a7e8dea822b6ed1ae3ee53e94120170f4fad0
+size 757391
diff --git a/data/test_i2v/videos/000000000896.2.005.mp4 b/data/test_i2v/videos/000000000896.2.005.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..db8a0d70c8565b8dad25311ea31ff3442a306501
--- /dev/null
+++ b/data/test_i2v/videos/000000000896.2.005.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:32bd466321a25326e6c01c22d4342fbc9e2180609092d5ddc5f6b91b07929dda
+size 1349710
diff --git a/data/test_i2v/videos/000000001059.6.004.mp4 b/data/test_i2v/videos/000000001059.6.004.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..ceccbcbf39aa47f6c77cf9c73d8691d5451c1674
--- /dev/null
+++ b/data/test_i2v/videos/000000001059.6.004.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f660782c05b937e2ec08834e76778818049a04d30c0617e538018528ab49bc73
+size 797266
diff --git a/data/test_i2v/videos/000000049282.4.005.mp4 b/data/test_i2v/videos/000000049282.4.005.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..671238be7f3a448ae560b34e2c001d61665ae223
--- /dev/null
+++ b/data/test_i2v/videos/000000049282.4.005.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f8f7c03191429649a4a23a503db089c6a864b2e4ae0418c8431ea9159954e460
+size 1177012
diff --git a/data/test_i2v/videos/000000049768.11.002.mp4 b/data/test_i2v/videos/000000049768.11.002.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..54aef3ae5e54af0dfeb4ae786b1b25f72da29760
--- /dev/null
+++ b/data/test_i2v/videos/000000049768.11.002.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0b66d0e0729d314bb275297d27a2b6e073962d70cd6cf1d5caac20bf10adc254
+size 909919
diff --git a/data/test_i2v/videos/000002012796.6.001.mp4 b/data/test_i2v/videos/000002012796.6.001.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..bb2e473f0832f0b45a6366b419a625807b1abee9
--- /dev/null
+++ b/data/test_i2v/videos/000002012796.6.001.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c7e8b1ab3ca836e70e3386ca26658dcd74df503952cace96ae1a9743b3af44f4
+size 446478
diff --git a/data/test_i2v/videos/000003031893.11.003.mp4 b/data/test_i2v/videos/000003031893.11.003.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..20cc3c33c5ae728d2920c0ad54318ea8e1aacb2d
--- /dev/null
+++ b/data/test_i2v/videos/000003031893.11.003.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26e186da08e186ad578a68da4bb0b376049e51f0039a302b63e0b2f4c9a25685
+size 506332
diff --git a/data/test_v2v/captions/amalfi-coast_traj_loop2.txt b/data/test_v2v/captions/amalfi-coast_traj_loop2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eb4ec9af96c48cfb73a389f41ed8d9f30dbb4a3f
--- /dev/null
+++ b/data/test_v2v/captions/amalfi-coast_traj_loop2.txt
@@ -0,0 +1 @@
+The video showcases an aerial view of a historic coastal structure perched on a cliff overlooking the sea. The architecture features a mix of stone and brick, with arched doorways and windows, and a prominent dome on one side. The surrounding area includes a large open plaza where people are gathered, some walking and others sitting, enjoying the scenic view. The coastline is rugged, with rocky outcrops jutting into the deep blue water below. In the background, there are more buildings and structures, indicating a nearby town or village. The overall scene is bathed in warm sunlight, highlighting the textures of the stone and the vibrant colors of. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/captions/art-museum_gradual_0_-30_0.0_0_0.txt b/data/test_v2v/captions/art-museum_gradual_0_-30_0.0_0_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a89a0f6a20d6f2499c6fd4ac9320aa2987a8e7c9
--- /dev/null
+++ b/data/test_v2v/captions/art-museum_gradual_0_-30_0.0_0_0.txt
@@ -0,0 +1 @@
+The video showcases an art gallery with a spacious, well-lit interior featuring polished wooden floors arranged in a herringbone pattern. The walls are adorned with a variety of framed paintings, each with distinct styles and subjects. On the left side, there are landscapes and abstract artworks, while the right side displays more traditional portraits and historical scenes. The ceiling is ornate, with intricate designs and recessed lighting that highlights the artwork. The gallery appears to be empty, allowing the viewer to focus solely on the art pieces and the elegant architecture of the space.. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/captions/dog_gradual_0_-30_-0.6_0_0.txt b/data/test_v2v/captions/dog_gradual_0_-30_-0.6_0_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0587346750730874a274cade384305d93a2af6ae
--- /dev/null
+++ b/data/test_v2v/captions/dog_gradual_0_-30_-0.6_0_0.txt
@@ -0,0 +1 @@
+A small, light brown puppy with floppy ears sits on a speckled carpet in front of a window with sheer curtains. The puppy looks around curiously, its head tilting slightly as it takes in its surroundings. The soft, natural light from the window illuminates the scene, creating a warm and cozy atmosphere. The puppy's expression is one of innocent curiosity, adding to the charm of the moment.. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/captions/fish_gradual_25_0_0.0_0_0.txt b/data/test_v2v/captions/fish_gradual_25_0_0.0_0_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..792d7ab3df049cb2b28256f07325ec37ca25961e
--- /dev/null
+++ b/data/test_v2v/captions/fish_gradual_25_0_0.0_0_0.txt
@@ -0,0 +1 @@
+A vibrant blue and yellow striped fish, resembling an angelfish, swims gracefully among rocky formations in what appears to be an aquarium setting. The fish's flowing fins and the intricate patterns on its body are highlighted by the underwater lighting, creating a serene and captivating scene. The background consists of large, rugged rocks that add texture and depth to the environment, enhancing the naturalistic feel of the habitat.. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/captions/grandma-birthday_gradual_0_0_0.2_0_0.txt b/data/test_v2v/captions/grandma-birthday_gradual_0_0_0.2_0_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..dc7b7b205f859248509a69832d5360ebb681d763
--- /dev/null
+++ b/data/test_v2v/captions/grandma-birthday_gradual_0_0_0.2_0_0.txt
@@ -0,0 +1 @@
+A joyful celebration unfolds as an elderly woman stands at a table, her face lit up with excitement and anticipation. In front of her is a beautifully decorated birthday cake adorned with numerous candles in shades of pink and yellow. The woman leans forward, her eyes sparkling with delight, as she prepares to blow out the candles. Surrounding her are several people, including family members and friends, who are clapping and cheering enthusiastically. The atmosphere is warm and festive, filled with laughter and happiness as everyone shares in the moment of joy.. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/captions/happy-cat_gradual_-20_0_0.0_0_0.txt b/data/test_v2v/captions/happy-cat_gradual_-20_0_0.0_0_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..63b0a69b7363faf3edb22692d7f14995b750e0af
--- /dev/null
+++ b/data/test_v2v/captions/happy-cat_gradual_-20_0_0.0_0_0.txt
@@ -0,0 +1 @@
+A fluffy orange and white cat with striking green eyes is seen walking through a lush garden. The path is lined with vibrant green foliage and dotted with small yellow flowers. The cat moves gracefully, its paws padding softly on the earthy ground. As it walks, the camera follows closely behind, capturing the serene beauty of the garden and the curious nature of the feline. The sunlight filters through the leaves, casting dappled shadows that dance across the scene, adding to the tranquil atmosphere.. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/captions/suv-in-the-dust_gradual_0_30_0.0_0_0.txt b/data/test_v2v/captions/suv-in-the-dust_gradual_0_30_0.0_0_0.txt
new file mode 100644
index 0000000000000000000000000000000000000000..909bdc3af0668ebf50879ab83f7f24605cc6c594
--- /dev/null
+++ b/data/test_v2v/captions/suv-in-the-dust_gradual_0_30_0.0_0_0.txt
@@ -0,0 +1 @@
+A white off-road vehicle is seen driving along a winding dirt road in a mountainous, forested area. The terrain is rugged and uneven, with the vehicle kicking up dust as it maneuvers through the curves. The surrounding landscape features dense trees and shrubs, with patches of exposed earth and rocks. The sky above is clear and blue, indicating a sunny day. The vehicle's shadow stretches long on the ground, suggesting the sun is high in the sky. The overall scene conveys a sense of adventure and exploration in a remote, natural setting.. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/captions/vlogger-corgi_traj_loop2.txt b/data/test_v2v/captions/vlogger-corgi_traj_loop2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c0a12407552be683af967358c361d6632fdc9a1a
--- /dev/null
+++ b/data/test_v2v/captions/vlogger-corgi_traj_loop2.txt
@@ -0,0 +1 @@
+A corgi dog is sitting on a vibrant blue beach towel adorned with yellow floral patterns, positioned on a sandy beach. The dog is wearing stylish sunglasses and a colorful lei around its neck, giving it a festive and playful appearance. A GoPro camera mounted on a selfie stick is pointed towards the dog, suggesting that it might be recording a video or taking photos. In the background, tall palm trees sway gently in the breeze, and the ocean stretches out to meet the horizon under a clear blue sky. The scene exudes a relaxed and joyful atmosphere, perfect for a day at the beach.. The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.
\ No newline at end of file
diff --git a/data/test_v2v/masked_videos/amalfi-coast_traj_loop2.mp4 b/data/test_v2v/masked_videos/amalfi-coast_traj_loop2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9b5e99e26abe97d72be85b7f64f9883a284412be
--- /dev/null
+++ b/data/test_v2v/masked_videos/amalfi-coast_traj_loop2.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a88a29ce9e47e9f5c78222721f0adc5c5f7098f90c7371f3859d8742513f8ed
+size 752999
diff --git a/data/test_v2v/masked_videos/art-museum_gradual_0_-30_0.0_0_0.mp4 b/data/test_v2v/masked_videos/art-museum_gradual_0_-30_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..6c871845177a11b84ab7c8015049e7f72b3deed3
--- /dev/null
+++ b/data/test_v2v/masked_videos/art-museum_gradual_0_-30_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aa32d57497c1ded042b4e7944f07b15828d2a3199db79f94c0c670f846dbde88
+size 442795
diff --git a/data/test_v2v/masked_videos/dog_gradual_0_-30_-0.6_0_0.mp4 b/data/test_v2v/masked_videos/dog_gradual_0_-30_-0.6_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..55392f3e0b45bd94d15dcf4e576c52506d16355c
--- /dev/null
+++ b/data/test_v2v/masked_videos/dog_gradual_0_-30_-0.6_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be94b77bd5367476198b3e0758614e8ef672b8d6ae8bcd451a005980d43e3ffe
+size 334957
diff --git a/data/test_v2v/masked_videos/fish_gradual_25_0_0.0_0_0.mp4 b/data/test_v2v/masked_videos/fish_gradual_25_0_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..878ebb96b84497556d742dae93fc9d026a5f021e
--- /dev/null
+++ b/data/test_v2v/masked_videos/fish_gradual_25_0_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3875bc6b8ebc9cf35dc77625004b63d1ec816aca1e6133cea42664ef49f2a673
+size 675659
diff --git a/data/test_v2v/masked_videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4 b/data/test_v2v/masked_videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..73da6b1b4ab582bfe0c66f7e4cc1fa4ea04fafbf
--- /dev/null
+++ b/data/test_v2v/masked_videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ddfc81e8352a4b2bf6ea83c1789559e5917f4e917ef4ef42889876a6e315cb64
+size 803428
diff --git a/data/test_v2v/masked_videos/happy-cat_gradual_-20_0_0.0_0_0.mp4 b/data/test_v2v/masked_videos/happy-cat_gradual_-20_0_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..c385a85fc389b0893ccf41ddc85be137bb122c77
--- /dev/null
+++ b/data/test_v2v/masked_videos/happy-cat_gradual_-20_0_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:280ff9d10ec40045740927967956d2eed80029185fc8d015087bd6438a1e719b
+size 1146692
diff --git a/data/test_v2v/masked_videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4 b/data/test_v2v/masked_videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..3eed5ff8112aea29667c44da0d8fdc192e94a39c
--- /dev/null
+++ b/data/test_v2v/masked_videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:618411664cb18be3c8467b3c80ec33d0810e5ab5c21a5ee56983f4a93048dff2
+size 731799
diff --git a/data/test_v2v/masked_videos/vlogger-corgi_traj_loop2.mp4 b/data/test_v2v/masked_videos/vlogger-corgi_traj_loop2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..7930bf21e3186ce9dbf96f01b5d4121aa84084c2
--- /dev/null
+++ b/data/test_v2v/masked_videos/vlogger-corgi_traj_loop2.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:efc3dd326c6d7e596d828cb3d84cc99df5222af83ef0dff7d0cb67092992586f
+size 1227836
diff --git a/data/test_v2v/masks/amalfi-coast_traj_loop2.npz b/data/test_v2v/masks/amalfi-coast_traj_loop2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7d939f71e9c430dd829eef30b4cd289d55f7dc3f
--- /dev/null
+++ b/data/test_v2v/masks/amalfi-coast_traj_loop2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fe824736387d974f044c12a5d68cbac21a62a1cbf7e1fd6a5b740d2298137495
+size 251090
diff --git a/data/test_v2v/masks/art-museum_gradual_0_-30_0.0_0_0.npz b/data/test_v2v/masks/art-museum_gradual_0_-30_0.0_0_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..25017fc0a059682fbc5f6eed4fa4098ea541fe37
--- /dev/null
+++ b/data/test_v2v/masks/art-museum_gradual_0_-30_0.0_0_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0c69c77f6b049069df3439a4526b83e3eb66f626d42cc552a8b6f252b209976
+size 198329
diff --git a/data/test_v2v/masks/dog_gradual_0_-30_-0.6_0_0.npz b/data/test_v2v/masks/dog_gradual_0_-30_-0.6_0_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ada41a8c569cb48451d021a6fc0d3ef059d548a3
--- /dev/null
+++ b/data/test_v2v/masks/dog_gradual_0_-30_-0.6_0_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8867600701e3290dcbd44144a67dc0b27e278c72775cc99e843d4ba4574c2db4
+size 218099
diff --git a/data/test_v2v/masks/fish_gradual_25_0_0.0_0_0.npz b/data/test_v2v/masks/fish_gradual_25_0_0.0_0_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..794f43dccef34b19f302e6d710d59bd364b96c76
--- /dev/null
+++ b/data/test_v2v/masks/fish_gradual_25_0_0.0_0_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:196f40587edfbbd8db425290bea90dbe3f16f0f0ef9d6ce9990202140551c3c2
+size 200344
diff --git a/data/test_v2v/masks/grandma-birthday_gradual_0_0_0.2_0_0.npz b/data/test_v2v/masks/grandma-birthday_gradual_0_0_0.2_0_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..7a75315158f796d624f0e99928765ce9c945d394
--- /dev/null
+++ b/data/test_v2v/masks/grandma-birthday_gradual_0_0_0.2_0_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1ff7e2edb3bbaa1aa0343279ff24028ed5e968e136953205c8c3e03efcccf89d
+size 245822
diff --git a/data/test_v2v/masks/happy-cat_gradual_-20_0_0.0_0_0.npz b/data/test_v2v/masks/happy-cat_gradual_-20_0_0.0_0_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..025d858f1f6d0a27f5e8e0c080f353db8b764999
--- /dev/null
+++ b/data/test_v2v/masks/happy-cat_gradual_-20_0_0.0_0_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8223427112d29ddcdc7bdc7fd435b0b1acfca5b6a01f7d6bfec0c5e803155551
+size 444784
diff --git a/data/test_v2v/masks/suv-in-the-dust_gradual_0_30_0.0_0_0.npz b/data/test_v2v/masks/suv-in-the-dust_gradual_0_30_0.0_0_0.npz
new file mode 100644
index 0000000000000000000000000000000000000000..51c3c01cb9a33e741e6ed6d58dae6f69b6311a0a
--- /dev/null
+++ b/data/test_v2v/masks/suv-in-the-dust_gradual_0_30_0.0_0_0.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4285272bc666acf57c814ae864423977db9e861c497c374cf2e530f48e21d790
+size 240572
diff --git a/data/test_v2v/masks/vlogger-corgi_traj_loop2.npz b/data/test_v2v/masks/vlogger-corgi_traj_loop2.npz
new file mode 100644
index 0000000000000000000000000000000000000000..e0e5f2f604043d7d366b796d72bec4cd1319ec0f
--- /dev/null
+++ b/data/test_v2v/masks/vlogger-corgi_traj_loop2.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d2cd1df5f2b5b513054d316a750edb8597f8bd4a1471c032560b7878ee2794bb
+size 295560
diff --git a/data/test_v2v/videos/amalfi-coast_traj_loop2.mp4 b/data/test_v2v/videos/amalfi-coast_traj_loop2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9d3a9e212886ca5d4ae7aa2518bda1e6feb8489a
--- /dev/null
+++ b/data/test_v2v/videos/amalfi-coast_traj_loop2.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:24ab6ff3d277b21654a3f373b24626e45991c6a2e991737a3d14354aa0aa3add
+size 475271
diff --git a/data/test_v2v/videos/art-museum_gradual_0_-30_0.0_0_0.mp4 b/data/test_v2v/videos/art-museum_gradual_0_-30_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..4440ca45c727b3122d2afe318120093bdbbb383d
--- /dev/null
+++ b/data/test_v2v/videos/art-museum_gradual_0_-30_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:970e1f8806c21819c1a01feb2349c3cb401f2182deada3531df5a916bc38166b
+size 335007
diff --git a/data/test_v2v/videos/dog_gradual_0_-30_-0.6_0_0.mp4 b/data/test_v2v/videos/dog_gradual_0_-30_-0.6_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..ac791a59376cd792833239edb84f15b8588a9b17
--- /dev/null
+++ b/data/test_v2v/videos/dog_gradual_0_-30_-0.6_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:77f36acaa7bc1af28432529717fd9ef27f1841ce6e20bd58f382b9ca35cefe1c
+size 241268
diff --git a/data/test_v2v/videos/fish_gradual_25_0_0.0_0_0.mp4 b/data/test_v2v/videos/fish_gradual_25_0_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..692d03c296f7b25711c0c7b214f1e8aa9924610c
--- /dev/null
+++ b/data/test_v2v/videos/fish_gradual_25_0_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:95c172cf161dd4f1fc1f5f42bfb0cb487614537a422ca963203bab57d22710ba
+size 402460
diff --git a/data/test_v2v/videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4 b/data/test_v2v/videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..70d760c83bf7bd734a020cc8aac66e1ddeee4e10
--- /dev/null
+++ b/data/test_v2v/videos/grandma-birthday_gradual_0_0_0.2_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e8e7c8f5a026c9fb9ee37bbc5c70914f9f0ef49898c8473f60324850d9ff82b
+size 435918
diff --git a/data/test_v2v/videos/happy-cat_gradual_-20_0_0.0_0_0.mp4 b/data/test_v2v/videos/happy-cat_gradual_-20_0_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5a7df2d61fda26428809982a9f2404893346c07c
--- /dev/null
+++ b/data/test_v2v/videos/happy-cat_gradual_-20_0_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8c121561255a284305fbc4b1963a20bfa96169ca60b43ab6b1c8e7d52c8e7668
+size 793419
diff --git a/data/test_v2v/videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4 b/data/test_v2v/videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..dd1bc99359a63e769ce62903870faa3fcf6d7e65
--- /dev/null
+++ b/data/test_v2v/videos/suv-in-the-dust_gradual_0_30_0.0_0_0.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:70533ac543c9075abacfc9125f8cba0b689c1bdcce4992a7d99901e29cff526c
+size 866252
diff --git a/data/test_v2v/videos/vlogger-corgi_traj_loop2.mp4 b/data/test_v2v/videos/vlogger-corgi_traj_loop2.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..2c26f27c1828aa6a137a0e96ec6ecfe3120ddaf9
--- /dev/null
+++ b/data/test_v2v/videos/vlogger-corgi_traj_loop2.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fdc535862e4ef89197fc92fd8387d48c093c378f028509e8f22b2f984be226cf
+size 376949
diff --git a/download/download.py b/download/download.py
new file mode 100644
index 0000000000000000000000000000000000000000..246f6fbc7510faea6ae7ea308435619e9192c1d5
--- /dev/null
+++ b/download/download.py
@@ -0,0 +1,25 @@
+from huggingface_hub import snapshot_download
+
+def download_model():
+    snapshot_download(
+        repo_id="tencent/DepthCrafter",
+        local_dir="../pretrained/DepthCrafter",
+        local_dir_use_symlinks=False,
+    )
+    snapshot_download(
+        repo_id="stabilityai/stable-video-diffusion-img2vid",
+        local_dir="../pretrained/stable-video-diffusion-img2vid",
+        local_dir_use_symlinks=False,
+    )
+    snapshot_download(
+        repo_id= "Qwen/Qwen2.5-VL-7B-Instruct",
+        local_dir="../pretrained/Qwen2.5-VL-7B-Instruct",
+        local_dir_use_symlinks=False,
+    )
+    snapshot_download(
+        repo_id="THUDM/CogVideoX-5b-I2V",
+        local_dir="../pretrained/CogVideoX-5b-I2V",
+        local_dir_use_symlinks=False,
+    )
+
+download_model()
\ No newline at end of file
diff --git a/download/download_models.sh b/download/download_models.sh
new file mode 100644
index 0000000000000000000000000000000000000000..bd96634c8e4bc65c45f2a0e208bae84034dbd1df
--- /dev/null
+++ b/download/download_models.sh
@@ -0,0 +1,4 @@
+
+mkdir -p ../pretrained/RAFT
+gdown 1MqDajR89k-xLV0HIrmJ0k-n8ZpG6_suM -O ../pretrained/RAFT/raft-things.pth
+python download.py
\ No newline at end of file
diff --git a/gradio_app.py b/gradio_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..0618b37abbafe69ea1a1e524e60d87581ea5f2f7
--- /dev/null
+++ b/gradio_app.py
@@ -0,0 +1,118 @@
+import os
+import subprocess
+from datetime import datetime
+from pathlib import Path
+
+import gradio as gr
+
+# -----------------------------
+# Setup paths and env
+# -----------------------------
+HF_HOME = "/app/hf_cache"
+os.environ["HF_HOME"] = HF_HOME
+os.environ["TRANSFORMERS_CACHE"] = HF_HOME
+os.makedirs(HF_HOME, exist_ok=True)
+
+PRETRAINED_DIR = "/app/pretrained"
+os.makedirs(PRETRAINED_DIR, exist_ok=True)
+
+
+# -----------------------------
+# Step 1: Optional Model Download
+# -----------------------------
+def download_models():
+    expected_model = os.path.join(PRETRAINED_DIR, "RAFT/raft-things.pth")
+    if not Path(expected_model).exists():
+        print("⚙️ Downloading pretrained models...")
+        try:
+            subprocess.check_call(["bash", "download/download_models.sh"])
+            print("✅ Models downloaded.")
+        except subprocess.CalledProcessError as e:
+            print(f"❌ Model download failed: {e}")
+    else:
+        print("✅ Pretrained models already exist.")
+
+
+download_models()
+
+
+# -----------------------------
+# Step 2: Inference Logic
+# -----------------------------
+def run_epic_inference(video_path, caption, motion_type):
+    temp_input_path = "/app/temp_input.mp4"
+    output_dir = f"/app/output_{motion_type}"
+    traj_name = motion_type
+    traj_txt = f"/app/inference/v2v_data/test/trajs/{traj_name}.txt"
+
+    # Save uploaded video
+    if video_path:
+        os.system(f"cp '{video_path}' {temp_input_path}")
+
+    # Construct command to run inference
+    command = [
+        "python",
+        "/app/inference/v2v_data/inference.py",
+        "--video_path",
+        temp_input_path,
+        "--stride",
+        "1",
+        "--out_dir",
+        output_dir,
+        "--radius_scale",
+        "1",
+        "--camera",
+        "traj",
+        "--mask",
+        "--target_pose",
+        "0 30 -0.6 0 0",
+        "--traj_txt",
+        traj_txt,
+        "--save_name",
+        f"amalfi-coast_traj_{traj_name}",
+        "--mode",
+        "gradual",
+        "--out_dir",
+        output_dir,
+    ]
+
+    # Run inference command
+    try:
+        result = subprocess.run(command, capture_output=True, text=True, check=True)
+        logs = result.stdout
+    except subprocess.CalledProcessError as e:
+        logs = f"❌ Inference failed:\n{e.stderr}"
+        return logs, None
+
+    # Locate the output video
+    output_video = Path(output_dir) / f"amalfi-coast_traj_{traj_name}.mp4"
+    if output_video.exists():
+        return logs, str(output_video)
+    else:
+        return f"Inference succeeded but no output video found in {output_dir}", None
+
+
+# -----------------------------
+# Step 3: Create Gradio UI
+# -----------------------------
+demo = gr.Interface(
+    fn=run_epic_inference,
+    inputs=[
+        gr.Video(label="Upload Video (MP4)"),
+        gr.Textbox(label="Caption", placeholder="e.g., Amalfi coast with boats"),
+        gr.Dropdown(
+            choices=["zoom_in", "rotate", "orbit", "pan", "loop1"],
+            label="Camera Motion Type",
+            value="zoom_in",
+        ),
+    ],
+    outputs=[gr.Textbox(label="Inference Logs"), gr.Video(label="Generated Video")],
+    title="🎬 EPiC: Efficient Video Camera Control",
+    description="Upload a video, describe the scene, and apply cinematic camera motion using pretrained EPiC models.",
+)
+
+# -----------------------------
+# Step 4: Launch App
+# -----------------------------
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)
\ No newline at end of file
diff --git a/inference/__init__.py b/inference/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/inference/cli_demo_camera_i2v_pcd.py b/inference/cli_demo_camera_i2v_pcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..f637bc9f805f49e2f245fd6bffe5b44ed108f0fa
--- /dev/null
+++ b/inference/cli_demo_camera_i2v_pcd.py
@@ -0,0 +1,454 @@
+import sys
+import os
+sys.path.insert(0, os.getcwd())
+sys.path.append('.')
+sys.path.append('..')
+import argparse
+import os
+
+import torch
+from transformers import T5EncoderModel, T5Tokenizer
+from diffusers import (
+    CogVideoXDDIMScheduler,
+    CogVideoXDPMScheduler,
+    AutoencoderKLCogVideoX
+)
+from diffusers.utils import export_to_video, load_video
+
+from controlnet_pipeline import ControlnetCogVideoXImageToVideoPCDPipeline
+from cogvideo_transformer import CustomCogVideoXTransformer3DModel
+from cogvideo_controlnet_pcd import CogVideoXControlnetPCD
+from training.controlnet_datasets_camera_pcd_mask import RealEstate10KPCDRenderDataset
+from torchvision.transforms.functional import to_pil_image
+
+from inference.utils import stack_images_horizontally
+from PIL import Image
+import numpy as np
+import torchvision.transforms as transforms
+import cv2
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+import cv2
+import numpy as np
+import torch
+
+def get_black_region_mask_tensor(video_tensor, threshold=2, kernel_size=15):
+    """
+    Generate cleaned binary masks for black regions in a video tensor.
+    
+    Args:
+        video_tensor (torch.Tensor): shape (T, H, W, 3), RGB, uint8
+        threshold (int): pixel intensity threshold to consider a pixel as black (default: 20)
+        kernel_size (int): morphological kernel size to smooth masks (default: 7)
+    
+    Returns:
+        torch.Tensor: binary mask tensor of shape (T, H, W), where 1 indicates black region
+    """
+    video_uint8 = ((video_tensor + 1.0) * 127.5).clamp(0, 255).to(torch.uint8).permute(0, 2, 3, 1)  # shape (T, H, W, C)
+    video_np = video_uint8.numpy()
+
+    T, H, W, _ = video_np.shape
+    masks = np.empty((T, H, W), dtype=np.uint8)
+    kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (kernel_size, kernel_size))
+
+    for t in range(T):
+        img = video_np[t]  # (H, W, 3), uint8
+        gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+        _, mask = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY_INV)
+        mask_cleaned = cv2.morphologyEx(mask, cv2.MORPH_CLOSE, kernel)
+        masks[t] = (mask_cleaned > 0).astype(np.uint8)
+    return torch.from_numpy(masks)
+
+def maxpool_mask_tensor(mask_tensor):
+    """
+    Apply spatial and temporal max pooling to a binary mask tensor.
+    
+    Args:
+        mask_tensor (torch.Tensor): shape (T, H, W), binary mask (0 or 1)
+    
+    Returns:
+        torch.Tensor: shape (12, 30, 45), pooled binary mask
+    """
+    T, H, W = mask_tensor.shape
+    assert T % 12 == 0, "T must be divisible by 12 (e.g., 48)"
+    assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
+
+    # Reshape to (B=T, C=1, H, W) for 2D spatial pooling
+    x = mask_tensor.unsqueeze(1).float()  # (T, 1, H, W)
+    x_pooled = F.max_pool2d(x, kernel_size=(H // 30, W // 45))  # → (T, 1, 30, 45)
+
+    # Temporal pooling: reshape to (12, T//12, 30, 45) and max along dim=1
+    t_groups = T // 12
+    x_pooled = x_pooled.view(12, t_groups, 30, 45)
+    pooled_mask = torch.amax(x_pooled, dim=1)  # → (12, 30, 45)
+
+    # Add a zero frame at the beginning: shape (1, 30, 45)
+    zero_frame = torch.zeros_like(pooled_mask[0:1])  # (1, 30, 45)
+    pooled_mask = torch.cat([zero_frame, pooled_mask], dim=0)  # → (13, 30, 45)
+    
+    return 1 - pooled_mask.int()
+
+def avgpool_mask_tensor(mask_tensor):
+    """
+    Apply spatial and temporal average pooling to a binary mask tensor,
+    and threshold at 0.5 to retain only majority-active regions.
+    
+    Args:
+        mask_tensor (torch.Tensor): shape (T, H, W), binary mask (0 or 1)
+    
+    Returns:
+        torch.Tensor: shape (13, 30, 45), pooled binary mask with first frame zeroed
+    """
+    T, H, W = mask_tensor.shape
+    assert T % 12 == 0, "T must be divisible by 12 (e.g., 48)"
+    assert H % 30 == 0 and W % 45 == 0, "H and W must be divisible by 30 and 45"
+
+    # Spatial average pooling
+    x = mask_tensor.unsqueeze(1).float()  # (T, 1, H, W)
+    x_pooled = F.avg_pool2d(x, kernel_size=(H // 30, W // 45))  # → (T, 1, 30, 45)
+
+    # Temporal pooling
+    t_groups = T // 12
+    x_pooled = x_pooled.view(12, t_groups, 30, 45)
+    pooled_avg = torch.mean(x_pooled, dim=1)  # → (12, 30, 45)
+
+    # Threshold: keep only when > 0.5
+    pooled_mask = (pooled_avg > 0.5).int()
+
+    # Add zero frame
+    zero_frame = torch.zeros_like(pooled_mask[0:1])
+    pooled_mask = torch.cat([zero_frame, pooled_mask], dim=0)  # → (13, 30, 45)
+
+    return 1 - pooled_mask  # inverting as before
+
+@torch.no_grad()
+def generate_video(
+    prompt,
+    image,
+    video_root_dir: str,
+    base_model_path: str,
+    use_zero_conv: bool,
+    controlnet_model_path: str,
+    controlnet_weights: float = 1.0,
+    controlnet_guidance_start: float = 0.0,
+    controlnet_guidance_end: float = 1.0,
+    use_dynamic_cfg: bool = True,
+    lora_path: str = None,
+    lora_rank: int = 128,
+    output_path: str = "./output/",
+    num_inference_steps: int = 50,
+    guidance_scale: float = 6.0,
+    num_videos_per_prompt: int = 1,
+    dtype: torch.dtype = torch.bfloat16,
+    seed: int = 42,
+    num_frames: int = 49,
+    height: int = 480,
+    width: int = 720,
+    start_camera_idx: int = 0,
+    end_camera_idx: int = 1,
+    controlnet_transformer_num_attn_heads: int = None,
+    controlnet_transformer_attention_head_dim: int = None,
+    controlnet_transformer_out_proj_dim_factor: int = None,
+    controlnet_transformer_out_proj_dim_zero_init: bool = False,
+    controlnet_transformer_num_layers: int = 8,
+    downscale_coef: int = 8,
+    controlnet_input_channels: int = 6,
+    infer_with_mask: bool = False,
+    pool_style: str = 'avg',
+    pipe_cpu_offload: bool = False,
+):
+    """
+    Generates a video based on the given prompt and saves it to the specified path.
+
+    Parameters:
+    - prompt (str): The description of the video to be generated.
+    - video_root_dir (str): The path to the camera dataset
+    - annotation_json (str): Name of subset (train.json or test.json)
+    - base_model_path (str): The path of the pre-trained model to be used.
+    - controlnet_model_path (str): The path of the pre-trained conrolnet model to be used.
+    - controlnet_weights (float): Strenght of controlnet
+    - controlnet_guidance_start (float): The stage when the controlnet starts to be applied
+    - controlnet_guidance_end (float): The stage when the controlnet end to be applied
+    - lora_path (str): The path of the LoRA weights to be used.
+    - lora_rank (int): The rank of the LoRA weights.
+    - output_path (str): The path where the generated video will be saved.
+    - num_inference_steps (int): Number of steps for the inference process. More steps can result in better quality.
+    - guidance_scale (float): The scale for classifier-free guidance. Higher values can lead to better alignment with the prompt.
+    - num_videos_per_prompt (int): Number of videos to generate per prompt.
+    - dtype (torch.dtype): The data type for computation (default is torch.bfloat16).
+    - seed (int): The seed for reproducibility.
+    """
+    os.makedirs(output_path, exist_ok=True)
+    # 1.  Load the pre-trained CogVideoX pipeline with the specified precision (bfloat16).
+    tokenizer = T5Tokenizer.from_pretrained(
+        base_model_path, subfolder="tokenizer"
+    )
+    text_encoder = T5EncoderModel.from_pretrained(
+        base_model_path, subfolder="text_encoder"
+    )
+    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
+        base_model_path, subfolder="transformer"
+    )
+    vae = AutoencoderKLCogVideoX.from_pretrained(
+        base_model_path, subfolder="vae"
+    )
+    scheduler = CogVideoXDDIMScheduler.from_pretrained(
+        base_model_path, subfolder="scheduler"
+    )
+    # ControlNet
+    num_attention_heads_orig = 48 if "5b" in base_model_path.lower() else 30
+    controlnet_kwargs = {}
+    if controlnet_transformer_num_attn_heads is not None:
+        controlnet_kwargs["num_attention_heads"] = args.controlnet_transformer_num_attn_heads
+    else:
+        controlnet_kwargs["num_attention_heads"] = num_attention_heads_orig
+    if controlnet_transformer_attention_head_dim is not None:
+        controlnet_kwargs["attention_head_dim"] = controlnet_transformer_attention_head_dim
+    if controlnet_transformer_out_proj_dim_factor is not None:
+        controlnet_kwargs["out_proj_dim"] = num_attention_heads_orig * controlnet_transformer_out_proj_dim_factor
+    controlnet_kwargs["out_proj_dim_zero_init"] = controlnet_transformer_out_proj_dim_zero_init
+    controlnet = CogVideoXControlnetPCD(
+        num_layers=controlnet_transformer_num_layers,
+        downscale_coef=downscale_coef,
+        in_channels=controlnet_input_channels,
+        use_zero_conv=use_zero_conv,
+        **controlnet_kwargs,   
+    )
+    if controlnet_model_path:
+        ckpt = torch.load(controlnet_model_path, map_location='cpu', weights_only=False)
+        controlnet_state_dict = {}
+        for name, params in ckpt['state_dict'].items():
+            controlnet_state_dict[name] = params
+        m, u = controlnet.load_state_dict(controlnet_state_dict, strict=False)
+        print(f'[ Weights from pretrained controlnet was loaded into controlnet ] [M: {len(m)} | U: {len(u)}]')
+    
+    # Full pipeline
+    pipe = ControlnetCogVideoXImageToVideoPCDPipeline(
+        tokenizer=tokenizer,
+        text_encoder=text_encoder,
+        transformer=transformer,
+        vae=vae,
+        controlnet=controlnet,
+        scheduler=scheduler,
+    ).to('cuda')
+    # If you're using with lora, add this code
+    if lora_path:
+        pipe.load_lora_weights(lora_path, weight_name="pytorch_lora_weights.safetensors", adapter_name="test_1")
+        pipe.fuse_lora(lora_scale=1 / lora_rank)
+
+    # 2. Set Scheduler.
+    # Can be changed to `CogVideoXDPMScheduler` or `CogVideoXDDIMScheduler`.
+    # We recommend using `CogVideoXDDIMScheduler` for CogVideoX-2B.
+    # using `CogVideoXDPMScheduler` for CogVideoX-5B / CogVideoX-5B-I2V.
+
+    # pipe.scheduler = CogVideoXDDIMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")
+
+    # 3. Enable CPU offload for the model.
+    # turn off if you have multiple GPUs or enough GPU memory(such as H100) and it will cost less time in inference
+    # and enable to("cuda")
+
+    # pipe.to("cuda")
+    pipe = pipe.to(dtype=dtype)
+    # pipe.enable_sequential_cpu_offload()
+    if pipe_cpu_offload:
+        pipe.enable_model_cpu_offload()
+
+    pipe.vae.enable_slicing()
+    pipe.vae.enable_tiling()
+    
+    # 4. Load dataset
+    eval_dataset = RealEstate10KPCDRenderDataset(
+        video_root_dir=video_root_dir,
+        image_size=(height, width), 
+        sample_n_frames=num_frames,
+    )
+    
+    None_prompt = True
+    if prompt:
+        None_prompt = False
+    print(eval_dataset.dataset)
+    
+    for camera_idx in range(start_camera_idx, end_camera_idx):
+        # Get data
+        data_dict = eval_dataset[camera_idx]
+        reference_video = data_dict['video']
+        anchor_video = data_dict['anchor_video']
+        print(eval_dataset.dataset[camera_idx],seed)
+        
+        if None_prompt:
+            # Set output directory
+            output_path_file = os.path.join(output_path, f"{camera_idx:05d}_{seed}_out.mp4")
+            prompt = data_dict['caption']
+        else:
+            # Set output directory
+            output_path_file = os.path.join(output_path, f"{prompt[:10]}_{camera_idx:05d}_{seed}_out.mp4")
+
+        if image is None:
+            input_images = reference_video[0].unsqueeze(0)
+        else:
+            input_images = torch.tensor(np.array(Image.open(image))).permute(2,0,1).unsqueeze(0)/255
+            pixel_transforms = [transforms.Resize((480, 720)),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+            for transform in pixel_transforms:
+                input_images = transform(input_images)
+
+        # if image is None:
+        #     input_images = reference_video[:24]
+        # else:
+        #     input_images = torch.tensor(np.array(Image.open(image))).permute(2,0,1)/255
+        #     pixel_transforms = [transforms.Resize((480, 720)),
+        #                         transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        #     for transform in pixel_transforms:
+        #         input_images = transform(input_images)
+            
+        reference_frames = [to_pil_image(frame) for frame in ((reference_video)/2+0.5)]
+        
+        output_path_file_reference = output_path_file.replace("_out.mp4", "_reference.mp4")
+        output_path_file_out_reference = output_path_file.replace(".mp4", "_reference.mp4")
+        
+        if infer_with_mask:
+            try:
+                video_mask = 1 - torch.from_numpy(np.load(os.path.join(eval_dataset.root_path,'masks',eval_dataset.dataset[camera_idx]+'.npz'))['mask']*1)
+            except:
+                print('using derived mask')
+                video_mask = get_black_region_mask_tensor(anchor_video)
+            
+            if pool_style == 'max':
+                controlnet_output_mask = maxpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
+            elif pool_style == 'avg':
+               controlnet_output_mask = avgpool_mask_tensor(video_mask[1:]).flatten().unsqueeze(0).unsqueeze(-1).to('cuda')
+        else:
+            controlnet_output_mask = None
+        # if os.path.isfile(output_path_file):
+        #     continue
+        
+        # 5. Generate the video frames based on the prompt.
+        # `num_frames` is the Number of frames to generate.
+        # This is the default value for 6 seconds video and 8 fps and will plus 1 frame for the first frame and 49 frames.
+        video_generate_all = pipe(
+            image=input_images,
+            anchor_video=anchor_video,
+            controlnet_output_mask=controlnet_output_mask,
+            prompt=prompt,
+            num_videos_per_prompt=num_videos_per_prompt,  # Number of videos to generate per prompt
+            num_inference_steps=num_inference_steps,  # Number of inference steps
+            num_frames=num_frames,  # Number of frames to generate，changed to 49 for diffusers version `0.30.3` and after.
+            use_dynamic_cfg=use_dynamic_cfg,  # This id used for DPM Sechduler, for DDIM scheduler, it should be False
+            guidance_scale=guidance_scale,
+            generator=torch.Generator().manual_seed(seed),  # Set the seed for reproducibility
+            controlnet_weights=controlnet_weights,
+            controlnet_guidance_start=controlnet_guidance_start,
+            controlnet_guidance_end=controlnet_guidance_end,
+        ).frames
+        video_generate = video_generate_all[0]
+
+        # 6. Export the generated frames to a video file. fps must be 8 for original video.
+        export_to_video(video_generate, output_path_file, fps=8)
+        export_to_video(reference_frames, output_path_file_reference, fps=8)
+        out_reference_frames = [
+            stack_images_horizontally(frame_reference, frame_out)
+            for frame_out, frame_reference in zip(video_generate, reference_frames)
+            ]
+        
+        anchor_video = [to_pil_image(frame) for frame in ((anchor_video)/2+0.5)]
+        out_reference_frames = [
+            stack_images_horizontally(frame_out, frame_reference)
+            for frame_out, frame_reference in zip(out_reference_frames, anchor_video)
+            ]
+        export_to_video(out_reference_frames, output_path_file_out_reference, fps=8)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Generate a video from a text prompt using CogVideoX")
+    parser.add_argument("--prompt", type=str, default=None, help="The description of the video to be generated")
+    parser.add_argument("--image", type=str, default=None, help="The reference image of the video to be generated")
+    parser.add_argument(
+        "--video_root_dir",
+        type=str,
+        required=True,
+        help="The path of the video for controlnet processing.",
+    )
+    parser.add_argument(
+        "--base_model_path", type=str, default="THUDM/CogVideoX-5b", help="The path of the pre-trained model to be used"
+    )
+    parser.add_argument(
+        "--controlnet_model_path", type=str, default="TheDenk/cogvideox-5b-controlnet-hed-v1", help="The path of the controlnet pre-trained model to be used"
+    )
+    parser.add_argument("--controlnet_weights", type=float, default=0.5, help="Strenght of controlnet")
+    parser.add_argument("--use_zero_conv", action="store_true", default=False, help="Use zero conv")
+    parser.add_argument("--infer_with_mask", action="store_true", default=False, help="add mask to controlnet")
+    parser.add_argument("--pool_style", default='max', help="max pool or avg pool")
+    parser.add_argument("--controlnet_guidance_start", type=float, default=0.0, help="The stage when the controlnet starts to be applied")
+    parser.add_argument("--controlnet_guidance_end", type=float, default=0.5, help="The stage when the controlnet end to be applied")
+    parser.add_argument("--use_dynamic_cfg", type=bool, default=True, help="Use dynamic cfg")
+    parser.add_argument("--lora_path", type=str, default=None, help="The path of the LoRA weights to be used")
+    parser.add_argument("--lora_rank", type=int, default=128, help="The rank of the LoRA weights")
+    parser.add_argument(
+        "--output_path", type=str, default="./output", help="The path where the generated video will be saved"
+    )
+    parser.add_argument("--guidance_scale", type=float, default=6.0, help="The scale for classifier-free guidance")
+    parser.add_argument(
+        "--num_inference_steps", type=int, default=50, help="Number of steps for the inference process"
+    )
+    parser.add_argument("--num_videos_per_prompt", type=int, default=1, help="Number of videos to generate per prompt")
+    parser.add_argument(
+        "--dtype", type=str, default="bfloat16", help="The data type for computation (e.g., 'float16' or 'bfloat16')"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="The seed for reproducibility")
+    parser.add_argument("--height", type=int, default=480)
+    parser.add_argument("--width", type=int, default=720)
+    parser.add_argument("--num_frames", type=int, default=49)
+    parser.add_argument("--start_camera_idx", type=int, default=0)
+    parser.add_argument("--end_camera_idx", type=int, default=1)
+    parser.add_argument("--controlnet_transformer_num_attn_heads", type=int, default=None)
+    parser.add_argument("--controlnet_transformer_attention_head_dim", type=int, default=None)
+    parser.add_argument("--controlnet_transformer_out_proj_dim_factor", type=int, default=None)
+    parser.add_argument("--controlnet_transformer_out_proj_dim_zero_init", action="store_true", default=False, help=("Init project zero."),
+    )
+    parser.add_argument("--downscale_coef", type=int, default=8)
+    parser.add_argument("--vae_channels", type=int, default=16)
+    parser.add_argument("--controlnet_input_channels", type=int, default=6)
+    parser.add_argument("--controlnet_transformer_num_layers", type=int, default=8)
+    parser.add_argument("--enable_model_cpu_offload", action="store_true", default=False, help="Enable model CPU offload")
+
+    args = parser.parse_args()
+    dtype = torch.float16 if args.dtype == "float16" else torch.bfloat16
+    generate_video(
+        prompt=args.prompt,
+        image=args.image,
+        video_root_dir=args.video_root_dir,
+        base_model_path=args.base_model_path,
+        use_zero_conv=args.use_zero_conv,
+        controlnet_model_path=args.controlnet_model_path,
+        controlnet_weights=args.controlnet_weights,
+        controlnet_guidance_start=args.controlnet_guidance_start,
+        controlnet_guidance_end=args.controlnet_guidance_end,
+        use_dynamic_cfg=args.use_dynamic_cfg,
+        lora_path=args.lora_path,
+        lora_rank=args.lora_rank,
+        output_path=args.output_path,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        num_videos_per_prompt=args.num_videos_per_prompt,
+        dtype=dtype,
+        seed=args.seed,
+        height=args.height,
+        width=args.width,
+        num_frames=args.num_frames,
+        start_camera_idx=args.start_camera_idx,
+        end_camera_idx=args.end_camera_idx,
+        controlnet_transformer_num_attn_heads=args.controlnet_transformer_num_attn_heads,
+        controlnet_transformer_attention_head_dim=args.controlnet_transformer_attention_head_dim,
+        controlnet_transformer_out_proj_dim_factor=args.controlnet_transformer_out_proj_dim_factor,
+        controlnet_transformer_num_layers=args.controlnet_transformer_num_layers,
+        downscale_coef=args.downscale_coef,
+        controlnet_input_channels=args.controlnet_input_channels,
+        infer_with_mask=args.infer_with_mask,
+        pool_style=args.pool_style,
+        pipe_cpu_offload=args.enable_model_cpu_offload,
+    )
diff --git a/inference/utils.py b/inference/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..50a529a8d90e011f09f4c45c6d1ac04d6a7d91ad
--- /dev/null
+++ b/inference/utils.py
@@ -0,0 +1,15 @@
+from PIL import Image
+
+def stack_images_horizontally(image1: Image.Image, image2: Image.Image) -> Image.Image:
+    # Ensure both images have the same height
+    height = max(image1.height, image2.height)
+    width = image1.width + image2.width
+    
+    # Create a new blank image with the combined width and the maximum height
+    new_image = Image.new('RGB', (width, height))
+    
+    # Paste the images into the new image
+    new_image.paste(image1, (0, 0))
+    new_image.paste(image2, (image1.width, 0))
+    
+    return new_image
\ No newline at end of file
diff --git a/inference/v2v_data/DepthCrafter/LICENSE b/inference/v2v_data/DepthCrafter/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..eafbe27468e9e25869eef89e6434328e5ff96c02
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/LICENSE
@@ -0,0 +1,32 @@
+Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved. The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications"). 
+
+License Terms of the inference code of DepthCrafter:
+--------------------------------------------------------------------
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this Software and associated documentation files, to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+- You agree to use the DepthCrafter only for academic, research and education purposes, and refrain from using it for any commercial or production purposes under any circumstances.
+
+- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+For avoidance of doubts, “Software” means the DepthCrafter model inference code and weights made available under this license excluding any pre-trained data and other AI components.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+
+Other dependencies and licenses:
+
+Open Source Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. Stability AI - Code
+Copyright (c) 2023 Stability AI
+
+Terms of the MIT License:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**You may find the code license of Stability AI at the following links: https://github.com/Stability-AI/generative-models/blob/main/LICENSE-CODE
diff --git a/inference/v2v_data/DepthCrafter/app.py b/inference/v2v_data/DepthCrafter/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..26d961573f019d8d794195640f94e00d4dfd9578
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/app.py
@@ -0,0 +1,239 @@
+import gc
+import os
+
+import numpy as np
+import spaces
+import gradio as gr
+import torch
+from diffusers.training_utils import set_seed
+
+from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
+from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
+
+import uuid
+import random
+from huggingface_hub import hf_hub_download
+
+from depthcrafter.utils import read_video_frames, vis_sequence_depth, save_video
+
+examples = [
+    ["examples/example_01.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_02.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_03.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_04.mp4", 5, 1.0, 1024, -1, -1],
+    ["examples/example_05.mp4", 5, 1.0, 1024, -1, -1],
+]
+
+
+unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
+    "tencent/DepthCrafter",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16,
+)
+pipe = DepthCrafterPipeline.from_pretrained(
+    "stabilityai/stable-video-diffusion-img2vid-xt",
+    unet=unet,
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipe.to("cuda")
+
+
+@spaces.GPU(duration=120)
+def infer_depth(
+    video: str,
+    num_denoising_steps: int,
+    guidance_scale: float,
+    max_res: int = 1024,
+    process_length: int = -1,
+    target_fps: int = -1,
+    #
+    save_folder: str = "./demo_output",
+    window_size: int = 110,
+    overlap: int = 25,
+    seed: int = 42,
+    track_time: bool = True,
+    save_npz: bool = False,
+):
+    set_seed(seed)
+    pipe.enable_xformers_memory_efficient_attention()
+
+    frames, target_fps = read_video_frames(video, process_length, target_fps, max_res)
+
+    # inference the depth map using the DepthCrafter pipeline
+    with torch.inference_mode():
+        res = pipe(
+            frames,
+            height=frames.shape[1],
+            width=frames.shape[2],
+            output_type="np",
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_denoising_steps,
+            window_size=window_size,
+            overlap=overlap,
+            track_time=track_time,
+        ).frames[0]
+    # convert the three-channel output to a single channel depth map
+    res = res.sum(-1) / res.shape[-1]
+    # normalize the depth map to [0, 1] across the whole video
+    res = (res - res.min()) / (res.max() - res.min())
+    # visualize the depth map and save the results
+    vis = vis_sequence_depth(res)
+    # save the depth map and visualization with the target FPS
+    save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0])
+    print(f"==> saving results to {save_path}")
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    if save_npz:
+        np.savez_compressed(save_path + ".npz", depth=res)
+    save_video(res, save_path + "_depth.mp4", fps=target_fps)
+    save_video(vis, save_path + "_vis.mp4", fps=target_fps)
+    save_video(frames, save_path + "_input.mp4", fps=target_fps)
+
+    # clear the cache for the next video
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    return [
+        save_path + "_input.mp4",
+        save_path + "_vis.mp4",
+        # save_path + "_depth.mp4",
+    ]
+
+
+def construct_demo():
+    with gr.Blocks(analytics_enabled=False) as depthcrafter_iface:
+        gr.Markdown(
+            """
+            <div align='center'> <h1> DepthCrafter: Generating Consistent Long Depth Sequences for Open-world Videos </span> </h1> \
+                        <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\
+                        <a href='https://wbhu.github.io'>Wenbo Hu</a>, \
+                        <a href='https://scholar.google.com/citations?user=qgdesEcAAAAJ&hl=en'>Xiangjun Gao</a>, \
+                        <a href='https://xiaoyu258.github.io/'>Xiaoyu Li</a>, \
+                        <a href='https://scholar.google.com/citations?user=tZ3dS3MAAAAJ&hl=en'>Sijie Zhao</a>, \
+                        <a href='https://vinthony.github.io/academic'> Xiaodong Cun</a>, \
+                        <a href='https://yzhang2016.github.io'>Yong Zhang</a>, \
+                        <a href='https://home.cse.ust.hk/~quan'>Long Quan</a>, \
+                        <a href='https://scholar.google.com/citations?user=4oXBp9UAAAAJ&hl=en'>Ying Shan</a>\
+                    </h2> \
+                    <a style='font-size:18px;color: #000000'>If you find DepthCrafter useful, please help ⭐ the </a>\
+                    <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Tencent/DepthCrafter'>[Github Repo]</a>\
+                    <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
+                        <a style='font-size:18px;color: #000000' href='https://arxiv.org/abs/2409.02095'> [ArXiv] </a>\
+                        <a style='font-size:18px;color: #000000' href='https://depthcrafter.github.io/'> [Project Page] </a> </div>
+            """
+        )
+
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                input_video = gr.Video(label="Input Video")
+
+            # with gr.Tab(label="Output"):
+            with gr.Column(scale=2):
+                with gr.Row(equal_height=True):
+                    output_video_1 = gr.Video(
+                        label="Preprocessed video",
+                        interactive=False,
+                        autoplay=True,
+                        loop=True,
+                        show_share_button=True,
+                        scale=5,
+                    )
+                    output_video_2 = gr.Video(
+                        label="Generated Depth Video",
+                        interactive=False,
+                        autoplay=True,
+                        loop=True,
+                        show_share_button=True,
+                        scale=5,
+                    )
+
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                with gr.Row(equal_height=False):
+                    with gr.Accordion("Advanced Settings", open=False):
+                        num_denoising_steps = gr.Slider(
+                            label="num denoising steps",
+                            minimum=1,
+                            maximum=25,
+                            value=5,
+                            step=1,
+                        )
+                        guidance_scale = gr.Slider(
+                            label="cfg scale",
+                            minimum=1.0,
+                            maximum=1.2,
+                            value=1.0,
+                            step=0.1,
+                        )
+                        max_res = gr.Slider(
+                            label="max resolution",
+                            minimum=512,
+                            maximum=2048,
+                            value=1024,
+                            step=64,
+                        )
+                        process_length = gr.Slider(
+                            label="process length",
+                            minimum=-1,
+                            maximum=280,
+                            value=60,
+                            step=1,
+                        )
+                        process_target_fps = gr.Slider(
+                            label="target FPS",
+                            minimum=-1,
+                            maximum=30,
+                            value=15,
+                            step=1,
+                        )
+                    generate_btn = gr.Button("Generate")
+            with gr.Column(scale=2):
+                pass
+
+        gr.Examples(
+            examples=examples,
+            inputs=[
+                input_video,
+                num_denoising_steps,
+                guidance_scale,
+                max_res,
+                process_length,
+                process_target_fps,
+            ],
+            outputs=[output_video_1, output_video_2],
+            fn=infer_depth,
+            cache_examples="lazy",
+        )
+        gr.Markdown(
+            """
+            <span style='font-size:18px;color: #E7CCCC'>Note: 
+            For time quota consideration, we set the default parameters to be more efficient here,
+            with a trade-off of shorter video length and slightly lower quality.
+            You may adjust the parameters according to our 
+            <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Tencent/DepthCrafter'>[Github Repo]</a>
+             for better results if you have enough time quota.
+            </span>
+            """
+        )
+
+        generate_btn.click(
+            fn=infer_depth,
+            inputs=[
+                input_video,
+                num_denoising_steps,
+                guidance_scale,
+                max_res,
+                process_length,
+                process_target_fps,
+            ],
+            outputs=[output_video_1, output_video_2],
+        )
+
+    return depthcrafter_iface
+
+
+if __name__ == "__main__":
+    demo = construct_demo()
+    demo.queue()
+    # demo.launch(server_name="0.0.0.0", server_port=12345, debug=True, share=False)
+    demo.launch(share=True)
diff --git a/inference/v2v_data/DepthCrafter/benchmark/__init__.py b/inference/v2v_data/DepthCrafter/benchmark/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/inference/v2v_data/DepthCrafter/benchmark/csv/meta_bonn.csv b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_bonn.csv
new file mode 100644
index 0000000000000000000000000000000000000000..6d40d857a1b75a07672a5d8a2ef5f4f715858766
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_bonn.csv
@@ -0,0 +1,6 @@
+filepath_left,filepath_disparity
+bonn/rgbd_bonn_synchronous_rgb_left.mp4,bonn/rgbd_bonn_synchronous_disparity.npz
+bonn/rgbd_bonn_person_tracking_rgb_left.mp4,bonn/rgbd_bonn_person_tracking_disparity.npz
+bonn/rgbd_bonn_crowd2_rgb_left.mp4,bonn/rgbd_bonn_crowd2_disparity.npz
+bonn/rgbd_bonn_crowd3_rgb_left.mp4,bonn/rgbd_bonn_crowd3_disparity.npz
+bonn/rgbd_bonn_balloon2_rgb_left.mp4,bonn/rgbd_bonn_balloon2_disparity.npz
diff --git a/inference/v2v_data/DepthCrafter/benchmark/csv/meta_kitti_val.csv b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_kitti_val.csv
new file mode 100644
index 0000000000000000000000000000000000000000..2cc17599dd8dac44348b501ea1556a5e28d875ac
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_kitti_val.csv
@@ -0,0 +1,14 @@
+filepath_left,filepath_disparity
+KITTI/2011_09_28_drive_0037_sync_rgb_left.mp4,KITTI/2011_09_28_drive_0037_sync_disparity.npz
+KITTI/2011_09_26_drive_0005_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0005_sync_disparity.npz
+KITTI/2011_09_30_drive_0016_sync_rgb_left.mp4,KITTI/2011_09_30_drive_0016_sync_disparity.npz
+KITTI/2011_09_26_drive_0079_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0079_sync_disparity.npz
+KITTI/2011_09_26_drive_0020_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0020_sync_disparity.npz
+KITTI/2011_09_26_drive_0095_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0095_sync_disparity.npz
+KITTI/2011_10_03_drive_0047_sync_rgb_left.mp4,KITTI/2011_10_03_drive_0047_sync_disparity.npz
+KITTI/2011_09_26_drive_0113_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0113_sync_disparity.npz
+KITTI/2011_09_26_drive_0036_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0036_sync_disparity.npz
+KITTI/2011_09_26_drive_0013_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0013_sync_disparity.npz
+KITTI/2011_09_26_drive_0002_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0002_sync_disparity.npz
+KITTI/2011_09_29_drive_0026_sync_rgb_left.mp4,KITTI/2011_09_29_drive_0026_sync_disparity.npz
+KITTI/2011_09_26_drive_0023_sync_rgb_left.mp4,KITTI/2011_09_26_drive_0023_sync_disparity.npz
diff --git a/inference/v2v_data/DepthCrafter/benchmark/csv/meta_nyu_test.csv b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_nyu_test.csv
new file mode 100644
index 0000000000000000000000000000000000000000..119dddf4da8ddd55b9c2ed6196c03d439785ea52
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_nyu_test.csv
@@ -0,0 +1,655 @@
+filepath_left,filepath_disparity
+NYUv2/test/kitchen_0004/rgb_0001_rgb_left.mp4,NYUv2/test/kitchen_0004/rgb_0001_disparity.npz
+NYUv2/test/kitchen_0004/rgb_0002_rgb_left.mp4,NYUv2/test/kitchen_0004/rgb_0002_disparity.npz
+NYUv2/test/office_0005/rgb_0009_rgb_left.mp4,NYUv2/test/office_0005/rgb_0009_disparity.npz
+NYUv2/test/office_0007/rgb_0014_rgb_left.mp4,NYUv2/test/office_0007/rgb_0014_disparity.npz
+NYUv2/test/office_0008/rgb_0015_rgb_left.mp4,NYUv2/test/office_0008/rgb_0015_disparity.npz
+NYUv2/test/office_0008/rgb_0016_rgb_left.mp4,NYUv2/test/office_0008/rgb_0016_disparity.npz
+NYUv2/test/office_0008/rgb_0017_rgb_left.mp4,NYUv2/test/office_0008/rgb_0017_disparity.npz
+NYUv2/test/office_0008/rgb_0018_rgb_left.mp4,NYUv2/test/office_0008/rgb_0018_disparity.npz
+NYUv2/test/office_0010/rgb_0021_rgb_left.mp4,NYUv2/test/office_0010/rgb_0021_disparity.npz
+NYUv2/test/office_0013/rgb_0028_rgb_left.mp4,NYUv2/test/office_0013/rgb_0028_disparity.npz
+NYUv2/test/office_0013/rgb_0029_rgb_left.mp4,NYUv2/test/office_0013/rgb_0029_disparity.npz
+NYUv2/test/office_0013/rgb_0030_rgb_left.mp4,NYUv2/test/office_0013/rgb_0030_disparity.npz
+NYUv2/test/office_0013/rgb_0031_rgb_left.mp4,NYUv2/test/office_0013/rgb_0031_disparity.npz
+NYUv2/test/office_0013/rgb_0032_rgb_left.mp4,NYUv2/test/office_0013/rgb_0032_disparity.npz
+NYUv2/test/office_0013/rgb_0033_rgb_left.mp4,NYUv2/test/office_0013/rgb_0033_disparity.npz
+NYUv2/test/office_0013/rgb_0034_rgb_left.mp4,NYUv2/test/office_0013/rgb_0034_disparity.npz
+NYUv2/test/office_0014/rgb_0035_rgb_left.mp4,NYUv2/test/office_0014/rgb_0035_disparity.npz
+NYUv2/test/office_0014/rgb_0036_rgb_left.mp4,NYUv2/test/office_0014/rgb_0036_disparity.npz
+NYUv2/test/office_0014/rgb_0037_rgb_left.mp4,NYUv2/test/office_0014/rgb_0037_disparity.npz
+NYUv2/test/office_0014/rgb_0038_rgb_left.mp4,NYUv2/test/office_0014/rgb_0038_disparity.npz
+NYUv2/test/office_0015/rgb_0039_rgb_left.mp4,NYUv2/test/office_0015/rgb_0039_disparity.npz
+NYUv2/test/office_0015/rgb_0040_rgb_left.mp4,NYUv2/test/office_0015/rgb_0040_disparity.npz
+NYUv2/test/office_0015/rgb_0041_rgb_left.mp4,NYUv2/test/office_0015/rgb_0041_disparity.npz
+NYUv2/test/office_0015/rgb_0042_rgb_left.mp4,NYUv2/test/office_0015/rgb_0042_disparity.npz
+NYUv2/test/office_0015/rgb_0043_rgb_left.mp4,NYUv2/test/office_0015/rgb_0043_disparity.npz
+NYUv2/test/bathroom_0003/rgb_0046_rgb_left.mp4,NYUv2/test/bathroom_0003/rgb_0046_disparity.npz
+NYUv2/test/bathroom_0004/rgb_0047_rgb_left.mp4,NYUv2/test/bathroom_0004/rgb_0047_disparity.npz
+NYUv2/test/bedroom_0011/rgb_0056_rgb_left.mp4,NYUv2/test/bedroom_0011/rgb_0056_disparity.npz
+NYUv2/test/bedroom_0011/rgb_0057_rgb_left.mp4,NYUv2/test/bedroom_0011/rgb_0057_disparity.npz
+NYUv2/test/bedroom_0013/rgb_0059_rgb_left.mp4,NYUv2/test/bedroom_0013/rgb_0059_disparity.npz
+NYUv2/test/bedroom_0013/rgb_0060_rgb_left.mp4,NYUv2/test/bedroom_0013/rgb_0060_disparity.npz
+NYUv2/test/bedroom_0013/rgb_0061_rgb_left.mp4,NYUv2/test/bedroom_0013/rgb_0061_disparity.npz
+NYUv2/test/bedroom_0013/rgb_0062_rgb_left.mp4,NYUv2/test/bedroom_0013/rgb_0062_disparity.npz
+NYUv2/test/bedroom_0013/rgb_0063_rgb_left.mp4,NYUv2/test/bedroom_0013/rgb_0063_disparity.npz
+NYUv2/test/bedroom_0018/rgb_0076_rgb_left.mp4,NYUv2/test/bedroom_0018/rgb_0076_disparity.npz
+NYUv2/test/bedroom_0018/rgb_0077_rgb_left.mp4,NYUv2/test/bedroom_0018/rgb_0077_disparity.npz
+NYUv2/test/bedroom_0018/rgb_0078_rgb_left.mp4,NYUv2/test/bedroom_0018/rgb_0078_disparity.npz
+NYUv2/test/bedroom_0018/rgb_0079_rgb_left.mp4,NYUv2/test/bedroom_0018/rgb_0079_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0084_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0084_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0085_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0085_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0086_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0086_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0087_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0087_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0088_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0088_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0089_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0089_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0090_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0090_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0091_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0091_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0117_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0117_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0118_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0118_disparity.npz
+NYUv2/test/bookstore_0001/rgb_0119_rgb_left.mp4,NYUv2/test/bookstore_0001/rgb_0119_disparity.npz
+NYUv2/test/kitchen_0005/rgb_0125_rgb_left.mp4,NYUv2/test/kitchen_0005/rgb_0125_disparity.npz
+NYUv2/test/kitchen_0005/rgb_0126_rgb_left.mp4,NYUv2/test/kitchen_0005/rgb_0126_disparity.npz
+NYUv2/test/kitchen_0005/rgb_0127_rgb_left.mp4,NYUv2/test/kitchen_0005/rgb_0127_disparity.npz
+NYUv2/test/kitchen_0005/rgb_0128_rgb_left.mp4,NYUv2/test/kitchen_0005/rgb_0128_disparity.npz
+NYUv2/test/kitchen_0005/rgb_0129_rgb_left.mp4,NYUv2/test/kitchen_0005/rgb_0129_disparity.npz
+NYUv2/test/kitchen_0007/rgb_0131_rgb_left.mp4,NYUv2/test/kitchen_0007/rgb_0131_disparity.npz
+NYUv2/test/kitchen_0007/rgb_0132_rgb_left.mp4,NYUv2/test/kitchen_0007/rgb_0132_disparity.npz
+NYUv2/test/kitchen_0007/rgb_0133_rgb_left.mp4,NYUv2/test/kitchen_0007/rgb_0133_disparity.npz
+NYUv2/test/kitchen_0007/rgb_0134_rgb_left.mp4,NYUv2/test/kitchen_0007/rgb_0134_disparity.npz
+NYUv2/test/kitchen_0009/rgb_0137_rgb_left.mp4,NYUv2/test/kitchen_0009/rgb_0137_disparity.npz
+NYUv2/test/living_room_0008/rgb_0153_rgb_left.mp4,NYUv2/test/living_room_0008/rgb_0153_disparity.npz
+NYUv2/test/living_room_0008/rgb_0154_rgb_left.mp4,NYUv2/test/living_room_0008/rgb_0154_disparity.npz
+NYUv2/test/living_room_0009/rgb_0155_rgb_left.mp4,NYUv2/test/living_room_0009/rgb_0155_disparity.npz
+NYUv2/test/living_room_0013/rgb_0167_rgb_left.mp4,NYUv2/test/living_room_0013/rgb_0167_disparity.npz
+NYUv2/test/living_room_0013/rgb_0168_rgb_left.mp4,NYUv2/test/living_room_0013/rgb_0168_disparity.npz
+NYUv2/test/living_room_0014/rgb_0169_rgb_left.mp4,NYUv2/test/living_room_0014/rgb_0169_disparity.npz
+NYUv2/test/bedroom_0003/rgb_0171_rgb_left.mp4,NYUv2/test/bedroom_0003/rgb_0171_disparity.npz
+NYUv2/test/bedroom_0003/rgb_0172_rgb_left.mp4,NYUv2/test/bedroom_0003/rgb_0172_disparity.npz
+NYUv2/test/bedroom_0003/rgb_0173_rgb_left.mp4,NYUv2/test/bedroom_0003/rgb_0173_disparity.npz
+NYUv2/test/bedroom_0003/rgb_0174_rgb_left.mp4,NYUv2/test/bedroom_0003/rgb_0174_disparity.npz
+NYUv2/test/bedroom_0003/rgb_0175_rgb_left.mp4,NYUv2/test/bedroom_0003/rgb_0175_disparity.npz
+NYUv2/test/bedroom_0003/rgb_0176_rgb_left.mp4,NYUv2/test/bedroom_0003/rgb_0176_disparity.npz
+NYUv2/test/bedroom_0005/rgb_0180_rgb_left.mp4,NYUv2/test/bedroom_0005/rgb_0180_disparity.npz
+NYUv2/test/bedroom_0005/rgb_0181_rgb_left.mp4,NYUv2/test/bedroom_0005/rgb_0181_disparity.npz
+NYUv2/test/bedroom_0005/rgb_0182_rgb_left.mp4,NYUv2/test/bedroom_0005/rgb_0182_disparity.npz
+NYUv2/test/bedroom_0006/rgb_0183_rgb_left.mp4,NYUv2/test/bedroom_0006/rgb_0183_disparity.npz
+NYUv2/test/bedroom_0006/rgb_0184_rgb_left.mp4,NYUv2/test/bedroom_0006/rgb_0184_disparity.npz
+NYUv2/test/bedroom_0006/rgb_0185_rgb_left.mp4,NYUv2/test/bedroom_0006/rgb_0185_disparity.npz
+NYUv2/test/bedroom_0006/rgb_0186_rgb_left.mp4,NYUv2/test/bedroom_0006/rgb_0186_disparity.npz
+NYUv2/test/bedroom_0006/rgb_0187_rgb_left.mp4,NYUv2/test/bedroom_0006/rgb_0187_disparity.npz
+NYUv2/test/bedroom_0006/rgb_0188_rgb_left.mp4,NYUv2/test/bedroom_0006/rgb_0188_disparity.npz
+NYUv2/test/bedroom_0007/rgb_0189_rgb_left.mp4,NYUv2/test/bedroom_0007/rgb_0189_disparity.npz
+NYUv2/test/bedroom_0007/rgb_0190_rgb_left.mp4,NYUv2/test/bedroom_0007/rgb_0190_disparity.npz
+NYUv2/test/bedroom_0007/rgb_0191_rgb_left.mp4,NYUv2/test/bedroom_0007/rgb_0191_disparity.npz
+NYUv2/test/bedroom_0007/rgb_0192_rgb_left.mp4,NYUv2/test/bedroom_0007/rgb_0192_disparity.npz
+NYUv2/test/bedroom_0007/rgb_0193_rgb_left.mp4,NYUv2/test/bedroom_0007/rgb_0193_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0194_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0194_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0195_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0195_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0196_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0196_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0197_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0197_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0198_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0198_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0199_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0199_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0200_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0200_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0201_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0201_disparity.npz
+NYUv2/test/kitchen_0002/rgb_0202_rgb_left.mp4,NYUv2/test/kitchen_0002/rgb_0202_disparity.npz
+NYUv2/test/living_room_0002/rgb_0207_rgb_left.mp4,NYUv2/test/living_room_0002/rgb_0207_disparity.npz
+NYUv2/test/living_room_0003/rgb_0208_rgb_left.mp4,NYUv2/test/living_room_0003/rgb_0208_disparity.npz
+NYUv2/test/living_room_0003/rgb_0209_rgb_left.mp4,NYUv2/test/living_room_0003/rgb_0209_disparity.npz
+NYUv2/test/living_room_0003/rgb_0210_rgb_left.mp4,NYUv2/test/living_room_0003/rgb_0210_disparity.npz
+NYUv2/test/living_room_0003/rgb_0211_rgb_left.mp4,NYUv2/test/living_room_0003/rgb_0211_disparity.npz
+NYUv2/test/living_room_0003/rgb_0212_rgb_left.mp4,NYUv2/test/living_room_0003/rgb_0212_disparity.npz
+NYUv2/test/bedroom_0022/rgb_0220_rgb_left.mp4,NYUv2/test/bedroom_0022/rgb_0220_disparity.npz
+NYUv2/test/bedroom_0024/rgb_0221_rgb_left.mp4,NYUv2/test/bedroom_0024/rgb_0221_disparity.npz
+NYUv2/test/bedroom_0024/rgb_0222_rgb_left.mp4,NYUv2/test/bedroom_0024/rgb_0222_disparity.npz
+NYUv2/test/kitchen_0015/rgb_0250_rgb_left.mp4,NYUv2/test/kitchen_0015/rgb_0250_disparity.npz
+NYUv2/test/living_room_0021/rgb_0264_rgb_left.mp4,NYUv2/test/living_room_0021/rgb_0264_disparity.npz
+NYUv2/test/office_0016/rgb_0271_rgb_left.mp4,NYUv2/test/office_0016/rgb_0271_disparity.npz
+NYUv2/test/office_0017/rgb_0272_rgb_left.mp4,NYUv2/test/office_0017/rgb_0272_disparity.npz
+NYUv2/test/study_room_0001/rgb_0273_rgb_left.mp4,NYUv2/test/study_room_0001/rgb_0273_disparity.npz
+NYUv2/test/study_room_0006/rgb_0279_rgb_left.mp4,NYUv2/test/study_room_0006/rgb_0279_disparity.npz
+NYUv2/test/bedroom_0131/rgb_0280_rgb_left.mp4,NYUv2/test/bedroom_0131/rgb_0280_disparity.npz
+NYUv2/test/bedroom_0131/rgb_0281_rgb_left.mp4,NYUv2/test/bedroom_0131/rgb_0281_disparity.npz
+NYUv2/test/bedroom_0131/rgb_0282_rgb_left.mp4,NYUv2/test/bedroom_0131/rgb_0282_disparity.npz
+NYUv2/test/bedroom_0131/rgb_0283_rgb_left.mp4,NYUv2/test/bedroom_0131/rgb_0283_disparity.npz
+NYUv2/test/classroom_0001/rgb_0284_rgb_left.mp4,NYUv2/test/classroom_0001/rgb_0284_disparity.npz
+NYUv2/test/classroom_0001/rgb_0285_rgb_left.mp4,NYUv2/test/classroom_0001/rgb_0285_disparity.npz
+NYUv2/test/classroom_0007/rgb_0296_rgb_left.mp4,NYUv2/test/classroom_0007/rgb_0296_disparity.npz
+NYUv2/test/classroom_0007/rgb_0297_rgb_left.mp4,NYUv2/test/classroom_0007/rgb_0297_disparity.npz
+NYUv2/test/classroom_0007/rgb_0298_rgb_left.mp4,NYUv2/test/classroom_0007/rgb_0298_disparity.npz
+NYUv2/test/classroom_0008/rgb_0299_rgb_left.mp4,NYUv2/test/classroom_0008/rgb_0299_disparity.npz
+NYUv2/test/classroom_0008/rgb_0300_rgb_left.mp4,NYUv2/test/classroom_0008/rgb_0300_disparity.npz
+NYUv2/test/classroom_0009/rgb_0301_rgb_left.mp4,NYUv2/test/classroom_0009/rgb_0301_disparity.npz
+NYUv2/test/classroom_0009/rgb_0302_rgb_left.mp4,NYUv2/test/classroom_0009/rgb_0302_disparity.npz
+NYUv2/test/classroom_0014/rgb_0310_rgb_left.mp4,NYUv2/test/classroom_0014/rgb_0310_disparity.npz
+NYUv2/test/classroom_0014/rgb_0311_rgb_left.mp4,NYUv2/test/classroom_0014/rgb_0311_disparity.npz
+NYUv2/test/classroom_0015/rgb_0312_rgb_left.mp4,NYUv2/test/classroom_0015/rgb_0312_disparity.npz
+NYUv2/test/classroom_0017/rgb_0315_rgb_left.mp4,NYUv2/test/classroom_0017/rgb_0315_disparity.npz
+NYUv2/test/classroom_0017/rgb_0316_rgb_left.mp4,NYUv2/test/classroom_0017/rgb_0316_disparity.npz
+NYUv2/test/classroom_0017/rgb_0317_rgb_left.mp4,NYUv2/test/classroom_0017/rgb_0317_disparity.npz
+NYUv2/test/classroom_0023/rgb_0325_rgb_left.mp4,NYUv2/test/classroom_0023/rgb_0325_disparity.npz
+NYUv2/test/classroom_0023/rgb_0326_rgb_left.mp4,NYUv2/test/classroom_0023/rgb_0326_disparity.npz
+NYUv2/test/classroom_0023/rgb_0327_rgb_left.mp4,NYUv2/test/classroom_0023/rgb_0327_disparity.npz
+NYUv2/test/classroom_0023/rgb_0328_rgb_left.mp4,NYUv2/test/classroom_0023/rgb_0328_disparity.npz
+NYUv2/test/classroom_0024/rgb_0329_rgb_left.mp4,NYUv2/test/classroom_0024/rgb_0329_disparity.npz
+NYUv2/test/classroom_0024/rgb_0330_rgb_left.mp4,NYUv2/test/classroom_0024/rgb_0330_disparity.npz
+NYUv2/test/classroom_0026/rgb_0331_rgb_left.mp4,NYUv2/test/classroom_0026/rgb_0331_disparity.npz
+NYUv2/test/classroom_0026/rgb_0332_rgb_left.mp4,NYUv2/test/classroom_0026/rgb_0332_disparity.npz
+NYUv2/test/computer_lab_0001/rgb_0333_rgb_left.mp4,NYUv2/test/computer_lab_0001/rgb_0333_disparity.npz
+NYUv2/test/computer_lab_0001/rgb_0334_rgb_left.mp4,NYUv2/test/computer_lab_0001/rgb_0334_disparity.npz
+NYUv2/test/computer_lab_0001/rgb_0335_rgb_left.mp4,NYUv2/test/computer_lab_0001/rgb_0335_disparity.npz
+NYUv2/test/foyer_0001/rgb_0351_rgb_left.mp4,NYUv2/test/foyer_0001/rgb_0351_disparity.npz
+NYUv2/test/foyer_0001/rgb_0352_rgb_left.mp4,NYUv2/test/foyer_0001/rgb_0352_disparity.npz
+NYUv2/test/home_office_0001/rgb_0355_rgb_left.mp4,NYUv2/test/home_office_0001/rgb_0355_disparity.npz
+NYUv2/test/home_office_0001/rgb_0356_rgb_left.mp4,NYUv2/test/home_office_0001/rgb_0356_disparity.npz
+NYUv2/test/home_office_0001/rgb_0357_rgb_left.mp4,NYUv2/test/home_office_0001/rgb_0357_disparity.npz
+NYUv2/test/home_office_0001/rgb_0358_rgb_left.mp4,NYUv2/test/home_office_0001/rgb_0358_disparity.npz
+NYUv2/test/home_office_0002/rgb_0359_rgb_left.mp4,NYUv2/test/home_office_0002/rgb_0359_disparity.npz
+NYUv2/test/home_office_0002/rgb_0360_rgb_left.mp4,NYUv2/test/home_office_0002/rgb_0360_disparity.npz
+NYUv2/test/home_office_0002/rgb_0361_rgb_left.mp4,NYUv2/test/home_office_0002/rgb_0361_disparity.npz
+NYUv2/test/home_office_0002/rgb_0362_rgb_left.mp4,NYUv2/test/home_office_0002/rgb_0362_disparity.npz
+NYUv2/test/home_office_0003/rgb_0363_rgb_left.mp4,NYUv2/test/home_office_0003/rgb_0363_disparity.npz
+NYUv2/test/home_office_0003/rgb_0364_rgb_left.mp4,NYUv2/test/home_office_0003/rgb_0364_disparity.npz
+NYUv2/test/home_office_0009/rgb_0384_rgb_left.mp4,NYUv2/test/home_office_0009/rgb_0384_disparity.npz
+NYUv2/test/home_office_0009/rgb_0385_rgb_left.mp4,NYUv2/test/home_office_0009/rgb_0385_disparity.npz
+NYUv2/test/home_office_0009/rgb_0386_rgb_left.mp4,NYUv2/test/home_office_0009/rgb_0386_disparity.npz
+NYUv2/test/home_office_0010/rgb_0387_rgb_left.mp4,NYUv2/test/home_office_0010/rgb_0387_disparity.npz
+NYUv2/test/home_office_0010/rgb_0388_rgb_left.mp4,NYUv2/test/home_office_0010/rgb_0388_disparity.npz
+NYUv2/test/home_office_0010/rgb_0389_rgb_left.mp4,NYUv2/test/home_office_0010/rgb_0389_disparity.npz
+NYUv2/test/home_office_0010/rgb_0390_rgb_left.mp4,NYUv2/test/home_office_0010/rgb_0390_disparity.npz
+NYUv2/test/home_office_0012/rgb_0395_rgb_left.mp4,NYUv2/test/home_office_0012/rgb_0395_disparity.npz
+NYUv2/test/home_office_0012/rgb_0396_rgb_left.mp4,NYUv2/test/home_office_0012/rgb_0396_disparity.npz
+NYUv2/test/home_office_0012/rgb_0397_rgb_left.mp4,NYUv2/test/home_office_0012/rgb_0397_disparity.npz
+NYUv2/test/office_kitchen_0002/rgb_0411_rgb_left.mp4,NYUv2/test/office_kitchen_0002/rgb_0411_disparity.npz
+NYUv2/test/office_kitchen_0002/rgb_0412_rgb_left.mp4,NYUv2/test/office_kitchen_0002/rgb_0412_disparity.npz
+NYUv2/test/office_kitchen_0002/rgb_0413_rgb_left.mp4,NYUv2/test/office_kitchen_0002/rgb_0413_disparity.npz
+NYUv2/test/office_kitchen_0002/rgb_0414_rgb_left.mp4,NYUv2/test/office_kitchen_0002/rgb_0414_disparity.npz
+NYUv2/test/playroom_0005/rgb_0430_rgb_left.mp4,NYUv2/test/playroom_0005/rgb_0430_disparity.npz
+NYUv2/test/playroom_0005/rgb_0431_rgb_left.mp4,NYUv2/test/playroom_0005/rgb_0431_disparity.npz
+NYUv2/test/playroom_0005/rgb_0432_rgb_left.mp4,NYUv2/test/playroom_0005/rgb_0432_disparity.npz
+NYUv2/test/playroom_0005/rgb_0433_rgb_left.mp4,NYUv2/test/playroom_0005/rgb_0433_disparity.npz
+NYUv2/test/playroom_0005/rgb_0434_rgb_left.mp4,NYUv2/test/playroom_0005/rgb_0434_disparity.npz
+NYUv2/test/playroom_0005/rgb_0435_rgb_left.mp4,NYUv2/test/playroom_0005/rgb_0435_disparity.npz
+NYUv2/test/playroom_0007/rgb_0441_rgb_left.mp4,NYUv2/test/playroom_0007/rgb_0441_disparity.npz
+NYUv2/test/playroom_0007/rgb_0442_rgb_left.mp4,NYUv2/test/playroom_0007/rgb_0442_disparity.npz
+NYUv2/test/playroom_0007/rgb_0443_rgb_left.mp4,NYUv2/test/playroom_0007/rgb_0443_disparity.npz
+NYUv2/test/playroom_0008/rgb_0444_rgb_left.mp4,NYUv2/test/playroom_0008/rgb_0444_disparity.npz
+NYUv2/test/playroom_0008/rgb_0445_rgb_left.mp4,NYUv2/test/playroom_0008/rgb_0445_disparity.npz
+NYUv2/test/playroom_0008/rgb_0446_rgb_left.mp4,NYUv2/test/playroom_0008/rgb_0446_disparity.npz
+NYUv2/test/playroom_0008/rgb_0447_rgb_left.mp4,NYUv2/test/playroom_0008/rgb_0447_disparity.npz
+NYUv2/test/playroom_0008/rgb_0448_rgb_left.mp4,NYUv2/test/playroom_0008/rgb_0448_disparity.npz
+NYUv2/test/reception_room_0003/rgb_0462_rgb_left.mp4,NYUv2/test/reception_room_0003/rgb_0462_disparity.npz
+NYUv2/test/reception_room_0003/rgb_0463_rgb_left.mp4,NYUv2/test/reception_room_0003/rgb_0463_disparity.npz
+NYUv2/test/reception_room_0003/rgb_0464_rgb_left.mp4,NYUv2/test/reception_room_0003/rgb_0464_disparity.npz
+NYUv2/test/reception_room_0003/rgb_0465_rgb_left.mp4,NYUv2/test/reception_room_0003/rgb_0465_disparity.npz
+NYUv2/test/reception_room_0003/rgb_0466_rgb_left.mp4,NYUv2/test/reception_room_0003/rgb_0466_disparity.npz
+NYUv2/test/study_0001/rgb_0469_rgb_left.mp4,NYUv2/test/study_0001/rgb_0469_disparity.npz
+NYUv2/test/study_0001/rgb_0470_rgb_left.mp4,NYUv2/test/study_0001/rgb_0470_disparity.npz
+NYUv2/test/study_0001/rgb_0471_rgb_left.mp4,NYUv2/test/study_0001/rgb_0471_disparity.npz
+NYUv2/test/study_0001/rgb_0472_rgb_left.mp4,NYUv2/test/study_0001/rgb_0472_disparity.npz
+NYUv2/test/study_0001/rgb_0473_rgb_left.mp4,NYUv2/test/study_0001/rgb_0473_disparity.npz
+NYUv2/test/study_0002/rgb_0474_rgb_left.mp4,NYUv2/test/study_0002/rgb_0474_disparity.npz
+NYUv2/test/study_0002/rgb_0475_rgb_left.mp4,NYUv2/test/study_0002/rgb_0475_disparity.npz
+NYUv2/test/study_0002/rgb_0476_rgb_left.mp4,NYUv2/test/study_0002/rgb_0476_disparity.npz
+NYUv2/test/study_0002/rgb_0477_rgb_left.mp4,NYUv2/test/study_0002/rgb_0477_disparity.npz
+NYUv2/test/bathroom_0058/rgb_0508_rgb_left.mp4,NYUv2/test/bathroom_0058/rgb_0508_disparity.npz
+NYUv2/test/bathroom_0058/rgb_0509_rgb_left.mp4,NYUv2/test/bathroom_0058/rgb_0509_disparity.npz
+NYUv2/test/bathroom_0058/rgb_0510_rgb_left.mp4,NYUv2/test/bathroom_0058/rgb_0510_disparity.npz
+NYUv2/test/bathroom_0060/rgb_0511_rgb_left.mp4,NYUv2/test/bathroom_0060/rgb_0511_disparity.npz
+NYUv2/test/bathroom_0060/rgb_0512_rgb_left.mp4,NYUv2/test/bathroom_0060/rgb_0512_disparity.npz
+NYUv2/test/bathroom_0060/rgb_0513_rgb_left.mp4,NYUv2/test/bathroom_0060/rgb_0513_disparity.npz
+NYUv2/test/bedroom_0133/rgb_0515_rgb_left.mp4,NYUv2/test/bedroom_0133/rgb_0515_disparity.npz
+NYUv2/test/bedroom_0133/rgb_0516_rgb_left.mp4,NYUv2/test/bedroom_0133/rgb_0516_disparity.npz
+NYUv2/test/bedroom_0133/rgb_0517_rgb_left.mp4,NYUv2/test/bedroom_0133/rgb_0517_disparity.npz
+NYUv2/test/bedroom_0133/rgb_0518_rgb_left.mp4,NYUv2/test/bedroom_0133/rgb_0518_disparity.npz
+NYUv2/test/bedroom_0133/rgb_0519_rgb_left.mp4,NYUv2/test/bedroom_0133/rgb_0519_disparity.npz
+NYUv2/test/bedroom_0134/rgb_0520_rgb_left.mp4,NYUv2/test/bedroom_0134/rgb_0520_disparity.npz
+NYUv2/test/bedroom_0134/rgb_0521_rgb_left.mp4,NYUv2/test/bedroom_0134/rgb_0521_disparity.npz
+NYUv2/test/bedroom_0134/rgb_0522_rgb_left.mp4,NYUv2/test/bedroom_0134/rgb_0522_disparity.npz
+NYUv2/test/bedroom_0135/rgb_0523_rgb_left.mp4,NYUv2/test/bedroom_0135/rgb_0523_disparity.npz
+NYUv2/test/bedroom_0135/rgb_0524_rgb_left.mp4,NYUv2/test/bedroom_0135/rgb_0524_disparity.npz
+NYUv2/test/bedroom_0135/rgb_0525_rgb_left.mp4,NYUv2/test/bedroom_0135/rgb_0525_disparity.npz
+NYUv2/test/bedroom_0135/rgb_0526_rgb_left.mp4,NYUv2/test/bedroom_0135/rgb_0526_disparity.npz
+NYUv2/test/bedroom_0137/rgb_0531_rgb_left.mp4,NYUv2/test/bedroom_0137/rgb_0531_disparity.npz
+NYUv2/test/bedroom_0137/rgb_0532_rgb_left.mp4,NYUv2/test/bedroom_0137/rgb_0532_disparity.npz
+NYUv2/test/bedroom_0137/rgb_0533_rgb_left.mp4,NYUv2/test/bedroom_0137/rgb_0533_disparity.npz
+NYUv2/test/bedroom_0139/rgb_0537_rgb_left.mp4,NYUv2/test/bedroom_0139/rgb_0537_disparity.npz
+NYUv2/test/bedroom_0139/rgb_0538_rgb_left.mp4,NYUv2/test/bedroom_0139/rgb_0538_disparity.npz
+NYUv2/test/bedroom_0139/rgb_0539_rgb_left.mp4,NYUv2/test/bedroom_0139/rgb_0539_disparity.npz
+NYUv2/test/dining_room_0038/rgb_0549_rgb_left.mp4,NYUv2/test/dining_room_0038/rgb_0549_disparity.npz
+NYUv2/test/dining_room_0038/rgb_0550_rgb_left.mp4,NYUv2/test/dining_room_0038/rgb_0550_disparity.npz
+NYUv2/test/dining_room_0038/rgb_0551_rgb_left.mp4,NYUv2/test/dining_room_0038/rgb_0551_disparity.npz
+NYUv2/test/home_office_0014/rgb_0555_rgb_left.mp4,NYUv2/test/home_office_0014/rgb_0555_disparity.npz
+NYUv2/test/home_office_0014/rgb_0556_rgb_left.mp4,NYUv2/test/home_office_0014/rgb_0556_disparity.npz
+NYUv2/test/home_office_0014/rgb_0557_rgb_left.mp4,NYUv2/test/home_office_0014/rgb_0557_disparity.npz
+NYUv2/test/home_office_0014/rgb_0558_rgb_left.mp4,NYUv2/test/home_office_0014/rgb_0558_disparity.npz
+NYUv2/test/kitchen_0055/rgb_0559_rgb_left.mp4,NYUv2/test/kitchen_0055/rgb_0559_disparity.npz
+NYUv2/test/kitchen_0055/rgb_0560_rgb_left.mp4,NYUv2/test/kitchen_0055/rgb_0560_disparity.npz
+NYUv2/test/kitchen_0056/rgb_0561_rgb_left.mp4,NYUv2/test/kitchen_0056/rgb_0561_disparity.npz
+NYUv2/test/kitchen_0056/rgb_0562_rgb_left.mp4,NYUv2/test/kitchen_0056/rgb_0562_disparity.npz
+NYUv2/test/kitchen_0056/rgb_0563_rgb_left.mp4,NYUv2/test/kitchen_0056/rgb_0563_disparity.npz
+NYUv2/test/kitchen_0056/rgb_0564_rgb_left.mp4,NYUv2/test/kitchen_0056/rgb_0564_disparity.npz
+NYUv2/test/kitchen_0057/rgb_0565_rgb_left.mp4,NYUv2/test/kitchen_0057/rgb_0565_disparity.npz
+NYUv2/test/kitchen_0057/rgb_0566_rgb_left.mp4,NYUv2/test/kitchen_0057/rgb_0566_disparity.npz
+NYUv2/test/kitchen_0057/rgb_0567_rgb_left.mp4,NYUv2/test/kitchen_0057/rgb_0567_disparity.npz
+NYUv2/test/kitchen_0057/rgb_0568_rgb_left.mp4,NYUv2/test/kitchen_0057/rgb_0568_disparity.npz
+NYUv2/test/kitchen_0058/rgb_0569_rgb_left.mp4,NYUv2/test/kitchen_0058/rgb_0569_disparity.npz
+NYUv2/test/kitchen_0058/rgb_0570_rgb_left.mp4,NYUv2/test/kitchen_0058/rgb_0570_disparity.npz
+NYUv2/test/kitchen_0058/rgb_0571_rgb_left.mp4,NYUv2/test/kitchen_0058/rgb_0571_disparity.npz
+NYUv2/test/living_room_0081/rgb_0579_rgb_left.mp4,NYUv2/test/living_room_0081/rgb_0579_disparity.npz
+NYUv2/test/living_room_0081/rgb_0580_rgb_left.mp4,NYUv2/test/living_room_0081/rgb_0580_disparity.npz
+NYUv2/test/living_room_0081/rgb_0581_rgb_left.mp4,NYUv2/test/living_room_0081/rgb_0581_disparity.npz
+NYUv2/test/living_room_0081/rgb_0582_rgb_left.mp4,NYUv2/test/living_room_0081/rgb_0582_disparity.npz
+NYUv2/test/living_room_0081/rgb_0583_rgb_left.mp4,NYUv2/test/living_room_0081/rgb_0583_disparity.npz
+NYUv2/test/living_room_0084/rgb_0591_rgb_left.mp4,NYUv2/test/living_room_0084/rgb_0591_disparity.npz
+NYUv2/test/living_room_0084/rgb_0592_rgb_left.mp4,NYUv2/test/living_room_0084/rgb_0592_disparity.npz
+NYUv2/test/living_room_0084/rgb_0593_rgb_left.mp4,NYUv2/test/living_room_0084/rgb_0593_disparity.npz
+NYUv2/test/living_room_0084/rgb_0594_rgb_left.mp4,NYUv2/test/living_room_0084/rgb_0594_disparity.npz
+NYUv2/test/living_room_0087/rgb_0603_rgb_left.mp4,NYUv2/test/living_room_0087/rgb_0603_disparity.npz
+NYUv2/test/living_room_0087/rgb_0604_rgb_left.mp4,NYUv2/test/living_room_0087/rgb_0604_disparity.npz
+NYUv2/test/living_room_0087/rgb_0605_rgb_left.mp4,NYUv2/test/living_room_0087/rgb_0605_disparity.npz
+NYUv2/test/living_room_0087/rgb_0606_rgb_left.mp4,NYUv2/test/living_room_0087/rgb_0606_disparity.npz
+NYUv2/test/living_room_0087/rgb_0607_rgb_left.mp4,NYUv2/test/living_room_0087/rgb_0607_disparity.npz
+NYUv2/test/office_0020/rgb_0612_rgb_left.mp4,NYUv2/test/office_0020/rgb_0612_disparity.npz
+NYUv2/test/office_0020/rgb_0613_rgb_left.mp4,NYUv2/test/office_0020/rgb_0613_disparity.npz
+NYUv2/test/office_0022/rgb_0617_rgb_left.mp4,NYUv2/test/office_0022/rgb_0617_disparity.npz
+NYUv2/test/office_0022/rgb_0618_rgb_left.mp4,NYUv2/test/office_0022/rgb_0618_disparity.npz
+NYUv2/test/office_0022/rgb_0619_rgb_left.mp4,NYUv2/test/office_0022/rgb_0619_disparity.npz
+NYUv2/test/office_0022/rgb_0620_rgb_left.mp4,NYUv2/test/office_0022/rgb_0620_disparity.npz
+NYUv2/test/office_0022/rgb_0621_rgb_left.mp4,NYUv2/test/office_0022/rgb_0621_disparity.npz
+NYUv2/test/office_0027/rgb_0633_rgb_left.mp4,NYUv2/test/office_0027/rgb_0633_disparity.npz
+NYUv2/test/office_0027/rgb_0634_rgb_left.mp4,NYUv2/test/office_0027/rgb_0634_disparity.npz
+NYUv2/test/office_0027/rgb_0635_rgb_left.mp4,NYUv2/test/office_0027/rgb_0635_disparity.npz
+NYUv2/test/office_0027/rgb_0636_rgb_left.mp4,NYUv2/test/office_0027/rgb_0636_disparity.npz
+NYUv2/test/office_0027/rgb_0637_rgb_left.mp4,NYUv2/test/office_0027/rgb_0637_disparity.npz
+NYUv2/test/office_0027/rgb_0638_rgb_left.mp4,NYUv2/test/office_0027/rgb_0638_disparity.npz
+NYUv2/test/study_0007/rgb_0644_rgb_left.mp4,NYUv2/test/study_0007/rgb_0644_disparity.npz
+NYUv2/test/study_0007/rgb_0645_rgb_left.mp4,NYUv2/test/study_0007/rgb_0645_disparity.npz
+NYUv2/test/bathroom_0008/rgb_0650_rgb_left.mp4,NYUv2/test/bathroom_0008/rgb_0650_disparity.npz
+NYUv2/test/bathroom_0009/rgb_0651_rgb_left.mp4,NYUv2/test/bathroom_0009/rgb_0651_disparity.npz
+NYUv2/test/bathroom_0012/rgb_0656_rgb_left.mp4,NYUv2/test/bathroom_0012/rgb_0656_disparity.npz
+NYUv2/test/bathroom_0012/rgb_0657_rgb_left.mp4,NYUv2/test/bathroom_0012/rgb_0657_disparity.npz
+NYUv2/test/bathroom_0012/rgb_0658_rgb_left.mp4,NYUv2/test/bathroom_0012/rgb_0658_disparity.npz
+NYUv2/test/bathroom_0015/rgb_0663_rgb_left.mp4,NYUv2/test/bathroom_0015/rgb_0663_disparity.npz
+NYUv2/test/bathroom_0015/rgb_0664_rgb_left.mp4,NYUv2/test/bathroom_0015/rgb_0664_disparity.npz
+NYUv2/test/bathroom_0017/rgb_0668_rgb_left.mp4,NYUv2/test/bathroom_0017/rgb_0668_disparity.npz
+NYUv2/test/bathroom_0017/rgb_0669_rgb_left.mp4,NYUv2/test/bathroom_0017/rgb_0669_disparity.npz
+NYUv2/test/bathroom_0017/rgb_0670_rgb_left.mp4,NYUv2/test/bathroom_0017/rgb_0670_disparity.npz
+NYUv2/test/bathroom_0018/rgb_0671_rgb_left.mp4,NYUv2/test/bathroom_0018/rgb_0671_disparity.npz
+NYUv2/test/bathroom_0018/rgb_0672_rgb_left.mp4,NYUv2/test/bathroom_0018/rgb_0672_disparity.npz
+NYUv2/test/bathroom_0018/rgb_0673_rgb_left.mp4,NYUv2/test/bathroom_0018/rgb_0673_disparity.npz
+NYUv2/test/bathroom_0020/rgb_0676_rgb_left.mp4,NYUv2/test/bathroom_0020/rgb_0676_disparity.npz
+NYUv2/test/bathroom_0020/rgb_0677_rgb_left.mp4,NYUv2/test/bathroom_0020/rgb_0677_disparity.npz
+NYUv2/test/bathroom_0021/rgb_0678_rgb_left.mp4,NYUv2/test/bathroom_0021/rgb_0678_disparity.npz
+NYUv2/test/bathroom_0021/rgb_0679_rgb_left.mp4,NYUv2/test/bathroom_0021/rgb_0679_disparity.npz
+NYUv2/test/bathroom_0022/rgb_0680_rgb_left.mp4,NYUv2/test/bathroom_0022/rgb_0680_disparity.npz
+NYUv2/test/bathroom_0022/rgb_0681_rgb_left.mp4,NYUv2/test/bathroom_0022/rgb_0681_disparity.npz
+NYUv2/test/bathroom_0025/rgb_0686_rgb_left.mp4,NYUv2/test/bathroom_0025/rgb_0686_disparity.npz
+NYUv2/test/bathroom_0025/rgb_0687_rgb_left.mp4,NYUv2/test/bathroom_0025/rgb_0687_disparity.npz
+NYUv2/test/bathroom_0026/rgb_0688_rgb_left.mp4,NYUv2/test/bathroom_0026/rgb_0688_disparity.npz
+NYUv2/test/bathroom_0026/rgb_0689_rgb_left.mp4,NYUv2/test/bathroom_0026/rgb_0689_disparity.npz
+NYUv2/test/bathroom_0026/rgb_0690_rgb_left.mp4,NYUv2/test/bathroom_0026/rgb_0690_disparity.npz
+NYUv2/test/bathroom_0029/rgb_0693_rgb_left.mp4,NYUv2/test/bathroom_0029/rgb_0693_disparity.npz
+NYUv2/test/bathroom_0029/rgb_0694_rgb_left.mp4,NYUv2/test/bathroom_0029/rgb_0694_disparity.npz
+NYUv2/test/bathroom_0031/rgb_0697_rgb_left.mp4,NYUv2/test/bathroom_0031/rgb_0697_disparity.npz
+NYUv2/test/bathroom_0031/rgb_0698_rgb_left.mp4,NYUv2/test/bathroom_0031/rgb_0698_disparity.npz
+NYUv2/test/bathroom_0031/rgb_0699_rgb_left.mp4,NYUv2/test/bathroom_0031/rgb_0699_disparity.npz
+NYUv2/test/bathroom_0036/rgb_0706_rgb_left.mp4,NYUv2/test/bathroom_0036/rgb_0706_disparity.npz
+NYUv2/test/bathroom_0036/rgb_0707_rgb_left.mp4,NYUv2/test/bathroom_0036/rgb_0707_disparity.npz
+NYUv2/test/bathroom_0036/rgb_0708_rgb_left.mp4,NYUv2/test/bathroom_0036/rgb_0708_disparity.npz
+NYUv2/test/bathroom_0037/rgb_0709_rgb_left.mp4,NYUv2/test/bathroom_0037/rgb_0709_disparity.npz
+NYUv2/test/bathroom_0037/rgb_0710_rgb_left.mp4,NYUv2/test/bathroom_0037/rgb_0710_disparity.npz
+NYUv2/test/bathroom_0038/rgb_0711_rgb_left.mp4,NYUv2/test/bathroom_0038/rgb_0711_disparity.npz
+NYUv2/test/bathroom_0038/rgb_0712_rgb_left.mp4,NYUv2/test/bathroom_0038/rgb_0712_disparity.npz
+NYUv2/test/bathroom_0038/rgb_0713_rgb_left.mp4,NYUv2/test/bathroom_0038/rgb_0713_disparity.npz
+NYUv2/test/bathroom_0040/rgb_0717_rgb_left.mp4,NYUv2/test/bathroom_0040/rgb_0717_disparity.npz
+NYUv2/test/bathroom_0040/rgb_0718_rgb_left.mp4,NYUv2/test/bathroom_0040/rgb_0718_disparity.npz
+NYUv2/test/bathroom_0043/rgb_0724_rgb_left.mp4,NYUv2/test/bathroom_0043/rgb_0724_disparity.npz
+NYUv2/test/bathroom_0043/rgb_0725_rgb_left.mp4,NYUv2/test/bathroom_0043/rgb_0725_disparity.npz
+NYUv2/test/bathroom_0043/rgb_0726_rgb_left.mp4,NYUv2/test/bathroom_0043/rgb_0726_disparity.npz
+NYUv2/test/bathroom_0044/rgb_0727_rgb_left.mp4,NYUv2/test/bathroom_0044/rgb_0727_disparity.npz
+NYUv2/test/bathroom_0044/rgb_0728_rgb_left.mp4,NYUv2/test/bathroom_0044/rgb_0728_disparity.npz
+NYUv2/test/bathroom_0046/rgb_0731_rgb_left.mp4,NYUv2/test/bathroom_0046/rgb_0731_disparity.npz
+NYUv2/test/bathroom_0046/rgb_0732_rgb_left.mp4,NYUv2/test/bathroom_0046/rgb_0732_disparity.npz
+NYUv2/test/bathroom_0047/rgb_0733_rgb_left.mp4,NYUv2/test/bathroom_0047/rgb_0733_disparity.npz
+NYUv2/test/bathroom_0047/rgb_0734_rgb_left.mp4,NYUv2/test/bathroom_0047/rgb_0734_disparity.npz
+NYUv2/test/bathroom_0052/rgb_0743_rgb_left.mp4,NYUv2/test/bathroom_0052/rgb_0743_disparity.npz
+NYUv2/test/bathroom_0052/rgb_0744_rgb_left.mp4,NYUv2/test/bathroom_0052/rgb_0744_disparity.npz
+NYUv2/test/kitchen_0021/rgb_0759_rgb_left.mp4,NYUv2/test/kitchen_0021/rgb_0759_disparity.npz
+NYUv2/test/kitchen_0021/rgb_0760_rgb_left.mp4,NYUv2/test/kitchen_0021/rgb_0760_disparity.npz
+NYUv2/test/kitchen_0021/rgb_0761_rgb_left.mp4,NYUv2/test/kitchen_0021/rgb_0761_disparity.npz
+NYUv2/test/kitchen_0022/rgb_0762_rgb_left.mp4,NYUv2/test/kitchen_0022/rgb_0762_disparity.npz
+NYUv2/test/kitchen_0022/rgb_0763_rgb_left.mp4,NYUv2/test/kitchen_0022/rgb_0763_disparity.npz
+NYUv2/test/kitchen_0022/rgb_0764_rgb_left.mp4,NYUv2/test/kitchen_0022/rgb_0764_disparity.npz
+NYUv2/test/kitchen_0022/rgb_0765_rgb_left.mp4,NYUv2/test/kitchen_0022/rgb_0765_disparity.npz
+NYUv2/test/kitchen_0022/rgb_0766_rgb_left.mp4,NYUv2/test/kitchen_0022/rgb_0766_disparity.npz
+NYUv2/test/kitchen_0023/rgb_0767_rgb_left.mp4,NYUv2/test/kitchen_0023/rgb_0767_disparity.npz
+NYUv2/test/kitchen_0023/rgb_0768_rgb_left.mp4,NYUv2/test/kitchen_0023/rgb_0768_disparity.npz
+NYUv2/test/kitchen_0023/rgb_0769_rgb_left.mp4,NYUv2/test/kitchen_0023/rgb_0769_disparity.npz
+NYUv2/test/kitchen_0023/rgb_0770_rgb_left.mp4,NYUv2/test/kitchen_0023/rgb_0770_disparity.npz
+NYUv2/test/kitchen_0023/rgb_0771_rgb_left.mp4,NYUv2/test/kitchen_0023/rgb_0771_disparity.npz
+NYUv2/test/kitchen_0024/rgb_0772_rgb_left.mp4,NYUv2/test/kitchen_0024/rgb_0772_disparity.npz
+NYUv2/test/kitchen_0024/rgb_0773_rgb_left.mp4,NYUv2/test/kitchen_0024/rgb_0773_disparity.npz
+NYUv2/test/kitchen_0024/rgb_0774_rgb_left.mp4,NYUv2/test/kitchen_0024/rgb_0774_disparity.npz
+NYUv2/test/kitchen_0024/rgb_0775_rgb_left.mp4,NYUv2/test/kitchen_0024/rgb_0775_disparity.npz
+NYUv2/test/kitchen_0024/rgb_0776_rgb_left.mp4,NYUv2/test/kitchen_0024/rgb_0776_disparity.npz
+NYUv2/test/kitchen_0025/rgb_0777_rgb_left.mp4,NYUv2/test/kitchen_0025/rgb_0777_disparity.npz
+NYUv2/test/kitchen_0025/rgb_0778_rgb_left.mp4,NYUv2/test/kitchen_0025/rgb_0778_disparity.npz
+NYUv2/test/kitchen_0025/rgb_0779_rgb_left.mp4,NYUv2/test/kitchen_0025/rgb_0779_disparity.npz
+NYUv2/test/kitchen_0026/rgb_0780_rgb_left.mp4,NYUv2/test/kitchen_0026/rgb_0780_disparity.npz
+NYUv2/test/kitchen_0026/rgb_0781_rgb_left.mp4,NYUv2/test/kitchen_0026/rgb_0781_disparity.npz
+NYUv2/test/kitchen_0026/rgb_0782_rgb_left.mp4,NYUv2/test/kitchen_0026/rgb_0782_disparity.npz
+NYUv2/test/kitchen_0027/rgb_0783_rgb_left.mp4,NYUv2/test/kitchen_0027/rgb_0783_disparity.npz
+NYUv2/test/kitchen_0027/rgb_0784_rgb_left.mp4,NYUv2/test/kitchen_0027/rgb_0784_disparity.npz
+NYUv2/test/kitchen_0027/rgb_0785_rgb_left.mp4,NYUv2/test/kitchen_0027/rgb_0785_disparity.npz
+NYUv2/test/kitchen_0027/rgb_0786_rgb_left.mp4,NYUv2/test/kitchen_0027/rgb_0786_disparity.npz
+NYUv2/test/kitchen_0027/rgb_0787_rgb_left.mp4,NYUv2/test/kitchen_0027/rgb_0787_disparity.npz
+NYUv2/test/kitchen_0030/rgb_0800_rgb_left.mp4,NYUv2/test/kitchen_0030/rgb_0800_disparity.npz
+NYUv2/test/kitchen_0030/rgb_0801_rgb_left.mp4,NYUv2/test/kitchen_0030/rgb_0801_disparity.npz
+NYUv2/test/kitchen_0030/rgb_0802_rgb_left.mp4,NYUv2/test/kitchen_0030/rgb_0802_disparity.npz
+NYUv2/test/kitchen_0030/rgb_0803_rgb_left.mp4,NYUv2/test/kitchen_0030/rgb_0803_disparity.npz
+NYUv2/test/kitchen_0030/rgb_0804_rgb_left.mp4,NYUv2/test/kitchen_0030/rgb_0804_disparity.npz
+NYUv2/test/kitchen_0032/rgb_0810_rgb_left.mp4,NYUv2/test/kitchen_0032/rgb_0810_disparity.npz
+NYUv2/test/kitchen_0032/rgb_0811_rgb_left.mp4,NYUv2/test/kitchen_0032/rgb_0811_disparity.npz
+NYUv2/test/kitchen_0032/rgb_0812_rgb_left.mp4,NYUv2/test/kitchen_0032/rgb_0812_disparity.npz
+NYUv2/test/kitchen_0032/rgb_0813_rgb_left.mp4,NYUv2/test/kitchen_0032/rgb_0813_disparity.npz
+NYUv2/test/kitchen_0032/rgb_0814_rgb_left.mp4,NYUv2/test/kitchen_0032/rgb_0814_disparity.npz
+NYUv2/test/kitchen_0034/rgb_0821_rgb_left.mp4,NYUv2/test/kitchen_0034/rgb_0821_disparity.npz
+NYUv2/test/kitchen_0034/rgb_0822_rgb_left.mp4,NYUv2/test/kitchen_0034/rgb_0822_disparity.npz
+NYUv2/test/kitchen_0034/rgb_0823_rgb_left.mp4,NYUv2/test/kitchen_0034/rgb_0823_disparity.npz
+NYUv2/test/kitchen_0038/rgb_0833_rgb_left.mp4,NYUv2/test/kitchen_0038/rgb_0833_disparity.npz
+NYUv2/test/kitchen_0038/rgb_0834_rgb_left.mp4,NYUv2/test/kitchen_0038/rgb_0834_disparity.npz
+NYUv2/test/kitchen_0038/rgb_0835_rgb_left.mp4,NYUv2/test/kitchen_0038/rgb_0835_disparity.npz
+NYUv2/test/kitchen_0038/rgb_0836_rgb_left.mp4,NYUv2/test/kitchen_0038/rgb_0836_disparity.npz
+NYUv2/test/kitchen_0039/rgb_0837_rgb_left.mp4,NYUv2/test/kitchen_0039/rgb_0837_disparity.npz
+NYUv2/test/kitchen_0039/rgb_0838_rgb_left.mp4,NYUv2/test/kitchen_0039/rgb_0838_disparity.npz
+NYUv2/test/kitchen_0039/rgb_0839_rgb_left.mp4,NYUv2/test/kitchen_0039/rgb_0839_disparity.npz
+NYUv2/test/kitchen_0039/rgb_0840_rgb_left.mp4,NYUv2/test/kitchen_0039/rgb_0840_disparity.npz
+NYUv2/test/kitchen_0039/rgb_0841_rgb_left.mp4,NYUv2/test/kitchen_0039/rgb_0841_disparity.npz
+NYUv2/test/kitchen_0039/rgb_0842_rgb_left.mp4,NYUv2/test/kitchen_0039/rgb_0842_disparity.npz
+NYUv2/test/kitchen_0040/rgb_0843_rgb_left.mp4,NYUv2/test/kitchen_0040/rgb_0843_disparity.npz
+NYUv2/test/kitchen_0040/rgb_0844_rgb_left.mp4,NYUv2/test/kitchen_0040/rgb_0844_disparity.npz
+NYUv2/test/kitchen_0040/rgb_0845_rgb_left.mp4,NYUv2/test/kitchen_0040/rgb_0845_disparity.npz
+NYUv2/test/kitchen_0040/rgb_0846_rgb_left.mp4,NYUv2/test/kitchen_0040/rgb_0846_disparity.npz
+NYUv2/test/kitchen_0042/rgb_0850_rgb_left.mp4,NYUv2/test/kitchen_0042/rgb_0850_disparity.npz
+NYUv2/test/kitchen_0042/rgb_0851_rgb_left.mp4,NYUv2/test/kitchen_0042/rgb_0851_disparity.npz
+NYUv2/test/kitchen_0042/rgb_0852_rgb_left.mp4,NYUv2/test/kitchen_0042/rgb_0852_disparity.npz
+NYUv2/test/kitchen_0044/rgb_0857_rgb_left.mp4,NYUv2/test/kitchen_0044/rgb_0857_disparity.npz
+NYUv2/test/kitchen_0044/rgb_0858_rgb_left.mp4,NYUv2/test/kitchen_0044/rgb_0858_disparity.npz
+NYUv2/test/kitchen_0044/rgb_0859_rgb_left.mp4,NYUv2/test/kitchen_0044/rgb_0859_disparity.npz
+NYUv2/test/kitchen_0044/rgb_0860_rgb_left.mp4,NYUv2/test/kitchen_0044/rgb_0860_disparity.npz
+NYUv2/test/kitchen_0044/rgb_0861_rgb_left.mp4,NYUv2/test/kitchen_0044/rgb_0861_disparity.npz
+NYUv2/test/kitchen_0044/rgb_0862_rgb_left.mp4,NYUv2/test/kitchen_0044/rgb_0862_disparity.npz
+NYUv2/test/kitchen_0046/rgb_0869_rgb_left.mp4,NYUv2/test/kitchen_0046/rgb_0869_disparity.npz
+NYUv2/test/kitchen_0046/rgb_0870_rgb_left.mp4,NYUv2/test/kitchen_0046/rgb_0870_disparity.npz
+NYUv2/test/kitchen_0046/rgb_0871_rgb_left.mp4,NYUv2/test/kitchen_0046/rgb_0871_disparity.npz
+NYUv2/test/kitchen_0054/rgb_0906_rgb_left.mp4,NYUv2/test/kitchen_0054/rgb_0906_disparity.npz
+NYUv2/test/kitchen_0054/rgb_0907_rgb_left.mp4,NYUv2/test/kitchen_0054/rgb_0907_disparity.npz
+NYUv2/test/kitchen_0054/rgb_0908_rgb_left.mp4,NYUv2/test/kitchen_0054/rgb_0908_disparity.npz
+NYUv2/test/bedroom_0027/rgb_0917_rgb_left.mp4,NYUv2/test/bedroom_0027/rgb_0917_disparity.npz
+NYUv2/test/bedroom_0027/rgb_0918_rgb_left.mp4,NYUv2/test/bedroom_0027/rgb_0918_disparity.npz
+NYUv2/test/bedroom_0027/rgb_0919_rgb_left.mp4,NYUv2/test/bedroom_0027/rgb_0919_disparity.npz
+NYUv2/test/bedroom_0030/rgb_0926_rgb_left.mp4,NYUv2/test/bedroom_0030/rgb_0926_disparity.npz
+NYUv2/test/bedroom_0030/rgb_0927_rgb_left.mp4,NYUv2/test/bedroom_0030/rgb_0927_disparity.npz
+NYUv2/test/bedroom_0030/rgb_0928_rgb_left.mp4,NYUv2/test/bedroom_0030/rgb_0928_disparity.npz
+NYUv2/test/bedroom_0032/rgb_0932_rgb_left.mp4,NYUv2/test/bedroom_0032/rgb_0932_disparity.npz
+NYUv2/test/bedroom_0032/rgb_0933_rgb_left.mp4,NYUv2/test/bedroom_0032/rgb_0933_disparity.npz
+NYUv2/test/bedroom_0032/rgb_0934_rgb_left.mp4,NYUv2/test/bedroom_0032/rgb_0934_disparity.npz
+NYUv2/test/bedroom_0032/rgb_0935_rgb_left.mp4,NYUv2/test/bedroom_0032/rgb_0935_disparity.npz
+NYUv2/test/bedroom_0037/rgb_0945_rgb_left.mp4,NYUv2/test/bedroom_0037/rgb_0945_disparity.npz
+NYUv2/test/bedroom_0037/rgb_0946_rgb_left.mp4,NYUv2/test/bedroom_0037/rgb_0946_disparity.npz
+NYUv2/test/bedroom_0037/rgb_0947_rgb_left.mp4,NYUv2/test/bedroom_0037/rgb_0947_disparity.npz
+NYUv2/test/bedroom_0043/rgb_0959_rgb_left.mp4,NYUv2/test/bedroom_0043/rgb_0959_disparity.npz
+NYUv2/test/bedroom_0043/rgb_0960_rgb_left.mp4,NYUv2/test/bedroom_0043/rgb_0960_disparity.npz
+NYUv2/test/bedroom_0044/rgb_0961_rgb_left.mp4,NYUv2/test/bedroom_0044/rgb_0961_disparity.npz
+NYUv2/test/bedroom_0044/rgb_0962_rgb_left.mp4,NYUv2/test/bedroom_0044/rgb_0962_disparity.npz
+NYUv2/test/bedroom_0046/rgb_0965_rgb_left.mp4,NYUv2/test/bedroom_0046/rgb_0965_disparity.npz
+NYUv2/test/bedroom_0046/rgb_0966_rgb_left.mp4,NYUv2/test/bedroom_0046/rgb_0966_disparity.npz
+NYUv2/test/bedroom_0046/rgb_0967_rgb_left.mp4,NYUv2/test/bedroom_0046/rgb_0967_disparity.npz
+NYUv2/test/bedroom_0048/rgb_0970_rgb_left.mp4,NYUv2/test/bedroom_0048/rgb_0970_disparity.npz
+NYUv2/test/bedroom_0048/rgb_0971_rgb_left.mp4,NYUv2/test/bedroom_0048/rgb_0971_disparity.npz
+NYUv2/test/bedroom_0048/rgb_0972_rgb_left.mp4,NYUv2/test/bedroom_0048/rgb_0972_disparity.npz
+NYUv2/test/bedroom_0048/rgb_0973_rgb_left.mp4,NYUv2/test/bedroom_0048/rgb_0973_disparity.npz
+NYUv2/test/bedroom_0048/rgb_0974_rgb_left.mp4,NYUv2/test/bedroom_0048/rgb_0974_disparity.npz
+NYUv2/test/bedroom_0049/rgb_0975_rgb_left.mp4,NYUv2/test/bedroom_0049/rgb_0975_disparity.npz
+NYUv2/test/bedroom_0049/rgb_0976_rgb_left.mp4,NYUv2/test/bedroom_0049/rgb_0976_disparity.npz
+NYUv2/test/bedroom_0049/rgb_0977_rgb_left.mp4,NYUv2/test/bedroom_0049/rgb_0977_disparity.npz
+NYUv2/test/bedroom_0054/rgb_0991_rgb_left.mp4,NYUv2/test/bedroom_0054/rgb_0991_disparity.npz
+NYUv2/test/bedroom_0054/rgb_0992_rgb_left.mp4,NYUv2/test/bedroom_0054/rgb_0992_disparity.npz
+NYUv2/test/bedroom_0054/rgb_0993_rgb_left.mp4,NYUv2/test/bedroom_0054/rgb_0993_disparity.npz
+NYUv2/test/bedroom_0055/rgb_0994_rgb_left.mp4,NYUv2/test/bedroom_0055/rgb_0994_disparity.npz
+NYUv2/test/bedroom_0055/rgb_0995_rgb_left.mp4,NYUv2/test/bedroom_0055/rgb_0995_disparity.npz
+NYUv2/test/bedroom_0058/rgb_1001_rgb_left.mp4,NYUv2/test/bedroom_0058/rgb_1001_disparity.npz
+NYUv2/test/bedroom_0058/rgb_1002_rgb_left.mp4,NYUv2/test/bedroom_0058/rgb_1002_disparity.npz
+NYUv2/test/bedroom_0058/rgb_1003_rgb_left.mp4,NYUv2/test/bedroom_0058/rgb_1003_disparity.npz
+NYUv2/test/bedroom_0058/rgb_1004_rgb_left.mp4,NYUv2/test/bedroom_0058/rgb_1004_disparity.npz
+NYUv2/test/bedroom_0061/rgb_1010_rgb_left.mp4,NYUv2/test/bedroom_0061/rgb_1010_disparity.npz
+NYUv2/test/bedroom_0061/rgb_1011_rgb_left.mp4,NYUv2/test/bedroom_0061/rgb_1011_disparity.npz
+NYUv2/test/bedroom_0061/rgb_1012_rgb_left.mp4,NYUv2/test/bedroom_0061/rgb_1012_disparity.npz
+NYUv2/test/bedroom_0064/rgb_1021_rgb_left.mp4,NYUv2/test/bedroom_0064/rgb_1021_disparity.npz
+NYUv2/test/bedroom_0064/rgb_1022_rgb_left.mp4,NYUv2/test/bedroom_0064/rgb_1022_disparity.npz
+NYUv2/test/bedroom_0064/rgb_1023_rgb_left.mp4,NYUv2/test/bedroom_0064/rgb_1023_disparity.npz
+NYUv2/test/bedroom_0068/rgb_1032_rgb_left.mp4,NYUv2/test/bedroom_0068/rgb_1032_disparity.npz
+NYUv2/test/bedroom_0068/rgb_1033_rgb_left.mp4,NYUv2/test/bedroom_0068/rgb_1033_disparity.npz
+NYUv2/test/bedroom_0068/rgb_1034_rgb_left.mp4,NYUv2/test/bedroom_0068/rgb_1034_disparity.npz
+NYUv2/test/bedroom_0070/rgb_1038_rgb_left.mp4,NYUv2/test/bedroom_0070/rgb_1038_disparity.npz
+NYUv2/test/bedroom_0070/rgb_1039_rgb_left.mp4,NYUv2/test/bedroom_0070/rgb_1039_disparity.npz
+NYUv2/test/bedroom_0073/rgb_1048_rgb_left.mp4,NYUv2/test/bedroom_0073/rgb_1048_disparity.npz
+NYUv2/test/bedroom_0073/rgb_1049_rgb_left.mp4,NYUv2/test/bedroom_0073/rgb_1049_disparity.npz
+NYUv2/test/bedroom_0075/rgb_1052_rgb_left.mp4,NYUv2/test/bedroom_0075/rgb_1052_disparity.npz
+NYUv2/test/bedroom_0075/rgb_1053_rgb_left.mp4,NYUv2/test/bedroom_0075/rgb_1053_disparity.npz
+NYUv2/test/bedroom_0077/rgb_1057_rgb_left.mp4,NYUv2/test/bedroom_0077/rgb_1057_disparity.npz
+NYUv2/test/bedroom_0077/rgb_1058_rgb_left.mp4,NYUv2/test/bedroom_0077/rgb_1058_disparity.npz
+NYUv2/test/bedroom_0083/rgb_1075_rgb_left.mp4,NYUv2/test/bedroom_0083/rgb_1075_disparity.npz
+NYUv2/test/bedroom_0083/rgb_1076_rgb_left.mp4,NYUv2/test/bedroom_0083/rgb_1076_disparity.npz
+NYUv2/test/bedroom_0084/rgb_1077_rgb_left.mp4,NYUv2/test/bedroom_0084/rgb_1077_disparity.npz
+NYUv2/test/bedroom_0084/rgb_1078_rgb_left.mp4,NYUv2/test/bedroom_0084/rgb_1078_disparity.npz
+NYUv2/test/bedroom_0084/rgb_1079_rgb_left.mp4,NYUv2/test/bedroom_0084/rgb_1079_disparity.npz
+NYUv2/test/bedroom_0084/rgb_1080_rgb_left.mp4,NYUv2/test/bedroom_0084/rgb_1080_disparity.npz
+NYUv2/test/bedroom_0085/rgb_1081_rgb_left.mp4,NYUv2/test/bedroom_0085/rgb_1081_disparity.npz
+NYUv2/test/bedroom_0085/rgb_1082_rgb_left.mp4,NYUv2/test/bedroom_0085/rgb_1082_disparity.npz
+NYUv2/test/bedroom_0085/rgb_1083_rgb_left.mp4,NYUv2/test/bedroom_0085/rgb_1083_disparity.npz
+NYUv2/test/bedroom_0085/rgb_1084_rgb_left.mp4,NYUv2/test/bedroom_0085/rgb_1084_disparity.npz
+NYUv2/test/bedroom_0087/rgb_1088_rgb_left.mp4,NYUv2/test/bedroom_0087/rgb_1088_disparity.npz
+NYUv2/test/bedroom_0087/rgb_1089_rgb_left.mp4,NYUv2/test/bedroom_0087/rgb_1089_disparity.npz
+NYUv2/test/bedroom_0087/rgb_1090_rgb_left.mp4,NYUv2/test/bedroom_0087/rgb_1090_disparity.npz
+NYUv2/test/bedroom_0088/rgb_1091_rgb_left.mp4,NYUv2/test/bedroom_0088/rgb_1091_disparity.npz
+NYUv2/test/bedroom_0088/rgb_1092_rgb_left.mp4,NYUv2/test/bedroom_0088/rgb_1092_disparity.npz
+NYUv2/test/bedroom_0088/rgb_1093_rgb_left.mp4,NYUv2/test/bedroom_0088/rgb_1093_disparity.npz
+NYUv2/test/bedroom_0089/rgb_1094_rgb_left.mp4,NYUv2/test/bedroom_0089/rgb_1094_disparity.npz
+NYUv2/test/bedroom_0089/rgb_1095_rgb_left.mp4,NYUv2/test/bedroom_0089/rgb_1095_disparity.npz
+NYUv2/test/bedroom_0089/rgb_1096_rgb_left.mp4,NYUv2/test/bedroom_0089/rgb_1096_disparity.npz
+NYUv2/test/bedroom_0091/rgb_1098_rgb_left.mp4,NYUv2/test/bedroom_0091/rgb_1098_disparity.npz
+NYUv2/test/bedroom_0091/rgb_1099_rgb_left.mp4,NYUv2/test/bedroom_0091/rgb_1099_disparity.npz
+NYUv2/test/bedroom_0092/rgb_1100_rgb_left.mp4,NYUv2/test/bedroom_0092/rgb_1100_disparity.npz
+NYUv2/test/bedroom_0092/rgb_1101_rgb_left.mp4,NYUv2/test/bedroom_0092/rgb_1101_disparity.npz
+NYUv2/test/bedroom_0092/rgb_1102_rgb_left.mp4,NYUv2/test/bedroom_0092/rgb_1102_disparity.npz
+NYUv2/test/bedroom_0093/rgb_1103_rgb_left.mp4,NYUv2/test/bedroom_0093/rgb_1103_disparity.npz
+NYUv2/test/bedroom_0093/rgb_1104_rgb_left.mp4,NYUv2/test/bedroom_0093/rgb_1104_disparity.npz
+NYUv2/test/bedroom_0095/rgb_1106_rgb_left.mp4,NYUv2/test/bedroom_0095/rgb_1106_disparity.npz
+NYUv2/test/bedroom_0095/rgb_1107_rgb_left.mp4,NYUv2/test/bedroom_0095/rgb_1107_disparity.npz
+NYUv2/test/bedroom_0095/rgb_1108_rgb_left.mp4,NYUv2/test/bedroom_0095/rgb_1108_disparity.npz
+NYUv2/test/bedroom_0095/rgb_1109_rgb_left.mp4,NYUv2/test/bedroom_0095/rgb_1109_disparity.npz
+NYUv2/test/bedroom_0099/rgb_1117_rgb_left.mp4,NYUv2/test/bedroom_0099/rgb_1117_disparity.npz
+NYUv2/test/bedroom_0099/rgb_1118_rgb_left.mp4,NYUv2/test/bedroom_0099/rgb_1118_disparity.npz
+NYUv2/test/bedroom_0099/rgb_1119_rgb_left.mp4,NYUv2/test/bedroom_0099/rgb_1119_disparity.npz
+NYUv2/test/bedroom_0101/rgb_1123_rgb_left.mp4,NYUv2/test/bedroom_0101/rgb_1123_disparity.npz
+NYUv2/test/bedroom_0101/rgb_1124_rgb_left.mp4,NYUv2/test/bedroom_0101/rgb_1124_disparity.npz
+NYUv2/test/bedroom_0101/rgb_1125_rgb_left.mp4,NYUv2/test/bedroom_0101/rgb_1125_disparity.npz
+NYUv2/test/bedroom_0101/rgb_1126_rgb_left.mp4,NYUv2/test/bedroom_0101/rgb_1126_disparity.npz
+NYUv2/test/bedroom_0102/rgb_1127_rgb_left.mp4,NYUv2/test/bedroom_0102/rgb_1127_disparity.npz
+NYUv2/test/bedroom_0102/rgb_1128_rgb_left.mp4,NYUv2/test/bedroom_0102/rgb_1128_disparity.npz
+NYUv2/test/bedroom_0103/rgb_1129_rgb_left.mp4,NYUv2/test/bedroom_0103/rgb_1129_disparity.npz
+NYUv2/test/bedroom_0103/rgb_1130_rgb_left.mp4,NYUv2/test/bedroom_0103/rgb_1130_disparity.npz
+NYUv2/test/bedroom_0103/rgb_1131_rgb_left.mp4,NYUv2/test/bedroom_0103/rgb_1131_disparity.npz
+NYUv2/test/bedroom_0105/rgb_1135_rgb_left.mp4,NYUv2/test/bedroom_0105/rgb_1135_disparity.npz
+NYUv2/test/bedroom_0105/rgb_1136_rgb_left.mp4,NYUv2/test/bedroom_0105/rgb_1136_disparity.npz
+NYUv2/test/bedroom_0108/rgb_1144_rgb_left.mp4,NYUv2/test/bedroom_0108/rgb_1144_disparity.npz
+NYUv2/test/bedroom_0108/rgb_1145_rgb_left.mp4,NYUv2/test/bedroom_0108/rgb_1145_disparity.npz
+NYUv2/test/bedroom_0108/rgb_1146_rgb_left.mp4,NYUv2/test/bedroom_0108/rgb_1146_disparity.npz
+NYUv2/test/bedroom_0109/rgb_1147_rgb_left.mp4,NYUv2/test/bedroom_0109/rgb_1147_disparity.npz
+NYUv2/test/bedroom_0109/rgb_1148_rgb_left.mp4,NYUv2/test/bedroom_0109/rgb_1148_disparity.npz
+NYUv2/test/bedroom_0109/rgb_1149_rgb_left.mp4,NYUv2/test/bedroom_0109/rgb_1149_disparity.npz
+NYUv2/test/bedroom_0110/rgb_1150_rgb_left.mp4,NYUv2/test/bedroom_0110/rgb_1150_disparity.npz
+NYUv2/test/bedroom_0110/rgb_1151_rgb_left.mp4,NYUv2/test/bedroom_0110/rgb_1151_disparity.npz
+NYUv2/test/bedroom_0110/rgb_1152_rgb_left.mp4,NYUv2/test/bedroom_0110/rgb_1152_disparity.npz
+NYUv2/test/bedroom_0111/rgb_1153_rgb_left.mp4,NYUv2/test/bedroom_0111/rgb_1153_disparity.npz
+NYUv2/test/bedroom_0111/rgb_1154_rgb_left.mp4,NYUv2/test/bedroom_0111/rgb_1154_disparity.npz
+NYUv2/test/bedroom_0111/rgb_1155_rgb_left.mp4,NYUv2/test/bedroom_0111/rgb_1155_disparity.npz
+NYUv2/test/bedroom_0112/rgb_1156_rgb_left.mp4,NYUv2/test/bedroom_0112/rgb_1156_disparity.npz
+NYUv2/test/bedroom_0112/rgb_1157_rgb_left.mp4,NYUv2/test/bedroom_0112/rgb_1157_disparity.npz
+NYUv2/test/bedroom_0112/rgb_1158_rgb_left.mp4,NYUv2/test/bedroom_0112/rgb_1158_disparity.npz
+NYUv2/test/bedroom_0114/rgb_1162_rgb_left.mp4,NYUv2/test/bedroom_0114/rgb_1162_disparity.npz
+NYUv2/test/bedroom_0114/rgb_1163_rgb_left.mp4,NYUv2/test/bedroom_0114/rgb_1163_disparity.npz
+NYUv2/test/bedroom_0114/rgb_1164_rgb_left.mp4,NYUv2/test/bedroom_0114/rgb_1164_disparity.npz
+NYUv2/test/bedroom_0115/rgb_1165_rgb_left.mp4,NYUv2/test/bedroom_0115/rgb_1165_disparity.npz
+NYUv2/test/bedroom_0115/rgb_1166_rgb_left.mp4,NYUv2/test/bedroom_0115/rgb_1166_disparity.npz
+NYUv2/test/bedroom_0115/rgb_1167_rgb_left.mp4,NYUv2/test/bedroom_0115/rgb_1167_disparity.npz
+NYUv2/test/bedroom_0117/rgb_1170_rgb_left.mp4,NYUv2/test/bedroom_0117/rgb_1170_disparity.npz
+NYUv2/test/bedroom_0117/rgb_1171_rgb_left.mp4,NYUv2/test/bedroom_0117/rgb_1171_disparity.npz
+NYUv2/test/bedroom_0119/rgb_1174_rgb_left.mp4,NYUv2/test/bedroom_0119/rgb_1174_disparity.npz
+NYUv2/test/bedroom_0119/rgb_1175_rgb_left.mp4,NYUv2/test/bedroom_0119/rgb_1175_disparity.npz
+NYUv2/test/bedroom_0119/rgb_1176_rgb_left.mp4,NYUv2/test/bedroom_0119/rgb_1176_disparity.npz
+NYUv2/test/bedroom_0121/rgb_1179_rgb_left.mp4,NYUv2/test/bedroom_0121/rgb_1179_disparity.npz
+NYUv2/test/bedroom_0121/rgb_1180_rgb_left.mp4,NYUv2/test/bedroom_0121/rgb_1180_disparity.npz
+NYUv2/test/bedroom_0122/rgb_1181_rgb_left.mp4,NYUv2/test/bedroom_0122/rgb_1181_disparity.npz
+NYUv2/test/bedroom_0122/rgb_1182_rgb_left.mp4,NYUv2/test/bedroom_0122/rgb_1182_disparity.npz
+NYUv2/test/bedroom_0123/rgb_1183_rgb_left.mp4,NYUv2/test/bedroom_0123/rgb_1183_disparity.npz
+NYUv2/test/bedroom_0123/rgb_1184_rgb_left.mp4,NYUv2/test/bedroom_0123/rgb_1184_disparity.npz
+NYUv2/test/bedroom_0127/rgb_1192_rgb_left.mp4,NYUv2/test/bedroom_0127/rgb_1192_disparity.npz
+NYUv2/test/bedroom_0127/rgb_1193_rgb_left.mp4,NYUv2/test/bedroom_0127/rgb_1193_disparity.npz
+NYUv2/test/bedroom_0127/rgb_1194_rgb_left.mp4,NYUv2/test/bedroom_0127/rgb_1194_disparity.npz
+NYUv2/test/bedroom_0128/rgb_1195_rgb_left.mp4,NYUv2/test/bedroom_0128/rgb_1195_disparity.npz
+NYUv2/test/bedroom_0128/rgb_1196_rgb_left.mp4,NYUv2/test/bedroom_0128/rgb_1196_disparity.npz
+NYUv2/test/living_room_0025/rgb_1201_rgb_left.mp4,NYUv2/test/living_room_0025/rgb_1201_disparity.npz
+NYUv2/test/living_room_0025/rgb_1202_rgb_left.mp4,NYUv2/test/living_room_0025/rgb_1202_disparity.npz
+NYUv2/test/living_room_0025/rgb_1203_rgb_left.mp4,NYUv2/test/living_room_0025/rgb_1203_disparity.npz
+NYUv2/test/living_room_0026/rgb_1204_rgb_left.mp4,NYUv2/test/living_room_0026/rgb_1204_disparity.npz
+NYUv2/test/living_room_0026/rgb_1205_rgb_left.mp4,NYUv2/test/living_room_0026/rgb_1205_disparity.npz
+NYUv2/test/living_room_0027/rgb_1206_rgb_left.mp4,NYUv2/test/living_room_0027/rgb_1206_disparity.npz
+NYUv2/test/living_room_0027/rgb_1207_rgb_left.mp4,NYUv2/test/living_room_0027/rgb_1207_disparity.npz
+NYUv2/test/living_room_0027/rgb_1208_rgb_left.mp4,NYUv2/test/living_room_0027/rgb_1208_disparity.npz
+NYUv2/test/living_room_0028/rgb_1209_rgb_left.mp4,NYUv2/test/living_room_0028/rgb_1209_disparity.npz
+NYUv2/test/living_room_0028/rgb_1210_rgb_left.mp4,NYUv2/test/living_room_0028/rgb_1210_disparity.npz
+NYUv2/test/living_room_0028/rgb_1211_rgb_left.mp4,NYUv2/test/living_room_0028/rgb_1211_disparity.npz
+NYUv2/test/living_room_0028/rgb_1212_rgb_left.mp4,NYUv2/test/living_room_0028/rgb_1212_disparity.npz
+NYUv2/test/living_room_0030/rgb_1216_rgb_left.mp4,NYUv2/test/living_room_0030/rgb_1216_disparity.npz
+NYUv2/test/living_room_0030/rgb_1217_rgb_left.mp4,NYUv2/test/living_room_0030/rgb_1217_disparity.npz
+NYUv2/test/living_room_0031/rgb_1218_rgb_left.mp4,NYUv2/test/living_room_0031/rgb_1218_disparity.npz
+NYUv2/test/living_room_0031/rgb_1219_rgb_left.mp4,NYUv2/test/living_room_0031/rgb_1219_disparity.npz
+NYUv2/test/living_room_0031/rgb_1220_rgb_left.mp4,NYUv2/test/living_room_0031/rgb_1220_disparity.npz
+NYUv2/test/living_room_0034/rgb_1226_rgb_left.mp4,NYUv2/test/living_room_0034/rgb_1226_disparity.npz
+NYUv2/test/living_room_0034/rgb_1227_rgb_left.mp4,NYUv2/test/living_room_0034/rgb_1227_disparity.npz
+NYUv2/test/living_room_0034/rgb_1228_rgb_left.mp4,NYUv2/test/living_room_0034/rgb_1228_disparity.npz
+NYUv2/test/living_room_0034/rgb_1229_rgb_left.mp4,NYUv2/test/living_room_0034/rgb_1229_disparity.npz
+NYUv2/test/living_room_0034/rgb_1230_rgb_left.mp4,NYUv2/test/living_room_0034/rgb_1230_disparity.npz
+NYUv2/test/living_room_0036/rgb_1233_rgb_left.mp4,NYUv2/test/living_room_0036/rgb_1233_disparity.npz
+NYUv2/test/living_room_0036/rgb_1234_rgb_left.mp4,NYUv2/test/living_room_0036/rgb_1234_disparity.npz
+NYUv2/test/living_room_0036/rgb_1235_rgb_left.mp4,NYUv2/test/living_room_0036/rgb_1235_disparity.npz
+NYUv2/test/living_room_0041/rgb_1247_rgb_left.mp4,NYUv2/test/living_room_0041/rgb_1247_disparity.npz
+NYUv2/test/living_room_0041/rgb_1248_rgb_left.mp4,NYUv2/test/living_room_0041/rgb_1248_disparity.npz
+NYUv2/test/living_room_0041/rgb_1249_rgb_left.mp4,NYUv2/test/living_room_0041/rgb_1249_disparity.npz
+NYUv2/test/living_room_0041/rgb_1250_rgb_left.mp4,NYUv2/test/living_room_0041/rgb_1250_disparity.npz
+NYUv2/test/living_room_0043/rgb_1254_rgb_left.mp4,NYUv2/test/living_room_0043/rgb_1254_disparity.npz
+NYUv2/test/living_room_0043/rgb_1255_rgb_left.mp4,NYUv2/test/living_room_0043/rgb_1255_disparity.npz
+NYUv2/test/living_room_0043/rgb_1256_rgb_left.mp4,NYUv2/test/living_room_0043/rgb_1256_disparity.npz
+NYUv2/test/living_room_0043/rgb_1257_rgb_left.mp4,NYUv2/test/living_room_0043/rgb_1257_disparity.npz
+NYUv2/test/living_room_0044/rgb_1258_rgb_left.mp4,NYUv2/test/living_room_0044/rgb_1258_disparity.npz
+NYUv2/test/living_room_0044/rgb_1259_rgb_left.mp4,NYUv2/test/living_room_0044/rgb_1259_disparity.npz
+NYUv2/test/living_room_0044/rgb_1260_rgb_left.mp4,NYUv2/test/living_room_0044/rgb_1260_disparity.npz
+NYUv2/test/living_room_0044/rgb_1261_rgb_left.mp4,NYUv2/test/living_room_0044/rgb_1261_disparity.npz
+NYUv2/test/living_room_0045/rgb_1262_rgb_left.mp4,NYUv2/test/living_room_0045/rgb_1262_disparity.npz
+NYUv2/test/living_room_0045/rgb_1263_rgb_left.mp4,NYUv2/test/living_room_0045/rgb_1263_disparity.npz
+NYUv2/test/living_room_0045/rgb_1264_rgb_left.mp4,NYUv2/test/living_room_0045/rgb_1264_disparity.npz
+NYUv2/test/living_room_0045/rgb_1265_rgb_left.mp4,NYUv2/test/living_room_0045/rgb_1265_disparity.npz
+NYUv2/test/living_room_0048/rgb_1275_rgb_left.mp4,NYUv2/test/living_room_0048/rgb_1275_disparity.npz
+NYUv2/test/living_room_0048/rgb_1276_rgb_left.mp4,NYUv2/test/living_room_0048/rgb_1276_disparity.npz
+NYUv2/test/living_room_0049/rgb_1277_rgb_left.mp4,NYUv2/test/living_room_0049/rgb_1277_disparity.npz
+NYUv2/test/living_room_0049/rgb_1278_rgb_left.mp4,NYUv2/test/living_room_0049/rgb_1278_disparity.npz
+NYUv2/test/living_room_0049/rgb_1279_rgb_left.mp4,NYUv2/test/living_room_0049/rgb_1279_disparity.npz
+NYUv2/test/living_room_0049/rgb_1280_rgb_left.mp4,NYUv2/test/living_room_0049/rgb_1280_disparity.npz
+NYUv2/test/living_room_0051/rgb_1285_rgb_left.mp4,NYUv2/test/living_room_0051/rgb_1285_disparity.npz
+NYUv2/test/living_room_0051/rgb_1286_rgb_left.mp4,NYUv2/test/living_room_0051/rgb_1286_disparity.npz
+NYUv2/test/living_room_0051/rgb_1287_rgb_left.mp4,NYUv2/test/living_room_0051/rgb_1287_disparity.npz
+NYUv2/test/living_room_0051/rgb_1288_rgb_left.mp4,NYUv2/test/living_room_0051/rgb_1288_disparity.npz
+NYUv2/test/living_room_0052/rgb_1289_rgb_left.mp4,NYUv2/test/living_room_0052/rgb_1289_disparity.npz
+NYUv2/test/living_room_0052/rgb_1290_rgb_left.mp4,NYUv2/test/living_room_0052/rgb_1290_disparity.npz
+NYUv2/test/living_room_0053/rgb_1291_rgb_left.mp4,NYUv2/test/living_room_0053/rgb_1291_disparity.npz
+NYUv2/test/living_room_0053/rgb_1292_rgb_left.mp4,NYUv2/test/living_room_0053/rgb_1292_disparity.npz
+NYUv2/test/living_room_0053/rgb_1293_rgb_left.mp4,NYUv2/test/living_room_0053/rgb_1293_disparity.npz
+NYUv2/test/living_room_0054/rgb_1294_rgb_left.mp4,NYUv2/test/living_room_0054/rgb_1294_disparity.npz
+NYUv2/test/living_room_0054/rgb_1295_rgb_left.mp4,NYUv2/test/living_room_0054/rgb_1295_disparity.npz
+NYUv2/test/living_room_0056/rgb_1297_rgb_left.mp4,NYUv2/test/living_room_0056/rgb_1297_disparity.npz
+NYUv2/test/living_room_0057/rgb_1298_rgb_left.mp4,NYUv2/test/living_room_0057/rgb_1298_disparity.npz
+NYUv2/test/living_room_0057/rgb_1299_rgb_left.mp4,NYUv2/test/living_room_0057/rgb_1299_disparity.npz
+NYUv2/test/living_room_0059/rgb_1302_rgb_left.mp4,NYUv2/test/living_room_0059/rgb_1302_disparity.npz
+NYUv2/test/living_room_0059/rgb_1303_rgb_left.mp4,NYUv2/test/living_room_0059/rgb_1303_disparity.npz
+NYUv2/test/living_room_0059/rgb_1304_rgb_left.mp4,NYUv2/test/living_room_0059/rgb_1304_disparity.npz
+NYUv2/test/living_room_0059/rgb_1305_rgb_left.mp4,NYUv2/test/living_room_0059/rgb_1305_disparity.npz
+NYUv2/test/living_room_0060/rgb_1306_rgb_left.mp4,NYUv2/test/living_room_0060/rgb_1306_disparity.npz
+NYUv2/test/living_room_0060/rgb_1307_rgb_left.mp4,NYUv2/test/living_room_0060/rgb_1307_disparity.npz
+NYUv2/test/living_room_0061/rgb_1308_rgb_left.mp4,NYUv2/test/living_room_0061/rgb_1308_disparity.npz
+NYUv2/test/living_room_0064/rgb_1314_rgb_left.mp4,NYUv2/test/living_room_0064/rgb_1314_disparity.npz
+NYUv2/test/living_room_0066/rgb_1315_rgb_left.mp4,NYUv2/test/living_room_0066/rgb_1315_disparity.npz
+NYUv2/test/living_room_0072/rgb_1329_rgb_left.mp4,NYUv2/test/living_room_0072/rgb_1329_disparity.npz
+NYUv2/test/living_room_0075/rgb_1330_rgb_left.mp4,NYUv2/test/living_room_0075/rgb_1330_disparity.npz
+NYUv2/test/living_room_0075/rgb_1331_rgb_left.mp4,NYUv2/test/living_room_0075/rgb_1331_disparity.npz
+NYUv2/test/living_room_0076/rgb_1332_rgb_left.mp4,NYUv2/test/living_room_0076/rgb_1332_disparity.npz
+NYUv2/test/living_room_0079/rgb_1335_rgb_left.mp4,NYUv2/test/living_room_0079/rgb_1335_disparity.npz
+NYUv2/test/living_room_0079/rgb_1336_rgb_left.mp4,NYUv2/test/living_room_0079/rgb_1336_disparity.npz
+NYUv2/test/living_room_0079/rgb_1337_rgb_left.mp4,NYUv2/test/living_room_0079/rgb_1337_disparity.npz
+NYUv2/test/living_room_0080/rgb_1338_rgb_left.mp4,NYUv2/test/living_room_0080/rgb_1338_disparity.npz
+NYUv2/test/living_room_0080/rgb_1339_rgb_left.mp4,NYUv2/test/living_room_0080/rgb_1339_disparity.npz
+NYUv2/test/living_room_0080/rgb_1340_rgb_left.mp4,NYUv2/test/living_room_0080/rgb_1340_disparity.npz
+NYUv2/test/dining_room_0003/rgb_1347_rgb_left.mp4,NYUv2/test/dining_room_0003/rgb_1347_disparity.npz
+NYUv2/test/dining_room_0003/rgb_1348_rgb_left.mp4,NYUv2/test/dining_room_0003/rgb_1348_disparity.npz
+NYUv2/test/dining_room_0003/rgb_1349_rgb_left.mp4,NYUv2/test/dining_room_0003/rgb_1349_disparity.npz
+NYUv2/test/dining_room_0005/rgb_1353_rgb_left.mp4,NYUv2/test/dining_room_0005/rgb_1353_disparity.npz
+NYUv2/test/dining_room_0005/rgb_1354_rgb_left.mp4,NYUv2/test/dining_room_0005/rgb_1354_disparity.npz
+NYUv2/test/dining_room_0006/rgb_1355_rgb_left.mp4,NYUv2/test/dining_room_0006/rgb_1355_disparity.npz
+NYUv2/test/dining_room_0006/rgb_1356_rgb_left.mp4,NYUv2/test/dining_room_0006/rgb_1356_disparity.npz
+NYUv2/test/dining_room_0009/rgb_1364_rgb_left.mp4,NYUv2/test/dining_room_0009/rgb_1364_disparity.npz
+NYUv2/test/dining_room_0009/rgb_1365_rgb_left.mp4,NYUv2/test/dining_room_0009/rgb_1365_disparity.npz
+NYUv2/test/dining_room_0011/rgb_1368_rgb_left.mp4,NYUv2/test/dining_room_0011/rgb_1368_disparity.npz
+NYUv2/test/dining_room_0011/rgb_1369_rgb_left.mp4,NYUv2/test/dining_room_0011/rgb_1369_disparity.npz
+NYUv2/test/dining_room_0017/rgb_1384_rgb_left.mp4,NYUv2/test/dining_room_0017/rgb_1384_disparity.npz
+NYUv2/test/dining_room_0017/rgb_1385_rgb_left.mp4,NYUv2/test/dining_room_0017/rgb_1385_disparity.npz
+NYUv2/test/dining_room_0017/rgb_1386_rgb_left.mp4,NYUv2/test/dining_room_0017/rgb_1386_disparity.npz
+NYUv2/test/dining_room_0018/rgb_1387_rgb_left.mp4,NYUv2/test/dining_room_0018/rgb_1387_disparity.npz
+NYUv2/test/dining_room_0018/rgb_1388_rgb_left.mp4,NYUv2/test/dining_room_0018/rgb_1388_disparity.npz
+NYUv2/test/dining_room_0018/rgb_1389_rgb_left.mp4,NYUv2/test/dining_room_0018/rgb_1389_disparity.npz
+NYUv2/test/dining_room_0018/rgb_1390_rgb_left.mp4,NYUv2/test/dining_room_0018/rgb_1390_disparity.npz
+NYUv2/test/dining_room_0018/rgb_1391_rgb_left.mp4,NYUv2/test/dining_room_0018/rgb_1391_disparity.npz
+NYUv2/test/dining_room_0020/rgb_1394_rgb_left.mp4,NYUv2/test/dining_room_0020/rgb_1394_disparity.npz
+NYUv2/test/dining_room_0020/rgb_1395_rgb_left.mp4,NYUv2/test/dining_room_0020/rgb_1395_disparity.npz
+NYUv2/test/dining_room_0020/rgb_1396_rgb_left.mp4,NYUv2/test/dining_room_0020/rgb_1396_disparity.npz
+NYUv2/test/dining_room_0021/rgb_1397_rgb_left.mp4,NYUv2/test/dining_room_0021/rgb_1397_disparity.npz
+NYUv2/test/dining_room_0021/rgb_1398_rgb_left.mp4,NYUv2/test/dining_room_0021/rgb_1398_disparity.npz
+NYUv2/test/dining_room_0021/rgb_1399_rgb_left.mp4,NYUv2/test/dining_room_0021/rgb_1399_disparity.npz
+NYUv2/test/dining_room_0022/rgb_1400_rgb_left.mp4,NYUv2/test/dining_room_0022/rgb_1400_disparity.npz
+NYUv2/test/dining_room_0022/rgb_1401_rgb_left.mp4,NYUv2/test/dining_room_0022/rgb_1401_disparity.npz
+NYUv2/test/dining_room_0025/rgb_1407_rgb_left.mp4,NYUv2/test/dining_room_0025/rgb_1407_disparity.npz
+NYUv2/test/dining_room_0025/rgb_1408_rgb_left.mp4,NYUv2/test/dining_room_0025/rgb_1408_disparity.npz
+NYUv2/test/dining_room_0025/rgb_1409_rgb_left.mp4,NYUv2/test/dining_room_0025/rgb_1409_disparity.npz
+NYUv2/test/dining_room_0025/rgb_1410_rgb_left.mp4,NYUv2/test/dining_room_0025/rgb_1410_disparity.npz
+NYUv2/test/dining_room_0025/rgb_1411_rgb_left.mp4,NYUv2/test/dining_room_0025/rgb_1411_disparity.npz
+NYUv2/test/dining_room_0026/rgb_1412_rgb_left.mp4,NYUv2/test/dining_room_0026/rgb_1412_disparity.npz
+NYUv2/test/dining_room_0026/rgb_1413_rgb_left.mp4,NYUv2/test/dining_room_0026/rgb_1413_disparity.npz
+NYUv2/test/dining_room_0026/rgb_1414_rgb_left.mp4,NYUv2/test/dining_room_0026/rgb_1414_disparity.npz
+NYUv2/test/dining_room_0030/rgb_1421_rgb_left.mp4,NYUv2/test/dining_room_0030/rgb_1421_disparity.npz
+NYUv2/test/dining_room_0030/rgb_1422_rgb_left.mp4,NYUv2/test/dining_room_0030/rgb_1422_disparity.npz
+NYUv2/test/dining_room_0030/rgb_1423_rgb_left.mp4,NYUv2/test/dining_room_0030/rgb_1423_disparity.npz
+NYUv2/test/dining_room_0030/rgb_1424_rgb_left.mp4,NYUv2/test/dining_room_0030/rgb_1424_disparity.npz
+NYUv2/test/dining_room_0032/rgb_1430_rgb_left.mp4,NYUv2/test/dining_room_0032/rgb_1430_disparity.npz
+NYUv2/test/dining_room_0032/rgb_1431_rgb_left.mp4,NYUv2/test/dining_room_0032/rgb_1431_disparity.npz
+NYUv2/test/dining_room_0032/rgb_1432_rgb_left.mp4,NYUv2/test/dining_room_0032/rgb_1432_disparity.npz
+NYUv2/test/dining_room_0032/rgb_1433_rgb_left.mp4,NYUv2/test/dining_room_0032/rgb_1433_disparity.npz
+NYUv2/test/dining_room_0035/rgb_1441_rgb_left.mp4,NYUv2/test/dining_room_0035/rgb_1441_disparity.npz
+NYUv2/test/dining_room_0035/rgb_1442_rgb_left.mp4,NYUv2/test/dining_room_0035/rgb_1442_disparity.npz
+NYUv2/test/dining_room_0035/rgb_1443_rgb_left.mp4,NYUv2/test/dining_room_0035/rgb_1443_disparity.npz
+NYUv2/test/dining_room_0036/rgb_1444_rgb_left.mp4,NYUv2/test/dining_room_0036/rgb_1444_disparity.npz
+NYUv2/test/dining_room_0036/rgb_1445_rgb_left.mp4,NYUv2/test/dining_room_0036/rgb_1445_disparity.npz
+NYUv2/test/dining_room_0036/rgb_1446_rgb_left.mp4,NYUv2/test/dining_room_0036/rgb_1446_disparity.npz
+NYUv2/test/dining_room_0036/rgb_1447_rgb_left.mp4,NYUv2/test/dining_room_0036/rgb_1447_disparity.npz
+NYUv2/test/dining_room_0036/rgb_1448_rgb_left.mp4,NYUv2/test/dining_room_0036/rgb_1448_disparity.npz
+NYUv2/test/dining_room_0036/rgb_1449_rgb_left.mp4,NYUv2/test/dining_room_0036/rgb_1449_disparity.npz
diff --git a/inference/v2v_data/DepthCrafter/benchmark/csv/meta_scannet_test.csv b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_scannet_test.csv
new file mode 100644
index 0000000000000000000000000000000000000000..50de835479755c0d6fbead33b810d9262b298939
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_scannet_test.csv
@@ -0,0 +1,101 @@
+filepath_left,filepath_disparity
+scannet/scene0707_00_rgb_left.mp4,scannet/scene0707_00_disparity.npz
+scannet/scene0708_00_rgb_left.mp4,scannet/scene0708_00_disparity.npz
+scannet/scene0709_00_rgb_left.mp4,scannet/scene0709_00_disparity.npz
+scannet/scene0710_00_rgb_left.mp4,scannet/scene0710_00_disparity.npz
+scannet/scene0711_00_rgb_left.mp4,scannet/scene0711_00_disparity.npz
+scannet/scene0712_00_rgb_left.mp4,scannet/scene0712_00_disparity.npz
+scannet/scene0713_00_rgb_left.mp4,scannet/scene0713_00_disparity.npz
+scannet/scene0714_00_rgb_left.mp4,scannet/scene0714_00_disparity.npz
+scannet/scene0715_00_rgb_left.mp4,scannet/scene0715_00_disparity.npz
+scannet/scene0716_00_rgb_left.mp4,scannet/scene0716_00_disparity.npz
+scannet/scene0717_00_rgb_left.mp4,scannet/scene0717_00_disparity.npz
+scannet/scene0718_00_rgb_left.mp4,scannet/scene0718_00_disparity.npz
+scannet/scene0719_00_rgb_left.mp4,scannet/scene0719_00_disparity.npz
+scannet/scene0720_00_rgb_left.mp4,scannet/scene0720_00_disparity.npz
+scannet/scene0721_00_rgb_left.mp4,scannet/scene0721_00_disparity.npz
+scannet/scene0722_00_rgb_left.mp4,scannet/scene0722_00_disparity.npz
+scannet/scene0723_00_rgb_left.mp4,scannet/scene0723_00_disparity.npz
+scannet/scene0724_00_rgb_left.mp4,scannet/scene0724_00_disparity.npz
+scannet/scene0725_00_rgb_left.mp4,scannet/scene0725_00_disparity.npz
+scannet/scene0726_00_rgb_left.mp4,scannet/scene0726_00_disparity.npz
+scannet/scene0727_00_rgb_left.mp4,scannet/scene0727_00_disparity.npz
+scannet/scene0728_00_rgb_left.mp4,scannet/scene0728_00_disparity.npz
+scannet/scene0729_00_rgb_left.mp4,scannet/scene0729_00_disparity.npz
+scannet/scene0730_00_rgb_left.mp4,scannet/scene0730_00_disparity.npz
+scannet/scene0731_00_rgb_left.mp4,scannet/scene0731_00_disparity.npz
+scannet/scene0732_00_rgb_left.mp4,scannet/scene0732_00_disparity.npz
+scannet/scene0733_00_rgb_left.mp4,scannet/scene0733_00_disparity.npz
+scannet/scene0734_00_rgb_left.mp4,scannet/scene0734_00_disparity.npz
+scannet/scene0735_00_rgb_left.mp4,scannet/scene0735_00_disparity.npz
+scannet/scene0736_00_rgb_left.mp4,scannet/scene0736_00_disparity.npz
+scannet/scene0737_00_rgb_left.mp4,scannet/scene0737_00_disparity.npz
+scannet/scene0738_00_rgb_left.mp4,scannet/scene0738_00_disparity.npz
+scannet/scene0739_00_rgb_left.mp4,scannet/scene0739_00_disparity.npz
+scannet/scene0740_00_rgb_left.mp4,scannet/scene0740_00_disparity.npz
+scannet/scene0741_00_rgb_left.mp4,scannet/scene0741_00_disparity.npz
+scannet/scene0742_00_rgb_left.mp4,scannet/scene0742_00_disparity.npz
+scannet/scene0743_00_rgb_left.mp4,scannet/scene0743_00_disparity.npz
+scannet/scene0744_00_rgb_left.mp4,scannet/scene0744_00_disparity.npz
+scannet/scene0745_00_rgb_left.mp4,scannet/scene0745_00_disparity.npz
+scannet/scene0746_00_rgb_left.mp4,scannet/scene0746_00_disparity.npz
+scannet/scene0747_00_rgb_left.mp4,scannet/scene0747_00_disparity.npz
+scannet/scene0748_00_rgb_left.mp4,scannet/scene0748_00_disparity.npz
+scannet/scene0749_00_rgb_left.mp4,scannet/scene0749_00_disparity.npz
+scannet/scene0750_00_rgb_left.mp4,scannet/scene0750_00_disparity.npz
+scannet/scene0751_00_rgb_left.mp4,scannet/scene0751_00_disparity.npz
+scannet/scene0752_00_rgb_left.mp4,scannet/scene0752_00_disparity.npz
+scannet/scene0753_00_rgb_left.mp4,scannet/scene0753_00_disparity.npz
+scannet/scene0754_00_rgb_left.mp4,scannet/scene0754_00_disparity.npz
+scannet/scene0755_00_rgb_left.mp4,scannet/scene0755_00_disparity.npz
+scannet/scene0756_00_rgb_left.mp4,scannet/scene0756_00_disparity.npz
+scannet/scene0757_00_rgb_left.mp4,scannet/scene0757_00_disparity.npz
+scannet/scene0758_00_rgb_left.mp4,scannet/scene0758_00_disparity.npz
+scannet/scene0759_00_rgb_left.mp4,scannet/scene0759_00_disparity.npz
+scannet/scene0760_00_rgb_left.mp4,scannet/scene0760_00_disparity.npz
+scannet/scene0761_00_rgb_left.mp4,scannet/scene0761_00_disparity.npz
+scannet/scene0762_00_rgb_left.mp4,scannet/scene0762_00_disparity.npz
+scannet/scene0763_00_rgb_left.mp4,scannet/scene0763_00_disparity.npz
+scannet/scene0764_00_rgb_left.mp4,scannet/scene0764_00_disparity.npz
+scannet/scene0765_00_rgb_left.mp4,scannet/scene0765_00_disparity.npz
+scannet/scene0766_00_rgb_left.mp4,scannet/scene0766_00_disparity.npz
+scannet/scene0767_00_rgb_left.mp4,scannet/scene0767_00_disparity.npz
+scannet/scene0768_00_rgb_left.mp4,scannet/scene0768_00_disparity.npz
+scannet/scene0769_00_rgb_left.mp4,scannet/scene0769_00_disparity.npz
+scannet/scene0770_00_rgb_left.mp4,scannet/scene0770_00_disparity.npz
+scannet/scene0771_00_rgb_left.mp4,scannet/scene0771_00_disparity.npz
+scannet/scene0772_00_rgb_left.mp4,scannet/scene0772_00_disparity.npz
+scannet/scene0773_00_rgb_left.mp4,scannet/scene0773_00_disparity.npz
+scannet/scene0774_00_rgb_left.mp4,scannet/scene0774_00_disparity.npz
+scannet/scene0775_00_rgb_left.mp4,scannet/scene0775_00_disparity.npz
+scannet/scene0776_00_rgb_left.mp4,scannet/scene0776_00_disparity.npz
+scannet/scene0777_00_rgb_left.mp4,scannet/scene0777_00_disparity.npz
+scannet/scene0778_00_rgb_left.mp4,scannet/scene0778_00_disparity.npz
+scannet/scene0779_00_rgb_left.mp4,scannet/scene0779_00_disparity.npz
+scannet/scene0780_00_rgb_left.mp4,scannet/scene0780_00_disparity.npz
+scannet/scene0781_00_rgb_left.mp4,scannet/scene0781_00_disparity.npz
+scannet/scene0782_00_rgb_left.mp4,scannet/scene0782_00_disparity.npz
+scannet/scene0783_00_rgb_left.mp4,scannet/scene0783_00_disparity.npz
+scannet/scene0784_00_rgb_left.mp4,scannet/scene0784_00_disparity.npz
+scannet/scene0785_00_rgb_left.mp4,scannet/scene0785_00_disparity.npz
+scannet/scene0786_00_rgb_left.mp4,scannet/scene0786_00_disparity.npz
+scannet/scene0787_00_rgb_left.mp4,scannet/scene0787_00_disparity.npz
+scannet/scene0788_00_rgb_left.mp4,scannet/scene0788_00_disparity.npz
+scannet/scene0789_00_rgb_left.mp4,scannet/scene0789_00_disparity.npz
+scannet/scene0790_00_rgb_left.mp4,scannet/scene0790_00_disparity.npz
+scannet/scene0791_00_rgb_left.mp4,scannet/scene0791_00_disparity.npz
+scannet/scene0792_00_rgb_left.mp4,scannet/scene0792_00_disparity.npz
+scannet/scene0793_00_rgb_left.mp4,scannet/scene0793_00_disparity.npz
+scannet/scene0794_00_rgb_left.mp4,scannet/scene0794_00_disparity.npz
+scannet/scene0795_00_rgb_left.mp4,scannet/scene0795_00_disparity.npz
+scannet/scene0796_00_rgb_left.mp4,scannet/scene0796_00_disparity.npz
+scannet/scene0797_00_rgb_left.mp4,scannet/scene0797_00_disparity.npz
+scannet/scene0798_00_rgb_left.mp4,scannet/scene0798_00_disparity.npz
+scannet/scene0799_00_rgb_left.mp4,scannet/scene0799_00_disparity.npz
+scannet/scene0800_00_rgb_left.mp4,scannet/scene0800_00_disparity.npz
+scannet/scene0801_00_rgb_left.mp4,scannet/scene0801_00_disparity.npz
+scannet/scene0802_00_rgb_left.mp4,scannet/scene0802_00_disparity.npz
+scannet/scene0803_00_rgb_left.mp4,scannet/scene0803_00_disparity.npz
+scannet/scene0804_00_rgb_left.mp4,scannet/scene0804_00_disparity.npz
+scannet/scene0805_00_rgb_left.mp4,scannet/scene0805_00_disparity.npz
+scannet/scene0806_00_rgb_left.mp4,scannet/scene0806_00_disparity.npz
diff --git a/inference/v2v_data/DepthCrafter/benchmark/csv/meta_sintel.csv b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_sintel.csv
new file mode 100644
index 0000000000000000000000000000000000000000..2e84b64aaa86fe4a9497799514be1e62fd87aa05
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/csv/meta_sintel.csv
@@ -0,0 +1,24 @@
+filepath_left,filepath_disparity
+sintel/ambush_5_rgb_left.mp4,sintel/ambush_5_disparity.npz
+sintel/bamboo_2_rgb_left.mp4,sintel/bamboo_2_disparity.npz
+sintel/mountain_1_rgb_left.mp4,sintel/mountain_1_disparity.npz
+sintel/bamboo_1_rgb_left.mp4,sintel/bamboo_1_disparity.npz
+sintel/shaman_2_rgb_left.mp4,sintel/shaman_2_disparity.npz
+sintel/ambush_6_rgb_left.mp4,sintel/ambush_6_disparity.npz
+sintel/bandage_1_rgb_left.mp4,sintel/bandage_1_disparity.npz
+sintel/alley_1_rgb_left.mp4,sintel/alley_1_disparity.npz
+sintel/temple_3_rgb_left.mp4,sintel/temple_3_disparity.npz
+sintel/shaman_3_rgb_left.mp4,sintel/shaman_3_disparity.npz
+sintel/ambush_2_rgb_left.mp4,sintel/ambush_2_disparity.npz
+sintel/cave_4_rgb_left.mp4,sintel/cave_4_disparity.npz
+sintel/cave_2_rgb_left.mp4,sintel/cave_2_disparity.npz
+sintel/alley_2_rgb_left.mp4,sintel/alley_2_disparity.npz
+sintel/market_5_rgb_left.mp4,sintel/market_5_disparity.npz
+sintel/sleeping_2_rgb_left.mp4,sintel/sleeping_2_disparity.npz
+sintel/ambush_4_rgb_left.mp4,sintel/ambush_4_disparity.npz
+sintel/sleeping_1_rgb_left.mp4,sintel/sleeping_1_disparity.npz
+sintel/market_6_rgb_left.mp4,sintel/market_6_disparity.npz
+sintel/market_2_rgb_left.mp4,sintel/market_2_disparity.npz
+sintel/bandage_2_rgb_left.mp4,sintel/bandage_2_disparity.npz
+sintel/ambush_7_rgb_left.mp4,sintel/ambush_7_disparity.npz
+sintel/temple_2_rgb_left.mp4,sintel/temple_2_disparity.npz
diff --git a/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_bonn.py b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_bonn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a62abaafa807506dc67e3c94017f5a716794234c
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_bonn.py
@@ -0,0 +1,155 @@
+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import imageio
+import csv
+
+
+def depth_read(filename):
+    # loads depth map D from png file
+    # and returns it as a numpy array
+
+    depth_png = np.asarray(Image.open(filename))
+    # make sure we have a proper 16bit depth map here.. not 8bit!
+    assert np.max(depth_png) > 255
+
+    depth = depth_png.astype(np.float64) / 5000.0
+    depth[depth_png == 0] = -1.0
+    return depth
+
+
+def extract_bonn(
+    root,
+    depth_root,
+    sample_len=-1,
+    csv_save_path="",
+    datatset_name="",
+    saved_rgb_dir="",
+    saved_disp_dir="",
+    start_frame=0,
+    end_frame=110,
+):
+    scenes_names = os.listdir(depth_root)
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        # load all images
+        all_img_names = os.listdir(osp.join(depth_root, seq_name, "rgb"))
+        all_img_names = [x for x in all_img_names if x.endswith(".png")]
+        print(f"sequence frame number: {len(all_img_names)}")
+
+        # for not zero padding image name
+        all_img_names.sort()
+        all_img_names = sorted(all_img_names, key=lambda x: int(x.split(".")[0][-4:]))
+        all_img_names = all_img_names[start_frame:end_frame]
+
+        all_depth_names = os.listdir(osp.join(depth_root, seq_name, "depth"))
+        all_depth_names = [x for x in all_depth_names if x.endswith(".png")]
+        print(f"sequence depth number: {len(all_depth_names)}")
+
+        # for not zero padding image name
+        all_depth_names.sort()
+        all_depth_names = sorted(
+            all_depth_names, key=lambda x: int(x.split(".")[0][-4:])
+        )
+        all_depth_names = all_depth_names[start_frame:end_frame]
+
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step + 1} / {seq_len//step}")
+
+            video_imgs = []
+            video_depths = []
+
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+
+            # for idx in range(ref_idx, ref_idx + step):
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(root, seq_name, "rgb", all_img_names[idx])
+                depth_path = osp.join(
+                    depth_root, seq_name, "depth", all_depth_names[idx]
+                )
+
+                depth = depth_read(depth_path)
+                disp = depth
+
+                video_depths.append(disp)
+                video_imgs.append(np.array(Image.open(im_path)))
+
+            disp_video = np.array(video_depths)[:, None]  # [:, 0:1, :, :, 0]
+            img_video = np.array(video_imgs)[..., 0:3]  # [:, 0, :, :, 0:3]
+
+            print(disp_video.max(), disp_video.min())
+
+            def even_or_odd(num):
+                if num % 2 == 0:
+                    return num
+                else:
+                    return num - 1
+
+            # print(disp_video.shape)
+            # print(img_video.shape)
+            height = disp_video.shape[-2]
+            width = disp_video.shape[-1]
+            height = even_or_odd(height)
+            width = even_or_odd(width)
+            disp_video = disp_video[:, :, 0:height, 0:width]
+            img_video = img_video[:, 0:height, 0:width]
+
+            data_root = saved_rgb_dir + datatset_name
+            disp_root = saved_disp_dir + datatset_name
+            os.makedirs(data_root, exist_ok=True)
+            os.makedirs(disp_root, exist_ok=True)
+
+            img_video_dir = data_root
+            disp_video_dir = disp_root
+
+            img_video_path = os.path.join(img_video_dir, f"{seq_name}_rgb_left.mp4")
+            disp_video_path = os.path.join(disp_video_dir, f"{seq_name}_disparity.npz")
+
+            imageio.mimsave(
+                img_video_path, img_video, fps=15, quality=9, macro_block_size=1
+            )
+            np.savez(disp_video_path, disparity=disp_video)
+
+            sample = {}
+            sample["filepath_left"] = os.path.join(
+                f"{datatset_name}/{seq_name}_rgb_left.mp4"
+            )  # img_video_path
+            sample["filepath_disparity"] = os.path.join(
+                f"{datatset_name}/{seq_name}_disparity.npz"
+            )  # disp_video_path
+
+            all_samples.append(sample)
+
+    # save csv file
+
+    filename_ = csv_save_path
+    os.makedirs(os.path.dirname(filename_), exist_ok=True)
+    fields = ["filepath_left", "filepath_disparity"]
+    with open(filename_, "w") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(all_samples)
+
+    print(f"{filename_} has been saved.")
+
+
+if __name__ == "__main__":
+    extract_bonn(
+        root="path/to/Bonn-RGBD",
+        depth_root="path/to/Bonn-RGBD",
+        saved_rgb_dir="./benchmark/datasets/",
+        saved_disp_dir="./benchmark/datasets/",
+        csv_save_path=f"./benchmark/datasets/bonn.csv",
+        sample_len=-1,
+        datatset_name="bonn",
+        start_frame=30,
+        end_frame=140,
+    )
diff --git a/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_kitti.py b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..6df251187a18c700b36778155754ba3397d4f1b4
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_kitti.py
@@ -0,0 +1,140 @@
+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import imageio
+
+
+def depth_read(filename):
+    # loads depth map D from png file
+    # and returns it as a numpy array,
+
+    depth_png = np.array(Image.open(filename), dtype=int)
+    # make sure we have a proper 16bit depth map here.. not 8bit!
+    assert np.max(depth_png) > 255
+
+    depth = depth_png.astype(np.float64) / 256.0
+    depth[depth_png == 0] = -1.0
+    return depth
+
+
+def extract_kitti(
+    root,
+    depth_root,
+    sample_len=-1,
+    csv_save_path="",
+    datatset_name="",
+    saved_rgb_dir="",
+    saved_disp_dir="",
+    start_frame=0,
+    end_frame=110,
+):
+    scenes_names = os.listdir(depth_root)
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        all_img_names = os.listdir(
+            osp.join(depth_root, seq_name, "proj_depth/groundtruth/image_02")
+        )
+        all_img_names = [x for x in all_img_names if x.endswith(".png")]
+        print(f"sequence frame number: {len(all_img_names)}")
+
+        all_img_names.sort()
+        all_img_names = sorted(all_img_names, key=lambda x: int(x.split(".")[0][-4:]))
+        all_img_names = all_img_names[start_frame:end_frame]
+
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step + 1} / {seq_len//step}")
+
+            video_imgs = []
+            video_depths = []
+
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(
+                    root, seq_name[0:10], seq_name, "image_02/data", all_img_names[idx]
+                )
+                depth_path = osp.join(
+                    depth_root,
+                    seq_name,
+                    "proj_depth/groundtruth/image_02",
+                    all_img_names[idx],
+                )
+
+                depth = depth_read(depth_path)
+                disp = depth
+
+                video_depths.append(disp)
+                video_imgs.append(np.array(Image.open(im_path)))
+
+            disp_video = np.array(video_depths)[:, None]
+            img_video = np.array(video_imgs)[..., 0:3]
+
+            def even_or_odd(num):
+                if num % 2 == 0:
+                    return num
+                else:
+                    return num - 1
+
+            height = disp_video.shape[-2]
+            width = disp_video.shape[-1]
+            height = even_or_odd(height)
+            width = even_or_odd(width)
+            disp_video = disp_video[:, :, 0:height, 0:width]
+            img_video = img_video[:, 0:height, 0:width]
+
+            data_root = saved_rgb_dir + datatset_name
+            disp_root = saved_disp_dir + datatset_name
+            os.makedirs(data_root, exist_ok=True)
+            os.makedirs(disp_root, exist_ok=True)
+
+            img_video_dir = data_root
+            disp_video_dir = disp_root
+
+            img_video_path = os.path.join(img_video_dir, f"{seq_name}_rgb_left.mp4")
+            disp_video_path = os.path.join(disp_video_dir, f"{seq_name}_disparity.npz")
+
+            imageio.mimsave(
+                img_video_path, img_video, fps=15, quality=10, macro_block_size=1
+            )
+            np.savez(disp_video_path, disparity=disp_video)
+
+            sample = {}
+            sample["filepath_left"] = os.path.join(f"KITTI/{seq_name}_rgb_left.mp4")
+            sample["filepath_disparity"] = os.path.join(
+                f"KITTI/{seq_name}_disparity.npz"
+            )
+
+            all_samples.append(sample)
+
+    filename_ = csv_save_path
+    os.makedirs(os.path.dirname(filename_), exist_ok=True)
+    fields = ["filepath_left", "filepath_disparity"]
+    with open(filename_, "w") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(all_samples)
+
+    print(f"{filename_} has been saved.")
+
+
+if __name__ == "__main__":
+    extract_kitti(
+        root="path/to/KITTI/raw_data",
+        depth_root="path/to/KITTI/data_depth_annotated/val",
+        saved_rgb_dir="./benchmark/datasets/",
+        saved_disp_dir="./benchmark/datasets/",
+        csv_save_path=f"./benchmark/datasets/KITTI.csv",
+        sample_len=-1,
+        datatset_name="KITTI",
+        start_frame=0,
+        end_frame=110,
+    )
diff --git a/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_nyu.py b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_nyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1286b244a55f8266e93f074d567d9b8e1baa29e
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_nyu.py
@@ -0,0 +1,106 @@
+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import imageio
+
+
+def _read_image(img_rel_path) -> np.ndarray:
+    image_to_read = img_rel_path
+    image = Image.open(image_to_read)
+    image = np.asarray(image)
+    return image
+
+
+def depth_read(filename):
+    depth_in = _read_image(filename)
+    depth_decoded = depth_in / 1000.0
+    return depth_decoded
+
+
+def extract_nyu(
+    root,
+    depth_root,
+    csv_save_path="",
+    datatset_name="",
+    filename_ls_path="",
+    saved_rgb_dir="",
+    saved_disp_dir="",
+):
+    with open(filename_ls_path, "r") as f:
+        filenames = [s.split() for s in f.readlines()]
+
+    all_samples = []
+    for i, pair_names in enumerate(tqdm(filenames)):
+        img_name = pair_names[0]
+        filled_depth_name = pair_names[2]
+
+        im_path = osp.join(root, img_name)
+        depth_path = osp.join(depth_root, filled_depth_name)
+
+        depth = depth_read(depth_path)
+        disp = depth
+
+        video_depths = [disp]
+        video_imgs = [np.array(Image.open(im_path))]
+
+        disp_video = np.array(video_depths)[:, None]
+        img_video = np.array(video_imgs)[..., 0:3]
+
+        disp_video = disp_video[:, :, 45:471, 41:601]
+        img_video = img_video[:, 45:471, 41:601, :]
+
+        data_root = saved_rgb_dir + datatset_name
+        disp_root = saved_disp_dir + datatset_name
+        os.makedirs(data_root, exist_ok=True)
+        os.makedirs(disp_root, exist_ok=True)
+
+        img_video_dir = data_root
+        disp_video_dir = disp_root
+
+        img_video_path = os.path.join(img_video_dir, f"{img_name[:-4]}_rgb_left.mp4")
+        disp_video_path = os.path.join(disp_video_dir, f"{img_name[:-4]}_disparity.npz")
+
+        dir_name = os.path.dirname(img_video_path)
+        os.makedirs(dir_name, exist_ok=True)
+        dir_name = os.path.dirname(disp_video_path)
+        os.makedirs(dir_name, exist_ok=True)
+
+        imageio.mimsave(
+            img_video_path, img_video, fps=15, quality=10, macro_block_size=1
+        )
+        np.savez(disp_video_path, disparity=disp_video)
+
+        sample = {}
+        sample["filepath_left"] = os.path.join(
+            f"{datatset_name}/{img_name[:-4]}_rgb_left.mp4"
+        )
+        sample["filepath_disparity"] = os.path.join(
+            f"{datatset_name}/{img_name[:-4]}_disparity.npz"
+        )
+
+        all_samples.append(sample)
+
+    filename_ = csv_save_path
+    os.makedirs(os.path.dirname(filename_), exist_ok=True)
+    fields = ["filepath_left", "filepath_disparity"]
+    with open(filename_, "w") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(all_samples)
+
+    print(f"{filename_} has been saved.")
+
+
+if __name__ == "__main__":
+    extract_nyu(
+        root="path/to/NYUv2/",
+        depth_root="path/to/NYUv2/",
+        filename_ls_path="path/to/NYUv2/filename_list_test.txt",
+        saved_rgb_dir="./benchmark/datasets/",
+        saved_disp_dir="./benchmark/datasets/",
+        csv_save_path=f"./benchmark/datasets/NYUv2.csv",
+        datatset_name="NYUv2",
+    )
diff --git a/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_scannet.py b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_scannet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8096bc3553d762ed6aa8227fbf39749214b47ac2
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_scannet.py
@@ -0,0 +1,124 @@
+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import imageio
+
+
+def _read_image(img_rel_path) -> np.ndarray:
+    image_to_read = img_rel_path
+    image = Image.open(image_to_read)  # [H, W, rgb]
+    image = np.asarray(image)
+    return image
+
+
+def depth_read(filename):
+    depth_in = _read_image(filename)
+    depth_decoded = depth_in / 1000.0
+    return depth_decoded
+
+
+def extract_scannet(
+    root,
+    sample_len=-1,
+    csv_save_path="",
+    datatset_name="",
+    scene_number=16,
+    scene_frames_len=120,
+    stride=1,
+    saved_rgb_dir="",
+    saved_disp_dir="",
+):
+    scenes_names = os.listdir(root)
+    scenes_names = sorted(scenes_names)[:scene_number]
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        all_img_names = os.listdir(osp.join(root, seq_name, "color"))
+        all_img_names = [x for x in all_img_names if x.endswith(".jpg")]
+        all_img_names = sorted(all_img_names, key=lambda x: int(x.split(".")[0]))
+        all_img_names = all_img_names[:scene_frames_len:stride]
+        print(f"sequence frame number: {len(all_img_names)}")
+
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step + 1} / {seq_len//step}")
+
+            video_imgs = []
+            video_depths = []
+
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(root, seq_name, "color", all_img_names[idx])
+                depth_path = osp.join(
+                    root, seq_name, "depth", all_img_names[idx][:-3] + "png"
+                )
+
+                depth = depth_read(depth_path)
+                disp = depth
+
+                video_depths.append(disp)
+                video_imgs.append(np.array(Image.open(im_path)))
+
+            disp_video = np.array(video_depths)[:, None]
+            img_video = np.array(video_imgs)[..., 0:3]
+
+            disp_video = disp_video[:, :, 8:-8, 11:-11]
+            img_video = img_video[:, 8:-8, 11:-11, :]
+
+            data_root = saved_rgb_dir + datatset_name
+            disp_root = saved_disp_dir + datatset_name
+            os.makedirs(data_root, exist_ok=True)
+            os.makedirs(disp_root, exist_ok=True)
+
+            img_video_dir = data_root
+            disp_video_dir = disp_root
+
+            img_video_path = os.path.join(img_video_dir, f"{seq_name}_rgb_left.mp4")
+            disp_video_path = os.path.join(disp_video_dir, f"{seq_name}_disparity.npz")
+
+            imageio.mimsave(
+                img_video_path, img_video, fps=15, quality=9, macro_block_size=1
+            )
+            np.savez(disp_video_path, disparity=disp_video)
+
+            sample = {}
+            sample["filepath_left"] = os.path.join(
+                f"{datatset_name}/{seq_name}_rgb_left.mp4"
+            )
+            sample["filepath_disparity"] = os.path.join(
+                f"{datatset_name}/{seq_name}_disparity.npz"
+            )
+
+            all_samples.append(sample)
+
+    filename_ = csv_save_path
+    os.makedirs(os.path.dirname(filename_), exist_ok=True)
+    fields = ["filepath_left", "filepath_disparity"]
+    with open(filename_, "w") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(all_samples)
+
+    print(f"{filename_} has been saved.")
+
+
+if __name__ == "__main__":
+    extract_scannet(
+        root="path/to/ScanNet_v2/raw/scans_test",
+        saved_rgb_dir="./benchmark/datasets/",
+        saved_disp_dir="./benchmark/datasets/",
+        csv_save_path=f"./benchmark/datasets/scannet.csv",
+        sample_len=-1,
+        datatset_name="scannet",
+        scene_number=100,
+        scene_frames_len=90 * 3,
+        stride=3,
+    )
diff --git a/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_sintel.py b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_sintel.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da97b39b4fd8653f79df0ae5cafa77983352ba0
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/dataset_extract/dataset_extract_sintel.py
@@ -0,0 +1,137 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# # Data loading based on https://github.com/NVIDIA/flownet2-pytorch
+
+
+import os
+import numpy as np
+import os.path as osp
+from PIL import Image
+from tqdm import tqdm
+import csv
+import imageio
+
+
+# Check for endianness, based on Daniel Scharstein's optical flow code.
+# Using little-endian architecture, these two should be equal.
+TAG_FLOAT = 202021.25
+TAG_CHAR = "PIEH"
+
+
+def depth_read(filename):
+    """Read depth data from file, return as numpy array."""
+    f = open(filename, "rb")
+    check = np.fromfile(f, dtype=np.float32, count=1)[0]
+    assert (
+        check == TAG_FLOAT
+    ), " depth_read:: Wrong tag in flow file (should be: {0}, is: {1}). Big-endian machine? ".format(
+        TAG_FLOAT, check
+    )
+    width = np.fromfile(f, dtype=np.int32, count=1)[0]
+    height = np.fromfile(f, dtype=np.int32, count=1)[0]
+    size = width * height
+    assert (
+        width > 0 and height > 0 and size > 1 and size < 100000000
+    ), " depth_read:: Wrong input size (width = {0}, height = {1}).".format(
+        width, height
+    )
+    depth = np.fromfile(f, dtype=np.float32, count=-1).reshape((height, width))
+    return depth
+
+
+def extract_sintel(
+    root,
+    depth_root,
+    sample_len=-1,
+    csv_save_path="",
+    datatset_name="",
+    saved_rgb_dir="",
+    saved_disp_dir="",
+):
+    scenes_names = os.listdir(root)
+    all_samples = []
+    for i, seq_name in enumerate(tqdm(scenes_names)):
+        all_img_names = os.listdir(os.path.join(root, seq_name))
+        all_img_names = [x for x in all_img_names if x.endswith(".png")]
+        all_img_names.sort()
+        all_img_names = sorted(all_img_names, key=lambda x: int(x.split(".")[0][-4:]))
+
+        seq_len = len(all_img_names)
+        step = sample_len if sample_len > 0 else seq_len
+
+        for ref_idx in range(0, seq_len, step):
+            print(f"Progress: {seq_name}, {ref_idx // step} / {seq_len // step}")
+
+            video_imgs = []
+            video_depths = []
+
+            if (ref_idx + step) <= seq_len:
+                ref_e = ref_idx + step
+            else:
+                continue
+
+            for idx in range(ref_idx, ref_e):
+                im_path = osp.join(root, seq_name, all_img_names[idx])
+                depth_path = osp.join(
+                    depth_root, seq_name, all_img_names[idx][:-3] + "dpt"
+                )
+
+                depth = depth_read(depth_path)
+                disp = depth
+
+                video_depths.append(disp)
+                video_imgs.append(np.array(Image.open(im_path)))
+
+            disp_video = np.array(video_depths)[:, None]
+            img_video = np.array(video_imgs)[..., 0:3]
+
+            data_root = saved_rgb_dir + datatset_name
+            disp_root = saved_disp_dir + datatset_name
+            os.makedirs(data_root, exist_ok=True)
+            os.makedirs(disp_root, exist_ok=True)
+
+            img_video_dir = data_root
+            disp_video_dir = disp_root
+
+            img_video_path = os.path.join(img_video_dir, f"{seq_name}_rgb_left.mp4")
+            disp_video_path = os.path.join(disp_video_dir, f"{seq_name}_disparity.npz")
+
+            imageio.mimsave(
+                img_video_path, img_video, fps=15, quality=10, macro_block_size=1
+            )
+            np.savez(disp_video_path, disparity=disp_video)
+
+            sample = {}
+            sample["filepath_left"] = os.path.join(
+                f"{datatset_name}/{seq_name}_rgb_left.mp4"
+            )
+            sample["filepath_disparity"] = os.path.join(
+                f"{datatset_name}/{seq_name}_disparity.npz"
+            )
+
+            all_samples.append(sample)
+
+    filename_ = csv_save_path
+    os.makedirs(os.path.dirname(filename_), exist_ok=True)
+    fields = ["filepath_left", "filepath_disparity"]
+    with open(filename_, "w") as csvfile:
+        writer = csv.DictWriter(csvfile, fieldnames=fields)
+        writer.writeheader()
+        writer.writerows(all_samples)
+
+    print(f"{filename_} has been saved.")
+
+
+if __name__ == "__main__":
+    extract_sintel(
+        root="path/to/Sintel-Depth/training_image/clean",
+        depth_root="path/to/Sintel-Depth/MPI-Sintel-depth-training-20150305/training/depth",
+        saved_rgb_dir="./benchmark/datasets/",
+        saved_disp_dir="./benchmark/datasets/",
+        csv_save_path=f"./benchmark/datasets/sintel.csv",
+        sample_len=-1,
+        datatset_name="sintel",
+    )
diff --git a/inference/v2v_data/DepthCrafter/benchmark/demo.sh b/inference/v2v_data/DepthCrafter/benchmark/demo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7cc9f1f1b5b83f1cd3971604d21101bb1afdc19b
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/demo.sh
@@ -0,0 +1,18 @@
+#!/bin/sh
+set -x
+set -e
+
+test_case=$1
+gpu_id=$2
+process_length=$3
+saved_root=$4
+saved_dataset_folder=$5
+overlap=$6
+dataset=$7
+
+CUDA_VISIBLE_DEVICES=${gpu_id} PYTHONPATH=. python run.py \
+  --video-path ${test_case} \
+  --save-folder ${saved_root}/${saved_dataset_folder} \
+  --process-length ${process_length} \
+  --dataset ${dataset} \
+  --overlap ${overlap}
\ No newline at end of file
diff --git a/inference/v2v_data/DepthCrafter/benchmark/eval/eval.py b/inference/v2v_data/DepthCrafter/benchmark/eval/eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ae6d20f915b6cd145f00071984d7744d29d86a8
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/eval/eval.py
@@ -0,0 +1,282 @@
+import numpy as np
+import os
+import torch
+import cv2
+import csv
+from metric import * 
+import metric
+import argparse
+from tqdm import tqdm
+import json
+
+
+device = 'cuda'
+eval_metrics = [
+    "abs_relative_difference",
+    "rmse_linear",
+    "delta1_acc",
+    # "squared_relative_difference",
+    # "rmse_log",
+    # "log10",
+    # "delta2_acc",
+    # "delta3_acc",
+    # "i_rmse",
+    # "silog_rmse",
+]
+
+
+def depth2disparity(depth, return_mask=False):
+    if isinstance(depth, torch.Tensor):
+        disparity = torch.zeros_like(depth)
+    elif isinstance(depth, np.ndarray):
+        disparity = np.zeros_like(depth)
+    non_negtive_mask = depth > 0
+    disparity[non_negtive_mask] = 1.0 / depth[non_negtive_mask]
+    if return_mask:
+        return disparity, non_negtive_mask
+    else:
+        return disparity
+
+
+def resize_images(images, new_size):
+    resized_images = np.empty(
+        (images.shape[0], new_size[0], new_size[1], images.shape[3])
+    )
+
+    for i, image in enumerate(images):
+        if image.shape[2]==1:
+            resized_images[i] = cv2.resize(image, (new_size[1], new_size[0]))[..., None]
+        else:
+            resized_images[i] = cv2.resize(image, (new_size[1], new_size[0]))
+
+    return resized_images
+    
+
+def eval_single(
+    pred_disp_path, 
+    gt_disp_path, 
+    seq_len=98, 
+    domain='depth', 
+    method_type="ours",
+    dataset_max_depth="70"
+):
+    # load data
+    gt_disp = np.load(gt_disp_path)['disparity'] \
+        if 'disparity' in np.load(gt_disp_path).files else \
+        np.load(gt_disp_path)['arr_0']  # (t, 1, h, w)
+
+    if method_type=="ours":
+        pred_disp = np.load(pred_disp_path)['depth'] # (t, h, w)
+    if method_type=="depth_anything":
+        pred_disp = np.load(pred_disp_path)['disparity'] # (t, h, w)
+    
+    # seq_len
+    if pred_disp.shape[0] < seq_len:
+         seq_len = pred_disp.shape[0]
+
+    # preprocess
+    pred_disp = resize_images(pred_disp[..., None], (gt_disp.shape[-2], gt_disp.shape[-1])) # (t, h, w)
+    pred_disp = pred_disp[..., 0] # (t, h, w)
+    pred_disp = pred_disp[:seq_len]
+    gt_disp = gt_disp[:seq_len, 0] # (t, h, w)
+
+    # valid mask
+    valid_mask = np.logical_and(
+            (gt_disp > 1e-3), 
+            (gt_disp < dataset_max_depth)
+        )
+    pred_disp = np.clip(pred_disp, a_min=1e-3, a_max=None) 
+    pred_disp_masked = pred_disp[valid_mask].reshape((-1, 1))
+    
+    # choose evaluation domain
+    DOMAIN = domain
+    if DOMAIN=='disp':
+        # align in real disp, calc in disp
+        gt_disp_maksed = gt_disp[valid_mask].reshape((-1, 1)).astype(np.float64)
+    elif DOMAIN=='depth':
+        # align in disp = 1/depth, calc in depth
+        gt_disp_maksed = 1. / (gt_disp[valid_mask].reshape((-1, 1)).astype(np.float64) + 1e-8)
+    else:
+        pass
+
+
+    # calc scale and shift
+    _ones = np.ones_like(pred_disp_masked)
+    A = np.concatenate([pred_disp_masked, _ones], axis=-1)
+    X = np.linalg.lstsq(A, gt_disp_maksed, rcond=None)[0]
+    scale, shift = X # gt = scale * pred + shift
+    
+    # align
+    aligned_pred = scale * pred_disp + shift
+    aligned_pred = np.clip(aligned_pred, a_min=1e-3, a_max=None) 
+
+
+    # align in real disp, calc in disp
+    if DOMAIN=='disp':
+        pred_depth = aligned_pred
+        gt_depth = gt_disp
+    # align in disp = 1/depth, calc in depth
+    elif DOMAIN=='depth':
+        pred_depth = depth2disparity(aligned_pred)
+        gt_depth = gt_disp
+    else:
+        pass
+
+    # metric evaluation, clip to dataset min max
+    pred_depth = np.clip(
+            pred_depth, a_min=1e-3, a_max=dataset_max_depth
+        )
+
+    # evaluate metric 
+    sample_metric = []
+    metric_funcs = [getattr(metric, _met) for _met in eval_metrics]
+
+    # Evaluate 
+    sample_metric = []
+    pred_depth_ts = torch.from_numpy(pred_depth).to(device)
+    gt_depth_ts = torch.from_numpy(gt_depth).to(device)
+    valid_mask_ts = torch.from_numpy(valid_mask).to(device)
+
+    n = valid_mask.sum((-1, -2))
+    valid_frame = (n > 0)
+    pred_depth_ts = pred_depth_ts[valid_frame]
+    gt_depth_ts = gt_depth_ts[valid_frame]
+    valid_mask_ts = valid_mask_ts[valid_frame]
+
+    for met_func in metric_funcs:
+        _metric_name = met_func.__name__
+        _metric = met_func(pred_depth_ts, gt_depth_ts, valid_mask_ts).item()
+        sample_metric.append(_metric)
+
+    return sample_metric
+
+
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--seq_len", 
+        type=int,
+        default=50,
+        help="Max video frame length for evaluation."
+    )
+
+    parser.add_argument(
+        "--domain",
+        type=str,
+        default="depth",
+        choices=["depth", "disp"],
+        help="Domain of metric calculation."
+    )
+
+    parser.add_argument(
+        "--method_type",
+        type=str,
+        default="ours",
+        choices=["ours", "depth_anything"],
+        help="Choose the methods."
+    )
+
+    parser.add_argument(
+        "--dataset_max_depth",
+        type=int,
+        default=70,
+        help="Dataset max depth clip."
+    )
+
+    parser.add_argument(
+        "--pred_disp_root",
+        type=str,
+        default="./demo_output",
+        help="Predicted output directory."
+    )
+
+    parser.add_argument(
+        "--gt_disp_root",
+        type=str,
+        required=True,
+        help="GT depth directory."
+    )
+
+    parser.add_argument(
+        "--dataset",
+        type=str,
+        required=True,
+        help="Choose the datasets."
+    )
+
+    parser.add_argument(
+        "--meta_path",
+        type=str,
+        required=True,
+        help="Path of test dataset csv file."
+    )
+
+
+    args = parser.parse_args()
+
+    SEQ_LEN = args.seq_len
+    method_type = args.method_type
+    if method_type == "ours":
+        pred_disp_root = os.path.join(args.pred_disp_root, f'results_{args.dataset}')
+    else:
+        # pred_disp_root = args.pred_disp_root
+        pred_disp_root = os.path.join(args.pred_disp_root, f'results_{args.dataset}')
+    domain = args.domain
+    dataset_max_depth = args.dataset_max_depth
+    saved_json_path = os.path.join(args.pred_disp_root, f"results_{args.dataset}.json")
+
+    meta_path = args.meta_path
+
+    assert method_type in ["depth_anything", "ours"], "Invalid method type, must be in ['depth_anything', 'ours']"
+    assert domain in ["depth", "disp"], "Invalid domain type, must be in ['depth', 'disp']"
+        
+    with open(meta_path, mode="r", encoding="utf-8") as csvfile:
+        csv_reader = csv.DictReader(csvfile)
+        samples = list(csv_reader)
+
+    # iterate all cases
+    results_all = []
+    for i, sample in enumerate(tqdm(samples)):
+        gt_disp_path = os.path.join(args.gt_disp_root, samples[i]['filepath_disparity'])
+        if method_type=="ours":
+            pred_disp_path = os.path.join(pred_disp_root, samples[i]['filepath_disparity'])
+            pred_disp_path = pred_disp_path.replace("disparity", "rgb_left")
+        
+        if method_type=="depth_anything":
+            pred_disp_path = os.path.join(pred_disp_root, samples[i]['filepath_disparity'])
+            pred_disp_path = pred_disp_path.replace("disparity", "rgb_left_depth")
+        
+        results_single = eval_single(
+            pred_disp_path, 
+            gt_disp_path, 
+            seq_len=SEQ_LEN, 
+            domain=domain, 
+            method_type=method_type, 
+            dataset_max_depth=dataset_max_depth
+        )
+
+        results_all.append(results_single)
+
+    # avarage
+    final_results =  np.array(results_all)
+    final_results_mean = np.mean(final_results, axis=0)
+    print("")
+
+    # save mean to json
+    result_dict = { 'name': method_type }
+    for i, metric in enumerate(eval_metrics):
+        result_dict[metric] = final_results_mean[i]
+        print(f"{metric}: {final_results_mean[i]:04f}")
+
+    # save each case to json
+    for i, results in enumerate(results_all):
+        result_dict[samples[i]['filepath_disparity']] = results
+
+    # write json
+    with open(saved_json_path, 'w') as f:
+        json.dump(result_dict, f, indent=4)
+    print("")
+    print(f"Evaluation results json are saved to {saved_json_path}")
+    
diff --git a/inference/v2v_data/DepthCrafter/benchmark/eval/eval.sh b/inference/v2v_data/DepthCrafter/benchmark/eval/eval.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ddf051b2ac6f1797573d4fb84c81cc804bcefee6
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/eval/eval.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+set -x
+set -e
+
+pred_disp_root=/path/to/saved/root_directory # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] prediction
+gt_disp_root=/path/to/gt_depth/root_directory # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] ground truth
+
+# eval sintel
+python benchmark/eval/eval.py \
+    --meta_path ./eval/csv/meta_sintel.csv \
+    --dataset_max_depth 70 \
+    --dataset sintel \
+    --seq_len 50 \
+    --pred_disp_root ${pred_disp_root} \
+    --gt_disp_root ${gt_disp_root} \
+
+# eval scannet
+python benchmark/eval/eval.py \
+    --meta_path ./eval/csv/meta_scannet_test.csv \
+    --dataset_max_depth 10 \
+    --dataset scannet \
+    --seq_len 90 \
+    --pred_disp_root ${pred_disp_root} \
+    --gt_disp_root ${gt_disp_root} \
+
+# eval kitti
+python benchmark/eval/eval.py \
+    --meta_path ./eval/csv/meta_kitti_val.csv \
+    --dataset_max_depth 80 \
+    --dataset kitti \
+    --seq_len 110 \
+    --pred_disp_root ${pred_disp_root} \
+    --gt_disp_root ${gt_disp_root} \
+
+# eval bonn
+python benchmark/eval/eval.py \
+    --meta_path ./eval/csv/meta_bonn.csv \
+    --dataset_max_depth 10 \
+    --dataset bonn \
+    --seq_len 110 \
+    --pred_disp_root ${pred_disp_root} \
+    --gt_disp_root ${gt_disp_root} \
+
+# eval nyu
+python benchmark/eval/eval.py \
+    --meta_path ./eval/csv/meta_nyu_test.csv \
+    --dataset_max_depth 10 \
+    --dataset nyu \
+    --seq_len 1 \
+    --pred_disp_root ${pred_disp_root} \
+    --gt_disp_root ${gt_disp_root} \
diff --git a/inference/v2v_data/DepthCrafter/benchmark/eval/metric.py b/inference/v2v_data/DepthCrafter/benchmark/eval/metric.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec934cdf750847cd1458dd9a632c43d63b31372
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/eval/metric.py
@@ -0,0 +1,128 @@
+import torch
+
+
+def abs_relative_difference(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    abs_relative_diff = torch.abs(actual_output - actual_target) / actual_target
+    if valid_mask is not None:
+        abs_relative_diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    abs_relative_diff = torch.sum(abs_relative_diff, (-1, -2)) / n
+    return abs_relative_diff.mean()
+
+
+def squared_relative_difference(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    square_relative_diff = (
+        torch.pow(torch.abs(actual_output - actual_target), 2) / actual_target
+    )
+    if valid_mask is not None:
+        square_relative_diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    square_relative_diff = torch.sum(square_relative_diff, (-1, -2)) / n
+    return square_relative_diff.mean()
+
+
+def rmse_linear(output, target, valid_mask=None):
+    actual_output = output
+    actual_target = target
+    diff = actual_output - actual_target
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+
+
+def rmse_log(output, target, valid_mask=None):
+    diff = torch.log(output) - torch.log(target)
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n  # [B]
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+
+
+def log10(output, target, valid_mask=None):
+    if valid_mask is not None:
+        diff = torch.abs(
+            torch.log10(output[valid_mask]) - torch.log10(target[valid_mask])
+        )
+    else:
+        diff = torch.abs(torch.log10(output) - torch.log10(target))
+    return diff.mean()
+
+
+# adapt from: https://github.com/imran3180/depth-map-prediction/blob/master/main.py
+def threshold_percentage(output, target, threshold_val, valid_mask=None):
+    d1 = output / target
+    d2 = target / output
+    max_d1_d2 = torch.max(d1, d2)
+    zero = torch.zeros(*output.shape)
+    one = torch.ones(*output.shape)
+    bit_mat = torch.where(max_d1_d2.cpu() < threshold_val, one, zero)
+    if valid_mask is not None:
+        bit_mat[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    count_mat = torch.sum(bit_mat, (-1, -2))
+    threshold_mat = count_mat / n.cpu()
+    return threshold_mat.mean()
+
+
+def delta1_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25, valid_mask)
+
+
+def delta2_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25**2, valid_mask)
+
+
+def delta3_acc(pred, gt, valid_mask):
+    return threshold_percentage(pred, gt, 1.25**3, valid_mask)
+
+
+def i_rmse(output, target, valid_mask=None):
+    output_inv = 1.0 / output
+    target_inv = 1.0 / target
+    diff = output_inv - target_inv
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = output.shape[-1] * output.shape[-2]
+    diff2 = torch.pow(diff, 2)
+    mse = torch.sum(diff2, (-1, -2)) / n  # [B]
+    rmse = torch.sqrt(mse)
+    return rmse.mean()
+
+
+def silog_rmse(depth_pred, depth_gt, valid_mask=None):
+    diff = torch.log(depth_pred) - torch.log(depth_gt)
+    if valid_mask is not None:
+        diff[~valid_mask] = 0
+        n = valid_mask.sum((-1, -2))
+    else:
+        n = depth_gt.shape[-2] * depth_gt.shape[-1]
+
+    diff2 = torch.pow(diff, 2)
+
+    first_term = torch.sum(diff2, (-1, -2)) / n
+    second_term = torch.pow(torch.sum(diff, (-1, -2)), 2) / (n**2)
+    loss = torch.sqrt(torch.mean(first_term - second_term)) * 100
+    return loss
diff --git a/inference/v2v_data/DepthCrafter/benchmark/infer/infer.sh b/inference/v2v_data/DepthCrafter/benchmark/infer/infer.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2079169308823ba9c8a7c8eb2d5aa59c93818c3d
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/infer/infer.sh
@@ -0,0 +1,59 @@
+#!/bin/sh
+set -x
+set -e
+
+input_rgb_root=/path/to/input/RGB/root_directory # The parent directory that contaning [sintel, scannet, KITTI, bonn, NYUv2] input RGB
+saved_root=/path/to/saved/root_directory # The parent directory that saving [sintel, scannet, KITTI, bonn, NYUv2] prediction
+gpus=0,1,2,3 # Using 4 GPU, you can adjust it according to your device
+
+
+# infer sintel
+python benchmark/infer/infer_batch.py \
+    --meta_path ./eval/csv/meta_sintel.csv \
+    --saved_root ${saved_root} \
+    --saved_dataset_folder results_sintel \
+    --input_rgb_root ${input_rgb_root} \
+    --process_length 50 \
+    --gpus ${gpus} \
+    --dataset sintel \
+
+# infer scannet
+python benchmark/infer/infer_batch.py \
+    --meta_path ./eval/csv/meta_scannet_test.csv \
+    --saved_root ${saved_root} \
+    --saved_dataset_folder results_scannet \
+    --input_rgb_root ${input_rgb_root} \
+    --process_length 90 \
+    --gpus ${gpus} \
+    --dataset scannet \
+
+# infer kitti
+python benchmark/infer/infer_batch.py \
+    --meta_path ./eval/csv/meta_kitti_val.csv \
+    --saved_root ${saved_root} \
+    --saved_dataset_folder results_kitti \
+    --input_rgb_root ${input_rgb_root} \
+    --process_length 110 \
+    --gpus ${gpus} \
+    --dataset kitti \
+
+# infer bonn
+python benchmark/infer/infer_batch.py \
+    --meta_path ./eval/csv/meta_bonn.csv \
+    --saved_root ${saved_root} \
+    --saved_dataset_folder results_bonn \
+    --input_rgb_root ${input_rgb_root} \
+    --process_length 110 \
+    --gpus ${gpus} \
+    --dataset bonn \
+
+# infer nyu
+python benchmark/infer/infer_batch.py \
+    --meta_path ./eval/csv/meta_nyu_test.csv \
+    --saved_root ${saved_root} \
+    --saved_dataset_folder results_nyu \
+    --input_rgb_root ${input_rgb_root} \
+    --process_length 1 \
+    --gpus ${gpus} \
+    --overlap 0 \
+    --dataset nyu \
diff --git a/inference/v2v_data/DepthCrafter/benchmark/infer/infer_batch.py b/inference/v2v_data/DepthCrafter/benchmark/infer/infer_batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a74ebab4c5ee6c67574070e789360c56e99e0a
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/benchmark/infer/infer_batch.py
@@ -0,0 +1,46 @@
+import os
+import multiprocessing as mp
+import csv
+import argparse
+
+
+def process_video(video_path, gpu_id, save_folder, args):
+    os.system(f'sh ./benchmark/demo.sh {video_path} {gpu_id} {int(args.process_length)} {args.saved_root} {save_folder} {args.overlap} {args.dataset}')
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    
+    parser.add_argument('--meta_path', type=str)
+    parser.add_argument('--saved_dataset_folder', type=str)
+    parser.add_argument('--saved_root', type=str, default="./output")
+    parser.add_argument('--input_rgb_root', type=str)
+
+    parser.add_argument('--process_length', type=int, default=110)
+    parser.add_argument('--gpus', type=str, default="0,1,2,3")
+    
+    parser.add_argument('--overlap', type=int, default=1)
+    parser.add_argument('--dataset', type=str, default="open")
+    
+    args = parser.parse_args()
+    gpus = args.gpus.strip().split(',')
+
+    with open(args.meta_path, mode="r", encoding="utf-8") as csvfile:
+        csv_reader = csv.DictReader(csvfile)
+        test_samples = list(csv_reader)
+    batch_size = len(gpus)
+    video_batches = [test_samples[i:i+batch_size] for i in range(0, len(test_samples), batch_size)]
+    print("gpus+++: ", gpus)
+
+    processes = []
+    for video_batch in video_batches:
+        for i, sample in enumerate(video_batch):
+            video_path = os.path.join(args.input_rgb_root, sample["filepath_left"])
+            save_folder = os.path.join(args.saved_dataset_folder, os.path.dirname(sample["filepath_left"]))
+            gpu_id = gpus[i % len(gpus)]
+            p = mp.Process(target=process_video, args=(video_path, gpu_id, save_folder, args))
+            p.start()
+            processes.append(p)
+        
+        for p in processes: 
+            p.join()
\ No newline at end of file
diff --git a/inference/v2v_data/DepthCrafter/depthcrafter/__init__.py b/inference/v2v_data/DepthCrafter/depthcrafter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/inference/v2v_data/DepthCrafter/depthcrafter/depth_crafter_ppl.py b/inference/v2v_data/DepthCrafter/depthcrafter/depth_crafter_ppl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d070d496aec9d1217aac83625878f0159a4ca2
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/depthcrafter/depth_crafter_ppl.py
@@ -0,0 +1,366 @@
+from typing import Callable, Dict, List, Optional, Union
+
+import numpy as np
+import torch
+
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import (
+    _resize_with_antialiasing,
+    StableVideoDiffusionPipelineOutput,
+    StableVideoDiffusionPipeline,
+    retrieve_timesteps,
+)
+from diffusers.utils import logging
+from diffusers.utils.torch_utils import randn_tensor
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class DepthCrafterPipeline(StableVideoDiffusionPipeline):
+
+    @torch.inference_mode()
+    def encode_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ) -> torch.Tensor:
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: image_embeddings in shape of [b, 1024]
+        """
+
+        video_224 = _resize_with_antialiasing(video.float(), (224, 224))
+        video_224 = (video_224 + 1.0) / 2.0  # [-1, 1] -> [0, 1]
+
+        embeddings = []
+        for i in range(0, video_224.shape[0], chunk_size):
+            tmp = self.feature_extractor(
+                images=video_224[i : i + chunk_size],
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values.to(video.device, dtype=video.dtype)
+            embeddings.append(self.image_encoder(tmp).image_embeds)  # [b, 1024]
+
+        embeddings = torch.cat(embeddings, dim=0)  # [t, 1024]
+        return embeddings
+
+    @torch.inference_mode()
+    def encode_vae_video(
+        self,
+        video: torch.Tensor,
+        chunk_size: int = 14,
+    ):
+        """
+        :param video: [b, c, h, w] in range [-1, 1], the b may contain multiple videos or frames
+        :param chunk_size: the chunk size to encode video
+        :return: vae latents in shape of [b, c, h, w]
+        """
+        video_latents = []
+        for i in range(0, video.shape[0], chunk_size):
+            video_latents.append(
+                self.vae.encode(video[i : i + chunk_size]).latent_dist.mode()
+            )
+        video_latents = torch.cat(video_latents, dim=0)
+        return video_latents
+
+    @staticmethod
+    def check_inputs(video, height, width):
+        """
+        :param video:
+        :param height:
+        :param width:
+        :return:
+        """
+        if not isinstance(video, torch.Tensor) and not isinstance(video, np.ndarray):
+            raise ValueError(
+                f"Expected `video` to be a `torch.Tensor` or `VideoReader`, but got a {type(video)}"
+            )
+
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                f"`height` and `width` have to be divisible by 8 but are {height} and {width}."
+            )
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        video: Union[np.ndarray, torch.Tensor],
+        height: int = 576,
+        width: int = 1024,
+        num_inference_steps: int = 25,
+        guidance_scale: float = 1.0,
+        window_size: Optional[int] = 110,
+        noise_aug_strength: float = 0.02,
+        decode_chunk_size: Optional[int] = None,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        return_dict: bool = True,
+        overlap: int = 25,
+        track_time: bool = False,
+    ):
+        """
+        :param video: in shape [t, h, w, c] if np.ndarray or [t, c, h, w] if torch.Tensor, in range [0, 1]
+        :param height:
+        :param width:
+        :param num_inference_steps:
+        :param guidance_scale:
+        :param window_size: sliding window processing size
+        :param fps:
+        :param motion_bucket_id:
+        :param noise_aug_strength:
+        :param decode_chunk_size:
+        :param generator:
+        :param latents:
+        :param output_type:
+        :param callback_on_step_end:
+        :param callback_on_step_end_tensor_inputs:
+        :param return_dict:
+        :return:
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        num_frames = video.shape[0]
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else 8
+        if num_frames <= window_size:
+            window_size = num_frames
+            overlap = 0
+        stride = window_size - overlap
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(video, height, width)
+
+        # 2. Define call parameters
+        batch_size = 1
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = guidance_scale
+
+        # 3. Encode input video
+        if isinstance(video, np.ndarray):
+            video = torch.from_numpy(video.transpose(0, 3, 1, 2))
+        else:
+            assert isinstance(video, torch.Tensor)
+        video = video.to(device=device, dtype=self.dtype)
+        video = video * 2.0 - 1.0  # [0,1] -> [-1,1], in [t, c, h, w]
+
+        if track_time:
+            start_event = torch.cuda.Event(enable_timing=True)
+            encode_event = torch.cuda.Event(enable_timing=True)
+            denoise_event = torch.cuda.Event(enable_timing=True)
+            decode_event = torch.cuda.Event(enable_timing=True)
+            start_event.record()
+
+        video_embeddings = self.encode_video(
+            video, chunk_size=decode_chunk_size
+        ).unsqueeze(
+            0
+        )  # [1, t, 1024]
+        torch.cuda.empty_cache()
+        # 4. Encode input image using VAE
+        noise = randn_tensor(
+            video.shape, generator=generator, device=device, dtype=video.dtype
+        )
+        video = video + noise_aug_strength * noise  # in [t, c, h, w]
+
+        # pdb.set_trace()
+        needs_upcasting = (
+            self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        )
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+
+        video_latents = self.encode_vae_video(
+            video.to(self.vae.dtype),
+            chunk_size=decode_chunk_size,
+        ).unsqueeze(
+            0
+        )  # [1, t, c, h, w]
+
+        if track_time:
+            encode_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = start_event.elapsed_time(encode_event)
+            print(f"Elapsed time for encoding video: {elapsed_time_ms} ms")
+
+        torch.cuda.empty_cache()
+
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            7,
+            127,
+            noise_aug_strength,
+            video_embeddings.dtype,
+            batch_size,
+            1,
+            False,
+        )  # [1 or 2, 3]
+        added_time_ids = added_time_ids.to(device)
+
+        # 6. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, None, None
+        )
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents_init = self.prepare_latents(
+            batch_size,
+            window_size,
+            num_channels_latents,
+            height,
+            width,
+            video_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )  # [1, t, c, h, w]
+        latents_all = None
+
+        idx_start = 0
+        if overlap > 0:
+            weights = torch.linspace(0, 1, overlap, device=device)
+            weights = weights.view(1, overlap, 1, 1, 1)
+        else:
+            weights = None
+
+        torch.cuda.empty_cache()
+
+        # inference strategy for long videos
+        # two main strategies: 1. noise init from previous frame, 2. segments stitching
+        while idx_start < num_frames - overlap:
+            idx_end = min(idx_start + window_size, num_frames)
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+
+            # 9. Denoising loop
+            latents = latents_init[:, : idx_end - idx_start].clone()
+            latents_init = torch.cat(
+                [latents_init[:, -overlap:], latents_init[:, :stride]], dim=1
+            )
+
+            video_latents_current = video_latents[:, idx_start:idx_end]
+            video_embeddings_current = video_embeddings[:, idx_start:idx_end]
+
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i, t in enumerate(timesteps):
+                    if latents_all is not None and i == 0:
+                        latents[:, :overlap] = (
+                            latents_all[:, -overlap:]
+                            + latents[:, :overlap]
+                            / self.scheduler.init_noise_sigma
+                            * self.scheduler.sigmas[i]
+                        )
+
+                    latent_model_input = latents  # [1, t, c, h, w]
+                    latent_model_input = self.scheduler.scale_model_input(
+                        latent_model_input, t
+                    )  # [1, t, c, h, w]
+                    latent_model_input = torch.cat(
+                        [latent_model_input, video_latents_current], dim=2
+                    )
+                    noise_pred = self.unet(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=video_embeddings_current,
+                        added_time_ids=added_time_ids,
+                        return_dict=False,
+                    )[0]
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        latent_model_input = latents
+                        latent_model_input = self.scheduler.scale_model_input(
+                            latent_model_input, t
+                        )
+                        latent_model_input = torch.cat(
+                            [latent_model_input, torch.zeros_like(latent_model_input)],
+                            dim=2,
+                        )
+                        noise_pred_uncond = self.unet(
+                            latent_model_input,
+                            t,
+                            encoder_hidden_states=torch.zeros_like(
+                                video_embeddings_current
+                            ),
+                            added_time_ids=added_time_ids,
+                            return_dict=False,
+                        )[0]
+
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (
+                            noise_pred - noise_pred_uncond
+                        )
+                    latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(
+                            self, i, t, callback_kwargs
+                        )
+
+                        latents = callback_outputs.pop("latents", latents)
+
+                    if i == len(timesteps) - 1 or (
+                        (i + 1) > num_warmup_steps
+                        and (i + 1) % self.scheduler.order == 0
+                    ):
+                        progress_bar.update()
+
+            if latents_all is None:
+                latents_all = latents.clone()
+            else:
+                assert weights is not None
+                # latents_all[:, -overlap:] = (
+                #     latents[:, :overlap] + latents_all[:, -overlap:]
+                # ) / 2.0
+                latents_all[:, -overlap:] = latents[
+                    :, :overlap
+                ] * weights + latents_all[:, -overlap:] * (1 - weights)
+                latents_all = torch.cat([latents_all, latents[:, overlap:]], dim=1)
+
+            idx_start += stride
+
+        if track_time:
+            denoise_event.record()
+            torch.cuda.synchronize()
+            elapsed_time_ms = encode_event.elapsed_time(denoise_event)
+            print(f"Elapsed time for denoising video: {elapsed_time_ms} ms")
+
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            frames = self.decode_latents(latents_all, num_frames, decode_chunk_size)
+
+            if track_time:
+                decode_event.record()
+                torch.cuda.synchronize()
+                elapsed_time_ms = denoise_event.elapsed_time(decode_event)
+                print(f"Elapsed time for decoding video: {elapsed_time_ms} ms")
+
+            frames = self.video_processor.postprocess_video(
+                video=frames, output_type=output_type
+            )
+        else:
+            frames = latents_all
+
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return frames
+
+        return StableVideoDiffusionPipelineOutput(frames=frames)
diff --git a/inference/v2v_data/DepthCrafter/depthcrafter/unet.py b/inference/v2v_data/DepthCrafter/depthcrafter/unet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0066a71c7a054d2e729f45baacc3a223276c1f44
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/depthcrafter/unet.py
@@ -0,0 +1,142 @@
+from typing import Union, Tuple
+
+import torch
+from diffusers import UNetSpatioTemporalConditionModel
+from diffusers.models.unets.unet_spatio_temporal_condition import UNetSpatioTemporalConditionOutput
+
+
+class DiffusersUNetSpatioTemporalConditionModelDepthCrafter(
+    UNetSpatioTemporalConditionModel
+):
+
+    def forward(
+        self,
+        sample: torch.Tensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        return_dict: bool = True,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        timesteps = timesteps.expand(batch_size)
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.conv_in.weight.dtype)
+
+        emb = self.time_embedding(t_emb)  # [batch_size * num_frames, channels]
+
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        emb = emb + aug_emb
+
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # Repeat the embeddings num_video_frames times
+        # emb: [batch, channels] -> [batch * frames, channels]
+        emb = emb.repeat_interleave(num_frames, dim=0)
+        # encoder_hidden_states: [batch, frames, channels] -> [batch * frames, 1, channels]
+        encoder_hidden_states = encoder_hidden_states.flatten(0, 1).unsqueeze(1)
+
+        # 2. pre-process
+        sample = sample.to(dtype=self.conv_in.weight.dtype)
+        assert sample.dtype == self.conv_in.weight.dtype, (
+            f"sample.dtype: {sample.dtype}, "
+            f"self.conv_in.weight.dtype: {self.conv_in.weight.dtype}"
+        )
+        sample = self.conv_in(sample)
+
+        image_only_indicator = torch.zeros(
+            batch_size, num_frames, dtype=sample.dtype, device=sample.device
+        )
+
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if (
+                hasattr(downsample_block, "has_cross_attention")
+                and downsample_block.has_cross_attention
+            ):
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+
+            else:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(
+            hidden_states=sample,
+            temb=emb,
+            encoder_hidden_states=encoder_hidden_states,
+            image_only_indicator=image_only_indicator,
+        )
+
+        # 5. up
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+
+            if (
+                hasattr(upsample_block, "has_cross_attention")
+                and upsample_block.has_cross_attention
+            ):
+                sample = upsample_block(
+                    hidden_states=sample,
+                    res_hidden_states_tuple=res_samples,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    res_hidden_states_tuple=res_samples,
+                    temb=emb,
+                    image_only_indicator=image_only_indicator,
+                )
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+
+        if not return_dict:
+            return (sample,)
+
+        return UNetSpatioTemporalConditionOutput(sample=sample)
diff --git a/inference/v2v_data/DepthCrafter/depthcrafter/utils.py b/inference/v2v_data/DepthCrafter/depthcrafter/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ac50e8dae660b3998eb4532b88eb7bb9c460ae3
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/depthcrafter/utils.py
@@ -0,0 +1,96 @@
+from typing import Union, List
+import tempfile
+import numpy as np
+import PIL.Image
+import matplotlib.cm as cm
+import mediapy
+import torch
+from decord import VideoReader, cpu
+
+dataset_res_dict = {
+    "sintel": [448, 1024],
+    "scannet": [640, 832],
+    "KITTI": [384, 1280],
+    "bonn": [512, 640],
+    "NYUv2": [448, 640],
+}
+
+
+def read_video_frames(video_path, process_length, target_fps, max_res, dataset="open"):
+    if dataset == "open":
+        print("==> processing video: ", video_path)
+        vid = VideoReader(video_path, ctx=cpu(0))
+        print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]))
+        original_height, original_width = vid.get_batch([0]).shape[1:3]
+        height = round(original_height / 64) * 64
+        width = round(original_width / 64) * 64
+        if max(height, width) > max_res:
+            scale = max_res / max(original_height, original_width)
+            height = round(original_height * scale / 64) * 64
+            width = round(original_width * scale / 64) * 64
+    else:
+        height = dataset_res_dict[dataset][0]
+        width = dataset_res_dict[dataset][1]
+
+    vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+
+    fps = vid.get_avg_fps() if target_fps == -1 else target_fps
+    stride = round(vid.get_avg_fps() / fps)
+    stride = max(stride, 1)
+    frames_idx = list(range(0, len(vid), stride))
+    print(
+        f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
+    )
+    if process_length != -1 and process_length < len(frames_idx):
+        frames_idx = frames_idx[:process_length]
+    print(
+        f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
+    )
+    frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0
+
+    return frames, fps
+
+
+def save_video(
+    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]],
+    output_video_path: str = None,
+    fps: int = 10,
+    crf: int = 18,
+) -> str:
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+
+    if isinstance(video_frames[0], np.ndarray):
+        video_frames = [(frame * 255).astype(np.uint8) for frame in video_frames]
+
+    elif isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+    mediapy.write_video(output_video_path, video_frames, fps=fps, crf=crf)
+    return output_video_path
+
+
+class ColorMapper:
+    # a color mapper to map depth values to a certain colormap
+    def __init__(self, colormap: str = "inferno"):
+        self.colormap = torch.tensor(cm.get_cmap(colormap).colors)
+
+    def apply(self, image: torch.Tensor, v_min=None, v_max=None):
+        # assert len(image.shape) == 2
+        if v_min is None:
+            v_min = image.min()
+        if v_max is None:
+            v_max = image.max()
+        image = (image - v_min) / (v_max - v_min)
+        image = (image * 255).long()
+        image = self.colormap[image]
+        return image
+
+
+def vis_sequence_depth(depths: np.ndarray, v_min=None, v_max=None):
+    visualizer = ColorMapper()
+    if v_min is None:
+        v_min = depths.min()
+    if v_max is None:
+        v_max = depths.max()
+    res = visualizer.apply(torch.tensor(depths), v_min=v_min, v_max=v_max).numpy()
+    return res
diff --git a/inference/v2v_data/DepthCrafter/requirements.txt b/inference/v2v_data/DepthCrafter/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9c696daaf87e909e98b86ae36f909d34f5308cf5
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/requirements.txt
@@ -0,0 +1,11 @@
+torch==2.0.1
+diffusers==0.29.1
+numpy==1.26.4
+matplotlib==3.8.4
+transformers==4.41.2
+accelerate==0.30.1
+xformers==0.0.20
+mediapy==1.2.0
+fire==0.6.0
+decord==0.6.0
+OpenEXR==3.2.4
diff --git a/inference/v2v_data/DepthCrafter/run.py b/inference/v2v_data/DepthCrafter/run.py
new file mode 100644
index 0000000000000000000000000000000000000000..7279cf108a49cd82c0c4412ea6e3c25290c7d455
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/run.py
@@ -0,0 +1,209 @@
+import gc
+import os
+import numpy as np
+import torch
+
+from diffusers.training_utils import set_seed
+from fire import Fire
+
+from depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
+from depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
+from depthcrafter.utils import vis_sequence_depth, save_video, read_video_frames
+
+
+class DepthCrafterDemo:
+    def __init__(
+        self,
+        unet_path: str,
+        pre_train_path: str,
+        cpu_offload: str = "model",
+    ):
+        unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
+            unet_path,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+        )
+        # load weights of other components from the provided checkpoint
+        self.pipe = DepthCrafterPipeline.from_pretrained(
+            pre_train_path,
+            unet=unet,
+            torch_dtype=torch.float16,
+            variant="fp16",
+        )
+
+        # for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory
+        if cpu_offload is not None:
+            if cpu_offload == "sequential":
+                # This will slow, but save more memory
+                self.pipe.enable_sequential_cpu_offload()
+            elif cpu_offload == "model":
+                self.pipe.enable_model_cpu_offload()
+            else:
+                raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
+        else:
+            self.pipe.to("cuda")
+        # enable attention slicing and xformers memory efficient attention
+        try:
+            self.pipe.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            print(e)
+            print("Xformers is not enabled")
+        self.pipe.enable_attention_slicing()
+
+    def infer(
+        self,
+        video: str,
+        num_denoising_steps: int,
+        guidance_scale: float,
+        save_folder: str = "./demo_output",
+        window_size: int = 110,
+        process_length: int = 195,
+        overlap: int = 25,
+        max_res: int = 1024,
+        dataset: str = "open",
+        target_fps: int = 15,
+        seed: int = 42,
+        track_time: bool = True,
+        save_npz: bool = False,
+        save_exr: bool = False,
+    ):
+        set_seed(seed)
+
+        frames, target_fps = read_video_frames(
+            video,
+            process_length,
+            target_fps,
+            max_res,
+            dataset,
+        )
+        # inference the depth map using the DepthCrafter pipeline
+        with torch.inference_mode():
+            res = self.pipe(
+                frames,
+                height=frames.shape[1],
+                width=frames.shape[2],
+                output_type="np",
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_denoising_steps,
+                window_size=window_size,
+                overlap=overlap,
+                track_time=track_time,
+            ).frames[0]
+        # convert the three-channel output to a single channel depth map
+        res = res.sum(-1) / res.shape[-1]
+        # normalize the depth map to [0, 1] across the whole video
+        res = (res - res.min()) / (res.max() - res.min())
+        # visualize the depth map and save the results
+        vis = vis_sequence_depth(res)
+        # save the depth map and visualization with the target FPS
+        save_path = os.path.join(
+            save_folder, os.path.splitext(os.path.basename(video))[0]
+        )
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        save_video(res, save_path + "_depth.mp4", fps=target_fps)
+        save_video(vis, save_path + "_vis.mp4", fps=target_fps)
+        save_video(frames, save_path + "_input.mp4", fps=target_fps)
+        if save_npz:
+            np.savez_compressed(save_path + ".npz", depth=res)
+        if save_exr:
+            import OpenEXR
+            import Imath
+
+            os.makedirs(save_path, exist_ok=True)
+            print(f"==> saving EXR results to {save_path}")
+            # Iterate over each frame and save as a separate EXR file
+            for i, frame in enumerate(res):
+                output_exr = f"{save_path}/frame_{i:04d}.exr"
+
+                # Prepare EXR header for each frame
+                header = OpenEXR.Header(frame.shape[1], frame.shape[0])
+                header["channels"] = {
+                    "Z": Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
+                }
+
+                # Create EXR file and write the frame
+                exr_file = OpenEXR.OutputFile(output_exr, header)
+                exr_file.writePixels({"Z": frame.tobytes()})
+                exr_file.close()
+
+        return [
+            save_path + "_input.mp4",
+            save_path + "_vis.mp4",
+            save_path + "_depth.mp4",
+        ]
+
+    def run(
+        self,
+        input_video,
+        num_denoising_steps,
+        guidance_scale,
+        max_res=1024,
+        process_length=195,
+    ):
+        res_path = self.infer(
+            input_video,
+            num_denoising_steps,
+            guidance_scale,
+            max_res=max_res,
+            process_length=process_length,
+        )
+        # clear the cache for the next video
+        gc.collect()
+        torch.cuda.empty_cache()
+        return res_path[:2]
+
+
+def main(
+    video_path: str,
+    save_folder: str = "./demo_output",
+    unet_path: str = "tencent/DepthCrafter",
+    pre_train_path: str = "stabilityai/stable-video-diffusion-img2vid-xt",
+    process_length: int = -1,
+    cpu_offload: str = "model",
+    target_fps: int = -1,
+    seed: int = 42,
+    num_inference_steps: int = 5,
+    guidance_scale: float = 1.0,
+    window_size: int = 110,
+    overlap: int = 25,
+    max_res: int = 1024,
+    dataset: str = "open",
+    save_npz: bool = False,
+    save_exr: bool = False,
+    track_time: bool = False,
+):
+    depthcrafter_demo = DepthCrafterDemo(
+        unet_path=unet_path,
+        pre_train_path=pre_train_path,
+        cpu_offload=cpu_offload,
+    )
+    # process the videos, the video paths are separated by comma
+    video_paths = video_path.split(",")
+    for video in video_paths:
+        depthcrafter_demo.infer(
+            video,
+            num_inference_steps,
+            guidance_scale,
+            save_folder=save_folder,
+            window_size=window_size,
+            process_length=process_length,
+            overlap=overlap,
+            max_res=max_res,
+            dataset=dataset,
+            target_fps=target_fps,
+            seed=seed,
+            track_time=track_time,
+            save_npz=save_npz,
+            save_exr=save_exr,
+        )
+        # clear the cache for the next video
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == "__main__":
+    # running configs
+    # the most important arguments for memory saving are `cpu_offload`, `enable_xformers`, `max_res`, and `window_size`
+    # the most important arguments for trade-off between quality and speed are
+    # `num_inference_steps`, `guidance_scale`, and `max_res`
+    Fire(main)
diff --git a/inference/v2v_data/DepthCrafter/tools/npz_to_exr.py b/inference/v2v_data/DepthCrafter/tools/npz_to_exr.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad16a190200a3011d61c8965fbc8092edaf9904b
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/tools/npz_to_exr.py
@@ -0,0 +1,36 @@
+import os
+
+import numpy as np
+import OpenEXR
+import Imath
+from fire import Fire
+
+
+def npz_to_multiframe_exr(input_npz: str, output_folder: str):
+    os.makedirs(output_folder, exist_ok=True)
+    # Load the .npz file
+    data = np.load(input_npz)
+    depth_frames = data["depth"]  # Replace 'depth' with the correct key
+    num_frames, height, width = depth_frames.shape
+
+    # Iterate over each frame and save as a separate EXR file
+    for i, frame in enumerate(depth_frames):
+        output_exr = f"{output_folder}/frame_{i:04d}.exr"
+
+        # Prepare EXR header for each frame
+        header = OpenEXR.Header(width, height)
+        header["channels"] = {
+            "Z": Imath.Channel(Imath.PixelType(Imath.PixelType.FLOAT))
+        }
+
+        # Create EXR file and write the frame
+        exr_file = OpenEXR.OutputFile(output_exr, header)
+        exr_file.writePixels({"Z": frame.tobytes()})
+        exr_file.close()
+        print(f"Saved frame {i} to {output_exr}")
+
+
+if __name__ == "__main__":
+    # Specify the input .npz file and output folder
+    # npz_to_multiframe_exr(r"path_to_input", r"path_to_output")
+    Fire(npz_to_multiframe_exr)
diff --git a/inference/v2v_data/DepthCrafter/visualization/visualization_pcd.py b/inference/v2v_data/DepthCrafter/visualization/visualization_pcd.py
new file mode 100644
index 0000000000000000000000000000000000000000..36187baa9162a5966a815ddb930383bddebbaa99
--- /dev/null
+++ b/inference/v2v_data/DepthCrafter/visualization/visualization_pcd.py
@@ -0,0 +1,166 @@
+"""Record3D visualizer
+"""
+
+import time
+from decord import VideoReader, cpu
+
+import numpy as np
+import tyro
+import viser
+import viser.extras
+import viser.transforms as tf
+from tqdm.auto import tqdm
+
+
+def main(
+    data_path: str,
+    vid_name: str,
+    downsample_factor: int = 8,
+    max_frames: int = 100,
+    share: bool = False,
+    point_size=0.01,
+) -> None:
+
+    server = viser.ViserServer()
+    if share:
+        server.request_share_url()
+
+    print("Loading frames!")
+    dis_path = data_path + "/" + vid_name + ".npz"
+    vid_path = data_path + "/" + vid_name + "_input.mp4"
+
+    disp_map = np.load(dis_path)["depth"][:, :, :]
+    T = disp_map.shape[0]
+    H = disp_map.shape[1]
+    W = disp_map.shape[2]
+
+    disp_max = disp_map.max()
+    disp_min = disp_map.min()
+    disp_map = (disp_map - disp_min) / (disp_max - disp_min)
+
+    vr = VideoReader(vid_path, ctx=cpu(0))
+    vid = vr[:].asnumpy()[:, 0:H, 0:W]
+    fps = vr.get_avg_fps()
+    num_frames = min(max_frames, T)
+
+    # Add playback UI.
+    with server.gui.add_folder("Playback"):
+        gui_timestep = server.gui.add_slider(
+            "Timestep",
+            min=0,
+            max=num_frames - 1,
+            step=1,
+            initial_value=0,
+            disabled=True,
+        )
+        gui_next_frame = server.gui.add_button("Next Frame", disabled=True)
+        gui_prev_frame = server.gui.add_button("Prev Frame", disabled=True)
+        gui_playing = server.gui.add_checkbox("Playing", True)
+        gui_framerate = server.gui.add_slider(
+            "FPS", min=1, max=60, step=0.1, initial_value=fps
+        )
+        gui_framerate_options = server.gui.add_button_group(
+            "FPS options", ("10", "20", "30", "60")
+        )
+
+    # Frame step buttons.
+    @gui_next_frame.on_click
+    def _(_) -> None:
+        gui_timestep.value = (gui_timestep.value + 1) % num_frames
+
+    @gui_prev_frame.on_click
+    def _(_) -> None:
+        gui_timestep.value = (gui_timestep.value - 1) % num_frames
+
+    # Disable frame controls when we're playing.
+    @gui_playing.on_update
+    def _(_) -> None:
+        gui_timestep.disabled = gui_playing.value
+        gui_next_frame.disabled = gui_playing.value
+        gui_prev_frame.disabled = gui_playing.value
+
+    # Set the framerate when we click one of the options.
+    @gui_framerate_options.on_click
+    def _(_) -> None:
+        gui_framerate.value = int(gui_framerate_options.value)
+
+    prev_timestep = gui_timestep.value
+
+    # Toggle frame visibility when the timestep slider changes.
+    @gui_timestep.on_update
+    def _(_) -> None:
+        nonlocal prev_timestep
+        current_timestep = gui_timestep.value
+        with server.atomic():
+            frame_nodes[current_timestep].visible = True
+            frame_nodes[prev_timestep].visible = False
+        prev_timestep = current_timestep
+        server.flush()  # Optional!
+
+    # Load in frames.
+    server.scene.add_frame(
+        "/frames",
+        wxyz=tf.SO3.exp(np.array([0.0, 0.0, 0.0])).wxyz,
+        position=(0, 0, 0),
+        show_axes=False,
+    )
+    frame_nodes: list[viser.FrameHandle] = []
+    for i in tqdm(range(num_frames)):
+
+        # Add base frame.
+        frame_nodes.append(server.scene.add_frame(f"/frames/t{i}", show_axes=False))
+
+        position_image = np.where(np.zeros([H, W]) == 0)
+        v = np.array(position_image[0])
+        u = np.array(position_image[1])
+        d = disp_map[i, v, u]
+
+        zc = 1.0 / (d + 0.1)
+        # zc = 1.0 / (d + 1e-8)
+
+        xc = zc * (u - (W / 2.0)) / (W / 2.0)
+        yc = zc * (v - (H / 2.0)) / (H / 2.0)
+
+        zc -= 4  # disp_max * 0.2
+
+        points = np.stack((xc, yc, zc), axis=1)
+        colors = vid[i, v, u]
+
+        points = points[::downsample_factor]
+        colors = colors[::downsample_factor]
+
+        # Place the point cloud in the frame.
+        server.scene.add_point_cloud(
+            name=f"/frames/t{i}/point_cloud",
+            points=points,
+            colors=colors,
+            point_size=point_size,  # 0.007,
+            point_shape="rounded",
+        )
+
+    # Hide all but the current frame.
+    for i, frame_node in enumerate(frame_nodes):
+        frame_node.visible = i == gui_timestep.value
+
+    # Playback update loop.
+    prev_timestep = gui_timestep.value
+    while True:
+        if gui_playing.value:
+            gui_timestep.value = (gui_timestep.value + 1) % num_frames
+
+        time.sleep(1.0 / gui_framerate.value)
+
+
+if __name__ == "__main__":
+    tyro.cli(
+        main(
+            # dir path of saved rgb.mp4 and disp.npz, modify it to your own dir
+            data_path="./demo_output",
+            # sample name, modify it to your own sample name
+            vid_name="example_01",
+            # downsample factor of dense pcd
+            downsample_factor=8,
+            # point cloud size
+            point_size=0.007,
+        )
+    )
diff --git a/inference/v2v_data/config_help.md b/inference/v2v_data/config_help.md
new file mode 100644
index 0000000000000000000000000000000000000000..36c49dc06ce7a6bdaf4c50b02eb9c2ea4e334f8f
--- /dev/null
+++ b/inference/v2v_data/config_help.md
@@ -0,0 +1,29 @@
+## Important configuration for [inference.py](../inference.py):
+
+### 1. General configs
+| Configuration     | Default Value   | Explanation                                              |
+|:----------------- |:--------------- |:-------------------------------------------------------- |
+| `--video_path`    | `None`            | Input video file path                                    |
+| `--out_dir`       | `./experiments/`| Output directory                                         |
+| `--device`        | `cuda:0`        | The device to use (e.g., CPU or GPU)                     |
+| `--exp_name`      | `None`           | Experiment name, defaults to video file name             |
+| `--seed`          | `43`            | Random seed for reproducibility                          |
+| `--video_length`  | `49`            | Length of the video frames (number of frames)            |
+| `--fps`           | `10`            | fps for saved video                  |
+| `--stride`        | `1`             | Sampling stride for input video (frame interval)         |
+| `--server_name`   | `None`            | Server IP address  for gradio                           |
+### 2. Point cloud render configs
+
+| Configuration     | Default Value   |   Explanation                                              |
+|:----------------- |:--------------- |:-------------------------------------------------------- |
+| `--radius_scale`  | `1.0`           | Scale factor for the spherical radius                    |
+| `--camera`        | `traj`          | Camera pose type, either 'traj' or 'target'                   |
+| `--mode`          | `gradual`       | Mode of operation, 'gradual', 'bullet', or 'direct'      |
+| `--mask`          | `False`         | Clean the point cloud data if true                       |
+| `--target_pose`   | `None`            | Required for 'target' camera pose type, specifies target camera poses (theta, phi, r, x, y). The initial camera is at (0,0,radius,0,0), +theta (theta<60) rotates camera upward, +phi (phi<60) rotates camera to right, +r (r<0.6) moves camera forward, +x (x<4) pans the camera to right, +y (y<4) pans the camera upward  |
+| `--traj_txt`      | `None`           | Required for 'traj' camera pose type, a txt file specifying a complex camera trajectory ([examples](../test/trajs/loop1.txt)). The fist line is the theta sequence, the second line the phi sequence, and the last line the r sequence. Our script will interpolate each of them into lists of length 49, producing theta_list, phi_list, r_list. The initial camera is at (0,0,radius), then move to (theta_list[0], phi_list[0], r_list[0]), finally (theta_list[-1], phi_list[-1], r_list[-1])|
+| `--near`          | `0.0001`        | Near clipping plane distance                             |
+| `--far`           | `10000.0`       | Far clipping plane distance                              |
+| `--anchor_idx`    | `0`             | One GT frame for anchor frame                            |
+
+
diff --git a/inference/v2v_data/demo.py b/inference/v2v_data/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..f129cb4efb0809fb668360b9e1f7800c883983f1
--- /dev/null
+++ b/inference/v2v_data/demo.py
@@ -0,0 +1,671 @@
+import gc
+import os
+import torch
+from models.infer import DepthCrafterDemo
+import numpy as np
+import torch
+from PIL import Image
+from models.utils import *
+
+import torch
+import torch.nn.functional as F
+
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
+from qwen_vl_utils import process_vision_info
+
+def get_center_crop_resolution(original_resoultion, target_aspect_ratio=(2, 3)):
+    target_h, target_w = target_aspect_ratio
+    aspect_ratio = target_w / target_h
+
+    original_h, original_w = original_resoultion
+    crop_h = original_h
+    crop_w = int(crop_h * aspect_ratio)
+    if crop_w > original_w:
+        crop_w = original_w
+        crop_h = int(crop_w / aspect_ratio)
+
+    resized_h = 576
+    resized_w = 1024
+    
+    h_ratio = resized_h / original_h
+    w_ratio = resized_w / original_w
+    
+    crop_h = int(crop_h * h_ratio)
+    crop_w = int(crop_w * w_ratio)
+    return crop_h, crop_w
+
+def process_video_tensor(video, resolution=(480, 720)):
+    video_resized = F.interpolate(video, size=(resolution[0], resolution[1]), mode='bilinear', align_corners=False)
+
+    video_uint8 = (video_resized.clamp(0, 1) * 255).byte()
+
+    return video_uint8
+
+def process_mask_tensor(video, resolution=(480, 720)):
+    video_resized = F.interpolate(video, size=(resolution[0], resolution[1]), mode='bilinear', align_corners=False)
+
+    return (video_resized==1).bool()
+
+def center_crop_to_ratio(tensor: torch.Tensor, resolution=(480, 720)):
+    """
+    Args:
+        tensor: [T, C, H, W], float32 or uint8
+    Returns:
+        cropped: [T, C, H_crop, W_crop], where H_crop:W_crop = 2:3 (480:720 ratio)
+    """
+    T, C, H, W = tensor.shape
+    h, w = resolution
+    target_ratio = w / h
+
+    crop_h = H
+    crop_w = int(H * target_ratio)
+    if crop_w > W:
+        crop_w = W
+        crop_h = int(W / target_ratio)
+
+    top = (H - crop_h) // 2
+    left = (W - crop_w) // 2
+
+    return tensor[:, :, top:top + crop_h, left:left + crop_w]
+
+import imageio
+import numpy as np
+
+def save_video_as_mp4(video_tensor, save_path, fps=24):
+    """
+    video_tensor: [T, 3, H, W], dtype=uint8, values in [0, 255]
+    save_path: e.g., "output_video.mp4"
+    """
+    assert video_tensor.dtype == torch.uint8 and video_tensor.ndim == 4
+    T, C, H, W = video_tensor.shape
+
+    video_np = video_tensor.permute(0, 2, 3, 1).cpu().numpy()
+    print(video_np.shape)
+
+    imageio.mimwrite(
+        save_path,
+        video_np,
+        fps=fps,
+    )
+
+
+class GetAnchorVideos:
+    def __init__(self, opts, gradio=False):
+        self.funwarp = Warper(device=opts.device)
+        self.depth_estimater = DepthCrafterDemo(
+            unet_path=opts.unet_path,
+            pre_train_path=opts.pre_train_path,
+            cpu_offload=opts.cpu_offload,
+            device=opts.device,
+        )
+
+        # default: Load the model on the available device(s)
+        self.caption_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            opts.qwen_path, torch_dtype="auto", device_map="auto"
+        )
+        # default processer
+        self.caption_processor = AutoProcessor.from_pretrained(opts.qwen_path)
+
+        if gradio:
+            self.opts = opts
+
+    def infer_gradual(self, opts):
+        frames = read_video_frames(
+            opts.video_path, opts.video_length, opts.stride, opts.max_res
+        )
+        vr = VideoReader(opts.video_path, ctx=cpu(0))
+        frame_shape = vr[0].shape  # (H, W, 3)
+        ori_resolution = frame_shape[:2]
+        print(f"==> original video shape: {frame_shape}")
+        target_resolution = get_center_crop_resolution(ori_resolution)
+        print(f"==> target video shape resized: {target_resolution}")
+
+        prompt = self.get_caption(opts, opts.video_path)
+        depths = self.depth_estimater.infer(
+            frames,
+            opts.near,
+            opts.far,
+            opts.depth_inference_steps,
+            opts.depth_guidance_scale,
+            window_size=opts.window_size,
+            overlap=opts.overlap,
+        ).to(opts.device)
+        frames = (
+            torch.from_numpy(frames).permute(0, 3, 1, 2).to(opts.device) * 2.0 - 1.0
+        )  # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+        
+        depths = center_crop_to_ratio(depths, resolution=target_resolution)
+        frames = center_crop_to_ratio(frames, resolution=target_resolution)
+        pose_s, pose_t, K = self.get_poses(opts, depths, num_frames=opts.video_length)
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(
+                frames[i : i + 1],
+                None,
+                depths[i : i + 1],
+                pose_s[i : i + 1],
+                pose_t[i : i + 1],
+                K[i : i + 1],
+                None,
+                opts.mask,
+                twice=False,
+            )
+            warped_images.append(warped_frame2)
+            masks.append(mask2)
+        cond_video = (torch.cat(warped_images) + 1.0) / 2.0
+        
+        cond_video_save = process_video_tensor(cond_video).cpu()
+        ori_video_save = process_video_tensor((frames+1.0) / 2.0).cpu()
+        save_cated = torch.cat([ori_video_save, cond_video_save], dim=3)
+        # post_t  captions  depth  intrinsics  joint_videos
+        save_name = os.path.basename(opts.video_path).split('.')[0]
+        save_name = opts.save_name
+
+        os.makedirs(f'{opts.out_dir}', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masked_videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/depth', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masks', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/post_t', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/pose_s', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/intrinsics', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/captions', exist_ok=True)
+        
+        mask_save = process_mask_tensor(torch.cat(masks)).squeeze().cpu().numpy()
+        np.save(f"{opts.out_dir}/depth/{save_name}.npy",depths.cpu().numpy())
+        np.savez_compressed(f"{opts.out_dir}/masks/{save_name}.npz",mask=mask_save)
+        save_video_as_mp4(ori_video_save,f"{opts.out_dir}/videos/{save_name}.mp4", fps=8)
+        save_video_as_mp4(cond_video_save,f"{opts.out_dir}/masked_videos/{save_name}.mp4", fps=8)
+        np.save(f'{opts.out_dir}/post_t/' + save_name + '.npy',pose_t.cpu().numpy())
+        np.save(f'{opts.out_dir}/pose_s/' + save_name + '.npy',pose_s.cpu().numpy())
+        np.save(f'{opts.out_dir}/intrinsics/' + save_name + '.npy',K[0].cpu().numpy())
+        # save prompt to txt
+        with open(f'{opts.out_dir}/captions/' + save_name + '.txt', 'w') as f:
+            f.write(prompt)
+
+    def infer_image(self, opts):
+        frames = read_video_frames(
+            opts.video_path, opts.video_length, opts.stride, opts.max_res
+        )
+        frames = frames[:1].repeat(opts.video_length, 0)
+        if opts.video_path.lower().endswith(('.mp4', '.avi', '.mov', '.webm')):
+            vr = VideoReader(opts.video_path, ctx=cpu(0))
+            frame_shape = vr[0].shape  # (H, W, 3)
+            ori_resolution = frame_shape[:2]
+        elif opts.video_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp')):
+            img = Image.open(opts.video_path)
+            ori_resolution = img.size[::-1]  # PIL gives (W, H), convert to (H, W)
+        print(f"==> original video shape: {ori_resolution}")
+        target_resolution = get_center_crop_resolution(ori_resolution)
+        print(f"==> target video shape resized: {target_resolution}")
+        # prompt = self.get_caption(opts, frames[opts.video_length // 2])
+        prompt = self.get_caption(opts, opts.video_path)
+        # depths= self.depth_estimater.infer(frames, opts.near, opts.far).to(opts.device)
+        depths = self.depth_estimater.infer(
+            frames,
+            opts.near,
+            opts.far,
+            opts.depth_inference_steps,
+            opts.depth_guidance_scale,
+            window_size=opts.window_size,
+            overlap=opts.overlap,
+        ).to(opts.device)
+        frames = (
+            torch.from_numpy(frames).permute(0, 3, 1, 2).to(opts.device) * 2.0 - 1.0
+        )  # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+        
+        # depths = mask_out_cropped_edges(depths)
+        depths = center_crop_to_ratio(depths, resolution=target_resolution)
+        frames = center_crop_to_ratio(frames, resolution=target_resolution)
+        assert frames.shape[0] == opts.video_length
+        pose_s, pose_t, K = self.get_poses(opts, depths, num_frames=opts.video_length)
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(
+                frames[i : i + 1],
+                None,
+                depths[i : i + 1],
+                pose_s[i : i + 1],
+                pose_t[i : i + 1],
+                K[i : i + 1],
+                None,
+                opts.mask,
+                twice=False,
+            )
+            warped_images.append(warped_frame2)
+            masks.append(mask2)
+        cond_video = (torch.cat(warped_images) + 1.0) / 2.0
+        
+        cond_video_save = process_video_tensor(cond_video).cpu()
+        ori_video_save = process_video_tensor((frames+1.0) / 2.0).cpu()
+        save_cated = torch.cat([ori_video_save, cond_video_save], dim=3)
+        # post_t  captions  depth  intrinsics  joint_videos
+        save_name = os.path.basename(opts.video_path).split('.')[0]
+        # save_name = f"{save_name}_"
+        save_name = opts.save_name
+
+        os.makedirs(f'{opts.out_dir}', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masked_videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/depth', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masks', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/post_t', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/pose_s', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/intrinsics', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/captions', exist_ok=True)
+        
+        mask_save = process_mask_tensor(torch.cat(masks)).squeeze().cpu().numpy()
+        np.save(f"{opts.out_dir}/depth/{save_name}.npy",depths.cpu().numpy())
+        np.savez_compressed(f"{opts.out_dir}/masks/{save_name}.npz",mask=mask_save)
+        save_video_as_mp4(ori_video_save,f"{opts.out_dir}/videos/{save_name}.mp4", fps=8)
+        save_video_as_mp4(cond_video_save,f"{opts.out_dir}/masked_videos/{save_name}.mp4", fps=8)
+        np.save(f'{opts.out_dir}/post_t/' + save_name + '.npy',pose_t.cpu().numpy())
+        np.save(f'{opts.out_dir}/pose_s/' + save_name + '.npy',pose_s.cpu().numpy())
+        np.save(f'{opts.out_dir}/intrinsics/' + save_name + '.npy',K[0].cpu().numpy())
+        # save prompt to txt
+        with open(f'{opts.out_dir}/captions/' + save_name + '.txt', 'w') as f:
+            f.write(prompt)
+
+        
+    def infer_direct(self, opts):
+        opts.cut = 20
+        frames = read_video_frames(
+            opts.video_path, opts.video_length, opts.stride, opts.max_res
+        )
+        vr = VideoReader(opts.video_path, ctx=cpu(0))
+        frame_shape = vr[0].shape  # (H, W, 3)
+        ori_resolution = frame_shape[:2]
+        print(f"==> original video shape: {frame_shape}")
+        target_resolution = get_center_crop_resolution(ori_resolution)
+        print(f"==> target video shape resized: {target_resolution}")
+
+        prompt = self.get_caption(opts, opts.video_path)
+        
+        depths = self.depth_estimater.infer(
+            frames,
+            opts.near,
+            opts.far,
+            opts.depth_inference_steps,
+            opts.depth_guidance_scale,
+            window_size=opts.window_size,
+            overlap=opts.overlap,
+        ).to(opts.device)
+        frames = (
+            torch.from_numpy(frames).permute(0, 3, 1, 2).to(opts.device) * 2.0 - 1.0
+        )  # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+        depths = center_crop_to_ratio(depths, resolution=target_resolution)
+        frames = center_crop_to_ratio(frames, resolution=target_resolution)
+        
+        pose_s, pose_t, K = self.get_poses(opts, depths, num_frames=opts.cut)
+
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            if i < opts.cut:
+                warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(
+                    frames[0:1],
+                    None,
+                    depths[0:1],
+                    pose_s[0:1],
+                    pose_t[i : i + 1],
+                    K[0:1],
+                    None,
+                    opts.mask,
+                    twice=False,
+                )
+                warped_images.append(warped_frame2)
+                masks.append(mask2)
+            else:
+                warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(
+                    frames[i - opts.cut : i - opts.cut + 1],
+                    None,
+                    depths[i - opts.cut : i - opts.cut + 1],
+                    pose_s[0:1],
+                    pose_t[-1:],
+                    K[0:1],
+                    None,
+                    opts.mask,
+                    twice=False,
+                )
+                warped_images.append(warped_frame2)
+                masks.append(mask2)
+        cond_video = (torch.cat(warped_images) + 1.0) / 2.0
+        cond_video_save = process_video_tensor(cond_video).cpu()
+        ori_video_save = process_video_tensor((frames+1.0) / 2.0).cpu()
+        save_cated = torch.cat([ori_video_save, cond_video_save], dim=3)
+        # post_t  captions  depth  intrinsics  joint_videos
+        save_name = os.path.basename(opts.video_path).split('.')[0]
+        save_name = opts.save_name
+
+        os.makedirs(f'{opts.out_dir}', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masked_videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/depth', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masks', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/post_t', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/pose_s', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/intrinsics', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/captions', exist_ok=True)
+        
+        mask_save = process_mask_tensor(torch.cat(masks)).squeeze().cpu().numpy()
+        np.save(f"{opts.out_dir}/depth/{save_name}.npy",depths.cpu().numpy())
+        np.savez_compressed(f"{opts.out_dir}/masks/{save_name}.npz",mask=mask_save)
+        save_video_as_mp4(ori_video_save,f"{opts.out_dir}/videos/{save_name}.mp4", fps=8)
+        save_video_as_mp4(cond_video_save,f"{opts.out_dir}/masked_videos/{save_name}.mp4", fps=8)
+        np.save(f'{opts.out_dir}/post_t/' + save_name + '.npy',pose_t.cpu().numpy())
+        np.save(f'{opts.out_dir}/pose_s/' + save_name + '.npy',pose_s.cpu().numpy())
+        np.save(f'{opts.out_dir}/intrinsics/' + save_name + '.npy',K[0].cpu().numpy())
+        # save prompt to txt
+        with open(f'{opts.out_dir}/captions/' + save_name + '.txt', 'w') as f:
+            f.write(prompt)
+
+
+    def infer_bullet(self, opts):
+        frames = read_video_frames(
+            opts.video_path, opts.video_length, opts.stride, opts.max_res
+        )
+        vr = VideoReader(opts.video_path, ctx=cpu(0))
+        frame_shape = vr[0].shape  # (H, W, 3)
+        ori_resolution = frame_shape[:2]
+        print(f"==> original video shape: {frame_shape}")
+        target_resolution = get_center_crop_resolution(ori_resolution)
+        print(f"==> target video shape resized: {target_resolution}")
+
+        prompt = self.get_caption(opts, opts.video_path)
+
+        depths = self.depth_estimater.infer(
+            frames,
+            opts.near,
+            opts.far,
+            opts.depth_inference_steps,
+            opts.depth_guidance_scale,
+            window_size=opts.window_size,
+            overlap=opts.overlap,
+        ).to(opts.device)
+
+        frames = (
+            torch.from_numpy(frames).permute(0, 3, 1, 2).to(opts.device) * 2.0 - 1.0
+        )  # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+        
+        depths = center_crop_to_ratio(depths, resolution=target_resolution)
+        frames = center_crop_to_ratio(frames, resolution=target_resolution)
+        
+        pose_s, pose_t, K = self.get_poses(opts, depths, num_frames=opts.video_length)
+
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(
+                frames[-1:],
+                None,
+                depths[-1:],
+                pose_s[0:1],
+                pose_t[i : i + 1],
+                K[0:1],
+                None,
+                opts.mask,
+                twice=False,
+            )
+            warped_images.append(warped_frame2)
+            masks.append(mask2)
+        cond_video = (torch.cat(warped_images) + 1.0) / 2.0
+        
+        cond_video_save = process_video_tensor(cond_video).cpu()
+        ori_video_save = process_video_tensor((frames+1.0) / 2.0).cpu()
+        save_cated = torch.cat([ori_video_save, cond_video_save], dim=3)
+        # post_t  captions  depth  intrinsics  joint_videos
+        save_name = os.path.basename(opts.video_path).split('.')[0]
+        save_name = opts.save_name
+
+        os.makedirs(f'{opts.out_dir}', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masked_videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/depth', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masks', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/post_t', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/pose_s', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/intrinsics', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/captions', exist_ok=True)
+        
+        mask_save = process_mask_tensor(torch.cat(masks)).squeeze().cpu().numpy()
+        np.save(f"{opts.out_dir}/depth/{save_name}.npy",depths.cpu().numpy())
+        np.savez_compressed(f"{opts.out_dir}/masks/{save_name}.npz",mask=mask_save)
+        save_video_as_mp4(ori_video_save,f"{opts.out_dir}/videos/{save_name}.mp4", fps=8)
+        save_video_as_mp4(cond_video_save,f"{opts.out_dir}/masked_videos/{save_name}.mp4", fps=8)
+        np.save(f'{opts.out_dir}/post_t/' + save_name + '.npy',pose_t.cpu().numpy())
+        np.save(f'{opts.out_dir}/pose_s/' + save_name + '.npy',pose_s.cpu().numpy())
+        np.save(f'{opts.out_dir}/intrinsics/' + save_name + '.npy',K[0].cpu().numpy())
+        # save prompt to txt
+        with open(f'{opts.out_dir}/captions/' + save_name + '.txt', 'w') as f:
+            f.write(prompt)
+
+    def infer_zoom(self, opts):
+        frames = read_video_frames(
+            opts.video_path, opts.video_length, opts.stride, opts.max_res
+        )
+        vr = VideoReader(opts.video_path, ctx=cpu(0))
+        frame_shape = vr[0].shape  # (H, W, 3)
+        ori_resolution = frame_shape[:2]
+        print(f"==> original video shape: {frame_shape}")
+        target_resolution = get_center_crop_resolution(ori_resolution)
+        print(f"==> target video shape resized: {target_resolution}")
+
+        prompt = self.get_caption(opts, opts.video_path)
+        
+        depths = self.depth_estimater.infer(
+            frames,
+            opts.near,
+            opts.far,
+            opts.depth_inference_steps,
+            opts.depth_guidance_scale,
+            window_size=opts.window_size,
+            overlap=opts.overlap,
+        ).to(opts.device)
+        frames = (
+            torch.from_numpy(frames).permute(0, 3, 1, 2).to(opts.device) * 2.0 - 1.0
+        )  # 49 576 1024 3 -> 49 3 576 1024, [-1,1]
+        assert frames.shape[0] == opts.video_length
+
+        depths = center_crop_to_ratio(depths, resolution=target_resolution)
+        frames = center_crop_to_ratio(frames, resolution=target_resolution)
+        
+        pose_s, pose_t, K = self.get_poses_f(opts, depths, num_frames=opts.video_length, f_new=250)
+
+        warped_images = []
+        masks = []
+        for i in tqdm(range(opts.video_length)):
+            warped_frame2, mask2, warped_depth2, flow12 = self.funwarp.forward_warp(
+                frames[i : i + 1],
+                None,
+                depths[i : i + 1],
+                pose_s[i : i + 1],
+                pose_t[i : i + 1],
+                K[0 : 1],
+                K[i : i + 1],
+                opts.mask,
+                twice=False,
+            )
+            warped_images.append(warped_frame2)
+            masks.append(mask2)
+        cond_video = (torch.cat(warped_images) + 1.0) / 2.0
+        
+        cond_video_save = process_video_tensor(cond_video).cpu()
+        ori_video_save = process_video_tensor((frames+1.0) / 2.0).cpu()
+        save_cated = torch.cat([ori_video_save, cond_video_save], dim=3)
+        # post_t  captions  depth  intrinsics  joint_videos
+        save_name = os.path.basename(opts.video_path).split('.')[0]
+        save_name = opts.save_name
+
+        os.makedirs(f'{opts.out_dir}', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masked_videos', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/depth', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/masks', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/post_t', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/pose_s', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/intrinsics', exist_ok=True)
+        os.makedirs(f'{opts.out_dir}/captions', exist_ok=True)
+        
+        mask_save = process_mask_tensor(torch.cat(masks)).squeeze().cpu().numpy()
+        np.save(f"{opts.out_dir}/depth/{save_name}.npy",depths.cpu().numpy())
+        np.savez_compressed(f"{opts.out_dir}/masks/{save_name}.npz",mask=mask_save)
+        save_video_as_mp4(ori_video_save,f"{opts.out_dir}/videos/{save_name}.mp4", fps=8)
+        save_video_as_mp4(cond_video_save,f"{opts.out_dir}/masked_videos/{save_name}.mp4", fps=8)
+        np.save(f'{opts.out_dir}/post_t/' + save_name + '.npy',pose_t.cpu().numpy())
+        np.save(f'{opts.out_dir}/pose_s/' + save_name + '.npy',pose_s.cpu().numpy())
+        np.save(f'{opts.out_dir}/intrinsics/' + save_name + '.npy',K[0].cpu().numpy())
+        # save prompt to txt
+        with open(f'{opts.out_dir}/captions/' + save_name + '.txt', 'w') as f:
+            f.write(prompt)
+
+    def get_caption(self, opts, video_path):
+        
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": video_path,
+                        "max_pixels": 360 * 420,
+                        "fps": 1.0,
+                    },
+                    {"type": "text", "text": "Give me a detailed caption of this video. Directly discribe the content of the video. Don't start with \"in the video\" stuff."},
+                ],
+            }
+        ]
+        
+        text = self.caption_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+
+        image_inputs, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True)
+        inputs = self.caption_processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            # fps=fps,
+            padding=True,
+            return_tensors="pt",
+            **video_kwargs,
+        )
+        inputs = inputs.to("cuda")
+
+        # Inference
+        generated_ids = self.caption_model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        generated_text = self.caption_processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return generated_text[0] + opts.refine_prompt
+
+    def get_poses(self, opts, depths, num_frames):
+        radius = (
+            depths[0, 0, depths.shape[-2] // 2, depths.shape[-1] // 2].cpu()
+            * opts.radius_scale
+        )
+        radius = min(radius, 5)
+        # cx = 512.0  # depths.shape[-1]//2
+        # cy = 288.0  # depths.shape[-2]//2
+        cx = depths.shape[-1]//2
+        cy = depths.shape[-2]//2
+        f = 500  # 500.
+        K = (
+            torch.tensor([[f, 0.0, cx], [0.0, f, cy], [0.0, 0.0, 1.0]])
+            .repeat(num_frames, 1, 1)
+            .to(opts.device)
+        )
+        c2w_init = (
+            torch.tensor(
+                [
+                    [-1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, -1.0, 0.0],
+                    [0.0, 0.0, 0.0, 1.0],
+                ]
+            )
+            .to(opts.device)
+            .unsqueeze(0)
+        )
+        if opts.camera == 'target':
+            dtheta, dphi, dr, dx, dy = opts.target_pose
+            poses = generate_traj_specified(
+                c2w_init, dtheta, dphi, dr * radius, dx, dy, num_frames, opts.device
+            )
+        elif opts.camera == 'target_fast':
+            dtheta, dphi, dr, dx, dy = opts.target_pose
+            poses = generate_traj_specified_fast(
+                c2w_init, dtheta, dphi, dr * radius, dx, dy, num_frames, opts.device
+            )
+        elif opts.camera == 'traj':
+            with open(opts.traj_txt, 'r') as file:
+                lines = file.readlines()
+                theta = [float(i) for i in lines[0].split()]
+                phi = [float(i) for i in lines[1].split()]
+                r = [float(i) * radius for i in lines[2].split()]
+            poses = generate_traj_txt(c2w_init, phi, theta, r, num_frames, opts.device)
+        poses[:, 2, 3] = poses[:, 2, 3] + radius
+        pose_s = poses[opts.anchor_idx : opts.anchor_idx + 1].repeat(num_frames, 1, 1)
+        pose_t = poses
+        return pose_s, pose_t, K
+
+    def get_poses_f(self, opts, depths, num_frames, f_new):
+        radius = (
+            depths[0, 0, depths.shape[-2] // 2, depths.shape[-1] // 2].cpu()
+            * opts.radius_scale
+        )
+        radius = min(radius, 5)
+        cx = depths.shape[-1]//2
+        cy = depths.shape[-2]//2
+        # cx = 512.0  
+        # cy = 288.0  
+        f = 500
+        # f_new,d_r: 250,0.5; 1000,-0.9
+        f_values = torch.linspace(f, f_new, num_frames, device=opts.device)
+        K = torch.zeros((num_frames, 3, 3), device=opts.device)
+        K[:, 0, 0] = f_values
+        K[:, 1, 1] = f_values
+        K[:, 0, 2] = cx
+        K[:, 1, 2] = cy
+        K[:, 2, 2] = 1.0
+        c2w_init = (
+            torch.tensor(
+                [
+                    [-1.0, 0.0, 0.0, 0.0],
+                    [0.0, 1.0, 0.0, 0.0],
+                    [0.0, 0.0, -1.0, 0.0],
+                    [0.0, 0.0, 0.0, 1.0],
+                ]
+            )
+            .to(opts.device)
+            .unsqueeze(0)
+        )
+        if opts.camera == 'target':
+            dtheta, dphi, dr, dx, dy = opts.target_pose
+            poses = generate_traj_specified(
+                c2w_init, dtheta, dphi, dr * radius, dx, dy, num_frames, opts.device
+            )
+        elif opts.camera == 'target_fast':
+            dtheta, dphi, dr, dx, dy = opts.target_pose
+            poses = generate_traj_specified_fast(
+                c2w_init, dtheta, dphi, dr * radius, dx, dy, num_frames, opts.device
+            )
+        elif opts.camera == 'traj':
+            with open(opts.traj_txt, 'r') as file:
+                lines = file.readlines()
+                theta = [float(i) for i in lines[0].split()]
+                phi = [float(i) for i in lines[1].split()]
+                r = [float(i) * radius for i in lines[2].split()]
+            poses = generate_traj_txt(c2w_init, phi, theta, r, num_frames, opts.device)
+        poses[:, 2, 3] = poses[:, 2, 3] + radius
+        pose_s = poses[opts.anchor_idx : opts.anchor_idx + 1].repeat(num_frames, 1, 1)
+        pose_t = poses
+        return pose_s, pose_t, K
\ No newline at end of file
diff --git a/inference/v2v_data/get_anchor_videos.sh b/inference/v2v_data/get_anchor_videos.sh
new file mode 100644
index 0000000000000000000000000000000000000000..cf6351df79d124ff4bb84d88f860cfd6903a7f50
--- /dev/null
+++ b/inference/v2v_data/get_anchor_videos.sh
@@ -0,0 +1,38 @@
+trap 'kill 0' SIGINT  
+
+target_pose="0 30 -0.6 0 0"
+target_pose_str="0_30_-0.6_0_0" 
+
+traj_name="loop1"
+traj_txt="test/trajs/${traj_name}.txt"
+
+video="/app/data/test_v2v/videos/amalfi-coast_traj_loop2.mp4"
+
+processed_data_name=$1
+# filename=$(basename "$video" .mp4)
+filename="amalfi-coast"
+CUDA_VISIBLE_DEVICES=0 python inference.py \
+    --video_path "$video" \
+    --stride 1 \
+    --out_dir experiments \
+    --radius_scale 1 \
+    --camera 'traj' \
+    --mask \
+    --target_pose $target_pose \
+    --traj_txt "$traj_txt" \
+    --save_name "${filename}_traj_${traj_name}" \
+    --mode "gradual" \
+    --out_dir ../../data/${processed_data_name}
+
+CUDA_VISIBLE_DEVICES=0 python inference.py \
+    --video_path "$video" \
+    --stride 1 \
+    --out_dir experiments \
+    --radius_scale 1 \
+    --camera 'target' \
+    --mask \
+    --target_pose $target_pose \
+    --traj_txt "$traj_txt" \
+    --save_name "${filename}_target_${target_pose_str}" \
+    --mode "gradual" \
+    --out_dir ../../data/${processed_data_name}
diff --git a/inference/v2v_data/inference.py b/inference/v2v_data/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..8293804638d9b7a4ec5b957eb14b165cd979aa93
--- /dev/null
+++ b/inference/v2v_data/inference.py
@@ -0,0 +1,197 @@
+from demo import GetAnchorVideos
+import os
+from datetime import datetime
+import argparse
+import torch
+
+
+def get_parser():
+    parser = argparse.ArgumentParser()
+
+    ## general
+    parser.add_argument('--video_path', type=str, help='Input path')
+    parser.add_argument(
+        '--out_dir', type=str, required=True, help='Output dir'
+    )
+    parser.add_argument(
+        '--device', type=str, default='cuda:0', help='The device to use'
+    )
+    parser.add_argument(
+        '--exp_name',
+        type=str,
+        default=None,
+        help='Experiment name, use video file name by default',
+    )
+    parser.add_argument(
+        '--save_name',
+        type=str,
+        default=None,
+        help='Experiment name, use video file name by default',
+    )
+    parser.add_argument(
+        '--seed', type=int, default=43, help='Random seed for reproducibility'
+    )
+    parser.add_argument(
+        '--video_length', type=int, default=49, help='Length of the video frames'
+    )
+    parser.add_argument('--fps', type=int, default=10, help='Fps for saved video')
+    parser.add_argument(
+        '--stride', type=int, default=1, help='Sampling stride for input video'
+    )
+    parser.add_argument('--server_name', type=str, help='Server IP address')
+
+    ## render
+    parser.add_argument(
+        '--radius_scale',
+        type=float,
+        default=1.0,
+        help='Scale factor for the spherical radius',
+    )
+    parser.add_argument('--camera', type=str, default='traj', help='traj or target')
+    parser.add_argument(
+        '--mode', type=str, default='gradual', help='gradual, bullet or direct'
+    )
+    parser.add_argument(
+        '--mask', action='store_true', default=False, help='Clean the pcd if true'
+    )
+    parser.add_argument(
+        '--traj_txt',
+        type=str,
+        help="Required for 'traj' camera, a txt file that specify camera trajectory",
+    )
+    parser.add_argument(
+        '--target_pose',
+        nargs=5,
+        type=float,
+        help="Required for 'target' mode, specify target camera pose, <theta phi r x y>",
+    )
+    parser.add_argument(
+        '--near', type=float, default=0.0001, help='Near clipping plane distance'
+    )
+    parser.add_argument(
+        '--far', type=float, default=10000.0, help='Far clipping plane distance'
+    )
+    parser.add_argument('--anchor_idx', type=int, default=0, help='One GT frame')
+
+    ## diffusion
+    parser.add_argument(
+        '--low_gpu_memory_mode',
+        type=bool,
+        default=False,
+        help='Enable low GPU memory mode',
+    )
+    # parser.add_argument('--model_name', type=str, default='checkpoints/CogVideoX-Fun-V1.1-5b-InP', help='Path to the model')
+    parser.add_argument(
+        '--model_name',
+        type=str,
+        default='../../pretrained/CogVideoX-Fun-V1.1-5b-InP',
+        help='Path to the model',
+    )
+    parser.add_argument(
+        '--sampler_name',
+        type=str,
+        choices=["Euler", "Euler A", "DPM++", "PNDM", "DDIM_Cog", "DDIM_Origin"],
+        default='DDIM_Origin',
+        help='Choose the sampler',
+    )
+    # parser.add_argument('--transformer_path', type=str, default='checkpoints/TrajectoryCrafter/crosstransformer', help='Path to the pretrained transformer model')
+    parser.add_argument(
+        '--transformer_path',
+        type=str,
+        default="../../pretrained/TrajectoryCrafter",
+        help='Path to the pretrained transformer model',
+    )
+    parser.add_argument(
+        '--sample_size',
+        type=int,
+        nargs=2,
+        default=[384, 672],
+        help='Sample size as [height, width]',
+    )
+    parser.add_argument(
+        '--diffusion_guidance_scale',
+        type=float,
+        default=6.0,
+        help='Guidance scale for inference',
+    )
+    parser.add_argument(
+        '--diffusion_inference_steps',
+        type=int,
+        default=50,
+        help='Number of inference steps',
+    )
+    parser.add_argument(
+        '--prompt', type=str, default=None, help='Prompt for video generation'
+    )
+    parser.add_argument(
+        '--negative_prompt',
+        type=str,
+        default="The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion.",
+        help='Negative prompt for video generation',
+    )
+    parser.add_argument(
+        '--refine_prompt',
+        type=str,
+        default=". The video is of high quality, and the view is very clear. High quality, masterpiece, best quality, highres, ultra-detailed, fantastic.",
+        help='Prompt for video generation',
+    )
+    parser.add_argument('--qwen_path', type=str, default="../../pretrained/Qwen2.5-VL-7B-Instruct")
+
+    ## depth
+    # parser.add_argument('--unet_path', type=str, default='checkpoints/DepthCrafter', help='Path to the UNet model')
+    parser.add_argument(
+        '--unet_path',
+        type=str,
+        default="../../pretrained/DepthCrafter",
+        help='Path to the UNet model',
+    )
+
+    # parser.add_argument('--pre_train_path', type=str, default='checkpoints/stable-video-diffusion-img2vid-xt', help='Path to the pre-trained model')
+    parser.add_argument(
+        '--pre_train_path',
+        type=str,
+        default="../../pretrained/stable-video-diffusion-img2vid",
+        help='Path to the pre-trained model',
+    )
+    parser.add_argument(
+        '--cpu_offload', type=str, default='model', help='CPU offload strategy'
+    )
+    parser.add_argument(
+        '--depth_inference_steps', type=int, default=5, help='Number of inference steps'
+    )
+    parser.add_argument(
+        '--depth_guidance_scale',
+        type=float,
+        default=1.0,
+        help='Guidance scale for inference',
+    )
+    parser.add_argument(
+        '--window_size', type=int, default=110, help='Window size for processing'
+    )
+    parser.add_argument(
+        '--overlap', type=int, default=25, help='Overlap size for processing'
+    )
+    parser.add_argument(
+        '--max_res', type=int, default=1024, help='Maximum resolution for processing'
+    )
+
+    return parser
+
+
+if __name__ == "__main__":
+    parser = get_parser()  # infer config.py
+    opts = parser.parse_args()
+    opts.weight_dtype = torch.bfloat16
+    pvd = GetAnchorVideos(opts)
+    if opts.mode == 'gradual':
+        pvd.infer_gradual(opts)
+    elif opts.mode == 'direct':
+        pvd.infer_direct(opts)
+    elif opts.mode == 'bullet':
+        pvd.infer_bullet(opts)
+    elif opts.mode == 'image':
+        pvd.infer_image(opts)
+    elif opts.mode == 'start_end':
+        pvd.infer_start_end(opts)
+    elif opts.mode == 'zoom':
+        pvd.infer_zoom(opts)
\ No newline at end of file
diff --git a/inference/v2v_data/models/infer.py b/inference/v2v_data/models/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..51ca50794211e7f374f52706162bc13b6c1be4fc
--- /dev/null
+++ b/inference/v2v_data/models/infer.py
@@ -0,0 +1,92 @@
+import gc
+import os
+import numpy as np
+import torch
+
+from diffusers.training_utils import set_seed
+# from models.depth_crafter_ppl import DepthCrafterPipeline
+# from models.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
+from DepthCrafter.depthcrafter.depth_crafter_ppl import DepthCrafterPipeline
+from DepthCrafter.depthcrafter.unet import DiffusersUNetSpatioTemporalConditionModelDepthCrafter
+
+class DepthCrafterDemo:
+    def __init__(
+        self,
+        unet_path: str,
+        pre_train_path: str,
+        cpu_offload: str = "model",
+        device: str = "cuda:0",
+    ):
+        unet = DiffusersUNetSpatioTemporalConditionModelDepthCrafter.from_pretrained(
+            unet_path,
+            low_cpu_mem_usage=True,
+            torch_dtype=torch.float16,
+        )
+        # load weights of other components from the provided checkpoint
+        self.pipe = DepthCrafterPipeline.from_pretrained(
+            pre_train_path,
+            unet=unet,
+            torch_dtype=torch.float16,
+            variant="fp16",
+        )
+
+        # for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory
+        if cpu_offload is not None:
+            if cpu_offload == "sequential":
+                # This will slow, but save more memory
+                self.pipe.enable_sequential_cpu_offload()
+            elif cpu_offload == "model":
+                self.pipe.enable_model_cpu_offload()
+            else:
+                raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
+        else:
+            self.pipe.to(device)
+        # enable attention slicing and xformers memory efficient attention
+        try:
+            self.pipe.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            print(e)
+            print("Xformers is not enabled")
+        self.pipe.enable_attention_slicing()
+
+    def infer(
+        self,
+        frames,
+        near,
+        far,
+        num_denoising_steps: int,
+        guidance_scale: float,
+        window_size: int = 110,
+        overlap: int = 25,
+        seed: int = 42,
+        track_time: bool = True,
+    ):
+        set_seed(seed)
+
+        # inference the depth map using the DepthCrafter pipeline
+        with torch.inference_mode():
+            res = self.pipe(
+                frames,
+                height=frames.shape[1],
+                width=frames.shape[2],
+                output_type="np",
+                guidance_scale=guidance_scale,
+                num_inference_steps=num_denoising_steps,
+                window_size=window_size,
+                overlap=overlap,
+                track_time=track_time,
+            ).frames[0]
+        # convert the three-channel output to a single channel depth map
+        res = res.sum(-1) / res.shape[-1]
+        # normalize the depth map to [0, 1] across the whole video
+        depths = (res - res.min()) / (res.max() - res.min())
+        # visualize the depth map and save the results
+        # vis = vis_sequence_depth(res)
+        # save the depth map and visualization with the target FPS
+        depths = torch.from_numpy(depths).unsqueeze(1)  # 49 576 1024 ->
+        depths *= 3900  # compatible with da output
+        depths[depths < 1e-5] = 1e-5
+        depths = 10000.0 / depths
+        depths = depths.clip(near, far)
+
+        return depths
diff --git a/inference/v2v_data/models/utils.py b/inference/v2v_data/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..807eb18d02f40c0350fd79b4b7ca22421a857628
--- /dev/null
+++ b/inference/v2v_data/models/utils.py
@@ -0,0 +1,622 @@
+import numpy as np
+import cv2
+import PIL
+from PIL import Image
+import os
+from datetime import datetime
+import pdb
+import torch.nn.functional as F
+import numpy as np
+import os
+import cv2
+import copy
+from scipy.interpolate import UnivariateSpline, interp1d
+import numpy as np
+import PIL.Image
+import torch
+import torchvision
+from tqdm import tqdm
+from pathlib import Path
+from typing import Tuple, Optional
+import cv2
+import PIL
+import numpy
+import skimage.io
+import torch
+import torch.nn.functional as F
+from decord import VideoReader, cpu
+
+from PIL import Image
+
+def read_video_frames(video_path, process_length, stride, max_res, dataset="open"):
+    def is_image(path):
+        return any(path.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png', '.bmp'])
+
+    if is_image(video_path):
+        print("==> Detected image. Loading as single-frame video:", video_path)
+        img = Image.open(video_path).convert("RGB")
+        # FIXME: hard coded
+        width = 1024
+        height = 576
+        img = img.resize((width, height), Image.BICUBIC)
+        img = np.array(img).astype("float32") / 255.0  # [H, W, 3]
+        frames = img[None, ...]  # [1, H, W, 3]
+        print(f"==> image shape: {frames.shape}")
+        return frames
+
+    if dataset == "open":
+        print("==> processing video:", video_path)
+        vid = VideoReader(video_path, ctx=cpu(0))
+        print("==> original video shape:", (len(vid), *vid.get_batch([0]).shape[1:]))
+
+        # FIXME: hard coded
+        width = 1024
+        height = 576
+
+    vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+
+    frames_idx = list(range(0, len(vid), stride))
+    print(f"==> downsampled shape: {(len(frames_idx), *vid.get_batch([0]).shape[1:])}, with stride: {stride}")
+    if process_length != -1 and process_length < len(frames_idx):
+        frames_idx = frames_idx[:process_length]
+    print(f"==> final processing shape: {(len(frames_idx), *vid.get_batch([0]).shape[1:])}")
+    frames = vid.get_batch(frames_idx).asnumpy().astype("float32") / 255.0  # [T, H, W, 3]
+
+    return frames
+
+
+
+def save_video(data, images_path, folder=None, fps=8):
+    if isinstance(data, np.ndarray):
+        tensor_data = (torch.from_numpy(data) * 255).to(torch.uint8)
+    elif isinstance(data, torch.Tensor):
+        tensor_data = (data.detach().cpu() * 255).to(torch.uint8)
+    elif isinstance(data, list):
+        folder = [folder] * len(data)
+        images = [
+            np.array(Image.open(os.path.join(folder_name, path)))
+            for folder_name, path in zip(folder, data)
+        ]
+        stacked_images = np.stack(images, axis=0)
+        tensor_data = torch.from_numpy(stacked_images).to(torch.uint8)
+    torchvision.io.write_video(
+        images_path, tensor_data, fps=fps, video_codec='h264', options={'crf': '10'}
+    )
+
+
+def sphere2pose(c2ws_input, theta, phi, r, device, x=None, y=None):
+    c2ws = copy.deepcopy(c2ws_input)
+    # c2ws[:,2, 3] = c2ws[:,2, 3] - radius
+
+    # 先沿着世界坐标系z轴方向平移再旋转
+    c2ws[:, 2, 3] -= r
+    if x is not None:
+        c2ws[:, 1, 3] += y
+    if y is not None:
+        c2ws[:, 0, 3] -= x
+
+    theta = torch.deg2rad(torch.tensor(theta)).to(device)
+    sin_value_x = torch.sin(theta)
+    cos_value_x = torch.cos(theta)
+    rot_mat_x = (
+        torch.tensor(
+            [
+                [1, 0, 0, 0],
+                [0, cos_value_x, -sin_value_x, 0],
+                [0, sin_value_x, cos_value_x, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+        .unsqueeze(0)
+        .repeat(c2ws.shape[0], 1, 1)
+        .to(device)
+    )
+
+    phi = torch.deg2rad(torch.tensor(phi)).to(device)
+    sin_value_y = torch.sin(phi)
+    cos_value_y = torch.cos(phi)
+    rot_mat_y = (
+        torch.tensor(
+            [
+                [cos_value_y, 0, sin_value_y, 0],
+                [0, 1, 0, 0],
+                [-sin_value_y, 0, cos_value_y, 0],
+                [0, 0, 0, 1],
+            ]
+        )
+        .unsqueeze(0)
+        .repeat(c2ws.shape[0], 1, 1)
+        .to(device)
+    )
+
+    c2ws = torch.matmul(rot_mat_x, c2ws)
+    c2ws = torch.matmul(rot_mat_y, c2ws)
+    # c2ws[:,2, 3] = c2ws[:,2, 3] + radius
+    return c2ws
+
+
+def generate_traj_specified(c2ws_anchor, theta, phi, d_r, d_x, d_y, frame, device):
+    # Initialize a camera.
+    thetas = np.linspace(0, theta, frame)
+    phis = np.linspace(0, phi, frame)
+    rs = np.linspace(0, d_r, frame)
+    xs = np.linspace(0, d_x, frame)
+    ys = np.linspace(0, d_y, frame)
+    c2ws_list = []
+    for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
+        c2w_new = sphere2pose(
+            c2ws_anchor,
+            np.float32(th),
+            np.float32(ph),
+            np.float32(r),
+            device,
+            np.float32(x),
+            np.float32(y),
+        )
+        c2ws_list.append(c2w_new)
+    c2ws = torch.cat(c2ws_list, dim=0)
+    return c2ws
+
+def generate_traj_specified_fast(c2ws_anchor, theta, phi, d_r, d_x, d_y, frame, device):
+    half = frame // 3
+
+    thetas = np.linspace(0, theta, half)
+    phis = np.linspace(0, phi, half)
+    rs = np.linspace(0, d_r, half)
+    xs = np.linspace(0, d_x, half)
+    ys = np.linspace(0, d_y, half)
+
+    c2ws_list = []
+
+    for th, ph, r, x, y in zip(thetas, phis, rs, xs, ys):
+        c2w_new = sphere2pose(
+            c2ws_anchor,
+            np.float32(th),
+            np.float32(ph),
+            np.float32(r),
+            device,
+            np.float32(x),
+            np.float32(y),
+        )
+        c2ws_list.append(c2w_new)
+
+    last_c2w = c2ws_list[-1]
+    for _ in range(frame - half):
+        c2ws_list.append(last_c2w.clone())
+
+    c2ws = torch.cat(c2ws_list, dim=0)
+    return c2ws
+
+
+def txt_interpolation(input_list, n, mode='smooth'):
+    x = np.linspace(0, 1, len(input_list))
+    if mode == 'smooth':
+        f = UnivariateSpline(x, input_list, k=3)
+    elif mode == 'linear':
+        f = interp1d(x, input_list)
+    else:
+        raise KeyError(f"Invalid txt interpolation mode: {mode}")
+    xnew = np.linspace(0, 1, n)
+    ynew = f(xnew)
+    return ynew
+
+
+def generate_traj_txt(c2ws_anchor, phi, theta, r, frame, device):
+    # Initialize a camera.
+    """
+    The camera coordinate sysmte in COLMAP is right-down-forward
+    Pytorch3D is left-up-forward
+    """
+
+    if len(phi) > 3:
+        phis = txt_interpolation(phi, frame, mode='smooth')
+        phis[0] = phi[0]
+        phis[-1] = phi[-1]
+    else:
+        phis = txt_interpolation(phi, frame, mode='linear')
+
+    if len(theta) > 3:
+        thetas = txt_interpolation(theta, frame, mode='smooth')
+        thetas[0] = theta[0]
+        thetas[-1] = theta[-1]
+    else:
+        thetas = txt_interpolation(theta, frame, mode='linear')
+
+    if len(r) > 3:
+        rs = txt_interpolation(r, frame, mode='smooth')
+        rs[0] = r[0]
+        rs[-1] = r[-1]
+    else:
+        rs = txt_interpolation(r, frame, mode='linear')
+    # rs = rs*c2ws_anchor[0,2,3].cpu().numpy()
+
+    c2ws_list = []
+    for th, ph, r in zip(thetas, phis, rs):
+        c2w_new = sphere2pose(
+            c2ws_anchor, np.float32(th), np.float32(ph), np.float32(r), device
+        )
+        c2ws_list.append(c2w_new)
+    c2ws = torch.cat(c2ws_list, dim=0)
+    return c2ws
+
+
+class Warper:
+    def __init__(self, resolution: tuple = None, device: str = 'gpu0'):
+        self.resolution = resolution
+        self.device = self.get_device(device)
+        self.dtype = torch.float32
+        return
+
+    def forward_warp(
+        self,
+        frame1: torch.Tensor,
+        mask1: Optional[torch.Tensor],
+        depth1: torch.Tensor,
+        transformation1: torch.Tensor,
+        transformation2: torch.Tensor,
+        intrinsic1: torch.Tensor,
+        intrinsic2: Optional[torch.Tensor],
+        mask=False,
+        twice=False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Given a frame1 and global transformations transformation1 and transformation2, warps frame1 to next view using
+        bilinear splatting.
+        All arrays should be torch tensors with batch dimension and channel first
+        :param frame1: (b, 3, h, w). If frame1 is not in the range [-1, 1], either set is_image=False when calling
+                        bilinear_splatting on frame within this function, or modify clipping in bilinear_splatting()
+                        method accordingly.
+        :param mask1: (b, 1, h, w) - 1 for known, 0 for unknown. Optional
+        :param depth1: (b, 1, h, w)
+        :param transformation1: (b, 4, 4) extrinsic transformation matrix of first view: [R, t; 0, 1]
+        :param transformation2: (b, 4, 4) extrinsic transformation matrix of second view: [R, t; 0, 1]
+        :param intrinsic1: (b, 3, 3) camera intrinsic matrix
+        :param intrinsic2: (b, 3, 3) camera intrinsic matrix. Optional
+        """
+        if self.resolution is not None:
+            assert frame1.shape[2:4] == self.resolution
+        b, c, h, w = frame1.shape
+        if mask1 is None:
+            mask1 = torch.ones(size=(b, 1, h, w)).to(frame1)
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+
+        assert frame1.shape == (b, 3, h, w)
+        assert mask1.shape == (b, 1, h, w)
+        assert depth1.shape == (b, 1, h, w)
+        assert transformation1.shape == (b, 4, 4)
+        assert transformation2.shape == (b, 4, 4)
+        assert intrinsic1.shape == (b, 3, 3)
+        assert intrinsic2.shape == (b, 3, 3)
+
+        frame1 = frame1.to(self.device).to(self.dtype)
+        mask1 = mask1.to(self.device).to(self.dtype)
+        depth1 = depth1.to(self.device).to(self.dtype)
+        transformation1 = transformation1.to(self.device).to(self.dtype)
+        transformation2 = transformation2.to(self.device).to(self.dtype)
+        intrinsic1 = intrinsic1.to(self.device).to(self.dtype)
+        intrinsic2 = intrinsic2.to(self.device).to(self.dtype)
+
+        trans_points1 = self.compute_transformed_points(
+            depth1, transformation1, transformation2, intrinsic1, intrinsic2
+        )
+        trans_coordinates = (
+            trans_points1[:, :, :, :2, 0] / trans_points1[:, :, :, 2:3, 0]
+        )
+        trans_depth1 = trans_points1[:, :, :, 2, 0]
+        grid = self.create_grid(b, h, w).to(trans_coordinates)
+        flow12 = trans_coordinates.permute(0, 3, 1, 2) - grid
+        if not twice:
+            warped_frame2, mask2 = self.bilinear_splatting(
+                frame1, mask1, trans_depth1, flow12, None, is_image=True
+            )
+            if mask:
+                warped_frame2, mask2 = self.clean_points(warped_frame2, mask2)
+            return warped_frame2, mask2, None, flow12
+
+        else:
+            warped_frame2, mask2 = self.bilinear_splatting(
+                frame1, mask1, trans_depth1, flow12, None, is_image=True
+            )
+            # warped_frame2, mask2 = self.clean_points(warped_frame2, mask2)
+            warped_flow, _ = self.bilinear_splatting(
+                flow12, mask1, trans_depth1, flow12, None, is_image=False
+            )
+            twice_warped_frame1, _ = self.bilinear_splatting(
+                warped_frame2,
+                mask2,
+                depth1.squeeze(1),
+                -warped_flow,
+                None,
+                is_image=True,
+            )
+            return twice_warped_frame1, warped_frame2, None, None
+
+    def compute_transformed_points(
+        self,
+        depth1: torch.Tensor,
+        transformation1: torch.Tensor,
+        transformation2: torch.Tensor,
+        intrinsic1: torch.Tensor,
+        intrinsic2: Optional[torch.Tensor],
+    ):
+        """
+        Computes transformed position for each pixel location
+        """
+        if self.resolution is not None:
+            assert depth1.shape[2:4] == self.resolution
+        b, _, h, w = depth1.shape
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+        transformation = torch.bmm(
+            transformation2, torch.linalg.inv(transformation1)
+        )  # (b, 4, 4)
+
+        x1d = torch.arange(0, w)[None]
+        y1d = torch.arange(0, h)[:, None]
+        x2d = x1d.repeat([h, 1]).to(depth1)  # (h, w)
+        y2d = y1d.repeat([1, w]).to(depth1)  # (h, w)
+        ones_2d = torch.ones(size=(h, w)).to(depth1)  # (h, w)
+        ones_4d = ones_2d[None, :, :, None, None].repeat(
+            [b, 1, 1, 1, 1]
+        )  # (b, h, w, 1, 1)
+        pos_vectors_homo = torch.stack([x2d, y2d, ones_2d], dim=2)[
+            None, :, :, :, None
+        ]  # (1, h, w, 3, 1)
+
+        intrinsic1_inv = torch.linalg.inv(intrinsic1)  # (b, 3, 3)
+        intrinsic1_inv_4d = intrinsic1_inv[:, None, None]  # (b, 1, 1, 3, 3)
+        intrinsic2_4d = intrinsic2[:, None, None]  # (b, 1, 1, 3, 3)
+        depth_4d = depth1[:, 0][:, :, :, None, None]  # (b, h, w, 1, 1)
+        trans_4d = transformation[:, None, None]  # (b, 1, 1, 4, 4)
+
+        unnormalized_pos = torch.matmul(
+            intrinsic1_inv_4d, pos_vectors_homo
+        )  # (b, h, w, 3, 1)
+        world_points = depth_4d * unnormalized_pos  # (b, h, w, 3, 1)
+        world_points_homo = torch.cat([world_points, ones_4d], dim=3)  # (b, h, w, 4, 1)
+        trans_world_homo = torch.matmul(trans_4d, world_points_homo)  # (b, h, w, 4, 1)
+        trans_world = trans_world_homo[:, :, :, :3]  # (b, h, w, 3, 1)
+        trans_norm_points = torch.matmul(intrinsic2_4d, trans_world)  # (b, h, w, 3, 1)
+        return trans_norm_points
+
+    def bilinear_splatting(
+        self,
+        frame1: torch.Tensor,
+        mask1: Optional[torch.Tensor],
+        depth1: torch.Tensor,
+        flow12: torch.Tensor,
+        flow12_mask: Optional[torch.Tensor],
+        is_image: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Bilinear splatting
+        :param frame1: (b,c,h,w)
+        :param mask1: (b,1,h,w): 1 for known, 0 for unknown. Optional
+        :param depth1: (b,1,h,w)
+        :param flow12: (b,2,h,w)
+        :param flow12_mask: (b,1,h,w): 1 for valid flow, 0 for invalid flow. Optional
+        :param is_image: if true, output will be clipped to (-1,1) range
+        :return: warped_frame2: (b,c,h,w)
+                 mask2: (b,1,h,w): 1 for known and 0 for unknown
+        """
+        if self.resolution is not None:
+            assert frame1.shape[2:4] == self.resolution
+        b, c, h, w = frame1.shape
+        if mask1 is None:
+            mask1 = torch.ones(size=(b, 1, h, w)).to(frame1)
+        if flow12_mask is None:
+            flow12_mask = torch.ones(size=(b, 1, h, w)).to(flow12)
+        grid = self.create_grid(b, h, w).to(frame1)
+        trans_pos = flow12 + grid
+
+        trans_pos_offset = trans_pos + 1
+        trans_pos_floor = torch.floor(trans_pos_offset).long()
+        trans_pos_ceil = torch.ceil(trans_pos_offset).long()
+        trans_pos_offset = torch.stack(
+            [
+                torch.clamp(trans_pos_offset[:, 0], min=0, max=w + 1),
+                torch.clamp(trans_pos_offset[:, 1], min=0, max=h + 1),
+            ],
+            dim=1,
+        )
+        trans_pos_floor = torch.stack(
+            [
+                torch.clamp(trans_pos_floor[:, 0], min=0, max=w + 1),
+                torch.clamp(trans_pos_floor[:, 1], min=0, max=h + 1),
+            ],
+            dim=1,
+        )
+        trans_pos_ceil = torch.stack(
+            [
+                torch.clamp(trans_pos_ceil[:, 0], min=0, max=w + 1),
+                torch.clamp(trans_pos_ceil[:, 1], min=0, max=h + 1),
+            ],
+            dim=1,
+        )
+
+        prox_weight_nw = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * (
+            1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1])
+        )
+        prox_weight_sw = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * (
+            1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1])
+        )
+        prox_weight_ne = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * (
+            1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1])
+        )
+        prox_weight_se = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * (
+            1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1])
+        )
+
+        sat_depth1 = torch.clamp(depth1, min=0, max=1000)
+        log_depth1 = torch.log(1 + sat_depth1)
+        depth_weights = torch.exp(log_depth1 / log_depth1.max() * 50)
+
+        weight_nw = torch.moveaxis(
+            prox_weight_nw * mask1 * flow12_mask / depth_weights.unsqueeze(1),
+            [0, 1, 2, 3],
+            [0, 3, 1, 2],
+        )
+        weight_sw = torch.moveaxis(
+            prox_weight_sw * mask1 * flow12_mask / depth_weights.unsqueeze(1),
+            [0, 1, 2, 3],
+            [0, 3, 1, 2],
+        )
+        weight_ne = torch.moveaxis(
+            prox_weight_ne * mask1 * flow12_mask / depth_weights.unsqueeze(1),
+            [0, 1, 2, 3],
+            [0, 3, 1, 2],
+        )
+        weight_se = torch.moveaxis(
+            prox_weight_se * mask1 * flow12_mask / depth_weights.unsqueeze(1),
+            [0, 1, 2, 3],
+            [0, 3, 1, 2],
+        )
+
+        warped_frame = torch.zeros(size=(b, h + 2, w + 2, c), dtype=torch.float32).to(
+            frame1
+        )
+        warped_weights = torch.zeros(size=(b, h + 2, w + 2, 1), dtype=torch.float32).to(
+            frame1
+        )
+
+        frame1_cl = torch.moveaxis(frame1, [0, 1, 2, 3], [0, 3, 1, 2])
+        batch_indices = torch.arange(b)[:, None, None].to(frame1.device)
+        warped_frame.index_put_(
+            (batch_indices, trans_pos_floor[:, 1], trans_pos_floor[:, 0]),
+            frame1_cl * weight_nw,
+            accumulate=True,
+        )
+        warped_frame.index_put_(
+            (batch_indices, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]),
+            frame1_cl * weight_sw,
+            accumulate=True,
+        )
+        warped_frame.index_put_(
+            (batch_indices, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]),
+            frame1_cl * weight_ne,
+            accumulate=True,
+        )
+        warped_frame.index_put_(
+            (batch_indices, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]),
+            frame1_cl * weight_se,
+            accumulate=True,
+        )
+
+        warped_weights.index_put_(
+            (batch_indices, trans_pos_floor[:, 1], trans_pos_floor[:, 0]),
+            weight_nw,
+            accumulate=True,
+        )
+        warped_weights.index_put_(
+            (batch_indices, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]),
+            weight_sw,
+            accumulate=True,
+        )
+        warped_weights.index_put_(
+            (batch_indices, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]),
+            weight_ne,
+            accumulate=True,
+        )
+        warped_weights.index_put_(
+            (batch_indices, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]),
+            weight_se,
+            accumulate=True,
+        )
+
+        warped_frame_cf = torch.moveaxis(warped_frame, [0, 1, 2, 3], [0, 2, 3, 1])
+        warped_weights_cf = torch.moveaxis(warped_weights, [0, 1, 2, 3], [0, 2, 3, 1])
+        cropped_warped_frame = warped_frame_cf[:, :, 1:-1, 1:-1]
+        cropped_weights = warped_weights_cf[:, :, 1:-1, 1:-1]
+
+        mask = cropped_weights > 0
+        zero_value = -1 if is_image else 0
+        zero_tensor = torch.tensor(zero_value, dtype=frame1.dtype, device=frame1.device)
+        warped_frame2 = torch.where(
+            mask, cropped_warped_frame / cropped_weights, zero_tensor
+        )
+        mask2 = mask.to(frame1)
+
+        if is_image:
+            assert warped_frame2.min() >= -1.1  # Allow for rounding errors
+            assert warped_frame2.max() <= 1.1
+            warped_frame2 = torch.clamp(warped_frame2, min=-1, max=1)
+        return warped_frame2, mask2
+
+    def clean_points(self, warped_frame2, mask2):
+        warped_frame2 = (warped_frame2 + 1.0) / 2.0
+        mask = 1 - mask2
+        mask[mask < 0.5] = 0
+        mask[mask >= 0.5] = 1
+        mask = mask.squeeze(0).repeat(3, 1, 1).permute(1, 2, 0) * 255.0
+        mask = mask.cpu().numpy()
+        kernel = numpy.ones((5, 5), numpy.uint8)
+        mask_erosion = cv2.dilate(numpy.array(mask), kernel, iterations=1)
+        mask_erosion = PIL.Image.fromarray(numpy.uint8(mask_erosion))
+        mask_erosion_ = numpy.array(mask_erosion) / 255.0
+        mask_erosion_[mask_erosion_ < 0.5] = 0
+        mask_erosion_[mask_erosion_ >= 0.5] = 1
+        mask_new = (
+            torch.from_numpy(mask_erosion_)
+            .permute(2, 0, 1)
+            .unsqueeze(0)
+            .to(self.device)
+        )
+        warped_frame2 = warped_frame2 * (1 - mask_new)
+        return warped_frame2 * 2.0 - 1.0, 1 - mask_new[:, 0:1, :, :]
+
+    @staticmethod
+    def create_grid(b, h, w):
+        x_1d = torch.arange(0, w)[None]
+        y_1d = torch.arange(0, h)[:, None]
+        x_2d = x_1d.repeat([h, 1])
+        y_2d = y_1d.repeat([1, w])
+        grid = torch.stack([x_2d, y_2d], dim=0)
+        batch_grid = grid[None].repeat([b, 1, 1, 1])
+        return batch_grid
+
+    @staticmethod
+    def read_image(path: Path) -> torch.Tensor:
+        image = skimage.io.imread(path.as_posix())
+        return image
+
+    @staticmethod
+    def read_depth(path: Path) -> torch.Tensor:
+        if path.suffix == '.png':
+            depth = skimage.io.imread(path.as_posix())
+        elif path.suffix == '.npy':
+            depth = numpy.load(path.as_posix())
+        elif path.suffix == '.npz':
+            with numpy.load(path.as_posix()) as depth_data:
+                depth = depth_data['depth']
+        else:
+            raise RuntimeError(f'Unknown depth format: {path.suffix}')
+        return depth
+
+    @staticmethod
+    def camera_intrinsic_transform(
+        capture_width=1920, capture_height=1080, patch_start_point: tuple = (0, 0)
+    ):
+        start_y, start_x = patch_start_point
+        camera_intrinsics = numpy.eye(4)
+        camera_intrinsics[0, 0] = 2100
+        camera_intrinsics[0, 2] = capture_width / 2.0 - start_x
+        camera_intrinsics[1, 1] = 2100
+        camera_intrinsics[1, 2] = capture_height / 2.0 - start_y
+        return camera_intrinsics
+
+    @staticmethod
+    def get_device(device: str):
+        """
+        Returns torch device object
+        :param device: cpu/gpu0/gpu1
+        :return:
+        """
+        if device == 'cpu':
+            device = torch.device('cpu')
+        elif device.startswith('gpu') and torch.cuda.is_available():
+            gpu_num = int(device[3:])
+            device = torch.device(f'cuda:{gpu_num}')
+        else:
+            device = torch.device('cpu')
+        return device
diff --git a/inference/v2v_data/test/trajs/loop1.txt b/inference/v2v_data/test/trajs/loop1.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b9bb29374e74be0149175c464f69bbe5a066f1ea
--- /dev/null
+++ b/inference/v2v_data/test/trajs/loop1.txt
@@ -0,0 +1,3 @@
+0 2 10 15 12 6 0 -2 -5 -12 -8 -3 0
+0 -3 -10 -20 -30 -25 -17 -10 0
+0 0.02 0.09 0.16 0.25 0.2 0.09 0
\ No newline at end of file
diff --git a/inference/v2v_data/test/trajs/loop2.txt b/inference/v2v_data/test/trajs/loop2.txt
new file mode 100644
index 0000000000000000000000000000000000000000..57a50b4db393d851aada3a1bfee872a61f1b0b8f
--- /dev/null
+++ b/inference/v2v_data/test/trajs/loop2.txt
@@ -0,0 +1,3 @@
+0 2 10 15 12 6 0 -2 -5 -12 -8 -3 0
+0 3 10 20 30 25 17 10 0
+0 0.02 0.09 0.16 0.25 0.28 0.19 0.09 0
\ No newline at end of file
diff --git a/inference/v2v_data/test/videos/0-NNvgaTcVzAG0-r.mp4 b/inference/v2v_data/test/videos/0-NNvgaTcVzAG0-r.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..5c0f3681886326cd8cf86617f1d7ec6d8f5123ac
--- /dev/null
+++ b/inference/v2v_data/test/videos/0-NNvgaTcVzAG0-r.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:26d5168e468959d0e4d0f30d47f322209de20c988c1d5315426bc4a9c9ee623a
+size 654822
diff --git a/inference/v2v_data/test/videos/p7.mp4 b/inference/v2v_data/test/videos/p7.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..3fbc188564781c3b1adcd857d3a001e588c517e8
--- /dev/null
+++ b/inference/v2v_data/test/videos/p7.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a2db72296fb9c7c3b6f7dd832db7722069855f0d7d921e397d51b7e26631b8af
+size 1326612
diff --git a/out/EPiC_pretrained/checkpoint-500.pt b/out/EPiC_pretrained/checkpoint-500.pt
new file mode 100644
index 0000000000000000000000000000000000000000..d3a2701f8b042a469237bea33198de09224846a4
--- /dev/null
+++ b/out/EPiC_pretrained/checkpoint-500.pt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1a1a096dee69e5bd627b9d903bf935b016b9999fabf992e3580589c6c38145a2
+size 53523334
diff --git a/preprocess/RAFT/LICENSE b/preprocess/RAFT/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..ed13d8404f0f1315ee323b2c8d1b2d8f77b5c82f
--- /dev/null
+++ b/preprocess/RAFT/LICENSE
@@ -0,0 +1,29 @@
+BSD 3-Clause License
+
+Copyright (c) 2020, princeton-vl
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/preprocess/RAFT/RAFT.png b/preprocess/RAFT/RAFT.png
new file mode 100644
index 0000000000000000000000000000000000000000..176b48c0e7d51e284d86771ae11c1c6afaddb4b1
--- /dev/null
+++ b/preprocess/RAFT/RAFT.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f9fe7730c2289d694d93627b60c272f94ded023ee04a201bd4803dc1028dd09
+size 204077
diff --git a/preprocess/RAFT/alt_cuda_corr/correlation.cpp b/preprocess/RAFT/alt_cuda_corr/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b01584d19edb99e7feec5f2e4c51169a1ed208db
--- /dev/null
+++ b/preprocess/RAFT/alt_cuda_corr/correlation.cpp
@@ -0,0 +1,54 @@
+#include <torch/extension.h>
+#include <vector>
+
+// CUDA forward declarations
+std::vector<torch::Tensor> corr_cuda_forward(
+    torch::Tensor fmap1,
+    torch::Tensor fmap2,
+    torch::Tensor coords,
+    int radius);
+
+std::vector<torch::Tensor> corr_cuda_backward(
+  torch::Tensor fmap1,
+  torch::Tensor fmap2,
+  torch::Tensor coords,
+  torch::Tensor corr_grad,
+  int radius);
+
+// C++ interface
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> corr_forward(
+    torch::Tensor fmap1,
+    torch::Tensor fmap2,
+    torch::Tensor coords,
+    int radius) {
+  CHECK_INPUT(fmap1);
+  CHECK_INPUT(fmap2);
+  CHECK_INPUT(coords);
+
+  return corr_cuda_forward(fmap1, fmap2, coords, radius);
+}
+
+
+std::vector<torch::Tensor> corr_backward(
+    torch::Tensor fmap1,
+    torch::Tensor fmap2,
+    torch::Tensor coords,
+    torch::Tensor corr_grad,
+    int radius) {
+  CHECK_INPUT(fmap1);
+  CHECK_INPUT(fmap2);
+  CHECK_INPUT(coords);
+  CHECK_INPUT(corr_grad);
+
+  return corr_cuda_backward(fmap1, fmap2, coords, corr_grad, radius);
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &corr_forward, "CORR forward");
+  m.def("backward", &corr_backward, "CORR backward");
+}
\ No newline at end of file
diff --git a/preprocess/RAFT/alt_cuda_corr/correlation_kernel.cu b/preprocess/RAFT/alt_cuda_corr/correlation_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..145e5804a16ece51b8ff5f1cb61ae8dab4fc3bb7
--- /dev/null
+++ b/preprocess/RAFT/alt_cuda_corr/correlation_kernel.cu
@@ -0,0 +1,324 @@
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+
+
+#define BLOCK_H 4
+#define BLOCK_W 8
+#define BLOCK_HW BLOCK_H * BLOCK_W
+#define CHANNEL_STRIDE 32
+
+
+__forceinline__ __device__
+bool within_bounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+__global__ void corr_forward_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> fmap1,
+    const torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> fmap2,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> coords,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> corr,
+    int r)
+{
+  const int b = blockIdx.x;
+  const int h0 = blockIdx.y * blockDim.x;
+  const int w0 = blockIdx.z * blockDim.y;
+  const int tid = threadIdx.x * blockDim.y + threadIdx.y;
+
+  const int H1 = fmap1.size(1);
+  const int W1 = fmap1.size(2);
+  const int H2 = fmap2.size(1);
+  const int W2 = fmap2.size(2);
+  const int N = coords.size(1);
+  const int C = fmap1.size(3);
+
+  __shared__ scalar_t f1[CHANNEL_STRIDE][BLOCK_HW+1];
+  __shared__ scalar_t f2[CHANNEL_STRIDE][BLOCK_HW+1];
+  __shared__ scalar_t x2s[BLOCK_HW];
+  __shared__ scalar_t y2s[BLOCK_HW];
+
+  for (int c=0; c<C; c+=CHANNEL_STRIDE) {
+    for (int k=0; k<BLOCK_HW; k+=BLOCK_HW/CHANNEL_STRIDE) {
+      int k1 = k + tid / CHANNEL_STRIDE;
+      int h1 = h0 + k1 / BLOCK_W;
+      int w1 = w0 + k1 % BLOCK_W;
+      int c1 = tid % CHANNEL_STRIDE;
+
+      auto fptr = fmap1[b][h1][w1];
+      if (within_bounds(h1, w1, H1, W1))
+        f1[c1][k1] = fptr[c+c1];
+      else
+        f1[c1][k1] = 0.0;
+    }
+
+    __syncthreads();
+
+    for (int n=0; n<N; n++) {
+      int h1 = h0 + threadIdx.x;
+      int w1 = w0 + threadIdx.y;
+      if (within_bounds(h1, w1, H1, W1)) {
+        x2s[tid] = coords[b][n][h1][w1][0];
+        y2s[tid] = coords[b][n][h1][w1][1];
+      }
+
+      scalar_t dx = x2s[tid] - floor(x2s[tid]);
+      scalar_t dy = y2s[tid] - floor(y2s[tid]);
+
+      int rd = 2*r + 1;
+      for (int iy=0; iy<rd+1; iy++) {
+        for (int ix=0; ix<rd+1; ix++) {
+          for (int k=0; k<BLOCK_HW; k+=BLOCK_HW/CHANNEL_STRIDE) {
+            int k1 = k + tid / CHANNEL_STRIDE;
+            int h2 = static_cast<int>(floor(y2s[k1]))-r+iy;
+            int w2 = static_cast<int>(floor(x2s[k1]))-r+ix;
+            int c2 = tid % CHANNEL_STRIDE;
+
+            auto fptr = fmap2[b][h2][w2];
+            if (within_bounds(h2, w2, H2, W2))
+              f2[c2][k1] = fptr[c+c2];
+            else
+              f2[c2][k1] = 0.0;
+          }
+
+          __syncthreads();
+      
+          scalar_t s = 0.0;
+          for (int k=0; k<CHANNEL_STRIDE; k++)
+            s += f1[k][tid] * f2[k][tid];
+
+          int ix_nw = H1*W1*((iy-1) + rd*(ix-1));
+          int ix_ne = H1*W1*((iy-1) + rd*ix);
+          int ix_sw = H1*W1*(iy + rd*(ix-1));
+          int ix_se = H1*W1*(iy + rd*ix);
+
+          scalar_t nw = s * (dy) * (dx);
+          scalar_t ne = s * (dy) * (1-dx);
+          scalar_t sw = s * (1-dy) * (dx);
+          scalar_t se = s * (1-dy) * (1-dx);
+
+          scalar_t* corr_ptr = &corr[b][n][0][h1][w1];
+
+          if (iy > 0 && ix > 0 && within_bounds(h1, w1, H1, W1))
+            *(corr_ptr + ix_nw) += nw;
+
+          if (iy > 0 && ix < rd && within_bounds(h1, w1, H1, W1))
+            *(corr_ptr + ix_ne) += ne;
+
+          if (iy < rd && ix > 0 && within_bounds(h1, w1, H1, W1))
+            *(corr_ptr + ix_sw) += sw;
+
+          if (iy < rd && ix < rd && within_bounds(h1, w1, H1, W1))
+            *(corr_ptr + ix_se) += se;
+        }
+      } 
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void corr_backward_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> fmap1,
+    const torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> fmap2,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> coords,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> corr_grad,
+    torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> fmap1_grad,
+    torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> fmap2_grad,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> coords_grad,
+    int r)
+{
+
+  const int b = blockIdx.x;
+  const int h0 = blockIdx.y * blockDim.x;
+  const int w0 = blockIdx.z * blockDim.y;
+  const int tid = threadIdx.x * blockDim.y + threadIdx.y;
+
+  const int H1 = fmap1.size(1);
+  const int W1 = fmap1.size(2);
+  const int H2 = fmap2.size(1);
+  const int W2 = fmap2.size(2);
+  const int N = coords.size(1);
+  const int C = fmap1.size(3);
+
+  __shared__ scalar_t f1[CHANNEL_STRIDE][BLOCK_HW+1];
+  __shared__ scalar_t f2[CHANNEL_STRIDE][BLOCK_HW+1];
+
+  __shared__ scalar_t f1_grad[CHANNEL_STRIDE][BLOCK_HW+1];
+  __shared__ scalar_t f2_grad[CHANNEL_STRIDE][BLOCK_HW+1];
+
+  __shared__ scalar_t x2s[BLOCK_HW];
+  __shared__ scalar_t y2s[BLOCK_HW];
+
+  for (int c=0; c<C; c+=CHANNEL_STRIDE) {
+
+    for (int k=0; k<BLOCK_HW; k+=BLOCK_HW/CHANNEL_STRIDE) {
+      int k1 = k + tid / CHANNEL_STRIDE;
+      int h1 = h0 + k1 / BLOCK_W;
+      int w1 = w0 + k1 % BLOCK_W;
+      int c1 = tid % CHANNEL_STRIDE;
+
+      auto fptr = fmap1[b][h1][w1];
+      if (within_bounds(h1, w1, H1, W1))
+        f1[c1][k1] = fptr[c+c1];
+      else
+        f1[c1][k1] = 0.0;
+
+      f1_grad[c1][k1] = 0.0;
+    }
+
+    __syncthreads();
+
+    int h1 = h0 + threadIdx.x;
+    int w1 = w0 + threadIdx.y;
+
+    for (int n=0; n<N; n++) {  
+      x2s[tid] = coords[b][n][h1][w1][0];
+      y2s[tid] = coords[b][n][h1][w1][1];
+
+      scalar_t dx = x2s[tid] - floor(x2s[tid]);
+      scalar_t dy = y2s[tid] - floor(y2s[tid]);
+
+      int rd = 2*r + 1;
+      for (int iy=0; iy<rd+1; iy++) {
+        for (int ix=0; ix<rd+1; ix++) {
+          for (int k=0; k<BLOCK_HW; k+=BLOCK_HW/CHANNEL_STRIDE) {
+            int k1 = k + tid / CHANNEL_STRIDE;
+            int h2 = static_cast<int>(floor(y2s[k1]))-r+iy;
+            int w2 = static_cast<int>(floor(x2s[k1]))-r+ix;
+            int c2 = tid % CHANNEL_STRIDE;
+
+            auto fptr = fmap2[b][h2][w2];
+            if (within_bounds(h2, w2, H2, W2))
+              f2[c2][k1] = fptr[c+c2];
+            else
+              f2[c2][k1] = 0.0;
+
+            f2_grad[c2][k1] = 0.0;
+          }
+
+          __syncthreads();
+      
+          const scalar_t* grad_ptr = &corr_grad[b][n][0][h1][w1];
+          scalar_t g = 0.0;
+
+          int ix_nw = H1*W1*((iy-1) + rd*(ix-1));
+          int ix_ne = H1*W1*((iy-1) + rd*ix);
+          int ix_sw = H1*W1*(iy + rd*(ix-1));
+          int ix_se = H1*W1*(iy + rd*ix);
+
+          if (iy > 0 && ix > 0 && within_bounds(h1, w1, H1, W1))
+            g +=  *(grad_ptr + ix_nw) * dy * dx;
+
+          if (iy > 0 && ix < rd && within_bounds(h1, w1, H1, W1))
+            g += *(grad_ptr + ix_ne) * dy * (1-dx);
+
+          if (iy < rd && ix > 0 && within_bounds(h1, w1, H1, W1))
+            g += *(grad_ptr + ix_sw) * (1-dy) * dx;
+
+          if (iy < rd && ix < rd && within_bounds(h1, w1, H1, W1))
+            g += *(grad_ptr + ix_se) * (1-dy) * (1-dx);
+            
+          for (int k=0; k<CHANNEL_STRIDE; k++) {
+            f1_grad[k][tid] += g * f2[k][tid];
+            f2_grad[k][tid] += g * f1[k][tid];
+          }
+
+          for (int k=0; k<BLOCK_HW; k+=BLOCK_HW/CHANNEL_STRIDE) {
+            int k1 = k + tid / CHANNEL_STRIDE;
+            int h2 = static_cast<int>(floor(y2s[k1]))-r+iy;
+            int w2 = static_cast<int>(floor(x2s[k1]))-r+ix;
+            int c2 = tid % CHANNEL_STRIDE;
+
+            scalar_t* fptr = &fmap2_grad[b][h2][w2][0];
+            if (within_bounds(h2, w2, H2, W2))
+              atomicAdd(fptr+c+c2, f2_grad[c2][k1]);
+          }
+        }
+      } 
+    }
+    __syncthreads();
+
+
+    for (int k=0; k<BLOCK_HW; k+=BLOCK_HW/CHANNEL_STRIDE) {
+      int k1 = k + tid / CHANNEL_STRIDE;
+      int h1 = h0 + k1 / BLOCK_W;
+      int w1 = w0 + k1 % BLOCK_W;
+      int c1 = tid % CHANNEL_STRIDE;
+
+      scalar_t* fptr = &fmap1_grad[b][h1][w1][0];
+      if (within_bounds(h1, w1, H1, W1))
+        fptr[c+c1] += f1_grad[c1][k1];
+    }
+  }
+}
+
+
+
+std::vector<torch::Tensor> corr_cuda_forward(
+  torch::Tensor fmap1,
+  torch::Tensor fmap2,
+  torch::Tensor coords,
+  int radius)
+{
+  const auto B = coords.size(0);
+  const auto N = coords.size(1);
+  const auto H = coords.size(2);
+  const auto W = coords.size(3);
+
+  const auto rd = 2 * radius + 1;
+  auto opts = fmap1.options();
+  auto corr = torch::zeros({B, N, rd*rd, H, W}, opts);
+  
+  const dim3 blocks(B, (H+BLOCK_H-1)/BLOCK_H, (W+BLOCK_W-1)/BLOCK_W);
+  const dim3 threads(BLOCK_H, BLOCK_W);
+
+  corr_forward_kernel<float><<<blocks, threads>>>(
+    fmap1.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+    fmap2.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+    coords.packed_accessor32<float,5,torch::RestrictPtrTraits>(),
+    corr.packed_accessor32<float,5,torch::RestrictPtrTraits>(),
+    radius);
+
+  return {corr};
+}
+
+std::vector<torch::Tensor> corr_cuda_backward(
+  torch::Tensor fmap1,
+  torch::Tensor fmap2,
+  torch::Tensor coords,
+  torch::Tensor corr_grad,
+  int radius)
+{
+  const auto B = coords.size(0);
+  const auto N = coords.size(1);
+
+  const auto H1 = fmap1.size(1);
+  const auto W1 = fmap1.size(2);
+  const auto H2 = fmap2.size(1);
+  const auto W2 = fmap2.size(2);
+  const auto C = fmap1.size(3);
+
+  auto opts = fmap1.options();
+  auto fmap1_grad = torch::zeros({B, H1, W1, C}, opts);
+  auto fmap2_grad = torch::zeros({B, H2, W2, C}, opts);
+  auto coords_grad = torch::zeros({B, N, H1, W1, 2}, opts);
+    
+  const dim3 blocks(B, (H1+BLOCK_H-1)/BLOCK_H, (W1+BLOCK_W-1)/BLOCK_W);
+  const dim3 threads(BLOCK_H, BLOCK_W);
+
+
+  corr_backward_kernel<float><<<blocks, threads>>>(
+    fmap1.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+    fmap2.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+    coords.packed_accessor32<float,5,torch::RestrictPtrTraits>(),
+    corr_grad.packed_accessor32<float,5,torch::RestrictPtrTraits>(),
+    fmap1_grad.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+    fmap2_grad.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+    coords_grad.packed_accessor32<float,5,torch::RestrictPtrTraits>(),
+    radius);
+
+  return {fmap1_grad, fmap2_grad, coords_grad};
+}
\ No newline at end of file
diff --git a/preprocess/RAFT/alt_cuda_corr/setup.py b/preprocess/RAFT/alt_cuda_corr/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0207ff285ffac4c8146c79d154f12416dbef48c
--- /dev/null
+++ b/preprocess/RAFT/alt_cuda_corr/setup.py
@@ -0,0 +1,15 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+
+setup(
+    name='correlation',
+    ext_modules=[
+        CUDAExtension('alt_cuda_corr',
+            sources=['correlation.cpp', 'correlation_kernel.cu'],
+            extra_compile_args={'cxx': [], 'nvcc': ['-O3']}),
+    ],
+    cmdclass={
+        'build_ext': BuildExtension
+    })
+
diff --git a/preprocess/RAFT/chairs_split.txt b/preprocess/RAFT/chairs_split.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6ae8f0b72a22fc061552604c94664e3a0287914e
--- /dev/null
+++ b/preprocess/RAFT/chairs_split.txt
@@ -0,0 +1,22872 @@
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+2
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+1
+2
+2
+1
+1
+1
+1
+1
+1
+1
+2
+1
+1
+1
+1
+1
\ No newline at end of file
diff --git a/preprocess/RAFT/core/__init__.py b/preprocess/RAFT/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/RAFT/core/corr.py b/preprocess/RAFT/core/corr.py
new file mode 100644
index 0000000000000000000000000000000000000000..cffcbc82af35cd00ac7537beec8e61e683ec68fe
--- /dev/null
+++ b/preprocess/RAFT/core/corr.py
@@ -0,0 +1,91 @@
+import torch
+import torch.nn.functional as F
+from utils.utils import bilinear_sampler, coords_grid
+
+try:
+    import alt_cuda_corr
+except:
+    # alt_cuda_corr is not compiled
+    pass
+
+
+class CorrBlock:
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+        self.corr_pyramid = []
+
+        # all pairs correlation
+        corr = CorrBlock.corr(fmap1, fmap2)
+
+        batch, h1, w1, dim, h2, w2 = corr.shape
+        corr = corr.reshape(batch*h1*w1, dim, h2, w2)
+        
+        self.corr_pyramid.append(corr)
+        for i in range(self.num_levels-1):
+            corr = F.avg_pool2d(corr, 2, stride=2)
+            self.corr_pyramid.append(corr)
+
+    def __call__(self, coords):
+        r = self.radius
+        coords = coords.permute(0, 2, 3, 1)
+        batch, h1, w1, _ = coords.shape
+
+        out_pyramid = []
+        for i in range(self.num_levels):
+            corr = self.corr_pyramid[i]
+            dx = torch.linspace(-r, r, 2*r+1, device=coords.device)
+            dy = torch.linspace(-r, r, 2*r+1, device=coords.device)
+            delta = torch.stack(torch.meshgrid(dy, dx), axis=-1)
+
+            centroid_lvl = coords.reshape(batch*h1*w1, 1, 1, 2) / 2**i
+            delta_lvl = delta.view(1, 2*r+1, 2*r+1, 2)
+            coords_lvl = centroid_lvl + delta_lvl
+
+            corr = bilinear_sampler(corr, coords_lvl)
+            corr = corr.view(batch, h1, w1, -1)
+            out_pyramid.append(corr)
+
+        out = torch.cat(out_pyramid, dim=-1)
+        return out.permute(0, 3, 1, 2).contiguous().float()
+
+    @staticmethod
+    def corr(fmap1, fmap2):
+        batch, dim, ht, wd = fmap1.shape
+        fmap1 = fmap1.view(batch, dim, ht*wd)
+        fmap2 = fmap2.view(batch, dim, ht*wd) 
+        
+        corr = torch.matmul(fmap1.transpose(1,2), fmap2)
+        corr = corr.view(batch, ht, wd, 1, ht, wd)
+        return corr  / torch.sqrt(torch.tensor(dim).float())
+
+
+class AlternateCorrBlock:
+    def __init__(self, fmap1, fmap2, num_levels=4, radius=4):
+        self.num_levels = num_levels
+        self.radius = radius
+
+        self.pyramid = [(fmap1, fmap2)]
+        for i in range(self.num_levels):
+            fmap1 = F.avg_pool2d(fmap1, 2, stride=2)
+            fmap2 = F.avg_pool2d(fmap2, 2, stride=2)
+            self.pyramid.append((fmap1, fmap2))
+
+    def __call__(self, coords):
+        coords = coords.permute(0, 2, 3, 1)
+        B, H, W, _ = coords.shape
+        dim = self.pyramid[0][0].shape[1]
+
+        corr_list = []
+        for i in range(self.num_levels):
+            r = self.radius
+            fmap1_i = self.pyramid[0][0].permute(0, 2, 3, 1).contiguous()
+            fmap2_i = self.pyramid[i][1].permute(0, 2, 3, 1).contiguous()
+
+            coords_i = (coords / 2**i).reshape(B, 1, H, W, 2).contiguous()
+            corr, = alt_cuda_corr.forward(fmap1_i, fmap2_i, coords_i, r)
+            corr_list.append(corr.squeeze(1))
+
+        corr = torch.stack(corr_list, dim=1)
+        corr = corr.reshape(B, -1, H, W)
+        return corr / torch.sqrt(torch.tensor(dim).float())
diff --git a/preprocess/RAFT/core/datasets.py b/preprocess/RAFT/core/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..3411fdacfb900024005e8997d07c600e963a95ca
--- /dev/null
+++ b/preprocess/RAFT/core/datasets.py
@@ -0,0 +1,235 @@
+# Data loading based on https://github.com/NVIDIA/flownet2-pytorch
+
+import numpy as np
+import torch
+import torch.utils.data as data
+import torch.nn.functional as F
+
+import os
+import math
+import random
+from glob import glob
+import os.path as osp
+
+from utils import frame_utils
+from utils.augmentor import FlowAugmentor, SparseFlowAugmentor
+
+
+class FlowDataset(data.Dataset):
+    def __init__(self, aug_params=None, sparse=False):
+        self.augmentor = None
+        self.sparse = sparse
+        if aug_params is not None:
+            if sparse:
+                self.augmentor = SparseFlowAugmentor(**aug_params)
+            else:
+                self.augmentor = FlowAugmentor(**aug_params)
+
+        self.is_test = False
+        self.init_seed = False
+        self.flow_list = []
+        self.image_list = []
+        self.extra_info = []
+
+    def __getitem__(self, index):
+
+        if self.is_test:
+            img1 = frame_utils.read_gen(self.image_list[index][0])
+            img2 = frame_utils.read_gen(self.image_list[index][1])
+            img1 = np.array(img1).astype(np.uint8)[..., :3]
+            img2 = np.array(img2).astype(np.uint8)[..., :3]
+            img1 = torch.from_numpy(img1).permute(2, 0, 1).float()
+            img2 = torch.from_numpy(img2).permute(2, 0, 1).float()
+            return img1, img2, self.extra_info[index]
+
+        if not self.init_seed:
+            worker_info = torch.utils.data.get_worker_info()
+            if worker_info is not None:
+                torch.manual_seed(worker_info.id)
+                np.random.seed(worker_info.id)
+                random.seed(worker_info.id)
+                self.init_seed = True
+
+        index = index % len(self.image_list)
+        valid = None
+        if self.sparse:
+            flow, valid = frame_utils.readFlowKITTI(self.flow_list[index])
+        else:
+            flow = frame_utils.read_gen(self.flow_list[index])
+
+        img1 = frame_utils.read_gen(self.image_list[index][0])
+        img2 = frame_utils.read_gen(self.image_list[index][1])
+
+        flow = np.array(flow).astype(np.float32)
+        img1 = np.array(img1).astype(np.uint8)
+        img2 = np.array(img2).astype(np.uint8)
+
+        # grayscale images
+        if len(img1.shape) == 2:
+            img1 = np.tile(img1[...,None], (1, 1, 3))
+            img2 = np.tile(img2[...,None], (1, 1, 3))
+        else:
+            img1 = img1[..., :3]
+            img2 = img2[..., :3]
+
+        if self.augmentor is not None:
+            if self.sparse:
+                img1, img2, flow, valid = self.augmentor(img1, img2, flow, valid)
+            else:
+                img1, img2, flow = self.augmentor(img1, img2, flow)
+
+        img1 = torch.from_numpy(img1).permute(2, 0, 1).float()
+        img2 = torch.from_numpy(img2).permute(2, 0, 1).float()
+        flow = torch.from_numpy(flow).permute(2, 0, 1).float()
+
+        if valid is not None:
+            valid = torch.from_numpy(valid)
+        else:
+            valid = (flow[0].abs() < 1000) & (flow[1].abs() < 1000)
+
+        return img1, img2, flow, valid.float()
+
+
+    def __rmul__(self, v):
+        self.flow_list = v * self.flow_list
+        self.image_list = v * self.image_list
+        return self
+        
+    def __len__(self):
+        return len(self.image_list)
+        
+
+class MpiSintel(FlowDataset):
+    def __init__(self, aug_params=None, split='training', root='datasets/Sintel', dstype='clean'):
+        super(MpiSintel, self).__init__(aug_params)
+        flow_root = osp.join(root, split, 'flow')
+        image_root = osp.join(root, split, dstype)
+
+        if split == 'test':
+            self.is_test = True
+
+        for scene in os.listdir(image_root):
+            image_list = sorted(glob(osp.join(image_root, scene, '*.png')))
+            for i in range(len(image_list)-1):
+                self.image_list += [ [image_list[i], image_list[i+1]] ]
+                self.extra_info += [ (scene, i) ] # scene and frame_id
+
+            if split != 'test':
+                self.flow_list += sorted(glob(osp.join(flow_root, scene, '*.flo')))
+
+
+class FlyingChairs(FlowDataset):
+    def __init__(self, aug_params=None, split='train', root='datasets/FlyingChairs_release/data'):
+        super(FlyingChairs, self).__init__(aug_params)
+
+        images = sorted(glob(osp.join(root, '*.ppm')))
+        flows = sorted(glob(osp.join(root, '*.flo')))
+        assert (len(images)//2 == len(flows))
+
+        split_list = np.loadtxt('chairs_split.txt', dtype=np.int32)
+        for i in range(len(flows)):
+            xid = split_list[i]
+            if (split=='training' and xid==1) or (split=='validation' and xid==2):
+                self.flow_list += [ flows[i] ]
+                self.image_list += [ [images[2*i], images[2*i+1]] ]
+
+
+class FlyingThings3D(FlowDataset):
+    def __init__(self, aug_params=None, root='datasets/FlyingThings3D', dstype='frames_cleanpass'):
+        super(FlyingThings3D, self).__init__(aug_params)
+
+        for cam in ['left']:
+            for direction in ['into_future', 'into_past']:
+                image_dirs = sorted(glob(osp.join(root, dstype, 'TRAIN/*/*')))
+                image_dirs = sorted([osp.join(f, cam) for f in image_dirs])
+
+                flow_dirs = sorted(glob(osp.join(root, 'optical_flow/TRAIN/*/*')))
+                flow_dirs = sorted([osp.join(f, direction, cam) for f in flow_dirs])
+
+                for idir, fdir in zip(image_dirs, flow_dirs):
+                    images = sorted(glob(osp.join(idir, '*.png')) )
+                    flows = sorted(glob(osp.join(fdir, '*.pfm')) )
+                    for i in range(len(flows)-1):
+                        if direction == 'into_future':
+                            self.image_list += [ [images[i], images[i+1]] ]
+                            self.flow_list += [ flows[i] ]
+                        elif direction == 'into_past':
+                            self.image_list += [ [images[i+1], images[i]] ]
+                            self.flow_list += [ flows[i+1] ]
+      
+
+class KITTI(FlowDataset):
+    def __init__(self, aug_params=None, split='training', root='datasets/KITTI'):
+        super(KITTI, self).__init__(aug_params, sparse=True)
+        if split == 'testing':
+            self.is_test = True
+
+        root = osp.join(root, split)
+        images1 = sorted(glob(osp.join(root, 'image_2/*_10.png')))
+        images2 = sorted(glob(osp.join(root, 'image_2/*_11.png')))
+
+        for img1, img2 in zip(images1, images2):
+            frame_id = img1.split('/')[-1]
+            self.extra_info += [ [frame_id] ]
+            self.image_list += [ [img1, img2] ]
+
+        if split == 'training':
+            self.flow_list = sorted(glob(osp.join(root, 'flow_occ/*_10.png')))
+
+
+class HD1K(FlowDataset):
+    def __init__(self, aug_params=None, root='datasets/HD1k'):
+        super(HD1K, self).__init__(aug_params, sparse=True)
+
+        seq_ix = 0
+        while 1:
+            flows = sorted(glob(os.path.join(root, 'hd1k_flow_gt', 'flow_occ/%06d_*.png' % seq_ix)))
+            images = sorted(glob(os.path.join(root, 'hd1k_input', 'image_2/%06d_*.png' % seq_ix)))
+
+            if len(flows) == 0:
+                break
+
+            for i in range(len(flows)-1):
+                self.flow_list += [flows[i]]
+                self.image_list += [ [images[i], images[i+1]] ]
+
+            seq_ix += 1
+
+
+def fetch_dataloader(args, TRAIN_DS='C+T+K+S+H'):
+    """ Create the data loader for the corresponding trainign set """
+
+    if args.stage == 'chairs':
+        aug_params = {'crop_size': args.image_size, 'min_scale': -0.1, 'max_scale': 1.0, 'do_flip': True}
+        train_dataset = FlyingChairs(aug_params, split='training')
+    
+    elif args.stage == 'things':
+        aug_params = {'crop_size': args.image_size, 'min_scale': -0.4, 'max_scale': 0.8, 'do_flip': True}
+        clean_dataset = FlyingThings3D(aug_params, dstype='frames_cleanpass')
+        final_dataset = FlyingThings3D(aug_params, dstype='frames_finalpass')
+        train_dataset = clean_dataset + final_dataset
+
+    elif args.stage == 'sintel':
+        aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.6, 'do_flip': True}
+        things = FlyingThings3D(aug_params, dstype='frames_cleanpass')
+        sintel_clean = MpiSintel(aug_params, split='training', dstype='clean')
+        sintel_final = MpiSintel(aug_params, split='training', dstype='final')        
+
+        if TRAIN_DS == 'C+T+K+S+H':
+            kitti = KITTI({'crop_size': args.image_size, 'min_scale': -0.3, 'max_scale': 0.5, 'do_flip': True})
+            hd1k = HD1K({'crop_size': args.image_size, 'min_scale': -0.5, 'max_scale': 0.2, 'do_flip': True})
+            train_dataset = 100*sintel_clean + 100*sintel_final + 200*kitti + 5*hd1k + things
+
+        elif TRAIN_DS == 'C+T+K/S':
+            train_dataset = 100*sintel_clean + 100*sintel_final + things
+
+    elif args.stage == 'kitti':
+        aug_params = {'crop_size': args.image_size, 'min_scale': -0.2, 'max_scale': 0.4, 'do_flip': False}
+        train_dataset = KITTI(aug_params, split='training')
+
+    train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, 
+        pin_memory=False, shuffle=True, num_workers=4, drop_last=True)
+
+    print('Training with %d image pairs' % len(train_dataset))
+    return train_loader
+
diff --git a/preprocess/RAFT/core/extractor.py b/preprocess/RAFT/core/extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a9c759d1243d4694e8656c2f6f8a37e53edd009
--- /dev/null
+++ b/preprocess/RAFT/core/extractor.py
@@ -0,0 +1,267 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+  
+        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, padding=1, stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+        
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+        
+        else:    
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm3)
+
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x+y)
+
+
+
+class BottleneckBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(BottleneckBlock, self).__init__()
+  
+        self.conv1 = nn.Conv2d(in_planes, planes//4, kernel_size=1, padding=0)
+        self.conv2 = nn.Conv2d(planes//4, planes//4, kernel_size=3, padding=1, stride=stride)
+        self.conv3 = nn.Conv2d(planes//4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups, num_channels=planes//4)
+            self.norm3 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(num_groups=num_groups, num_channels=planes)
+        
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes//4)
+            self.norm2 = nn.BatchNorm2d(planes//4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+        
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes//4)
+            self.norm2 = nn.InstanceNorm2d(planes//4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+        
+        else:    
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride), self.norm4)
+
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x+y)
+
+class BasicEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=64)
+            
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(64)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(64)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 64
+        self.layer1 = self._make_layer(64,  stride=1)
+        self.layer2 = self._make_layer(96, stride=2)
+        self.layer3 = self._make_layer(128, stride=2)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(128, output_dim, kernel_size=1)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+        
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
+
+
+class SmallEncoder(nn.Module):
+    def __init__(self, output_dim=128, norm_fn='batch', dropout=0.0):
+        super(SmallEncoder, self).__init__()
+        self.norm_fn = norm_fn
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=32)
+            
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(32)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(32)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, 32, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = 32
+        self.layer1 = self._make_layer(32,  stride=1)
+        self.layer2 = self._make_layer(64, stride=2)
+        self.layer3 = self._make_layer(96, stride=2)
+
+        self.dropout = None
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        
+        self.conv2 = nn.Conv2d(96, output_dim, kernel_size=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = BottleneckBlock(self.in_planes, dim, self.norm_fn, stride=stride)
+        layer2 = BottleneckBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+    
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+
+    def forward(self, x):
+
+        # if input is list, combine batch dimension
+        is_list = isinstance(x, tuple) or isinstance(x, list)
+        if is_list:
+            batch_dim = x[0].shape[0]
+            x = torch.cat(x, dim=0)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.conv2(x)
+
+        if self.training and self.dropout is not None:
+            x = self.dropout(x)
+
+        if is_list:
+            x = torch.split(x, [batch_dim, batch_dim], dim=0)
+
+        return x
diff --git a/preprocess/RAFT/core/raft.py b/preprocess/RAFT/core/raft.py
new file mode 100644
index 0000000000000000000000000000000000000000..652b81a33fb1393cfb58861f14076a53be3a9d5f
--- /dev/null
+++ b/preprocess/RAFT/core/raft.py
@@ -0,0 +1,144 @@
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from update import BasicUpdateBlock, SmallUpdateBlock
+from extractor import BasicEncoder, SmallEncoder
+from corr import CorrBlock, AlternateCorrBlock
+from utils.utils import bilinear_sampler, coords_grid, upflow8
+
+try:
+    autocast = torch.cuda.amp.autocast
+except:
+    # dummy autocast for PyTorch < 1.6
+    class autocast:
+        def __init__(self, enabled):
+            pass
+        def __enter__(self):
+            pass
+        def __exit__(self, *args):
+            pass
+
+
+class RAFT(nn.Module):
+    def __init__(self, args):
+        super(RAFT, self).__init__()
+        self.args = args
+
+        if args.small:
+            self.hidden_dim = hdim = 96
+            self.context_dim = cdim = 64
+            args.corr_levels = 4
+            args.corr_radius = 3
+        
+        else:
+            self.hidden_dim = hdim = 128
+            self.context_dim = cdim = 128
+            args.corr_levels = 4
+            args.corr_radius = 4
+
+        if 'dropout' not in self.args:
+            self.args.dropout = 0
+
+        if 'alternate_corr' not in self.args:
+            self.args.alternate_corr = False
+
+        # feature network, context network, and update block
+        if args.small:
+            self.fnet = SmallEncoder(output_dim=128, norm_fn='instance', dropout=args.dropout)        
+            self.cnet = SmallEncoder(output_dim=hdim+cdim, norm_fn='none', dropout=args.dropout)
+            self.update_block = SmallUpdateBlock(self.args, hidden_dim=hdim)
+
+        else:
+            self.fnet = BasicEncoder(output_dim=256, norm_fn='instance', dropout=args.dropout)        
+            self.cnet = BasicEncoder(output_dim=hdim+cdim, norm_fn='batch', dropout=args.dropout)
+            self.update_block = BasicUpdateBlock(self.args, hidden_dim=hdim)
+
+    def freeze_bn(self):
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                m.eval()
+
+    def initialize_flow(self, img):
+        """ Flow is represented as difference between two coordinate grids flow = coords1 - coords0"""
+        N, C, H, W = img.shape
+        coords0 = coords_grid(N, H//8, W//8, device=img.device)
+        coords1 = coords_grid(N, H//8, W//8, device=img.device)
+
+        # optical flow computed as difference: flow = coords1 - coords0
+        return coords0, coords1
+
+    def upsample_flow(self, flow, mask):
+        """ Upsample flow field [H/8, W/8, 2] -> [H, W, 2] using convex combination """
+        N, _, H, W = flow.shape
+        mask = mask.view(N, 1, 9, 8, 8, H, W)
+        mask = torch.softmax(mask, dim=2)
+
+        up_flow = F.unfold(8 * flow, [3,3], padding=1)
+        up_flow = up_flow.view(N, 2, 9, 1, 1, H, W)
+
+        up_flow = torch.sum(mask * up_flow, dim=2)
+        up_flow = up_flow.permute(0, 1, 4, 2, 5, 3)
+        return up_flow.reshape(N, 2, 8*H, 8*W)
+
+
+    def forward(self, image1, image2, iters=12, flow_init=None, upsample=True, test_mode=False):
+        """ Estimate optical flow between pair of frames """
+
+        image1 = 2 * (image1 / 255.0) - 1.0
+        image2 = 2 * (image2 / 255.0) - 1.0
+
+        image1 = image1.contiguous()
+        image2 = image2.contiguous()
+
+        hdim = self.hidden_dim
+        cdim = self.context_dim
+
+        # run the feature network
+        with autocast(enabled=self.args.mixed_precision):
+            fmap1, fmap2 = self.fnet([image1, image2])        
+        
+        fmap1 = fmap1.float()
+        fmap2 = fmap2.float()
+        if self.args.alternate_corr:
+            corr_fn = AlternateCorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
+        else:
+            corr_fn = CorrBlock(fmap1, fmap2, radius=self.args.corr_radius)
+
+        # run the context network
+        with autocast(enabled=self.args.mixed_precision):
+            cnet = self.cnet(image1)
+            net, inp = torch.split(cnet, [hdim, cdim], dim=1)
+            net = torch.tanh(net)
+            inp = torch.relu(inp)
+
+        coords0, coords1 = self.initialize_flow(image1)
+
+        if flow_init is not None:
+            coords1 = coords1 + flow_init
+
+        flow_predictions = []
+        for itr in range(iters):
+            coords1 = coords1.detach()
+            corr = corr_fn(coords1) # index correlation volume
+
+            flow = coords1 - coords0
+            with autocast(enabled=self.args.mixed_precision):
+                net, up_mask, delta_flow = self.update_block(net, inp, corr, flow)
+
+            # F(t+1) = F(t) + \Delta(t)
+            coords1 = coords1 + delta_flow
+
+            # upsample predictions
+            if up_mask is None:
+                flow_up = upflow8(coords1 - coords0)
+            else:
+                flow_up = self.upsample_flow(coords1 - coords0, up_mask)
+            
+            flow_predictions.append(flow_up)
+
+        if test_mode:
+            return coords1 - coords0, flow_up
+            
+        return flow_predictions
diff --git a/preprocess/RAFT/core/update.py b/preprocess/RAFT/core/update.py
new file mode 100644
index 0000000000000000000000000000000000000000..f940497f9b5eb1c12091574fe9a0223a1b196d50
--- /dev/null
+++ b/preprocess/RAFT/core/update.py
@@ -0,0 +1,139 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class FlowHead(nn.Module):
+    def __init__(self, input_dim=128, hidden_dim=256):
+        super(FlowHead, self).__init__()
+        self.conv1 = nn.Conv2d(input_dim, hidden_dim, 3, padding=1)
+        self.conv2 = nn.Conv2d(hidden_dim, 2, 3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        return self.conv2(self.relu(self.conv1(x)))
+
+class ConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=192+128):
+        super(ConvGRU, self).__init__()
+        self.convz = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)
+        self.convr = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)
+        self.convq = nn.Conv2d(hidden_dim+input_dim, hidden_dim, 3, padding=1)
+
+    def forward(self, h, x):
+        hx = torch.cat([h, x], dim=1)
+
+        z = torch.sigmoid(self.convz(hx))
+        r = torch.sigmoid(self.convr(hx))
+        q = torch.tanh(self.convq(torch.cat([r*h, x], dim=1)))
+
+        h = (1-z) * h + z * q
+        return h
+
+class SepConvGRU(nn.Module):
+    def __init__(self, hidden_dim=128, input_dim=192+128):
+        super(SepConvGRU, self).__init__()
+        self.convz1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convr1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+        self.convq1 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (1,5), padding=(0,2))
+
+        self.convz2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+        self.convr2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+        self.convq2 = nn.Conv2d(hidden_dim+input_dim, hidden_dim, (5,1), padding=(2,0))
+
+
+    def forward(self, h, x):
+        # horizontal
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz1(hx))
+        r = torch.sigmoid(self.convr1(hx))
+        q = torch.tanh(self.convq1(torch.cat([r*h, x], dim=1)))        
+        h = (1-z) * h + z * q
+
+        # vertical
+        hx = torch.cat([h, x], dim=1)
+        z = torch.sigmoid(self.convz2(hx))
+        r = torch.sigmoid(self.convr2(hx))
+        q = torch.tanh(self.convq2(torch.cat([r*h, x], dim=1)))       
+        h = (1-z) * h + z * q
+
+        return h
+
+class SmallMotionEncoder(nn.Module):
+    def __init__(self, args):
+        super(SmallMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 96, 1, padding=0)
+        self.convf1 = nn.Conv2d(2, 64, 7, padding=3)
+        self.convf2 = nn.Conv2d(64, 32, 3, padding=1)
+        self.conv = nn.Conv2d(128, 80, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+class BasicMotionEncoder(nn.Module):
+    def __init__(self, args):
+        super(BasicMotionEncoder, self).__init__()
+        cor_planes = args.corr_levels * (2*args.corr_radius + 1)**2
+        self.convc1 = nn.Conv2d(cor_planes, 256, 1, padding=0)
+        self.convc2 = nn.Conv2d(256, 192, 3, padding=1)
+        self.convf1 = nn.Conv2d(2, 128, 7, padding=3)
+        self.convf2 = nn.Conv2d(128, 64, 3, padding=1)
+        self.conv = nn.Conv2d(64+192, 128-2, 3, padding=1)
+
+    def forward(self, flow, corr):
+        cor = F.relu(self.convc1(corr))
+        cor = F.relu(self.convc2(cor))
+        flo = F.relu(self.convf1(flow))
+        flo = F.relu(self.convf2(flo))
+
+        cor_flo = torch.cat([cor, flo], dim=1)
+        out = F.relu(self.conv(cor_flo))
+        return torch.cat([out, flow], dim=1)
+
+class SmallUpdateBlock(nn.Module):
+    def __init__(self, args, hidden_dim=96):
+        super(SmallUpdateBlock, self).__init__()
+        self.encoder = SmallMotionEncoder(args)
+        self.gru = ConvGRU(hidden_dim=hidden_dim, input_dim=82+64)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=128)
+
+    def forward(self, net, inp, corr, flow):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        return net, None, delta_flow
+
+class BasicUpdateBlock(nn.Module):
+    def __init__(self, args, hidden_dim=128, input_dim=128):
+        super(BasicUpdateBlock, self).__init__()
+        self.args = args
+        self.encoder = BasicMotionEncoder(args)
+        self.gru = SepConvGRU(hidden_dim=hidden_dim, input_dim=128+hidden_dim)
+        self.flow_head = FlowHead(hidden_dim, hidden_dim=256)
+
+        self.mask = nn.Sequential(
+            nn.Conv2d(128, 256, 3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 64*9, 1, padding=0))
+
+    def forward(self, net, inp, corr, flow, upsample=True):
+        motion_features = self.encoder(flow, corr)
+        inp = torch.cat([inp, motion_features], dim=1)
+
+        net = self.gru(net, inp)
+        delta_flow = self.flow_head(net)
+
+        # scale mask to balence gradients
+        mask = .25 * self.mask(net)
+        return net, mask, delta_flow
+
+
+
diff --git a/preprocess/RAFT/core/utils/__init__.py b/preprocess/RAFT/core/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/preprocess/RAFT/core/utils/augmentor.py b/preprocess/RAFT/core/utils/augmentor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81c4f2b5c16c31c0ae236d744f299d430228a04
--- /dev/null
+++ b/preprocess/RAFT/core/utils/augmentor.py
@@ -0,0 +1,246 @@
+import numpy as np
+import random
+import math
+from PIL import Image
+
+import cv2
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+import torch
+from torchvision.transforms import ColorJitter
+import torch.nn.functional as F
+
+
+class FlowAugmentor:
+    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=True):
+        
+        # spatial augmentation params
+        self.crop_size = crop_size
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.spatial_aug_prob = 0.8
+        self.stretch_prob = 0.8
+        self.max_stretch = 0.2
+
+        # flip augmentation params
+        self.do_flip = do_flip
+        self.h_flip_prob = 0.5
+        self.v_flip_prob = 0.1
+
+        # photometric augmentation params
+        self.photo_aug = ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.5/3.14)
+        self.asymmetric_color_aug_prob = 0.2
+        self.eraser_aug_prob = 0.5
+
+    def color_transform(self, img1, img2):
+        """ Photometric augmentation """
+
+        # asymmetric
+        if np.random.rand() < self.asymmetric_color_aug_prob:
+            img1 = np.array(self.photo_aug(Image.fromarray(img1)), dtype=np.uint8)
+            img2 = np.array(self.photo_aug(Image.fromarray(img2)), dtype=np.uint8)
+
+        # symmetric
+        else:
+            image_stack = np.concatenate([img1, img2], axis=0)
+            image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
+            img1, img2 = np.split(image_stack, 2, axis=0)
+
+        return img1, img2
+
+    def eraser_transform(self, img1, img2, bounds=[50, 100]):
+        """ Occlusion augmentation """
+
+        ht, wd = img1.shape[:2]
+        if np.random.rand() < self.eraser_aug_prob:
+            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
+            for _ in range(np.random.randint(1, 3)):
+                x0 = np.random.randint(0, wd)
+                y0 = np.random.randint(0, ht)
+                dx = np.random.randint(bounds[0], bounds[1])
+                dy = np.random.randint(bounds[0], bounds[1])
+                img2[y0:y0+dy, x0:x0+dx, :] = mean_color
+
+        return img1, img2
+
+    def spatial_transform(self, img1, img2, flow):
+        # randomly sample scale
+        ht, wd = img1.shape[:2]
+        min_scale = np.maximum(
+            (self.crop_size[0] + 8) / float(ht), 
+            (self.crop_size[1] + 8) / float(wd))
+
+        scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
+        scale_x = scale
+        scale_y = scale
+        if np.random.rand() < self.stretch_prob:
+            scale_x *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+            scale_y *= 2 ** np.random.uniform(-self.max_stretch, self.max_stretch)
+        
+        scale_x = np.clip(scale_x, min_scale, None)
+        scale_y = np.clip(scale_y, min_scale, None)
+
+        if np.random.rand() < self.spatial_aug_prob:
+            # rescale the images
+            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            flow = cv2.resize(flow, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            flow = flow * [scale_x, scale_y]
+
+        if self.do_flip:
+            if np.random.rand() < self.h_flip_prob: # h-flip
+                img1 = img1[:, ::-1]
+                img2 = img2[:, ::-1]
+                flow = flow[:, ::-1] * [-1.0, 1.0]
+
+            if np.random.rand() < self.v_flip_prob: # v-flip
+                img1 = img1[::-1, :]
+                img2 = img2[::-1, :]
+                flow = flow[::-1, :] * [1.0, -1.0]
+
+        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0])
+        x0 = np.random.randint(0, img1.shape[1] - self.crop_size[1])
+        
+        img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
+        img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
+        flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
+
+        return img1, img2, flow
+
+    def __call__(self, img1, img2, flow):
+        img1, img2 = self.color_transform(img1, img2)
+        img1, img2 = self.eraser_transform(img1, img2)
+        img1, img2, flow = self.spatial_transform(img1, img2, flow)
+
+        img1 = np.ascontiguousarray(img1)
+        img2 = np.ascontiguousarray(img2)
+        flow = np.ascontiguousarray(flow)
+
+        return img1, img2, flow
+
+class SparseFlowAugmentor:
+    def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, do_flip=False):
+        # spatial augmentation params
+        self.crop_size = crop_size
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.spatial_aug_prob = 0.8
+        self.stretch_prob = 0.8
+        self.max_stretch = 0.2
+
+        # flip augmentation params
+        self.do_flip = do_flip
+        self.h_flip_prob = 0.5
+        self.v_flip_prob = 0.1
+
+        # photometric augmentation params
+        self.photo_aug = ColorJitter(brightness=0.3, contrast=0.3, saturation=0.3, hue=0.3/3.14)
+        self.asymmetric_color_aug_prob = 0.2
+        self.eraser_aug_prob = 0.5
+        
+    def color_transform(self, img1, img2):
+        image_stack = np.concatenate([img1, img2], axis=0)
+        image_stack = np.array(self.photo_aug(Image.fromarray(image_stack)), dtype=np.uint8)
+        img1, img2 = np.split(image_stack, 2, axis=0)
+        return img1, img2
+
+    def eraser_transform(self, img1, img2):
+        ht, wd = img1.shape[:2]
+        if np.random.rand() < self.eraser_aug_prob:
+            mean_color = np.mean(img2.reshape(-1, 3), axis=0)
+            for _ in range(np.random.randint(1, 3)):
+                x0 = np.random.randint(0, wd)
+                y0 = np.random.randint(0, ht)
+                dx = np.random.randint(50, 100)
+                dy = np.random.randint(50, 100)
+                img2[y0:y0+dy, x0:x0+dx, :] = mean_color
+
+        return img1, img2
+
+    def resize_sparse_flow_map(self, flow, valid, fx=1.0, fy=1.0):
+        ht, wd = flow.shape[:2]
+        coords = np.meshgrid(np.arange(wd), np.arange(ht))
+        coords = np.stack(coords, axis=-1)
+
+        coords = coords.reshape(-1, 2).astype(np.float32)
+        flow = flow.reshape(-1, 2).astype(np.float32)
+        valid = valid.reshape(-1).astype(np.float32)
+
+        coords0 = coords[valid>=1]
+        flow0 = flow[valid>=1]
+
+        ht1 = int(round(ht * fy))
+        wd1 = int(round(wd * fx))
+
+        coords1 = coords0 * [fx, fy]
+        flow1 = flow0 * [fx, fy]
+
+        xx = np.round(coords1[:,0]).astype(np.int32)
+        yy = np.round(coords1[:,1]).astype(np.int32)
+
+        v = (xx > 0) & (xx < wd1) & (yy > 0) & (yy < ht1)
+        xx = xx[v]
+        yy = yy[v]
+        flow1 = flow1[v]
+
+        flow_img = np.zeros([ht1, wd1, 2], dtype=np.float32)
+        valid_img = np.zeros([ht1, wd1], dtype=np.int32)
+
+        flow_img[yy, xx] = flow1
+        valid_img[yy, xx] = 1
+
+        return flow_img, valid_img
+
+    def spatial_transform(self, img1, img2, flow, valid):
+        # randomly sample scale
+
+        ht, wd = img1.shape[:2]
+        min_scale = np.maximum(
+            (self.crop_size[0] + 1) / float(ht), 
+            (self.crop_size[1] + 1) / float(wd))
+
+        scale = 2 ** np.random.uniform(self.min_scale, self.max_scale)
+        scale_x = np.clip(scale, min_scale, None)
+        scale_y = np.clip(scale, min_scale, None)
+
+        if np.random.rand() < self.spatial_aug_prob:
+            # rescale the images
+            img1 = cv2.resize(img1, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            img2 = cv2.resize(img2, None, fx=scale_x, fy=scale_y, interpolation=cv2.INTER_LINEAR)
+            flow, valid = self.resize_sparse_flow_map(flow, valid, fx=scale_x, fy=scale_y)
+
+        if self.do_flip:
+            if np.random.rand() < 0.5: # h-flip
+                img1 = img1[:, ::-1]
+                img2 = img2[:, ::-1]
+                flow = flow[:, ::-1] * [-1.0, 1.0]
+                valid = valid[:, ::-1]
+
+        margin_y = 20
+        margin_x = 50
+
+        y0 = np.random.randint(0, img1.shape[0] - self.crop_size[0] + margin_y)
+        x0 = np.random.randint(-margin_x, img1.shape[1] - self.crop_size[1] + margin_x)
+
+        y0 = np.clip(y0, 0, img1.shape[0] - self.crop_size[0])
+        x0 = np.clip(x0, 0, img1.shape[1] - self.crop_size[1])
+
+        img1 = img1[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
+        img2 = img2[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
+        flow = flow[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
+        valid = valid[y0:y0+self.crop_size[0], x0:x0+self.crop_size[1]]
+        return img1, img2, flow, valid
+
+
+    def __call__(self, img1, img2, flow, valid):
+        img1, img2 = self.color_transform(img1, img2)
+        img1, img2 = self.eraser_transform(img1, img2)
+        img1, img2, flow, valid = self.spatial_transform(img1, img2, flow, valid)
+
+        img1 = np.ascontiguousarray(img1)
+        img2 = np.ascontiguousarray(img2)
+        flow = np.ascontiguousarray(flow)
+        valid = np.ascontiguousarray(valid)
+
+        return img1, img2, flow, valid
diff --git a/preprocess/RAFT/core/utils/flow_viz.py b/preprocess/RAFT/core/utils/flow_viz.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcee65e89b91b07ee0496aeb4c7e7436abf99641
--- /dev/null
+++ b/preprocess/RAFT/core/utils/flow_viz.py
@@ -0,0 +1,132 @@
+# Flow visualization code used from https://github.com/tomrunia/OpticalFlow_Visualization
+
+
+# MIT License
+#
+# Copyright (c) 2018 Tom Runia
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to conditions.
+#
+# Author: Tom Runia
+# Date Created: 2018-08-03
+
+import numpy as np
+
+def make_colorwheel():
+    """
+    Generates a color wheel for optical flow visualization as presented in:
+        Baker et al. "A Database and Evaluation Methodology for Optical Flow" (ICCV, 2007)
+        URL: http://vision.middlebury.edu/flow/flowEval-iccv07.pdf
+
+    Code follows the original C++ source code of Daniel Scharstein.
+    Code follows the the Matlab source code of Deqing Sun.
+
+    Returns:
+        np.ndarray: Color wheel
+    """
+
+    RY = 15
+    YG = 6
+    GC = 4
+    CB = 11
+    BM = 13
+    MR = 6
+
+    ncols = RY + YG + GC + CB + BM + MR
+    colorwheel = np.zeros((ncols, 3))
+    col = 0
+
+    # RY
+    colorwheel[0:RY, 0] = 255
+    colorwheel[0:RY, 1] = np.floor(255*np.arange(0,RY)/RY)
+    col = col+RY
+    # YG
+    colorwheel[col:col+YG, 0] = 255 - np.floor(255*np.arange(0,YG)/YG)
+    colorwheel[col:col+YG, 1] = 255
+    col = col+YG
+    # GC
+    colorwheel[col:col+GC, 1] = 255
+    colorwheel[col:col+GC, 2] = np.floor(255*np.arange(0,GC)/GC)
+    col = col+GC
+    # CB
+    colorwheel[col:col+CB, 1] = 255 - np.floor(255*np.arange(CB)/CB)
+    colorwheel[col:col+CB, 2] = 255
+    col = col+CB
+    # BM
+    colorwheel[col:col+BM, 2] = 255
+    colorwheel[col:col+BM, 0] = np.floor(255*np.arange(0,BM)/BM)
+    col = col+BM
+    # MR
+    colorwheel[col:col+MR, 2] = 255 - np.floor(255*np.arange(MR)/MR)
+    colorwheel[col:col+MR, 0] = 255
+    return colorwheel
+
+
+def flow_uv_to_colors(u, v, convert_to_bgr=False):
+    """
+    Applies the flow color wheel to (possibly clipped) flow components u and v.
+
+    According to the C++ source code of Daniel Scharstein
+    According to the Matlab source code of Deqing Sun
+
+    Args:
+        u (np.ndarray): Input horizontal flow of shape [H,W]
+        v (np.ndarray): Input vertical flow of shape [H,W]
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    flow_image = np.zeros((u.shape[0], u.shape[1], 3), np.uint8)
+    colorwheel = make_colorwheel()  # shape [55x3]
+    ncols = colorwheel.shape[0]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    a = np.arctan2(-v, -u)/np.pi
+    fk = (a+1) / 2*(ncols-1)
+    k0 = np.floor(fk).astype(np.int32)
+    k1 = k0 + 1
+    k1[k1 == ncols] = 0
+    f = fk - k0
+    for i in range(colorwheel.shape[1]):
+        tmp = colorwheel[:,i]
+        col0 = tmp[k0] / 255.0
+        col1 = tmp[k1] / 255.0
+        col = (1-f)*col0 + f*col1
+        idx = (rad <= 1)
+        col[idx]  = 1 - rad[idx] * (1-col[idx])
+        col[~idx] = col[~idx] * 0.75   # out of range
+        # Note the 2-i => BGR instead of RGB
+        ch_idx = 2-i if convert_to_bgr else i
+        flow_image[:,:,ch_idx] = np.floor(255 * col)
+    return flow_image
+
+
+def flow_to_image(flow_uv, clip_flow=None, convert_to_bgr=False):
+    """
+    Expects a two dimensional flow image of shape.
+
+    Args:
+        flow_uv (np.ndarray): Flow UV image of shape [H,W,2]
+        clip_flow (float, optional): Clip maximum of flow values. Defaults to None.
+        convert_to_bgr (bool, optional): Convert output image to BGR. Defaults to False.
+
+    Returns:
+        np.ndarray: Flow visualization image of shape [H,W,3]
+    """
+    assert flow_uv.ndim == 3, 'input flow must have three dimensions'
+    assert flow_uv.shape[2] == 2, 'input flow must have shape [H,W,2]'
+    if clip_flow is not None:
+        flow_uv = np.clip(flow_uv, 0, clip_flow)
+    u = flow_uv[:,:,0]
+    v = flow_uv[:,:,1]
+    rad = np.sqrt(np.square(u) + np.square(v))
+    rad_max = np.max(rad)
+    epsilon = 1e-5
+    u = u / (rad_max + epsilon)
+    v = v / (rad_max + epsilon)
+    return flow_uv_to_colors(u, v, convert_to_bgr)
\ No newline at end of file
diff --git a/preprocess/RAFT/core/utils/frame_utils.py b/preprocess/RAFT/core/utils/frame_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c491135efaffc25bd61ec3ecde99d236f5deb12
--- /dev/null
+++ b/preprocess/RAFT/core/utils/frame_utils.py
@@ -0,0 +1,137 @@
+import numpy as np
+from PIL import Image
+from os.path import *
+import re
+
+import cv2
+cv2.setNumThreads(0)
+cv2.ocl.setUseOpenCL(False)
+
+TAG_CHAR = np.array([202021.25], np.float32)
+
+def readFlow(fn):
+    """ Read .flo file in Middlebury format"""
+    # Code adapted from:
+    # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
+
+    # WARNING: this will work on little-endian architectures (eg Intel x86) only!
+    # print 'fn = %s'%(fn)
+    with open(fn, 'rb') as f:
+        magic = np.fromfile(f, np.float32, count=1)
+        if 202021.25 != magic:
+            print('Magic number incorrect. Invalid .flo file')
+            return None
+        else:
+            w = np.fromfile(f, np.int32, count=1)
+            h = np.fromfile(f, np.int32, count=1)
+            # print 'Reading %d x %d flo file\n' % (w, h)
+            data = np.fromfile(f, np.float32, count=2*int(w)*int(h))
+            # Reshape data into 3D array (columns, rows, bands)
+            # The reshape here is for visualization, the original code is (w,h,2)
+            return np.resize(data, (int(h), int(w), 2))
+
+def readPFM(file):
+    file = open(file, 'rb')
+
+    color = None
+    width = None
+    height = None
+    scale = None
+    endian = None
+
+    header = file.readline().rstrip()
+    if header == b'PF':
+        color = True
+    elif header == b'Pf':
+        color = False
+    else:
+        raise Exception('Not a PFM file.')
+
+    dim_match = re.match(rb'^(\d+)\s(\d+)\s$', file.readline())
+    if dim_match:
+        width, height = map(int, dim_match.groups())
+    else:
+        raise Exception('Malformed PFM header.')
+
+    scale = float(file.readline().rstrip())
+    if scale < 0: # little-endian
+        endian = '<'
+        scale = -scale
+    else:
+        endian = '>' # big-endian
+
+    data = np.fromfile(file, endian + 'f')
+    shape = (height, width, 3) if color else (height, width)
+
+    data = np.reshape(data, shape)
+    data = np.flipud(data)
+    return data
+
+def writeFlow(filename,uv,v=None):
+    """ Write optical flow to file.
+    
+    If v is None, uv is assumed to contain both u and v channels,
+    stacked in depth.
+    Original code by Deqing Sun, adapted from Daniel Scharstein.
+    """
+    nBands = 2
+
+    if v is None:
+        assert(uv.ndim == 3)
+        assert(uv.shape[2] == 2)
+        u = uv[:,:,0]
+        v = uv[:,:,1]
+    else:
+        u = uv
+
+    assert(u.shape == v.shape)
+    height,width = u.shape
+    f = open(filename,'wb')
+    # write the header
+    f.write(TAG_CHAR)
+    np.array(width).astype(np.int32).tofile(f)
+    np.array(height).astype(np.int32).tofile(f)
+    # arrange into matrix form
+    tmp = np.zeros((height, width*nBands))
+    tmp[:,np.arange(width)*2] = u
+    tmp[:,np.arange(width)*2 + 1] = v
+    tmp.astype(np.float32).tofile(f)
+    f.close()
+
+
+def readFlowKITTI(filename):
+    flow = cv2.imread(filename, cv2.IMREAD_ANYDEPTH|cv2.IMREAD_COLOR)
+    flow = flow[:,:,::-1].astype(np.float32)
+    flow, valid = flow[:, :, :2], flow[:, :, 2]
+    flow = (flow - 2**15) / 64.0
+    return flow, valid
+
+def readDispKITTI(filename):
+    disp = cv2.imread(filename, cv2.IMREAD_ANYDEPTH) / 256.0
+    valid = disp > 0.0
+    flow = np.stack([-disp, np.zeros_like(disp)], -1)
+    return flow, valid
+
+
+def writeFlowKITTI(filename, uv):
+    uv = 64.0 * uv + 2**15
+    valid = np.ones([uv.shape[0], uv.shape[1], 1])
+    uv = np.concatenate([uv, valid], axis=-1).astype(np.uint16)
+    cv2.imwrite(filename, uv[..., ::-1])
+    
+
+def read_gen(file_name, pil=False):
+    ext = splitext(file_name)[-1]
+    if ext == '.png' or ext == '.jpeg' or ext == '.ppm' or ext == '.jpg':
+        return Image.open(file_name)
+    elif ext == '.bin' or ext == '.raw':
+        return np.load(file_name)
+    elif ext == '.flo':
+        return readFlow(file_name).astype(np.float32)
+    elif ext == '.pfm':
+        flow = readPFM(file_name).astype(np.float32)
+        if len(flow.shape) == 2:
+            return flow
+        else:
+            return flow[:, :, :-1]
+    return []
\ No newline at end of file
diff --git a/preprocess/RAFT/core/utils/utils.py b/preprocess/RAFT/core/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..741ccfe4d0d778c3199c586d368edc2882d4fff8
--- /dev/null
+++ b/preprocess/RAFT/core/utils/utils.py
@@ -0,0 +1,82 @@
+import torch
+import torch.nn.functional as F
+import numpy as np
+from scipy import interpolate
+
+
+class InputPadder:
+    """ Pads images such that dimensions are divisible by 8 """
+    def __init__(self, dims, mode='sintel'):
+        self.ht, self.wd = dims[-2:]
+        pad_ht = (((self.ht // 8) + 1) * 8 - self.ht) % 8
+        pad_wd = (((self.wd // 8) + 1) * 8 - self.wd) % 8
+        if mode == 'sintel':
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, pad_ht//2, pad_ht - pad_ht//2]
+        else:
+            self._pad = [pad_wd//2, pad_wd - pad_wd//2, 0, pad_ht]
+
+    def pad(self, *inputs):
+        return [F.pad(x, self._pad, mode='replicate') for x in inputs]
+
+    def unpad(self,x):
+        ht, wd = x.shape[-2:]
+        c = [self._pad[2], ht-self._pad[3], self._pad[0], wd-self._pad[1]]
+        return x[..., c[0]:c[1], c[2]:c[3]]
+
+def forward_interpolate(flow):
+    flow = flow.detach().cpu().numpy()
+    dx, dy = flow[0], flow[1]
+
+    ht, wd = dx.shape
+    x0, y0 = np.meshgrid(np.arange(wd), np.arange(ht))
+
+    x1 = x0 + dx
+    y1 = y0 + dy
+    
+    x1 = x1.reshape(-1)
+    y1 = y1.reshape(-1)
+    dx = dx.reshape(-1)
+    dy = dy.reshape(-1)
+
+    valid = (x1 > 0) & (x1 < wd) & (y1 > 0) & (y1 < ht)
+    x1 = x1[valid]
+    y1 = y1[valid]
+    dx = dx[valid]
+    dy = dy[valid]
+
+    flow_x = interpolate.griddata(
+        (x1, y1), dx, (x0, y0), method='nearest', fill_value=0)
+
+    flow_y = interpolate.griddata(
+        (x1, y1), dy, (x0, y0), method='nearest', fill_value=0)
+
+    flow = np.stack([flow_x, flow_y], axis=0)
+    return torch.from_numpy(flow).float()
+
+
+def bilinear_sampler(img, coords, mode='bilinear', mask=False):
+    """ Wrapper for grid_sample, uses pixel coordinates """
+    H, W = img.shape[-2:]
+    xgrid, ygrid = coords.split([1,1], dim=-1)
+    xgrid = 2*xgrid/(W-1) - 1
+    ygrid = 2*ygrid/(H-1) - 1
+
+    grid = torch.cat([xgrid, ygrid], dim=-1)
+    img = F.grid_sample(img, grid, align_corners=True)
+
+    if mask:
+        mask = (xgrid > -1) & (ygrid > -1) & (xgrid < 1) & (ygrid < 1)
+        return img, mask.float()
+
+    return img
+
+
+def coords_grid(batch, ht, wd, device):
+    coords = torch.meshgrid(torch.arange(ht, device=device), torch.arange(wd, device=device))
+    coords = torch.stack(coords[::-1], dim=0).float()
+    return coords[None].repeat(batch, 1, 1, 1)
+
+
+def upflow8(flow, mode='bilinear'):
+    new_size = (8 * flow.shape[2], 8 * flow.shape[3])
+    return  8 * F.interpolate(flow, size=new_size, mode=mode, align_corners=True)
diff --git a/preprocess/RAFT/demo-frames/frame_0016.png b/preprocess/RAFT/demo-frames/frame_0016.png
new file mode 100644
index 0000000000000000000000000000000000000000..e177213c0e1cbf2cadad84eaba85535f568f3382
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0016.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5a815fe352c27c8fe94e19e5236bef3856ae22b9b9ee0b38ec406ef213faaaff
+size 667971
diff --git a/preprocess/RAFT/demo-frames/frame_0017.png b/preprocess/RAFT/demo-frames/frame_0017.png
new file mode 100644
index 0000000000000000000000000000000000000000..ebc73de25bfcbeb4c4082b1bac93312938ba92b3
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0017.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6c6f31f001dadddb9578dcc03f8bd53a045fe752311a53b9f9bcd1e1a1b2a329
+size 668067
diff --git a/preprocess/RAFT/demo-frames/frame_0018.png b/preprocess/RAFT/demo-frames/frame_0018.png
new file mode 100644
index 0000000000000000000000000000000000000000..5b9a5ac8bd2acfeaec502b72f96492be08e3d56f
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0018.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5920eff6da437d8cb8bef7f2ac01428d72bae297b8b1690b5190aa9d12ca5bda
+size 667952
diff --git a/preprocess/RAFT/demo-frames/frame_0019.png b/preprocess/RAFT/demo-frames/frame_0019.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6a7e9a999be3624652208cf9c5430319833a673
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0019.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:717bde9ea6425e4cfb8dede2d234d790c193caf68a20f154e0f1fc5b37f07757
+size 668893
diff --git a/preprocess/RAFT/demo-frames/frame_0020.png b/preprocess/RAFT/demo-frames/frame_0020.png
new file mode 100644
index 0000000000000000000000000000000000000000..7341027a6a2f2ed4bf93e1f7f3c2a0b7c7948e7c
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0020.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7dcc6bf66864c11c33d357fe913ccfd4c5098c33660a6fe4fc18e8d2f6735eb2
+size 670304
diff --git a/preprocess/RAFT/demo-frames/frame_0021.png b/preprocess/RAFT/demo-frames/frame_0021.png
new file mode 100644
index 0000000000000000000000000000000000000000..698b04cafe44c6fe4f6ee08ae14fd67953ef48dc
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0021.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bffde53655579b67b1d092d0c7ac9a644a05655be989d5affd7f0b1962186e8
+size 672426
diff --git a/preprocess/RAFT/demo-frames/frame_0022.png b/preprocess/RAFT/demo-frames/frame_0022.png
new file mode 100644
index 0000000000000000000000000000000000000000..11807dc868fc87fb53e91de583dc982806536675
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0022.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f842b5a1d7d50048e235818eb24cc3c41ce79ab4453eb0294b979097c9221231
+size 673863
diff --git a/preprocess/RAFT/demo-frames/frame_0023.png b/preprocess/RAFT/demo-frames/frame_0023.png
new file mode 100644
index 0000000000000000000000000000000000000000..27ca98291493da2ccffc4839d341d9d7f96f501f
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0023.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:800b478fe36a32472878382b8823a360142c2492c9e942a8ca2960fc52d05974
+size 674569
diff --git a/preprocess/RAFT/demo-frames/frame_0024.png b/preprocess/RAFT/demo-frames/frame_0024.png
new file mode 100644
index 0000000000000000000000000000000000000000..861570b11b9b485c4e7f0a5098885cc005bd9d05
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0024.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:da5708ef12972068c85b18b6e252f70ddb71e447ac49c7ba3b1950846e3b5267
+size 675519
diff --git a/preprocess/RAFT/demo-frames/frame_0025.png b/preprocess/RAFT/demo-frames/frame_0025.png
new file mode 100644
index 0000000000000000000000000000000000000000..cd56e006d0e383311423e4f4de2cb6630c2567d9
--- /dev/null
+++ b/preprocess/RAFT/demo-frames/frame_0025.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc81e7a22534e6660492b1bbe3b90d1a86c01fb27f7fb52c76545ae745a60b0b
+size 675832
diff --git a/preprocess/RAFT/demo.py b/preprocess/RAFT/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..5abc1da863f1231af1247209739402b05fa8bf85
--- /dev/null
+++ b/preprocess/RAFT/demo.py
@@ -0,0 +1,75 @@
+import sys
+sys.path.append('core')
+
+import argparse
+import os
+import cv2
+import glob
+import numpy as np
+import torch
+from PIL import Image
+
+from raft import RAFT
+from utils import flow_viz
+from utils.utils import InputPadder
+
+
+
+DEVICE = 'cuda'
+
+def load_image(imfile):
+    img = np.array(Image.open(imfile)).astype(np.uint8)
+    img = torch.from_numpy(img).permute(2, 0, 1).float()
+    return img[None].to(DEVICE)
+
+
+def viz(img, flo):
+    img = img[0].permute(1,2,0).cpu().numpy()
+    flo = flo[0].permute(1,2,0).cpu().numpy()
+    
+    # map flow to rgb image
+    flo = flow_viz.flow_to_image(flo)
+    img_flo = np.concatenate([img, flo], axis=0)
+
+    # import matplotlib.pyplot as plt
+    # plt.imshow(img_flo / 255.0)
+    # plt.show()
+
+    cv2.imshow('image', img_flo[:, :, [2,1,0]]/255.0)
+    cv2.waitKey()
+
+
+def demo(args):
+    model = torch.nn.DataParallel(RAFT(args))
+    model.load_state_dict(torch.load(args.model))
+
+    model = model.module
+    model.to(DEVICE)
+    model.eval()
+
+    with torch.no_grad():
+        images = glob.glob(os.path.join(args.path, '*.png')) + \
+                 glob.glob(os.path.join(args.path, '*.jpg'))
+        
+        images = sorted(images)
+        for imfile1, imfile2 in zip(images[:-1], images[1:]):
+            image1 = load_image(imfile1)
+            image2 = load_image(imfile2)
+
+            padder = InputPadder(image1.shape)
+            image1, image2 = padder.pad(image1, image2)
+
+            flow_low, flow_up = model(image1, image2, iters=20, test_mode=True)
+            viz(image1, flow_up)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', help="restore checkpoint")
+    parser.add_argument('--path', help="dataset for evaluation")
+    parser.add_argument('--small', action='store_true', help='use small model')
+    parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
+    parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')
+    args = parser.parse_args()
+
+    demo(args)
diff --git a/preprocess/RAFT/download_models.sh b/preprocess/RAFT/download_models.sh
new file mode 100644
index 0000000000000000000000000000000000000000..dfd8d473f461edd999716fd38fe7ee32f5a39235
--- /dev/null
+++ b/preprocess/RAFT/download_models.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+wget https://dl.dropboxusercontent.com/s/4j4z58wuv8o0mfz/models.zip
+unzip models.zip
diff --git a/preprocess/RAFT/evaluate.py b/preprocess/RAFT/evaluate.py
new file mode 100644
index 0000000000000000000000000000000000000000..431a0f58891bede2804454fa7f28e9434c4c8746
--- /dev/null
+++ b/preprocess/RAFT/evaluate.py
@@ -0,0 +1,197 @@
+import sys
+sys.path.append('core')
+
+from PIL import Image
+import argparse
+import os
+import time
+import numpy as np
+import torch
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+
+import datasets
+from utils import flow_viz
+from utils import frame_utils
+
+from raft import RAFT
+from utils.utils import InputPadder, forward_interpolate
+
+
+@torch.no_grad()
+def create_sintel_submission(model, iters=32, warm_start=False, output_path='sintel_submission'):
+    """ Create submission for the Sintel leaderboard """
+    model.eval()
+    for dstype in ['clean', 'final']:
+        test_dataset = datasets.MpiSintel(split='test', aug_params=None, dstype=dstype)
+        
+        flow_prev, sequence_prev = None, None
+        for test_id in range(len(test_dataset)):
+            image1, image2, (sequence, frame) = test_dataset[test_id]
+            if sequence != sequence_prev:
+                flow_prev = None
+            
+            padder = InputPadder(image1.shape)
+            image1, image2 = padder.pad(image1[None].cuda(), image2[None].cuda())
+
+            flow_low, flow_pr = model(image1, image2, iters=iters, flow_init=flow_prev, test_mode=True)
+            flow = padder.unpad(flow_pr[0]).permute(1, 2, 0).cpu().numpy()
+
+            if warm_start:
+                flow_prev = forward_interpolate(flow_low[0])[None].cuda()
+            
+            output_dir = os.path.join(output_path, dstype, sequence)
+            output_file = os.path.join(output_dir, 'frame%04d.flo' % (frame+1))
+
+            if not os.path.exists(output_dir):
+                os.makedirs(output_dir)
+
+            frame_utils.writeFlow(output_file, flow)
+            sequence_prev = sequence
+
+
+@torch.no_grad()
+def create_kitti_submission(model, iters=24, output_path='kitti_submission'):
+    """ Create submission for the Sintel leaderboard """
+    model.eval()
+    test_dataset = datasets.KITTI(split='testing', aug_params=None)
+
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    for test_id in range(len(test_dataset)):
+        image1, image2, (frame_id, ) = test_dataset[test_id]
+        padder = InputPadder(image1.shape, mode='kitti')
+        image1, image2 = padder.pad(image1[None].cuda(), image2[None].cuda())
+
+        _, flow_pr = model(image1, image2, iters=iters, test_mode=True)
+        flow = padder.unpad(flow_pr[0]).permute(1, 2, 0).cpu().numpy()
+
+        output_filename = os.path.join(output_path, frame_id)
+        frame_utils.writeFlowKITTI(output_filename, flow)
+
+
+@torch.no_grad()
+def validate_chairs(model, iters=24):
+    """ Perform evaluation on the FlyingChairs (test) split """
+    model.eval()
+    epe_list = []
+
+    val_dataset = datasets.FlyingChairs(split='validation')
+    for val_id in range(len(val_dataset)):
+        image1, image2, flow_gt, _ = val_dataset[val_id]
+        image1 = image1[None].cuda()
+        image2 = image2[None].cuda()
+
+        _, flow_pr = model(image1, image2, iters=iters, test_mode=True)
+        epe = torch.sum((flow_pr[0].cpu() - flow_gt)**2, dim=0).sqrt()
+        epe_list.append(epe.view(-1).numpy())
+
+    epe = np.mean(np.concatenate(epe_list))
+    print("Validation Chairs EPE: %f" % epe)
+    return {'chairs': epe}
+
+
+@torch.no_grad()
+def validate_sintel(model, iters=32):
+    """ Peform validation using the Sintel (train) split """
+    model.eval()
+    results = {}
+    for dstype in ['clean', 'final']:
+        val_dataset = datasets.MpiSintel(split='training', dstype=dstype)
+        epe_list = []
+
+        for val_id in range(len(val_dataset)):
+            image1, image2, flow_gt, _ = val_dataset[val_id]
+            image1 = image1[None].cuda()
+            image2 = image2[None].cuda()
+
+            padder = InputPadder(image1.shape)
+            image1, image2 = padder.pad(image1, image2)
+
+            flow_low, flow_pr = model(image1, image2, iters=iters, test_mode=True)
+            flow = padder.unpad(flow_pr[0]).cpu()
+
+            epe = torch.sum((flow - flow_gt)**2, dim=0).sqrt()
+            epe_list.append(epe.view(-1).numpy())
+
+        epe_all = np.concatenate(epe_list)
+        epe = np.mean(epe_all)
+        px1 = np.mean(epe_all<1)
+        px3 = np.mean(epe_all<3)
+        px5 = np.mean(epe_all<5)
+
+        print("Validation (%s) EPE: %f, 1px: %f, 3px: %f, 5px: %f" % (dstype, epe, px1, px3, px5))
+        results[dstype] = np.mean(epe_list)
+
+    return results
+
+
+@torch.no_grad()
+def validate_kitti(model, iters=24):
+    """ Peform validation using the KITTI-2015 (train) split """
+    model.eval()
+    val_dataset = datasets.KITTI(split='training')
+
+    out_list, epe_list = [], []
+    for val_id in range(len(val_dataset)):
+        image1, image2, flow_gt, valid_gt = val_dataset[val_id]
+        image1 = image1[None].cuda()
+        image2 = image2[None].cuda()
+
+        padder = InputPadder(image1.shape, mode='kitti')
+        image1, image2 = padder.pad(image1, image2)
+
+        flow_low, flow_pr = model(image1, image2, iters=iters, test_mode=True)
+        flow = padder.unpad(flow_pr[0]).cpu()
+
+        epe = torch.sum((flow - flow_gt)**2, dim=0).sqrt()
+        mag = torch.sum(flow_gt**2, dim=0).sqrt()
+
+        epe = epe.view(-1)
+        mag = mag.view(-1)
+        val = valid_gt.view(-1) >= 0.5
+
+        out = ((epe > 3.0) & ((epe/mag) > 0.05)).float()
+        epe_list.append(epe[val].mean().item())
+        out_list.append(out[val].cpu().numpy())
+
+    epe_list = np.array(epe_list)
+    out_list = np.concatenate(out_list)
+
+    epe = np.mean(epe_list)
+    f1 = 100 * np.mean(out_list)
+
+    print("Validation KITTI: %f, %f" % (epe, f1))
+    return {'kitti-epe': epe, 'kitti-f1': f1}
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', help="restore checkpoint")
+    parser.add_argument('--dataset', help="dataset for evaluation")
+    parser.add_argument('--small', action='store_true', help='use small model')
+    parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
+    parser.add_argument('--alternate_corr', action='store_true', help='use efficent correlation implementation')
+    args = parser.parse_args()
+
+    model = torch.nn.DataParallel(RAFT(args))
+    model.load_state_dict(torch.load(args.model))
+
+    model.cuda()
+    model.eval()
+
+    # create_sintel_submission(model.module, warm_start=True)
+    # create_kitti_submission(model.module)
+
+    with torch.no_grad():
+        if args.dataset == 'chairs':
+            validate_chairs(model.module)
+
+        elif args.dataset == 'sintel':
+            validate_sintel(model.module)
+
+        elif args.dataset == 'kitti':
+            validate_kitti(model.module)
+
+
diff --git a/preprocess/RAFT/train.py b/preprocess/RAFT/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..307573097f13ee30c67bbe11658f457fdf1ead3c
--- /dev/null
+++ b/preprocess/RAFT/train.py
@@ -0,0 +1,247 @@
+from __future__ import print_function, division
+import sys
+sys.path.append('core')
+
+import argparse
+import os
+import cv2
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+
+import torch
+import torch.nn as nn
+import torch.optim as optim
+import torch.nn.functional as F
+
+from torch.utils.data import DataLoader
+from raft import RAFT
+import evaluate
+import datasets
+
+from torch.utils.tensorboard import SummaryWriter
+
+try:
+    from torch.cuda.amp import GradScaler
+except:
+    # dummy GradScaler for PyTorch < 1.6
+    class GradScaler:
+        def __init__(self):
+            pass
+        def scale(self, loss):
+            return loss
+        def unscale_(self, optimizer):
+            pass
+        def step(self, optimizer):
+            optimizer.step()
+        def update(self):
+            pass
+
+
+# exclude extremly large displacements
+MAX_FLOW = 400
+SUM_FREQ = 100
+VAL_FREQ = 5000
+
+
+def sequence_loss(flow_preds, flow_gt, valid, gamma=0.8, max_flow=MAX_FLOW):
+    """ Loss function defined over sequence of flow predictions """
+
+    n_predictions = len(flow_preds)    
+    flow_loss = 0.0
+
+    # exlude invalid pixels and extremely large diplacements
+    mag = torch.sum(flow_gt**2, dim=1).sqrt()
+    valid = (valid >= 0.5) & (mag < max_flow)
+
+    for i in range(n_predictions):
+        i_weight = gamma**(n_predictions - i - 1)
+        i_loss = (flow_preds[i] - flow_gt).abs()
+        flow_loss += i_weight * (valid[:, None] * i_loss).mean()
+
+    epe = torch.sum((flow_preds[-1] - flow_gt)**2, dim=1).sqrt()
+    epe = epe.view(-1)[valid.view(-1)]
+
+    metrics = {
+        'epe': epe.mean().item(),
+        '1px': (epe < 1).float().mean().item(),
+        '3px': (epe < 3).float().mean().item(),
+        '5px': (epe < 5).float().mean().item(),
+    }
+
+    return flow_loss, metrics
+
+
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+
+
+def fetch_optimizer(args, model):
+    """ Create the optimizer and learning rate scheduler """
+    optimizer = optim.AdamW(model.parameters(), lr=args.lr, weight_decay=args.wdecay, eps=args.epsilon)
+
+    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, args.lr, args.num_steps+100,
+        pct_start=0.05, cycle_momentum=False, anneal_strategy='linear')
+
+    return optimizer, scheduler
+    
+
+class Logger:
+    def __init__(self, model, scheduler):
+        self.model = model
+        self.scheduler = scheduler
+        self.total_steps = 0
+        self.running_loss = {}
+        self.writer = None
+
+    def _print_training_status(self):
+        metrics_data = [self.running_loss[k]/SUM_FREQ for k in sorted(self.running_loss.keys())]
+        training_str = "[{:6d}, {:10.7f}] ".format(self.total_steps+1, self.scheduler.get_last_lr()[0])
+        metrics_str = ("{:10.4f}, "*len(metrics_data)).format(*metrics_data)
+        
+        # print the training status
+        print(training_str + metrics_str)
+
+        if self.writer is None:
+            self.writer = SummaryWriter()
+
+        for k in self.running_loss:
+            self.writer.add_scalar(k, self.running_loss[k]/SUM_FREQ, self.total_steps)
+            self.running_loss[k] = 0.0
+
+    def push(self, metrics):
+        self.total_steps += 1
+
+        for key in metrics:
+            if key not in self.running_loss:
+                self.running_loss[key] = 0.0
+
+            self.running_loss[key] += metrics[key]
+
+        if self.total_steps % SUM_FREQ == SUM_FREQ-1:
+            self._print_training_status()
+            self.running_loss = {}
+
+    def write_dict(self, results):
+        if self.writer is None:
+            self.writer = SummaryWriter()
+
+        for key in results:
+            self.writer.add_scalar(key, results[key], self.total_steps)
+
+    def close(self):
+        self.writer.close()
+
+
+def train(args):
+
+    model = nn.DataParallel(RAFT(args), device_ids=args.gpus)
+    print("Parameter Count: %d" % count_parameters(model))
+
+    if args.restore_ckpt is not None:
+        model.load_state_dict(torch.load(args.restore_ckpt), strict=False)
+
+    model.cuda()
+    model.train()
+
+    if args.stage != 'chairs':
+        model.module.freeze_bn()
+
+    train_loader = datasets.fetch_dataloader(args)
+    optimizer, scheduler = fetch_optimizer(args, model)
+
+    total_steps = 0
+    scaler = GradScaler(enabled=args.mixed_precision)
+    logger = Logger(model, scheduler)
+
+    VAL_FREQ = 5000
+    add_noise = True
+
+    should_keep_training = True
+    while should_keep_training:
+
+        for i_batch, data_blob in enumerate(train_loader):
+            optimizer.zero_grad()
+            image1, image2, flow, valid = [x.cuda() for x in data_blob]
+
+            if args.add_noise:
+                stdv = np.random.uniform(0.0, 5.0)
+                image1 = (image1 + stdv * torch.randn(*image1.shape).cuda()).clamp(0.0, 255.0)
+                image2 = (image2 + stdv * torch.randn(*image2.shape).cuda()).clamp(0.0, 255.0)
+
+            flow_predictions = model(image1, image2, iters=args.iters)            
+
+            loss, metrics = sequence_loss(flow_predictions, flow, valid, args.gamma)
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)                
+            torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
+            
+            scaler.step(optimizer)
+            scheduler.step()
+            scaler.update()
+
+            logger.push(metrics)
+
+            if total_steps % VAL_FREQ == VAL_FREQ - 1:
+                PATH = 'checkpoints/%d_%s.pth' % (total_steps+1, args.name)
+                torch.save(model.state_dict(), PATH)
+
+                results = {}
+                for val_dataset in args.validation:
+                    if val_dataset == 'chairs':
+                        results.update(evaluate.validate_chairs(model.module))
+                    elif val_dataset == 'sintel':
+                        results.update(evaluate.validate_sintel(model.module))
+                    elif val_dataset == 'kitti':
+                        results.update(evaluate.validate_kitti(model.module))
+
+                logger.write_dict(results)
+                
+                model.train()
+                if args.stage != 'chairs':
+                    model.module.freeze_bn()
+            
+            total_steps += 1
+
+            if total_steps > args.num_steps:
+                should_keep_training = False
+                break
+
+    logger.close()
+    PATH = 'checkpoints/%s.pth' % args.name
+    torch.save(model.state_dict(), PATH)
+
+    return PATH
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--name', default='raft', help="name your experiment")
+    parser.add_argument('--stage', help="determines which dataset to use for training") 
+    parser.add_argument('--restore_ckpt', help="restore checkpoint")
+    parser.add_argument('--small', action='store_true', help='use small model')
+    parser.add_argument('--validation', type=str, nargs='+')
+
+    parser.add_argument('--lr', type=float, default=0.00002)
+    parser.add_argument('--num_steps', type=int, default=100000)
+    parser.add_argument('--batch_size', type=int, default=6)
+    parser.add_argument('--image_size', type=int, nargs='+', default=[384, 512])
+    parser.add_argument('--gpus', type=int, nargs='+', default=[0,1])
+    parser.add_argument('--mixed_precision', action='store_true', help='use mixed precision')
+
+    parser.add_argument('--iters', type=int, default=12)
+    parser.add_argument('--wdecay', type=float, default=.00005)
+    parser.add_argument('--epsilon', type=float, default=1e-8)
+    parser.add_argument('--clip', type=float, default=1.0)
+    parser.add_argument('--dropout', type=float, default=0.0)
+    parser.add_argument('--gamma', type=float, default=0.8, help='exponential weighting')
+    parser.add_argument('--add_noise', action='store_true')
+    args = parser.parse_args()
+
+    torch.manual_seed(1234)
+    np.random.seed(1234)
+
+    if not os.path.isdir('checkpoints'):
+        os.mkdir('checkpoints')
+
+    train(args)
\ No newline at end of file
diff --git a/preprocess/RAFT/train_mixed.sh b/preprocess/RAFT/train_mixed.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d9b979f143902a17a0ba7b0a8f960598b7096e0b
--- /dev/null
+++ b/preprocess/RAFT/train_mixed.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+mkdir -p checkpoints
+python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 --num_steps 120000 --batch_size 8 --lr 0.00025 --image_size 368 496 --wdecay 0.0001 --mixed_precision 
+python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 400 720 --wdecay 0.0001 --mixed_precision
+python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 --num_steps 120000 --batch_size 5 --lr 0.0001 --image_size 368 768 --wdecay 0.00001 --gamma=0.85 --mixed_precision
+python -u train.py --name raft-kitti  --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 --num_steps 50000 --batch_size 5 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85 --mixed_precision
diff --git a/preprocess/RAFT/train_standard.sh b/preprocess/RAFT/train_standard.sh
new file mode 100644
index 0000000000000000000000000000000000000000..7f559b386b6b596ec14a94f0d8c13974309b7d80
--- /dev/null
+++ b/preprocess/RAFT/train_standard.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+mkdir -p checkpoints
+python -u train.py --name raft-chairs --stage chairs --validation chairs --gpus 0 1 --num_steps 100000 --batch_size 10 --lr 0.0004 --image_size 368 496 --wdecay 0.0001
+python -u train.py --name raft-things --stage things --validation sintel --restore_ckpt checkpoints/raft-chairs.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 400 720 --wdecay 0.0001
+python -u train.py --name raft-sintel --stage sintel --validation sintel --restore_ckpt checkpoints/raft-things.pth --gpus 0 1 --num_steps 100000 --batch_size 6 --lr 0.000125 --image_size 368 768 --wdecay 0.00001 --gamma=0.85
+python -u train.py --name raft-kitti  --stage kitti --validation kitti --restore_ckpt checkpoints/raft-sintel.pth --gpus 0 1 --num_steps 50000 --batch_size 6 --lr 0.0001 --image_size 288 960 --wdecay 0.00001 --gamma=0.85
diff --git a/preprocess/get_masked_videos.py b/preprocess/get_masked_videos.py
new file mode 100644
index 0000000000000000000000000000000000000000..a34cd1d54a87ff3d80a89aca094b61aa51660cc0
--- /dev/null
+++ b/preprocess/get_masked_videos.py
@@ -0,0 +1,186 @@
+import os
+import cv2
+import torch
+import numpy as np
+from tqdm import tqdm
+from torchvision import transforms
+import imageio
+import argparse
+import sys
+
+sys.path.append("RAFT/core")
+from raft import RAFT
+from utils.utils import InputPadder
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+def load_raft_model(ckpt_path):
+    args = argparse.Namespace(
+        small=False,
+        mixed_precision=False,
+        alternate_corr=False,
+        dropout=0.0,
+        max_depth=8,
+        depth_network=False,
+        depth_residual=False,
+        depth_scale=1.0
+    )
+    model = torch.nn.DataParallel(RAFT(args))
+    model.load_state_dict(torch.load(ckpt_path, map_location=DEVICE))
+    return model.module.to(DEVICE).eval()
+
+def run_masking(video_path, output_path, mask_path, raft):
+    cap = cv2.VideoCapture(video_path)
+    if not cap.isOpened():
+        print(f"Failed to open video: {video_path}")
+        return
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    n_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+    ok, first = cap.read()
+    if not ok:
+        print(f"Failed to read first frame in {video_path}")
+        return
+
+    resize_to = (720, 480)
+    first = cv2.resize(first, resize_to)
+    H, W, _ = first.shape
+    area_thresh = (H * W) // 6
+
+    grid = np.stack(np.meshgrid(np.arange(W), np.arange(H)), -1).astype(np.float32)
+    pos = grid.copy()
+    vis = np.ones((H, W), dtype=bool)
+
+    writer = imageio.get_writer(output_path, fps=int(fps))
+
+    prev = first.copy()
+    frames_since_corr = 0
+    freeze_mask = False
+    frozen_mask = None
+    all_masks = []
+
+    writer.append_data(first[:, :, ::-1])
+    all_masks.append(np.ones((H, W), dtype=bool))
+
+    def to_tensor(bgr):
+        return transforms.ToTensor()(bgr).unsqueeze(0).to(DEVICE)
+
+    def raft_flow(img1_bgr, img2_bgr):
+        t1, t2 = to_tensor(img1_bgr), to_tensor(img2_bgr)
+        padder = InputPadder(t1.shape)
+        i1, i2 = padder.pad(t1, t2)
+        with torch.no_grad():
+            _, flow = raft(i1, i2, iters=20, test_mode=True)
+        return padder.unpad(flow)[0].permute(1, 2, 0).cpu().numpy()
+
+    for _ in range(1, n_frames):
+        ok, cur = cap.read()
+        if not ok:
+            break
+        cur = cv2.resize(cur, resize_to)
+
+        if not freeze_mask:
+            flow_fw = raft_flow(prev, cur)
+            pos += flow_fw
+            frames_since_corr += 1
+
+            x_ok = (0 <= pos[..., 0]) & (pos[..., 0] < W)
+            y_ok = (0 <= pos[..., 1]) & (pos[..., 1] < H)
+            vis &= x_ok & y_ok
+
+            m = np.zeros((H, W), np.uint8)
+
+            ys, xs = np.where(vis)
+            px = np.round(pos[ys, xs, 0]).astype(int)
+            py = np.round(pos[ys, xs, 1]).astype(int)
+
+            inb = (0 <= px) & (px < W) & (0 <= py) & (py < H)
+            m[py[inb], px[inb]] = 1
+            m = cv2.dilate(m, np.ones((2, 2), np.uint8))
+
+            visible_ratio = m.sum() / (H * W)
+            if visible_ratio < 0.3:
+                flow_0t = raft_flow(first, cur)
+                pos = grid + flow_0t
+
+                vis = np.ones((H, W), dtype=bool)
+                x_ok = (0 <= pos[..., 0]) & (pos[..., 0] < W)
+                y_ok = (0 <= pos[..., 1]) & (pos[..., 1] < H)
+                vis &= x_ok & y_ok
+
+                m.fill(0)
+                ys, xs = np.where(vis)
+                px = np.round(pos[ys, xs, 0]).astype(int)
+                py = np.round(pos[ys, xs, 1]).astype(int)
+                inb = (0 <= px) & (px < W) & (0 <= py) & (py < H)
+                m[py[inb], px[inb]] = 1
+                m = cv2.dilate(m, np.ones((2, 2), np.uint8))
+
+                if m.sum() < area_thresh:
+                    freeze_mask = True
+                    frozen_mask = m.copy()
+
+                frames_since_corr = 0
+        else:
+            m = frozen_mask
+
+        effective_mask = m.astype(bool)
+        all_masks.append(effective_mask)
+
+        out = cur.copy()
+        out[~effective_mask] = 0
+        writer.append_data(out[:, :, ::-1])
+
+        prev = cur if not freeze_mask else prev
+
+    writer.close()
+    cap.release()
+
+    all_masks_array = np.stack(all_masks, axis=0)
+    np.savez_compressed(mask_path, mask=all_masks_array)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    parser.add_argument("--mask_path", type=str, required=True)
+    parser.add_argument("--raft_ckpt", type=str, required=True)
+    parser.add_argument("--start_idx", type=int, required=True)
+    parser.add_argument("--end_idx", type=int, required=True)
+    parser.add_argument("--gpu_id", type=int, required=True)
+
+    args = parser.parse_args()
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
+    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+    os.makedirs(args.output_path, exist_ok=True)
+    os.makedirs(args.mask_path, exist_ok=True)
+
+    video_list = sorted([
+        f for f in os.listdir(args.video_path)
+        if f.endswith(".mp4")
+    ])
+    selected_videos = video_list[args.start_idx : args.end_idx]
+
+    print(f"[GPU {args.gpu_id}] Processing {len(selected_videos)} videos: {args.start_idx} to {args.end_idx}")
+    model = load_raft_model(args.raft_ckpt)
+
+    for fname in tqdm(selected_videos, desc="Batch Processing"):
+        input_path = os.path.join(args.video_path, fname)
+        mask_path = os.path.join(args.mask_path, fname.replace(".mp4", ".npz"))
+        output_path = os.path.join(args.output_path, fname)
+
+        if os.path.exists(mask_path):
+            try:
+                np.load(mask_path)["mask"]
+                continue
+            except:
+                print(f"⚠️ Mask corrupt or unreadable: {mask_path} - Regenerating")
+
+        if os.path.exists(output_path):
+            continue
+
+        run_masking(input_path, output_path, mask_path, model)
\ No newline at end of file
diff --git a/preprocess/get_prompt_emb.py b/preprocess/get_prompt_emb.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a07ed475236450f696e3d4f6edbb31994e5a8a
--- /dev/null
+++ b/preprocess/get_prompt_emb.py
@@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+import argparse
+import torch
+import os
+import tqdm
+from transformers import T5Tokenizer, T5EncoderModel, AutoTokenizer
+
+def compute_prompt_embeddings(tokenizer, text_encoder, prompts, max_sequence_length=226, device=torch.device("cpu"), dtype=torch.float16):
+    if isinstance(prompts, str):
+        prompts = [prompts]
+
+    text_inputs = tokenizer(
+        prompts,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt"
+    )
+    text_input_ids = text_inputs.input_ids.to(device)
+
+    with torch.no_grad():
+        prompt_embeds = text_encoder(text_input_ids)[0]
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    return prompt_embeds
+
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=None
+    )
+    text_encoder = T5EncoderModel.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=None
+    ).to(device)
+
+    all_files = sorted(os.listdir(args.caption_path))
+    chunk = all_files[args.start_idx: args.end_idx]
+
+    os.makedirs(args.output_path, exist_ok=True)
+
+    for name in tqdm.tqdm(chunk, desc=f"GPU {args.gpu_id}"):
+        with open(os.path.join(args.caption_path, name), 'r') as f:
+            caption = f.read().strip()
+
+        embeddings = compute_prompt_embeddings(
+            tokenizer,
+            text_encoder,
+            caption,
+            max_sequence_length=args.max_sequence_length,
+            device=device,
+            dtype=torch.bfloat16
+        ).cpu()
+        torch.save(embeddings, os.path.join(args.output_path, name.replace('.txt', '') + '.pt'))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Single-GPU T5 prompt embedding")
+    parser.add_argument("--pretrained_model_name_or_path", type=str, required=True)
+    parser.add_argument("--caption_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    parser.add_argument("--max_sequence_length", type=int, default=226)
+    parser.add_argument("--gpu_id", type=int, required=True)
+    parser.add_argument("--start_idx", type=int, required=True)
+    parser.add_argument("--end_idx", type=int, required=True)
+    args = parser.parse_args()
+    
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
+    main(args)
diff --git a/preprocess/get_vae_latent.py b/preprocess/get_vae_latent.py
new file mode 100644
index 0000000000000000000000000000000000000000..183e45e497e1fbdb2772a08124c2645d30f50c57
--- /dev/null
+++ b/preprocess/get_vae_latent.py
@@ -0,0 +1,119 @@
+import os
+import torch
+import numpy as np
+import math
+import argparse
+from decord import VideoReader
+from diffusers import AutoencoderKLCogVideoX
+from safetensors.torch import save_file
+import tqdm
+import random
+
+def encode_video(video, vae):
+    video = video[None].permute(0, 2, 1, 3, 4).contiguous()
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+
+def add_dashed_rays_to_video(video_tensor, num_perp_samples=50, density_decay=0.075):
+    T, C, H, W = video_tensor.shape
+    max_length = int((H**2 + W**2) ** 0.5) + 10
+    center = torch.tensor([W / 2, H / 2])
+    theta = torch.rand(1).item() * 2 * math.pi
+    direction = torch.tensor([math.cos(theta), math.sin(theta)])
+    direction = direction / direction.norm()
+    d_perp = torch.tensor([-direction[1], direction[0]])
+    half_len = max(H, W) // 2
+    positions = torch.linspace(-half_len, half_len, num_perp_samples)
+    perp_coords = center[None, :] + positions[:, None] * d_perp[None, :]
+    x0, y0 = perp_coords[:, 0], perp_coords[:, 1]
+    steps = []
+    dist = 0
+    while dist < max_length:
+        steps.append(dist)
+        dist += 1.0 + density_decay * dist
+    steps = torch.tensor(steps)
+    S = len(steps)
+    dxdy = direction[None, :] * steps[:, None]
+    all_xy = perp_coords[:, None, :] + dxdy[None, :, :]
+    all_xy = all_xy.reshape(-1, 2)
+    all_x = all_xy[:, 0].round().long()
+    all_y = all_xy[:, 1].round().long()
+    valid = (0 <= all_x) & (all_x < W) & (0 <= all_y) & (all_y < H)
+    all_x = all_x[valid]
+    all_y = all_y[valid]
+    x0r = x0.round().long().clamp(0, W - 1)
+    y0r = y0.round().long().clamp(0, H - 1)
+    frame0 = video_tensor[0]
+    base_colors = frame0[:, y0r, x0r]
+    base_colors = base_colors.repeat_interleave(S, dim=1)[:, valid]
+    video_out = video_tensor.clone()
+    offsets = [(0, 0), (0, 1), (1, 0), (1, 1)]
+    for dxo, dyo in offsets:
+        ox = all_x + dxo
+        oy = all_y + dyo
+        inside = (0 <= ox) & (ox < W) & (0 <= oy) & (oy < H)
+        ox = ox[inside]
+        oy = oy[inside]
+        colors = base_colors[:, inside]
+        for c in range(C):
+            video_out[1:, c, oy, ox] = colors[c][None, :].expand(T - 1, -1)
+    return video_out
+
+def main(args):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    vae = AutoencoderKLCogVideoX.from_pretrained(args.pretrained_model_path, subfolder="vae")
+    vae.requires_grad_(False)
+    vae = vae.to(device, dtype=torch.float16)
+
+    masked_video_path = os.path.join(args.video_root, "masked_videos")
+    source_video_path = os.path.join(args.video_root, "videos")
+    joint_latent_path = os.path.join(args.video_root, "joint_latents")
+    os.makedirs(joint_latent_path, exist_ok=True)
+
+    all_video_names = sorted(os.listdir(source_video_path))
+    video_names = all_video_names[args.start_idx : args.end_idx]
+
+    for video_name in tqdm.tqdm(video_names, desc=f"GPU {args.gpu_id}"):
+        masked_video_file = os.path.join(masked_video_path, video_name)
+        source_video_file = os.path.join(source_video_path, video_name)
+        output_file = os.path.join(joint_latent_path, video_name.replace('.mp4', '.safetensors'))
+
+        if not os.path.exists(masked_video_file):
+            print(f"Skipping {video_name}, masked video not found.")
+            continue
+        if os.path.exists(output_file):
+            continue
+
+        try:
+            vr = VideoReader(source_video_file)
+            video = torch.from_numpy(vr.get_batch(np.arange(49)).asnumpy()).permute(0, 3, 1, 2).contiguous()
+            video = (video / 255.0) * 2 - 1
+            source_latent = encode_video(video, vae)
+
+            vr = VideoReader(masked_video_file)
+            video = torch.from_numpy(vr.get_batch(np.arange(49)).asnumpy()).permute(0, 3, 1, 2).contiguous()
+            video = (video / 255.0) * 2 - 1
+            video = add_dashed_rays_to_video(video)
+            masked_latent = encode_video(video, vae)
+
+            source_latent = source_latent.to("cpu")
+            masked_latent = masked_latent.to("cpu")
+            cated_latent = torch.cat([source_latent, masked_latent], dim=2)
+            save_file({'joint_latents': cated_latent}, output_file)
+
+        except Exception as e:
+            print(f"[GPU {args.gpu_id}] Error processing {video_name}: {e}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--video_root", type=str, required=True)
+    parser.add_argument("--pretrained_model_path", type=str, required=True)
+    parser.add_argument("--start_idx", type=int, required=True)
+    parser.add_argument("--end_idx", type=int, required=True)
+    parser.add_argument("--gpu_id", type=int, required=True)
+    args = parser.parse_args()
+
+    os.environ["CUDA_VISIBLE_DEVICES"] = str(args.gpu_id)
+    main(args)
diff --git a/preprocess/preprocess.sh b/preprocess/preprocess.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8131ae165481c741ebf6e42edb5e929cd1021ce3
--- /dev/null
+++ b/preprocess/preprocess.sh
@@ -0,0 +1,108 @@
+#!/bin/bash
+trap 'kill 0' SIGINT
+
+mode=$1  # Options: caption, masking, latent
+
+video_root="../data/train"
+pretrained_model_path="../pretrained/CogVideoX-5b-I2V"
+raft_ckpt="../pretrained/RAFT/raft-things.pth"
+# gpu_list="0,1,2,3,4,5,6,7"
+gpu_list="2,3,6,7"
+
+gpus=(${gpu_list//,/ })
+num_gpus=${#gpus[@]}
+
+if [[ "$mode" == "caption" ]]; then
+    echo "==== Running CAPTION EMBEDDING ===="
+
+    caption_path="$video_root/captions"
+    caption_emb_path="$video_root/caption_embs"
+    all_files=($caption_path/*.txt)
+    total=${#all_files[@]}
+    chunk_size=$(( (total + num_gpus - 1) / num_gpus ))
+
+    echo "Total caption files: $total"
+    echo "Using $num_gpus GPUs, chunk size: $chunk_size"
+
+    for ((i=0; i<num_gpus; i++)); do
+        start_idx=$((i * chunk_size))
+        end_idx=$(( (i + 1) * chunk_size ))
+        (( end_idx > total )) && end_idx=$total
+
+        gpu_id=${gpus[$i]}
+        echo "Launching GPU $gpu_id: captions $start_idx to $end_idx"
+
+        CUDA_VISIBLE_DEVICES=$gpu_id python get_prompt_emb.py \
+            --pretrained_model_name_or_path $pretrained_model_path \
+            --caption_path $caption_path \
+            --output_path $caption_emb_path \
+            --gpu_id $gpu_id \
+            --start_idx $start_idx \
+            --end_idx $end_idx &
+    done
+
+elif [[ "$mode" == "masking" ]]; then
+    echo "==== Running VIDEO MASKING ===="
+
+    source_video_dir="$video_root/videos"
+    mask_dir="$video_root/masks"
+    masked_video_dir="$video_root/masked_videos"
+    all_videos=($source_video_dir/*.mp4)
+    total=${#all_videos[@]}
+    chunk_size=$(( (total + num_gpus - 1) / num_gpus ))
+
+    echo "Total videos: $total"
+    echo "Using $num_gpus GPUs, chunk size: $chunk_size"
+
+    for ((i=0; i<num_gpus; i++)); do
+        start_idx=$((i * chunk_size))
+        end_idx=$(( (i + 1) * chunk_size ))
+        (( end_idx > total )) && end_idx=$total
+
+        gpu_id=${gpus[$i]}
+        echo "Launching GPU $gpu_id: videos $start_idx to $end_idx"
+
+        CUDA_VISIBLE_DEVICES=$gpu_id python get_masked_videos.py \
+            --video_path $source_video_dir \
+            --output_path $masked_video_dir \
+            --mask_path $mask_dir \
+            --raft_ckpt $raft_ckpt \
+            --start_idx $start_idx \
+            --end_idx $end_idx \
+            --gpu_id $gpu_id &
+    done
+
+elif [[ "$mode" == "latent" ]]; then
+    echo "==== Running LATENT ENCODING ===="
+
+    all_videos=($video_root/videos/*.mp4)
+    total=${#all_videos[@]}
+    chunk_size=$(( (total + num_gpus - 1) / num_gpus ))
+
+    echo "Total videos: $total"
+    echo "Using $num_gpus GPUs, chunk size: $chunk_size"
+
+    for ((i=0; i<num_gpus; i++)); do
+        start_idx=$((i * chunk_size))
+        end_idx=$(( (i + 1) * chunk_size ))
+        (( end_idx > total )) && end_idx=$total
+
+        gpu_id=${gpus[$i]}
+        echo "Launching GPU $gpu_id: videos $start_idx to $end_idx"
+
+        CUDA_VISIBLE_DEVICES=$gpu_id python get_vae_latent.py \
+            --video_root $video_root \
+            --pretrained_model_path $pretrained_model_path \
+            --start_idx $start_idx \
+            --end_idx $end_idx \
+            --gpu_id $gpu_id &
+    done
+
+else
+    echo "Unknown mode: $mode"
+    echo "Usage: bash preprocess.sh [caption|masking|latent]"
+    exit 1
+fi
+
+wait
+echo "All processes completed."
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fbaa686b27e106d8d728b06d213605879bcd2099
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,17 @@
+diffusers==0.32.2
+accelerate>=1.1.1
+transformers>=4.46.2
+numpy==1.26.0
+torch>=2.5.0
+torchvision>=0.20.0
+opencv-python>=4.10.0.84
+imageio>=2.35.1
+imageio-ffmpeg>=0.5.1
+sentencepiece>=0.2.0
+einops
+decord
+protobuf
+scipy
+scikit-image
+qwen_vl_utils
+gdown
\ No newline at end of file
diff --git a/scripts/inference.sh b/scripts/inference.sh
new file mode 100644
index 0000000000000000000000000000000000000000..32d7c5c2efeb4da1ea0962e3e47f0fd73df658f2
--- /dev/null
+++ b/scripts/inference.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+trap 'kill 0' SIGINT
+
+export MODEL_PATH="./pretrained/CogVideoX-5b-I2V"
+
+processed_data_name=$1
+ckpt_steps=500
+ckpt_dir=./out/EPiC_pretrained
+ckpt_file=checkpoint-${ckpt_steps}.pt
+ckpt_path=${ckpt_dir}/${ckpt_file}
+video_root_dir="./data/${processed_data_name}"
+out_dir=${ckpt_dir}/test/${ckpt_steps}_${processed_data_name}
+
+CUDA_VISIBLE_DEVICES=0 python inference/cli_demo_camera_i2v_pcd.py \
+    --video_root_dir $video_root_dir \
+    --base_model_path $MODEL_PATH \
+    --controlnet_model_path $ckpt_path \
+    --output_path "${out_dir}" \
+    --start_camera_idx 0 \
+    --end_camera_idx 8 \
+    --controlnet_weights 1.0 \
+    --controlnet_guidance_start 0.0 \
+    --controlnet_guidance_end 0.4 \
+    --controlnet_input_channels 3 \
+    --controlnet_transformer_num_attn_heads 4 \
+    --controlnet_transformer_attention_head_dim 64 \
+    --controlnet_transformer_out_proj_dim_factor 64 \
+    --controlnet_transformer_out_proj_dim_zero_init \
+    --vae_channels 16 \
+    --num_frames 49 \
+    --controlnet_transformer_num_layers 8 \
+    --infer_with_mask \
+    --pool_style 'max' \
+    --seed 1
diff --git a/scripts/train.sh b/scripts/train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..d02145b3d21e443d5fd41c49f5b67f2d937ffe2b
--- /dev/null
+++ b/scripts/train.sh
@@ -0,0 +1,60 @@
+MODEL_PATH="pretrained/CogVideoX-5b-I2V"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+video_root_dir="data/train" # subfolders: annotations/ pose_files/ video_clips/
+
+dir=`pwd`
+output_dir=${dir}/out/EPiC
+MODEL_PATH=${dir}/${MODEL_PATH}
+video_root_dir=${dir}/${video_root_dir}
+cd training
+
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" accelerate launch --config_file accelerate_config_machine.yaml --multi_gpu --main_process_port 29502 \
+  train_controlnet_i2v_pcd_render_mask_aware_add_dash.py \
+  --tracker_name "cogvideox-controlnet" \
+  --gradient_checkpointing \
+  --pretrained_model_name_or_path $MODEL_PATH \
+  --enable_tiling \
+  --enable_slicing \
+  --num_inference_steps 28 \
+  --seed 42 \
+  --mixed_precision bf16 \
+  --output_dir $output_dir \
+  --height 480 \
+  --width 720 \
+  --fps 8 \
+  --max_num_frames 49 \
+  --video_root_dir $video_root_dir \
+  --hflip_p 0.0 \
+  --controlnet_transformer_num_layers 8 \
+  --controlnet_input_channels 3 \
+  --downscale_coef 8 \
+  --controlnet_weights 1.0 \
+  --train_batch_size 2 \
+  --dataloader_num_workers 0 \
+  --num_train_epochs 2 \
+  --checkpointing_steps 500 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 2e-4 \
+  --lr_scheduler cosine_with_restarts \
+  --lr_warmup_steps 250 \
+  --lr_num_cycles 1 \
+  --enable_slicing \
+  --enable_tiling \
+  --gradient_checkpointing \
+  --optimizer AdamW \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.95 \
+  --max_grad_norm 1.0 \
+  --allow_tf32 \
+  --enable_time_sampling \
+  --time_sampling_type truncated_normal \
+  --time_sampling_mean 0.95 \
+  --time_sampling_std 0.1 \
+  --controlnet_guidance_start 0.0 \
+  --controlnet_guidance_end 1.0 \
+  --controlnet_transformer_num_attn_heads 4 \
+  --controlnet_transformer_attention_head_dim 64 \
+  --controlnet_transformer_out_proj_dim_factor 64 \
+  --controlnet_transformer_out_proj_dim_zero_init \
+  --text_embedding_path "${video_root_dir}/caption_embs" 
\ No newline at end of file
diff --git a/scripts/train_with_latent.sh b/scripts/train_with_latent.sh
new file mode 100644
index 0000000000000000000000000000000000000000..75bd9cf3425958bbd0a3a302d53db3b3058c8912
--- /dev/null
+++ b/scripts/train_with_latent.sh
@@ -0,0 +1,60 @@
+MODEL_PATH="pretrained/CogVideoX-5b-I2V"
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+
+video_root_dir="data/train" # subfolders: annotations/ pose_files/ video_clips/
+
+dir=`pwd`
+output_dir=${dir}/out/EPiC
+MODEL_PATH=${dir}/${MODEL_PATH}
+video_root_dir=${dir}/${video_root_dir}
+cd training
+
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" accelerate launch --config_file accelerate_config_machine.yaml --multi_gpu --main_process_port 29502 \
+  train_controlnet_i2v_pcd_render_mask_aware_add_dash_use_latent.py \
+  --tracker_name "cogvideox-controlnet" \
+  --gradient_checkpointing \
+  --pretrained_model_name_or_path $MODEL_PATH \
+  --enable_tiling \
+  --enable_slicing \
+  --num_inference_steps 28 \
+  --seed 42 \
+  --mixed_precision bf16 \
+  --output_dir $output_dir \
+  --height 480 \
+  --width 720 \
+  --fps 8 \
+  --max_num_frames 49 \
+  --video_root_dir $video_root_dir \
+  --hflip_p 0.0 \
+  --controlnet_transformer_num_layers 8 \
+  --controlnet_input_channels 3 \
+  --downscale_coef 8 \
+  --controlnet_weights 1.0 \
+  --train_batch_size 2 \
+  --dataloader_num_workers 0 \
+  --num_train_epochs 2 \
+  --checkpointing_steps 500 \
+  --gradient_accumulation_steps 1 \
+  --learning_rate 2e-4 \
+  --lr_scheduler cosine_with_restarts \
+  --lr_warmup_steps 250 \
+  --lr_num_cycles 1 \
+  --enable_slicing \
+  --enable_tiling \
+  --gradient_checkpointing \
+  --optimizer AdamW \
+  --adam_beta1 0.9 \
+  --adam_beta2 0.95 \
+  --max_grad_norm 1.0 \
+  --allow_tf32 \
+  --enable_time_sampling \
+  --time_sampling_type truncated_normal \
+  --time_sampling_mean 0.95 \
+  --time_sampling_std 0.1 \
+  --controlnet_guidance_start 0.0 \
+  --controlnet_guidance_end 1.0 \
+  --controlnet_transformer_num_attn_heads 4 \
+  --controlnet_transformer_attention_head_dim 64 \
+  --controlnet_transformer_out_proj_dim_factor 64 \
+  --controlnet_transformer_out_proj_dim_zero_init \
+  --text_embedding_path "${video_root_dir}/caption_embs" 
\ No newline at end of file
diff --git a/training/__init__.py b/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/training/accelerate_config_machine.yaml b/training/accelerate_config_machine.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8f907bccbcddf6db57ffc9248f0ed29c2996bf9b
--- /dev/null
+++ b/training/accelerate_config_machine.yaml
@@ -0,0 +1,25 @@
+compute_environment: LOCAL_MACHINE
+main_process_port: 29501
+debug: false
+deepspeed_config:
+  gradient_accumulation_steps: 1
+  gradient_clipping: 1.0
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+dynamo_backend: 'no'
+mixed_precision: 'no'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/training/controlnet_datasets_camera_pcd_mask.py b/training/controlnet_datasets_camera_pcd_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..d33f2e050a7ca74e53af34de1b0036fa28bff155
--- /dev/null
+++ b/training/controlnet_datasets_camera_pcd_mask.py
@@ -0,0 +1,200 @@
+import os
+import random
+import json
+import torch
+
+import torch.nn as nn
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as F
+import numpy as np
+
+from decord import VideoReader
+from torch.utils.data.dataset import Dataset
+from packaging import version as pver
+
+class RandomHorizontalFlipWithPose(nn.Module):
+    def __init__(self, p=0.5):
+        super(RandomHorizontalFlipWithPose, self).__init__()
+        self.p = p
+
+    def get_flip_flag(self, n_image):
+        return torch.rand(n_image) < self.p
+
+    def forward(self, image, flip_flag=None):
+        n_image = image.shape[0]
+        if flip_flag is not None:
+            assert n_image == flip_flag.shape[0]
+        else:
+            flip_flag = self.get_flip_flag(n_image)
+
+        ret_images = []
+        for fflag, img in zip(flip_flag, image):
+            if fflag:
+                ret_images.append(F.hflip(img))
+            else:
+                ret_images.append(img)
+        return torch.stack(ret_images, dim=0)
+
+class RealEstate10KPCDRenderDataset(Dataset):
+    def __init__(
+            self,
+            video_root_dir,
+            sample_n_frames=49,
+            image_size=[480, 720],
+            shuffle_frames=False,
+            hflip_p=0.0,
+    ):
+        if hflip_p != 0.0:
+            use_flip = True
+        else:
+            use_flip = False
+        root_path = video_root_dir
+        self.root_path = root_path
+        self.sample_n_frames = sample_n_frames
+        self.source_video_root = os.path.join(self.root_path, 'videos')
+        self.mask_video_root = os.path.join(self.root_path, 'masked_videos')
+        self.captions_root = os.path.join(self.root_path, 'captions')
+        self.dataset = sorted([n.replace('.mp4','') for n in os.listdir(self.source_video_root)])
+        self.length = len(self.dataset)
+        sample_size = image_size
+        sample_size = tuple(sample_size) if not isinstance(sample_size, int) else (sample_size, sample_size)
+        self.sample_size = sample_size
+        if use_flip:
+            pixel_transforms = [transforms.Resize(sample_size),
+                                RandomHorizontalFlipWithPose(hflip_p),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        else:
+            pixel_transforms = [transforms.Resize(sample_size),
+                                transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5], inplace=True)]
+        self.sample_wh_ratio = sample_size[1] / sample_size[0]
+
+        self.pixel_transforms = pixel_transforms
+        self.shuffle_frames = shuffle_frames
+        self.use_flip = use_flip
+
+    def load_video_reader(self, idx):
+        clip_name = self.dataset[idx]
+        video_path = os.path.join(self.source_video_root, clip_name + '.mp4')
+        video_reader = VideoReader(video_path)
+        mask_video_path = os.path.join(self.mask_video_root, clip_name + '.mp4')
+        mask_video_reader = VideoReader(mask_video_path)
+        caption_path = os.path.join(self.captions_root, clip_name + '.txt')
+        if os.path.exists(caption_path):
+            caption = open(caption_path, 'r').read().strip()
+        else:
+            caption = ''
+        return clip_name, video_reader, mask_video_reader, caption
+
+    def get_batch(self, idx):
+        clip_name, video_reader, mask_video_reader, video_caption = self.load_video_reader(idx)
+        if self.use_flip:
+            flip_flag = self.pixel_transforms[1].get_flip_flag(self.sample_n_frames)
+        else:
+            flip_flag = torch.zeros(self.sample_n_frames, dtype=torch.bool)
+
+        indices = np.arange(self.sample_n_frames)
+        pixel_values = torch.from_numpy(video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        
+        anchor_pixels = torch.from_numpy(mask_video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        anchor_pixels = anchor_pixels / 255.
+        
+        return pixel_values, anchor_pixels, video_caption, flip_flag, clip_name
+
+    def __len__(self):
+        return self.length
+    
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, anchor_video, video_caption, flip_flag, clip_name = self.get_batch(idx)
+                break
+
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        if self.use_flip:
+            video = self.pixel_transforms[0](video)
+            video = self.pixel_transforms[1](video, flip_flag)
+            video = self.pixel_transforms[2](video)
+            anchor_video = self.pixel_transforms[0](anchor_video)
+            anchor_video = self.pixel_transforms[1](anchor_video, flip_flag)
+            anchor_video = self.pixel_transforms[2](anchor_video)
+        else:
+            for transform in self.pixel_transforms:
+                video = transform(video)
+                anchor_video = transform(anchor_video)
+        data = {
+            'video': video, 
+            'anchor_video': anchor_video,
+            'caption': video_caption, 
+        }
+        return data
+    
+class RealEstate10KPCDRenderCapEmbDataset(RealEstate10KPCDRenderDataset):
+    def __init__(
+            self,
+            video_root_dir,
+            text_embedding_path,
+            sample_n_frames=49,
+            image_size=[480, 720],
+            shuffle_frames=False,
+            hflip_p=0.0,
+    ):
+        super().__init__(
+            video_root_dir,
+            sample_n_frames=sample_n_frames,
+            image_size=image_size,
+            shuffle_frames=shuffle_frames,
+            hflip_p=hflip_p,
+        )
+        self.text_embedding_path = text_embedding_path
+        self.mask_root = os.path.join(self.root_path, 'masks')
+
+    def get_batch(self, idx):
+        clip_name, video_reader, mask_video_reader, video_caption = self.load_video_reader(idx)
+        cap_emb_path = os.path.join(self.text_embedding_path, clip_name + '.pt')
+        video_caption_emb = torch.load(cap_emb_path, weights_only=True)
+        if self.use_flip:
+            flip_flag = self.pixel_transforms[1].get_flip_flag(self.sample_n_frames)
+        else:
+            flip_flag = torch.zeros(self.sample_n_frames, dtype=torch.bool)
+        indices = np.arange(self.sample_n_frames)
+        pixel_values = torch.from_numpy(video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        pixel_values = pixel_values / 255.
+        
+        anchor_pixels = torch.from_numpy(mask_video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        anchor_pixels = anchor_pixels / 255.
+        try:
+            masks = np.load(os.path.join(self.mask_root, clip_name + '.npz'))['mask']*1.0
+            masks = torch.from_numpy(masks).unsqueeze(1)
+        except:
+            threshold = 0.1  # you can adjust this value
+            masks = (anchor_pixels.sum(dim=1, keepdim=True) < threshold).float()
+        return pixel_values, anchor_pixels, masks, video_caption_emb, flip_flag, clip_name
+    
+    def __getitem__(self, idx):
+        while True:
+            try:
+                video, anchor_video, mask, video_caption_emb, flip_flag, clip_name = self.get_batch(idx)
+                break
+
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        if self.use_flip:
+            video = self.pixel_transforms[0](video)
+            video = self.pixel_transforms[1](video, flip_flag)
+            video = self.pixel_transforms[2](video)
+            anchor_video = self.pixel_transforms[0](anchor_video)
+            anchor_video = self.pixel_transforms[1](anchor_video, flip_flag)
+            anchor_video = self.pixel_transforms[2](anchor_video)
+        else:
+            for transform in self.pixel_transforms:
+                video = transform(video)
+                anchor_video = transform(anchor_video)
+        data = {
+            'video': video, 
+            'anchor_video': anchor_video,
+            'caption_emb': video_caption_emb, 
+            'mask': mask
+        }
+        return data
\ No newline at end of file
diff --git a/training/controlnet_datasets_camera_pcd_mask_latent.py b/training/controlnet_datasets_camera_pcd_mask_latent.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca81be95140df9ebdbe86f6302a5d924349ee204
--- /dev/null
+++ b/training/controlnet_datasets_camera_pcd_mask_latent.py
@@ -0,0 +1,68 @@
+import os
+import random
+import json
+import torch
+
+import torch.nn as nn
+import torchvision.transforms.functional as F
+import numpy as np
+
+from torch.utils.data.dataset import Dataset
+from packaging import version as pver
+from decord import VideoReader
+
+from safetensors.torch import load_file
+  
+class RealEstate10KPCDRenderLatentCapEmbDataset(Dataset):
+    def __init__(
+            self,
+            video_root_dir,
+            text_embedding_path
+    ):
+        root_path = video_root_dir
+        self.root_path = root_path
+        self.latent_root = os.path.join(self.root_path, 'joint_latents')
+        self.source_video_root = os.path.join(self.root_path, 'videos')
+        self.captions_root = os.path.join(self.root_path, 'captions')
+        self.dataset = sorted([n.replace('.safetensors','') for n in os.listdir(self.latent_root)])
+        self.length = len(self.dataset)
+        self.text_embedding_path = text_embedding_path
+        self.mask_root = os.path.join(self.root_path, 'masks')
+
+    def get_batch(self, idx):
+        clip_name = self.dataset[idx]
+        cap_emb_path = os.path.join(self.text_embedding_path, clip_name + '.pt')
+        video_caption_emb = torch.load(cap_emb_path, weights_only=True)
+        joint_latent_path = os.path.join(self.latent_root, clip_name + '.safetensors')
+        joint_latent = load_file(joint_latent_path, device='cpu')['joint_latent']
+        video_reader = VideoReader(os.path.join(self.source_video_root, clip_name + '.mp4'))
+        indices = [0]
+        first_frame = torch.from_numpy(video_reader.get_batch(indices).asnumpy()).permute(0, 3, 1, 2).contiguous()
+        first_frame = (first_frame / 255.)*2-1
+        
+        T = joint_latent.shape[2] // 2
+        source_latent = joint_latent[:, :, :T]
+        anchor_latent = joint_latent[:, :, T:]
+        masks = np.load(os.path.join(self.mask_root, clip_name + '.npz'))['mask']*1.0
+        masks = torch.from_numpy(masks).unsqueeze(1)
+        return source_latent, anchor_latent, first_frame, masks, video_caption_emb, clip_name
+    
+    def __len__(self):
+        return self.length
+    
+    def __getitem__(self, idx):
+        while True:
+            try:
+                source_latent, anchor_latent, image, mask, video_caption_emb, clip_name = self.get_batch(idx)
+                break
+
+            except Exception as e:
+                idx = random.randint(0, self.length - 1)
+        data = {
+            'source_latent': source_latent,
+            'anchor_latent': anchor_latent,
+            'image': image,
+            'caption_emb': video_caption_emb, 
+            'mask': mask
+        }
+        return data
\ No newline at end of file
diff --git a/training/train_controlnet_i2v_pcd_render_mask_aware_add_dash.py b/training/train_controlnet_i2v_pcd_render_mask_aware_add_dash.py
new file mode 100644
index 0000000000000000000000000000000000000000..1feac075b9794a156a77e4c130480a93d5ce43f5
--- /dev/null
+++ b/training/train_controlnet_i2v_pcd_render_mask_aware_add_dash.py
@@ -0,0 +1,1361 @@
+# Copyright 2024 The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+import numpy as np
+from decord import VideoReader
+from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer
+
+import diffusers
+from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
+from diffusers.training_utils import (
+    cast_training_params,
+    # clear_objs_and_retain_memory,
+)
+from diffusers.utils import check_min_version, export_to_video, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+from controlnet_datasets_camera_pcd_mask import RealEstate10KPCDRenderCapEmbDataset
+from controlnet_pipeline import ControlnetCogVideoXImageToVideoPCDPipeline
+from cogvideo_transformer import CustomCogVideoXTransformer3DModel
+from cogvideo_controlnet_pcd import CogVideoXControlnetPCD
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script for CogVideoX.")
+
+    # Model information
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--video_root_dir",
+        type=str,
+        default=None,
+        required=True,
+        help=("A folder containing the training data."),
+    )
+    parser.add_argument(
+        "--text_embedding_path",
+        type=str,
+        default="/root_path/text_embedding",
+        required=False,
+        help=("Relative path to the text embeddings."),
+    )
+    parser.add_argument(
+        "--csv_path",
+        type=str,
+        default=None,
+        required=False,
+        help=("A path to csv."),
+    )
+    parser.add_argument(
+        "--hflip_p",
+        type=float,
+        default=0.5,
+        required=False,
+        help="Video horizontal flip probability.",
+    )
+    parser.add_argument(
+        "--use_zero_conv",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--controlnet_transformer_num_layers",
+        type=int,
+        default=2,
+        required=False,
+        help=("Count of controlnet blocks."),
+    )
+    parser.add_argument(
+        "--downscale_coef",
+        type=int,
+        default=8,
+        required=False,
+        help=("Downscale coef as encoder decreases resolutio before apply transformer."),
+    )
+    parser.add_argument(
+        "--controlnet_input_channels",
+        type=int,
+        default=3,
+        required=False,
+        help=("Controlnet encoder input channels."),
+    )
+    parser.add_argument(
+        "--controlnet_weights",
+        type=float,
+        default=1.0,
+        required=False,
+        help=("Controlnet blocks weight."),
+    )
+    parser.add_argument(
+        "--init_from_transformer",
+        action="store_true",
+        help="Whether or not load start controlnet parameters from transformer model.",
+    )
+    parser.add_argument(
+        "--pretrained_controlnet_path",
+        type=str,
+        default=None,
+        required=False,
+        help=("Path to controlnet .pt checkpoint."),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    # Validation
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=50,
+        help=(
+            "Num steps for denoising on validation stage."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="One or more prompt(s) that is used during validation to verify that the model is learning. Multiple validation prompts should be separated by the '--validation_prompt_seperator' string.",
+    )
+    parser.add_argument(
+        "--validation_video",
+        type=str,
+        default=None,
+        help="Paths to video for falidation.",
+    )
+    parser.add_argument(
+        "--validation_prompt_separator",
+        type=str,
+        default=":::",
+        help="String that separates multiple validation prompts",
+    )
+    parser.add_argument(
+        "--num_validation_videos",
+        type=int,
+        default=1,
+        help="Number of videos that should be generated during validation per `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=50,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt `args.validation_prompt` multiple times: `args.num_validation_videos`."
+        ),
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=6,
+        help="The guidance scale to use while sampling validation videos.",
+    )
+    parser.add_argument(
+        "--use_dynamic_cfg",
+        action="store_true",
+        default=False,
+        help="Whether or not to use the default cosine dynamic guidance schedule when sampling validation videos.",
+    )
+
+    # Training information
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="cogvideox-controlnet",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=480,
+        help="All input videos are resized to this height.",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=720,
+        help="All input videos are resized to this width.",
+    )
+    parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
+    parser.add_argument(
+        "--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides `--num_train_epochs`.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--enable_slicing",
+        action="store_true",
+        default=False,
+        help="Whether or not to use VAE slicing for saving memory.",
+    )
+    parser.add_argument(
+        "--enable_tiling",
+        action="store_true",
+        default=False,
+        help="Whether or not to use VAE tiling for saving memory.",
+    )
+
+    # Optimizer
+    parser.add_argument(
+        "--optimizer",
+        type=lambda s: s.lower(),
+        default="adam",
+        choices=["adam", "adamw", "prodigy"],
+        help=("The optimizer type to use."),
+    )
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.95, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2.",
+    )
+    parser.add_argument("--prodigy_decouple", action="store_true", help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--prodigy_use_bias_correction", action="store_true", help="Turn on Adam's bias correction.")
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        action="store_true",
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage.",
+    )
+
+    # Other information
+    parser.add_argument("--tracker_name", type=str, default=None, help="Project tracker name")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help="Directory where logs are stored.",
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default=None,
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_time_sampling",
+        action="store_true",
+        default=False,
+        help="Whether or not to use time_sampling_dict.",
+    )
+    parser.add_argument(
+        "--time_sampling_type",
+        type=str,
+        default="truncated_normal",
+        choices=["truncated_normal", "truncated_uniform"]
+    )
+    parser.add_argument(
+        "--time_sampling_mean",
+        type=float,
+        default=0.9,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--time_sampling_std",
+        type=float,
+        default=0.03,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--controlnet_guidance_end",
+        type=float,
+        default=0.2,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--controlnet_guidance_start",
+        type=float,
+        default=0.0,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--controlnet_transformer_num_attn_heads",
+        type=int,
+        default=None,
+        required=False,
+        help=("Count of attention heads in controlnet blocks."),
+    )
+    parser.add_argument(
+        "--controlnet_transformer_attention_head_dim",
+        type=int,
+        default=None,
+        required=False,
+        help=("Attention dim in controlnet blocks."),
+    )
+    parser.add_argument(
+        "--controlnet_transformer_out_proj_dim_factor",
+        type=int,
+        default=None,
+        required=False,
+        help=("Attention dim for custom controlnet blocks."),
+    )
+    parser.add_argument(
+        "--controlnet_transformer_out_proj_dim_zero_init",
+        action="store_true",
+        default=False,
+        help=("Init project zero."),
+    )
+
+    return parser.parse_args()
+
+
+def read_video(video_path, start_index=0, frames_count=49, stride=1):
+    video_reader = VideoReader(video_path)
+    end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
+    batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
+    numpy_video = video_reader.get_batch(batch_index).asnumpy()
+    return numpy_video
+    
+
+def log_validation(
+    pipe,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation: bool = False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_videos} videos with prompt: {pipeline_args['prompt']}."
+    )
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipe.scheduler.config:
+        variance_type = pipe.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
+    pipe = pipe.to(accelerator.device)
+    # pipe.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+
+    videos = []
+    for _ in range(args.num_validation_videos):
+        video = pipe(**pipeline_args, generator=generator, output_type="np").frames[0]
+        videos.append(video)
+
+    for i, video in enumerate(videos):
+        prompt = (
+            pipeline_args["prompt"][:25]
+            .replace(" ", "_")
+            .replace(" ", "_")
+            .replace("'", "_")
+            .replace('"', "_")
+            .replace("/", "_")
+        )
+        filename = os.path.join(args.output_dir, f"{epoch}_video_{i}_{prompt}.mp4")
+        export_to_video(video, filename, fps=8)
+
+    clear_objs_and_retain_memory([pipe])
+
+    return videos
+
+
+def _get_t5_prompt_embeds(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("`text_input_ids` must be provided when the tokenizer is not specified.")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+def encode_prompt(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer,
+        text_encoder,
+        prompt=prompt,
+        num_videos_per_prompt=num_videos_per_prompt,
+        max_sequence_length=max_sequence_length,
+        device=device,
+        dtype=dtype,
+        text_input_ids=text_input_ids,
+    )
+    return prompt_embeds
+
+
+def compute_prompt_embeddings(
+    tokenizer, text_encoder, prompt, max_sequence_length, device, dtype, requires_grad: bool = False
+):
+    if requires_grad:
+        prompt_embeds = encode_prompt(
+            tokenizer,
+            text_encoder,
+            prompt,
+            num_videos_per_prompt=1,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+    else:
+        with torch.no_grad():
+            prompt_embeds = encode_prompt(
+                tokenizer,
+                text_encoder,
+                prompt,
+                num_videos_per_prompt=1,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+    return prompt_embeds
+
+
+def prepare_rotary_positional_embeddings(
+    height: int,
+    width: int,
+    num_frames: int,
+    vae_scale_factor_spatial: int = 8,
+    patch_size: int = 2,
+    attention_head_dim: int = 64,
+    device: Optional[torch.device] = None,
+    base_height: int = 480,
+    base_width: int = 720,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grid_height = height // (vae_scale_factor_spatial * patch_size)
+    grid_width = width // (vae_scale_factor_spatial * patch_size)
+    base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
+    base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+
+    grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
+    freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+        embed_dim=attention_head_dim,
+        crops_coords=grid_crops_coords,
+        grid_size=(grid_height, grid_width),
+        temporal_size=num_frames,
+    )
+
+    freqs_cos = freqs_cos.to(device=device)
+    freqs_sin = freqs_sin.to(device=device)
+    return freqs_cos, freqs_sin
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+def avgpool_mask_tensor(mask_tensor):
+    bs, f, c, h, w = mask_tensor.shape
+    assert c == 1, "Channel must be 1"
+    assert f % 12 == 0, "Frame number must be divisible by 12 (e.g., 48)"
+    assert h % 30 == 0 and w % 45 == 0, "Height and width must be divisible by 30 and 45"
+
+    # Spatial average pooling
+    x = mask_tensor.float()  # (bs, f, 1, h, w)
+    x = x.view(bs * f, 1, h, w)
+    x_pooled = F.avg_pool2d(x, kernel_size=(h // 30, w // 45))  # (bs * f, 1, 30, 45)
+    x_pooled = x_pooled.view(bs, f, 1, 30, 45)
+
+    # Temporal pooling
+    t_groups = f // 12
+    x_pooled = x_pooled.view(bs, 12, t_groups, 1, 30, 45)
+    pooled_avg = torch.mean(x_pooled, dim=2)  # (bs, 12, 1, 30, 45)
+
+    # Threshold
+    pooled_mask = (pooled_avg > 0.5).int()
+
+    # Add zero frame for each sample
+    zero_frame = torch.zeros_like(pooled_mask[:, 0:1])  # (bs, 1, 1, 30, 45)
+    pooled_mask = torch.cat([zero_frame, pooled_mask], dim=1)  # (bs, 13, 1, 30, 45)
+
+    return 1 - pooled_mask  # invert
+
+
+import torch
+import math
+
+def add_dashed_rays_to_video(video_tensor, num_perp_samples=50, density_decay=0.075):
+    T, C, H, W = video_tensor.shape
+    max_length = int((H**2 + W**2) ** 0.5) + 10
+    center = torch.tensor([W / 2, H / 2])
+
+    # Random direction and perpendicular
+    theta = torch.rand(1).item() * 2 * math.pi
+    direction = torch.tensor([math.cos(theta), math.sin(theta)])
+    direction = direction / direction.norm()
+    d_perp = torch.tensor([-direction[1], direction[0]])
+
+    # Ray origins
+    half_len = max(H, W) // 2
+    positions = torch.linspace(-half_len, half_len, num_perp_samples)
+    perp_coords = center[None, :] + positions[:, None] * d_perp[None, :]
+    x0, y0 = perp_coords[:, 0], perp_coords[:, 1]
+
+    # Ray steps
+    steps = []
+    dist = 0
+    while dist < max_length:
+        steps.append(dist)
+        dist += 1.0 + density_decay * dist
+    steps = torch.tensor(steps)
+    S = len(steps)
+
+    # All ray endpoints
+    dxdy = direction[None, :] * steps[:, None]
+    all_xy = perp_coords[:, None, :] + dxdy[None, :, :]
+    all_xy = all_xy.reshape(-1, 2)
+    all_x = all_xy[:, 0].round().long()
+    all_y = all_xy[:, 1].round().long()
+
+    valid = (0 <= all_x) & (all_x < W) & (0 <= all_y) & (all_y < H)
+    all_x = all_x[valid]
+    all_y = all_y[valid]
+
+    # Sample base colors from first frame
+    x0r = x0.round().long().clamp(0, W - 1)
+    y0r = y0.round().long().clamp(0, H - 1)
+    frame0 = video_tensor[0]  # (C, H, W)
+    base_colors = frame0[:, y0r, x0r]
+    base_colors = base_colors.repeat_interleave(S, dim=1)[:, valid]
+
+    # Overlay on all frames
+    video_out = video_tensor.clone()
+    offsets = [(0, 0), (0, 1), (1, 0), (1, 1)]
+    for dxo, dyo in offsets:
+        ox = all_x + dxo
+        oy = all_y + dyo
+        inside = (0 <= ox) & (ox < W) & (0 <= oy) & (oy < H)
+        ox = ox[inside]
+        oy = oy[inside]
+        colors = base_colors[:, inside]  # (C, K)
+
+        for c in range(C):
+            video_out[1:, c, oy, ox] = colors[c][None, :].expand(T - 1, -1)
+
+    return video_out
+
+def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
+    # Use DeepSpeed optimzer
+    if use_deepspeed:
+        from accelerate.utils import DummyOptim
+
+        return DummyOptim(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+
+    # Optimizer creation
+    supported_optimizers = ["adam", "adamw", "prodigy"]
+    if args.optimizer not in supported_optimizers:
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}. Supported optimizers include {supported_optimizers}. Defaulting to AdamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not (args.optimizer.lower() not in ["adam", "adamw"]):
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'Adam' or 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+    if args.optimizer.lower() == "adamw":
+        optimizer_class = bnb.optim.AdamW8bit if args.use_8bit_adam else torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "adam":
+        optimizer_class = bnb.optim.Adam8bit if args.use_8bit_adam else torch.optim.Adam
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    return optimizer
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    if accelerator.is_main_process:
+        loss_log_path = os.path.join(args.output_dir, "loss_log.csv")
+        if not os.path.exists(loss_log_path):
+            with open(loss_log_path, "w") as f:
+                f.write("step,loss,lr\n")
+                
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # CogVideoX-2b weights are stored in float16
+    # CogVideoX-5b and CogVideoX-5b-I2V weights are stored in bfloat16
+    load_dtype = torch.bfloat16 if "5b" in args.pretrained_model_name_or_path.lower() else torch.float16
+    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        torch_dtype=load_dtype,
+        revision=args.revision,
+        variant=args.variant,
+    )
+
+    vae = AutoencoderKLCogVideoX.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    controlnet_kwargs = {}
+    num_attention_heads_orig = 48 if "5b" in args.pretrained_model_name_or_path.lower() else 30
+    if args.controlnet_transformer_num_attn_heads is not None:
+        controlnet_kwargs["num_attention_heads"] = args.controlnet_transformer_num_attn_heads
+    else:
+        controlnet_kwargs["num_attention_heads"] = num_attention_heads_orig
+    if args.controlnet_transformer_attention_head_dim is not None:
+        controlnet_kwargs["attention_head_dim"] = args.controlnet_transformer_attention_head_dim
+    if args.controlnet_transformer_out_proj_dim_factor is not None:
+        controlnet_kwargs["out_proj_dim"] = num_attention_heads_orig * args.controlnet_transformer_out_proj_dim_factor
+    controlnet_kwargs["out_proj_dim_zero_init"] = args.controlnet_transformer_out_proj_dim_zero_init
+    controlnet = CogVideoXControlnetPCD(
+        num_layers=args.controlnet_transformer_num_layers,
+        downscale_coef=args.downscale_coef,
+        in_channels=args.controlnet_input_channels,
+        use_zero_conv=args.use_zero_conv,
+        **controlnet_kwargs,   
+    )
+
+    if args.init_from_transformer:
+        controlnet_state_dict = {}
+        for name, params in transformer.state_dict().items():
+            if 'patch_embed.proj.weight' in name:
+                continue
+            controlnet_state_dict[name] = params
+        m, u = controlnet.load_state_dict(controlnet_state_dict, strict=False)
+        print(f'[ Weights from transformer was loaded into controlnet ] [M: {len(m)} | U: {len(u)}]')
+
+    if args.pretrained_controlnet_path:
+        ckpt = torch.load(args.pretrained_controlnet_path, map_location='cpu', weights_only=False)
+        controlnet_state_dict = {}
+        for name, params in ckpt['state_dict'].items():
+            controlnet_state_dict[name] = params
+        m, u = controlnet.load_state_dict(controlnet_state_dict, strict=False)
+        print(f'[ Weights from pretrained controlnet was loaded into controlnet ] [M: {len(m)} | U: {len(u)}]')
+    
+    scheduler = CogVideoXDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    if args.enable_slicing:
+        vae.enable_slicing()
+    if args.enable_tiling:
+        vae.enable_tiling()
+
+    # We only train the additional adapter controlnet layers
+    # text_encoder.requires_grad_(False)
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    controlnet.requires_grad_(True)
+
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.state.deepspeed_plugin:
+        # DeepSpeed is handling precision, use what's in the DeepSpeed config
+        if (
+            "fp16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["fp16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+        if (
+            "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+    else:
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    # text_encoder.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    controlnet.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        controlnet.enable_gradient_checkpointing()
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters into fp32
+        cast_training_params([controlnet], dtype=torch.float32)
+
+    trainable_parameters = list(filter(lambda p: p.requires_grad, controlnet.parameters()))
+
+    # Optimization parameters
+    trainable_parameters_with_lr = {"params": trainable_parameters, "lr": args.learning_rate}
+    params_to_optimize = [trainable_parameters_with_lr]
+
+    use_deepspeed_optimizer = (
+        accelerator.state.deepspeed_plugin is not None
+        and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+    use_deepspeed_scheduler = (
+        accelerator.state.deepspeed_plugin is not None
+        and "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+
+    optimizer = get_optimizer(args, params_to_optimize, use_deepspeed=use_deepspeed_optimizer)
+
+    # Dataset and DataLoader
+    train_dataset = RealEstate10KPCDRenderCapEmbDataset(
+        video_root_dir=args.video_root_dir,
+        text_embedding_path=args.text_embedding_path,
+        hflip_p=args.hflip_p,
+        image_size=(args.height, args.width),
+        sample_n_frames=args.max_num_frames,
+    )
+        
+    def encode_video(video):
+        video = video.to(accelerator.device, dtype=vae.dtype)
+        video = video.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        latent_dist = vae.encode(video).latent_dist.sample() * vae.config.scaling_factor
+        return latent_dist.permute(0, 2, 1, 3, 4).to(memory_format=torch.contiguous_format)
+    
+    def collate_fn(examples):
+        videos = [example["video"] for example in examples]
+        anchor_videos = [add_dashed_rays_to_video(example["anchor_video"]) for example in examples]
+        caption_embs = [example["caption_emb"] for example in examples]
+        masks = [example["mask"] for example in examples]
+
+        caption_embs = torch.concat(caption_embs)
+
+        videos = torch.stack(videos)
+        videos = videos.to(memory_format=torch.contiguous_format).float()
+        
+        anchor_videos = torch.stack(anchor_videos)
+        anchor_videos = anchor_videos.to(memory_format=torch.contiguous_format).float()
+
+        masks = torch.stack(masks)
+        masks = masks.to(memory_format=torch.contiguous_format).float()
+        
+        # found average pool works better than max pool
+        masks = avgpool_mask_tensor(1-masks[:,1:])
+        # masks = maxpool_mask_tensor(1-masks[:,1:])  # [B, F, 1, 30, 45]
+        masks = masks.flatten(start_dim=1).unsqueeze(-1)
+
+        return {
+            "videos": videos,
+            "anchor_videos": anchor_videos,
+            "caption_embs": caption_embs,
+            "controlnet_masks": masks
+        }
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    if use_deepspeed_scheduler:
+        from accelerate.utils import DummyScheduler
+
+        lr_scheduler = DummyScheduler(
+            name=args.lr_scheduler,
+            optimizer=optimizer,
+            total_num_steps=args.max_train_steps * accelerator.num_processes,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        )
+    else:
+        lr_scheduler = get_scheduler(
+            args.lr_scheduler,
+            optimizer=optimizer,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+            num_training_steps=args.max_train_steps * accelerator.num_processes,
+            num_cycles=args.lr_num_cycles,
+            power=args.lr_power,
+        )
+
+    # Prepare everything with our `accelerator`.
+    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = args.tracker_name or "cogvideox-controlnet"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    num_trainable_parameters = sum(param.numel() for model in params_to_optimize for param in model["params"])
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num trainable parameters = {num_trainable_parameters}")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    vae_scale_factor_spatial = 2 ** (len(vae.config.block_out_channels) - 1)
+
+    # For DeepSpeed training
+    model_config = transformer.module.config if hasattr(transformer, "module") else transformer.config
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        controlnet.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [controlnet]
+
+            with accelerator.accumulate(models_to_accumulate):
+                model_input = encode_video(batch["videos"]).to(dtype=weight_dtype)  # [B, F, C, H, W]
+                masks = batch["controlnet_masks"].to(dtype=weight_dtype)  # [B, F, 1, H, W]
+                prompt_embeds = batch["caption_embs"].to(weight_dtype)
+                
+                # Sample noise that will be added to the latents
+                noise = torch.randn_like(model_input)
+                batch_size, num_frames, num_channels, height, width = model_input.shape
+
+                # Sample a random timestep for each image
+                if args.enable_time_sampling:
+                    if args.time_sampling_type == "truncated_normal":
+                        time_sampling_dict = {
+                            'mean': args.time_sampling_mean,
+                            'std': args.time_sampling_std,
+                            'a': 1 - args.controlnet_guidance_end,
+                            'b': 1 - args.controlnet_guidance_start,
+                        }
+                        timesteps = torch.nn.init.trunc_normal_(
+                            torch.empty(batch_size, device=model_input.device), **time_sampling_dict
+                            ) * scheduler.config.num_train_timesteps
+                    elif args.time_sampling_type == "truncated_uniform":
+                        timesteps = torch.randint(
+                            int((1- args.controlnet_guidance_end) * scheduler.config.num_train_timesteps),
+                            int((1 - args.controlnet_guidance_start) * scheduler.config.num_train_timesteps),
+                            (batch_size,), device=model_input.device
+                        )
+                else:    
+                    timesteps = torch.randint(
+                        0, scheduler.config.num_train_timesteps, (batch_size,), device=model_input.device
+                    )
+                timesteps = timesteps.long()
+        
+                # Prepare rotary embeds
+                image_rotary_emb = (
+                    prepare_rotary_positional_embeddings(
+                        height=args.height,
+                        width=args.width,
+                        num_frames=num_frames,
+                        vae_scale_factor_spatial=vae_scale_factor_spatial,
+                        patch_size=model_config.patch_size,
+                        attention_head_dim=model_config.attention_head_dim,
+                        device=accelerator.device,
+                    )
+                    if model_config.use_rotary_positional_embeddings
+                    else None
+                )
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = scheduler.add_noise(model_input, noise, timesteps)
+
+                images = batch["videos"][:,0].unsqueeze(2)
+                # Add noise to images
+                image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=accelerator.device)
+                image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+                noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+                image_latent_dist = vae.encode(noisy_images.to(dtype=vae.dtype)).latent_dist
+                image_latents = image_latent_dist.sample() * vae.config.scaling_factor
+
+
+                # from [B, C, F, H, W] to [B, F, C, H, W]
+                latent = model_input
+                image_latents = image_latents.permute(0, 2, 1, 3, 4)
+                assert (latent.shape[0], *latent.shape[2:]) == (image_latents.shape[0], *image_latents.shape[2:])
+
+                # Padding image_latents to the same frame number as latent
+                padding_shape = (latent.shape[0], latent.shape[1] - 1, *latent.shape[2:])
+                latent_padding = image_latents.new_zeros(padding_shape)
+                image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+                # Concatenate latent and image_latents in the channel dimension
+                latent_img_noisy = torch.cat([noisy_model_input, image_latents], dim=2)
+
+                anchor_videos = batch["anchor_videos"]
+                if not args.use_zero_conv:
+                    anchor_states = encode_video(anchor_videos).to(dtype=weight_dtype)  # [B, F, C, H, W]
+                else:
+                    anchor_states = anchor_videos.to(dtype=weight_dtype)  # [B, F, C, H, W]
+                    
+                controlnet_input_states = anchor_states
+                controlnet_states = controlnet(
+                    hidden_states=noisy_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    image_rotary_emb=image_rotary_emb,
+                    controlnet_states=controlnet_input_states,
+                    timestep=timesteps,
+                    return_dict=False,
+                    controlnet_output_mask=masks
+                )[0]
+                if isinstance(controlnet_states, (tuple, list)):
+                    controlnet_states = [x.to(dtype=weight_dtype) for x in controlnet_states]
+                else:
+                    controlnet_states = controlnet_states.to(dtype=weight_dtype)
+                # Predict the noise residual
+                model_output = transformer(
+                    hidden_states=latent_img_noisy,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timesteps,
+                    image_rotary_emb=image_rotary_emb,
+                    controlnet_states=controlnet_states,
+                    controlnet_weights=args.controlnet_weights,
+                    return_dict=False,
+                )[0]
+                model_pred = scheduler.get_velocity(model_output, noisy_model_input, timesteps)
+
+                alphas_cumprod = scheduler.alphas_cumprod[timesteps]
+                weights = 1 / (1 - alphas_cumprod)
+                while len(weights.shape) < len(model_pred.shape):
+                    weights = weights.unsqueeze(-1)
+
+                target = model_input
+
+                loss = torch.mean((weights * (model_pred - target) ** 2).reshape(batch_size, -1), dim=1)
+                loss = loss.mean()
+                accelerator.backward(loss)
+
+                if accelerator.sync_gradients:
+                    params_to_clip = controlnet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                if accelerator.state.deepspeed_plugin is None:
+                    optimizer.step()
+                    optimizer.zero_grad()
+
+                lr_scheduler.step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}.pt")
+                        torch.save({'state_dict': unwrap_model(controlnet).state_dict()}, save_path)
+                        logger.info(f"Saved state to {save_path}")
+                        
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            
+            if accelerator.is_main_process:
+                with open(loss_log_path, "a") as f:
+                    f.write(f"{global_step},{logs['loss']},{logs['lr']}\n")
+
+            if global_step >= args.max_train_steps:
+                break
+
+            if accelerator.is_main_process:
+                if args.validation_prompt is not None and (step + 1) % args.validation_steps == 0:
+                    # Create pipeline
+                    pipe = ControlnetCogVideoXPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        transformer=unwrap_model(transformer),
+                        text_encoder=unwrap_model(text_encoder),
+                        vae=unwrap_model(vae),
+                        controlnet=unwrap_model(controlnet),
+                        scheduler=scheduler,
+                        torch_dtype=weight_dtype,
+                    )
+    
+                    validation_prompts = args.validation_prompt.split(args.validation_prompt_separator)
+                    validation_videos = args.validation_video.split(args.validation_prompt_separator)
+                    for validation_prompt, validation_video in zip(validation_prompts, validation_videos):
+                        numpy_frames = read_video(validation_video, frames_count=args.max_num_frames)
+                        controlnet_frames = np.stack([train_dataset.controlnet_processor(x) for x in numpy_frames])
+                        pipeline_args = {
+                            "prompt": validation_prompt,
+                            "controlnet_frames": controlnet_frames,
+                            "guidance_scale": args.guidance_scale,
+                            "use_dynamic_cfg": args.use_dynamic_cfg,
+                            "height": args.height,
+                            "width": args.width,
+                            "num_frames": args.max_num_frames,
+                            "num_inference_steps": args.num_inference_steps,
+                            "controlnet_weights": args.controlnet_weights,
+                        }
+    
+                        validation_outputs = log_validation(
+                            pipe=pipe,
+                            args=args,
+                            accelerator=accelerator,
+                            pipeline_args=pipeline_args,
+                            epoch=epoch,
+                        )
+    
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args)
\ No newline at end of file
diff --git a/training/train_controlnet_i2v_pcd_render_mask_aware_add_dash_use_latent.py b/training/train_controlnet_i2v_pcd_render_mask_aware_add_dash_use_latent.py
new file mode 100644
index 0000000000000000000000000000000000000000..004ace89fe2a090b4f62929bacfffb41bd34f06d
--- /dev/null
+++ b/training/train_controlnet_i2v_pcd_render_mask_aware_add_dash_use_latent.py
@@ -0,0 +1,1304 @@
+# Copyright 2024 The HuggingFace Team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+sys.path.append('..')
+import argparse
+import logging
+import math
+import os
+import shutil
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import torch
+import transformers
+from accelerate import Accelerator
+from accelerate.logging import get_logger
+from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
+from huggingface_hub import create_repo, upload_folder
+from torch.utils.data import DataLoader, Dataset
+from torchvision import transforms
+from tqdm.auto import tqdm
+import numpy as np
+from decord import VideoReader
+from transformers import AutoTokenizer, T5EncoderModel, T5Tokenizer
+
+import diffusers
+from diffusers import AutoencoderKLCogVideoX, CogVideoXDPMScheduler
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from diffusers.optimization import get_scheduler
+from diffusers.pipelines.cogvideo.pipeline_cogvideox import get_resize_crop_region_for_grid
+from diffusers.training_utils import (
+    cast_training_params,
+    # clear_objs_and_retain_memory,
+)
+from diffusers.utils import check_min_version, export_to_video, is_wandb_available
+from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
+from diffusers.utils.torch_utils import is_compiled_module
+
+from controlnet_datasets_camera_pcd_mask_latent import RealEstate10KPCDRenderLatentCapEmbDataset
+from controlnet_pipeline import ControlnetCogVideoXImageToVideoPCDPipeline
+from cogvideo_transformer import CustomCogVideoXTransformer3DModel
+from cogvideo_controlnet_pcd import CogVideoXControlnetPCD
+
+
+if is_wandb_available():
+    import wandb
+
+# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
+check_min_version("0.31.0.dev0")
+
+logger = get_logger(__name__)
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description="Simple example of a training script for CogVideoX.")
+
+    # Model information
+    parser.add_argument(
+        "--pretrained_model_name_or_path",
+        type=str,
+        default=None,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--revision",
+        type=str,
+        default=None,
+        required=False,
+        help="Revision of pretrained model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--variant",
+        type=str,
+        default=None,
+        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
+    )
+    parser.add_argument(
+        "--video_root_dir",
+        type=str,
+        default=None,
+        required=True,
+        help=("A folder containing the training data."),
+    )
+    parser.add_argument(
+        "--text_embedding_path",
+        type=str,
+        default="./data/train/caption_embs",
+        required=False,
+        help=("Relative path to the text embeddings."),
+    )
+    parser.add_argument(
+        "--csv_path",
+        type=str,
+        default=None,
+        required=False,
+        help=("A path to csv."),
+    )
+    parser.add_argument(
+        "--hflip_p",
+        type=float,
+        default=0.5,
+        required=False,
+        help="Video horizontal flip probability.",
+    )
+    parser.add_argument(
+        "--use_zero_conv",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--controlnet_transformer_num_layers",
+        type=int,
+        default=2,
+        required=False,
+        help=("Count of controlnet blocks."),
+    )
+    parser.add_argument(
+        "--downscale_coef",
+        type=int,
+        default=8,
+        required=False,
+        help=("Downscale coef as encoder decreases resolutio before apply transformer."),
+    )
+    parser.add_argument(
+        "--controlnet_input_channels",
+        type=int,
+        default=3,
+        required=False,
+        help=("Controlnet encoder input channels."),
+    )
+    parser.add_argument(
+        "--controlnet_weights",
+        type=float,
+        default=1.0,
+        required=False,
+        help=("Controlnet blocks weight."),
+    )
+    parser.add_argument(
+        "--init_from_transformer",
+        action="store_true",
+        help="Whether or not load start controlnet parameters from transformer model.",
+    )
+    parser.add_argument(
+        "--pretrained_controlnet_path",
+        type=str,
+        default=None,
+        required=False,
+        help=("Path to controlnet .pt checkpoint."),
+    )
+    parser.add_argument(
+        "--dataloader_num_workers",
+        type=int,
+        default=0,
+        help=(
+            "Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
+        ),
+    )
+    # Validation
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=50,
+        help=(
+            "Num steps for denoising on validation stage."
+        ),
+    )
+    parser.add_argument(
+        "--validation_prompt",
+        type=str,
+        default=None,
+        help="One or more prompt(s) that is used during validation to verify that the model is learning. Multiple validation prompts should be separated by the '--validation_prompt_seperator' string.",
+    )
+    parser.add_argument(
+        "--validation_video",
+        type=str,
+        default=None,
+        help="Paths to video for falidation.",
+    )
+    parser.add_argument(
+        "--validation_prompt_separator",
+        type=str,
+        default=":::",
+        help="String that separates multiple validation prompts",
+    )
+    parser.add_argument(
+        "--num_validation_videos",
+        type=int,
+        default=1,
+        help="Number of videos that should be generated during validation per `validation_prompt`.",
+    )
+    parser.add_argument(
+        "--validation_steps",
+        type=int,
+        default=50,
+        help=(
+            "Run validation every X steps. Validation consists of running the prompt `args.validation_prompt` multiple times: `args.num_validation_videos`."
+        ),
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=6,
+        help="The guidance scale to use while sampling validation videos.",
+    )
+    parser.add_argument(
+        "--use_dynamic_cfg",
+        action="store_true",
+        default=False,
+        help="Whether or not to use the default cosine dynamic guidance schedule when sampling validation videos.",
+    )
+
+    # Training information
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--mixed_precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
+    )
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default="cogvideox-controlnet",
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=480,
+        help="All input videos are resized to this height.",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=720,
+        help="All input videos are resized to this width.",
+    )
+    parser.add_argument("--fps", type=int, default=8, help="All input videos will be used at this FPS.")
+    parser.add_argument(
+        "--max_num_frames", type=int, default=49, help="All input videos will be truncated to these many frames."
+    )
+    parser.add_argument(
+        "--train_batch_size", type=int, default=4, help="Batch size (per device) for the training dataloader."
+    )
+    parser.add_argument("--num_train_epochs", type=int, default=1)
+    parser.add_argument(
+        "--max_train_steps",
+        type=int,
+        default=None,
+        help="Total number of training steps to perform. If provided, overrides `--num_train_epochs`.",
+    )
+    parser.add_argument(
+        "--checkpointing_steps",
+        type=int,
+        default=500,
+        help=(
+            "Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
+            " checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
+            " training using `--resume_from_checkpoint`."
+        ),
+    )
+    parser.add_argument(
+        "--checkpoints_total_limit",
+        type=int,
+        default=None,
+        help=("Max number of checkpoints to store."),
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--gradient_checkpointing",
+        action="store_true",
+        help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=1e-4,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--scale_lr",
+        action="store_true",
+        default=False,
+        help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
+    )
+    parser.add_argument(
+        "--lr_scheduler",
+        type=str,
+        default="constant",
+        help=(
+            'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
+            ' "constant", "constant_with_warmup"]'
+        ),
+    )
+    parser.add_argument(
+        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
+    )
+    parser.add_argument(
+        "--lr_num_cycles",
+        type=int,
+        default=1,
+        help="Number of hard resets of the lr in cosine_with_restarts scheduler.",
+    )
+    parser.add_argument("--lr_power", type=float, default=1.0, help="Power factor of the polynomial scheduler.")
+    parser.add_argument(
+        "--enable_slicing",
+        action="store_true",
+        default=False,
+        help="Whether or not to use VAE slicing for saving memory.",
+    )
+    parser.add_argument(
+        "--enable_tiling",
+        action="store_true",
+        default=False,
+        help="Whether or not to use VAE tiling for saving memory.",
+    )
+
+    # Optimizer
+    parser.add_argument(
+        "--optimizer",
+        type=lambda s: s.lower(),
+        default="adam",
+        choices=["adam", "adamw", "prodigy"],
+        help=("The optimizer type to use."),
+    )
+    parser.add_argument(
+        "--use_8bit_adam",
+        action="store_true",
+        help="Whether or not to use 8-bit Adam from bitsandbytes. Ignored if optimizer is not set to AdamW",
+    )
+    parser.add_argument(
+        "--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--adam_beta2", type=float, default=0.95, help="The beta2 parameter for the Adam and Prodigy optimizers."
+    )
+    parser.add_argument(
+        "--prodigy_beta3",
+        type=float,
+        default=None,
+        help="Coefficients for computing the Prodigy optimizer's stepsize using running averages. If set to None, uses the value of square root of beta2.",
+    )
+    parser.add_argument("--prodigy_decouple", action="store_true", help="Use AdamW style decoupled weight decay")
+    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer and Prodigy optimizers.",
+    )
+    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
+    parser.add_argument("--prodigy_use_bias_correction", action="store_true", help="Turn on Adam's bias correction.")
+    parser.add_argument(
+        "--prodigy_safeguard_warmup",
+        action="store_true",
+        help="Remove lr from the denominator of D estimate to avoid issues during warm-up stage.",
+    )
+
+    # Other information
+    parser.add_argument("--tracker_name", type=str, default=None, help="Project tracker name")
+    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
+    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
+    parser.add_argument(
+        "--hub_model_id",
+        type=str,
+        default=None,
+        help="The name of the repository to keep in sync with the local `output_dir`.",
+    )
+    parser.add_argument(
+        "--logging_dir",
+        type=str,
+        default="logs",
+        help="Directory where logs are stored.",
+    )
+    parser.add_argument(
+        "--allow_tf32",
+        action="store_true",
+        help=(
+            "Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
+            " https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
+        ),
+    )
+    parser.add_argument(
+        "--report_to",
+        type=str,
+        default=None,
+        help=(
+            'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
+            ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
+        ),
+    )
+    parser.add_argument(
+        "--enable_time_sampling",
+        action="store_true",
+        default=False,
+        help="Whether or not to use time_sampling_dict.",
+    )
+    parser.add_argument(
+        "--time_sampling_type",
+        type=str,
+        default="truncated_normal",
+        choices=["truncated_normal", "truncated_uniform"]
+    )
+    parser.add_argument(
+        "--time_sampling_mean",
+        type=float,
+        default=0.9,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--time_sampling_std",
+        type=float,
+        default=0.03,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--controlnet_guidance_end",
+        type=float,
+        default=0.2,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--controlnet_guidance_start",
+        type=float,
+        default=0.0,
+        help="Shifted and truncated noise sampling",
+    )
+    parser.add_argument(
+        "--controlnet_transformer_num_attn_heads",
+        type=int,
+        default=None,
+        required=False,
+        help=("Count of attention heads in controlnet blocks."),
+    )
+    parser.add_argument(
+        "--controlnet_transformer_attention_head_dim",
+        type=int,
+        default=None,
+        required=False,
+        help=("Attention dim in controlnet blocks."),
+    )
+    parser.add_argument(
+        "--controlnet_transformer_out_proj_dim_factor",
+        type=int,
+        default=None,
+        required=False,
+        help=("Attention dim for custom controlnet blocks."),
+    )
+    parser.add_argument(
+        "--controlnet_transformer_out_proj_dim_zero_init",
+        action="store_true",
+        default=False,
+        help=("Init project zero."),
+    )
+
+    return parser.parse_args()
+
+
+def read_video(video_path, start_index=0, frames_count=49, stride=1):
+    video_reader = VideoReader(video_path)
+    end_index = min(start_index + frames_count * stride, len(video_reader)) - 1
+    batch_index = np.linspace(start_index, end_index, frames_count, dtype=int)
+    numpy_video = video_reader.get_batch(batch_index).asnumpy()
+    return numpy_video
+    
+
+def log_validation(
+    pipe,
+    args,
+    accelerator,
+    pipeline_args,
+    epoch,
+    is_final_validation: bool = False,
+):
+    logger.info(
+        f"Running validation... \n Generating {args.num_validation_videos} videos with prompt: {pipeline_args['prompt']}."
+    )
+    # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
+    scheduler_args = {}
+
+    if "variance_type" in pipe.scheduler.config:
+        variance_type = pipe.scheduler.config.variance_type
+
+        if variance_type in ["learned", "learned_range"]:
+            variance_type = "fixed_small"
+
+        scheduler_args["variance_type"] = variance_type
+
+    pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, **scheduler_args)
+    pipe = pipe.to(accelerator.device)
+    # pipe.set_progress_bar_config(disable=True)
+
+    # run inference
+    generator = torch.Generator(device=accelerator.device).manual_seed(args.seed) if args.seed else None
+
+    videos = []
+    for _ in range(args.num_validation_videos):
+        video = pipe(**pipeline_args, generator=generator, output_type="np").frames[0]
+        videos.append(video)
+
+    for i, video in enumerate(videos):
+        prompt = (
+            pipeline_args["prompt"][:25]
+            .replace(" ", "_")
+            .replace(" ", "_")
+            .replace("'", "_")
+            .replace('"', "_")
+            .replace("/", "_")
+        )
+        filename = os.path.join(args.output_dir, f"{epoch}_video_{i}_{prompt}.mp4")
+        export_to_video(video, filename, fps=8)
+
+    clear_objs_and_retain_memory([pipe])
+
+    return videos
+
+
+def _get_t5_prompt_embeds(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("`text_input_ids` must be provided when the tokenizer is not specified.")
+
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    _, seq_len, _ = prompt_embeds.shape
+    prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
+
+    return prompt_embeds
+
+
+def encode_prompt(
+    tokenizer: T5Tokenizer,
+    text_encoder: T5EncoderModel,
+    prompt: Union[str, List[str]],
+    num_videos_per_prompt: int = 1,
+    max_sequence_length: int = 226,
+    device: Optional[torch.device] = None,
+    dtype: Optional[torch.dtype] = None,
+    text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    prompt_embeds = _get_t5_prompt_embeds(
+        tokenizer,
+        text_encoder,
+        prompt=prompt,
+        num_videos_per_prompt=num_videos_per_prompt,
+        max_sequence_length=max_sequence_length,
+        device=device,
+        dtype=dtype,
+        text_input_ids=text_input_ids,
+    )
+    return prompt_embeds
+
+
+def compute_prompt_embeddings(
+    tokenizer, text_encoder, prompt, max_sequence_length, device, dtype, requires_grad: bool = False
+):
+    if requires_grad:
+        prompt_embeds = encode_prompt(
+            tokenizer,
+            text_encoder,
+            prompt,
+            num_videos_per_prompt=1,
+            max_sequence_length=max_sequence_length,
+            device=device,
+            dtype=dtype,
+        )
+    else:
+        with torch.no_grad():
+            prompt_embeds = encode_prompt(
+                tokenizer,
+                text_encoder,
+                prompt,
+                num_videos_per_prompt=1,
+                max_sequence_length=max_sequence_length,
+                device=device,
+                dtype=dtype,
+            )
+    return prompt_embeds
+
+
+def prepare_rotary_positional_embeddings(
+    height: int,
+    width: int,
+    num_frames: int,
+    vae_scale_factor_spatial: int = 8,
+    patch_size: int = 2,
+    attention_head_dim: int = 64,
+    device: Optional[torch.device] = None,
+    base_height: int = 480,
+    base_width: int = 720,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    grid_height = height // (vae_scale_factor_spatial * patch_size)
+    grid_width = width // (vae_scale_factor_spatial * patch_size)
+    base_size_width = base_width // (vae_scale_factor_spatial * patch_size)
+    base_size_height = base_height // (vae_scale_factor_spatial * patch_size)
+
+    grid_crops_coords = get_resize_crop_region_for_grid((grid_height, grid_width), base_size_width, base_size_height)
+    freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+        embed_dim=attention_head_dim,
+        crops_coords=grid_crops_coords,
+        grid_size=(grid_height, grid_width),
+        temporal_size=num_frames,
+    )
+
+    freqs_cos = freqs_cos.to(device=device)
+    freqs_sin = freqs_sin.to(device=device)
+    return freqs_cos, freqs_sin
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+def avgpool_mask_tensor(mask_tensor):
+    bs, f, c, h, w = mask_tensor.shape
+    assert c == 1, "Channel must be 1"
+    assert f % 12 == 0, "Frame number must be divisible by 12 (e.g., 48)"
+    assert h % 30 == 0 and w % 45 == 0, "Height and width must be divisible by 30 and 45"
+
+    # Spatial average pooling
+    x = mask_tensor.float()  # (bs, f, 1, h, w)
+    x = x.view(bs * f, 1, h, w)
+    x_pooled = F.avg_pool2d(x, kernel_size=(h // 30, w // 45))  # (bs * f, 1, 30, 45)
+    x_pooled = x_pooled.view(bs, f, 1, 30, 45)
+
+    # Temporal pooling
+    t_groups = f // 12
+    x_pooled = x_pooled.view(bs, 12, t_groups, 1, 30, 45)
+    pooled_avg = torch.mean(x_pooled, dim=2)  # (bs, 12, 1, 30, 45)
+
+    # Threshold
+    pooled_mask = (pooled_avg > 0.5).int()
+
+    # Add zero frame for each sample
+    zero_frame = torch.zeros_like(pooled_mask[:, 0:1])  # (bs, 1, 1, 30, 45)
+    pooled_mask = torch.cat([zero_frame, pooled_mask], dim=1)  # (bs, 13, 1, 30, 45)
+
+    return 1 - pooled_mask  # invert
+
+def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
+    # Use DeepSpeed optimzer
+    if use_deepspeed:
+        from accelerate.utils import DummyOptim
+
+        return DummyOptim(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+
+    # Optimizer creation
+    supported_optimizers = ["adam", "adamw", "prodigy"]
+    if args.optimizer not in supported_optimizers:
+        logger.warning(
+            f"Unsupported choice of optimizer: {args.optimizer}. Supported optimizers include {supported_optimizers}. Defaulting to AdamW"
+        )
+        args.optimizer = "adamw"
+
+    if args.use_8bit_adam and not (args.optimizer.lower() not in ["adam", "adamw"]):
+        logger.warning(
+            f"use_8bit_adam is ignored when optimizer is not set to 'Adam' or 'AdamW'. Optimizer was "
+            f"set to {args.optimizer.lower()}"
+        )
+
+    if args.use_8bit_adam:
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError(
+                "To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
+            )
+
+    if args.optimizer.lower() == "adamw":
+        optimizer_class = bnb.optim.AdamW8bit if args.use_8bit_adam else torch.optim.AdamW
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "adam":
+        optimizer_class = bnb.optim.Adam8bit if args.use_8bit_adam else torch.optim.Adam
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            betas=(args.adam_beta1, args.adam_beta2),
+            eps=args.adam_epsilon,
+            weight_decay=args.adam_weight_decay,
+        )
+    elif args.optimizer.lower() == "prodigy":
+        try:
+            import prodigyopt
+        except ImportError:
+            raise ImportError("To use Prodigy, please install the prodigyopt library: `pip install prodigyopt`")
+
+        optimizer_class = prodigyopt.Prodigy
+
+        if args.learning_rate <= 0.1:
+            logger.warning(
+                "Learning rate is too low. When using prodigy, it's generally better to set learning rate around 1.0"
+            )
+
+        optimizer = optimizer_class(
+            params_to_optimize,
+            lr=args.learning_rate,
+            betas=(args.adam_beta1, args.adam_beta2),
+            beta3=args.prodigy_beta3,
+            weight_decay=args.adam_weight_decay,
+            eps=args.adam_epsilon,
+            decouple=args.prodigy_decouple,
+            use_bias_correction=args.prodigy_use_bias_correction,
+            safeguard_warmup=args.prodigy_safeguard_warmup,
+        )
+
+    return optimizer
+
+
+def main(args):
+    if args.report_to == "wandb" and args.hub_token is not None:
+        raise ValueError(
+            "You cannot use both --report_to=wandb and --hub_token due to a security risk of exposing your token."
+            " Please use `huggingface-cli login` to authenticate with the Hub."
+        )
+
+    if torch.backends.mps.is_available() and args.mixed_precision == "bf16":
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    logging_dir = Path(args.output_dir, args.logging_dir)
+
+    accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=True)
+    accelerator = Accelerator(
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        mixed_precision=args.mixed_precision,
+        log_with=args.report_to,
+        project_config=accelerator_project_config,
+        kwargs_handlers=[kwargs],
+    )
+
+    # Disable AMP for MPS.
+    if torch.backends.mps.is_available():
+        accelerator.native_amp = False
+
+    if args.report_to == "wandb":
+        if not is_wandb_available():
+            raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.info(accelerator.state, main_process_only=False)
+    if accelerator.is_local_main_process:
+        transformers.utils.logging.set_verbosity_warning()
+        diffusers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+        diffusers.utils.logging.set_verbosity_error()
+
+    # If passed along, set the training seed now.
+    if args.seed is not None:
+        set_seed(args.seed)
+
+    # Handle the repository creation
+    if accelerator.is_main_process:
+        if args.output_dir is not None:
+            os.makedirs(args.output_dir, exist_ok=True)
+
+    if accelerator.is_main_process:
+        loss_log_path = os.path.join(args.output_dir, "loss_log.csv")
+        if not os.path.exists(loss_log_path):
+            with open(loss_log_path, "w") as f:
+                f.write("step,loss,lr\n")
+                
+        if args.push_to_hub:
+            repo_id = create_repo(
+                repo_id=args.hub_model_id or Path(args.output_dir).name,
+                exist_ok=True,
+            ).repo_id
+
+    # # Prepare models and scheduler
+    # tokenizer = AutoTokenizer.from_pretrained(
+    #     args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision
+    # )
+
+    # text_encoder = T5EncoderModel.from_pretrained(
+    #     args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
+    # )
+
+    # CogVideoX-2b weights are stored in float16
+    # CogVideoX-5b and CogVideoX-5b-I2V weights are stored in bfloat16
+    load_dtype = torch.bfloat16 if "5b" in args.pretrained_model_name_or_path.lower() else torch.float16
+    transformer = CustomCogVideoXTransformer3DModel.from_pretrained(
+        args.pretrained_model_name_or_path,
+        subfolder="transformer",
+        torch_dtype=load_dtype,
+        revision=args.revision,
+        variant=args.variant,
+    )
+
+    vae = AutoencoderKLCogVideoX.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="vae", revision=args.revision, variant=args.variant
+    )
+    controlnet_kwargs = {}
+    num_attention_heads_orig = 48 if "5b" in args.pretrained_model_name_or_path.lower() else 30
+    if args.controlnet_transformer_num_attn_heads is not None:
+        controlnet_kwargs["num_attention_heads"] = args.controlnet_transformer_num_attn_heads
+    else:
+        controlnet_kwargs["num_attention_heads"] = num_attention_heads_orig
+    if args.controlnet_transformer_attention_head_dim is not None:
+        controlnet_kwargs["attention_head_dim"] = args.controlnet_transformer_attention_head_dim
+    if args.controlnet_transformer_out_proj_dim_factor is not None:
+        controlnet_kwargs["out_proj_dim"] = num_attention_heads_orig * args.controlnet_transformer_out_proj_dim_factor
+    controlnet_kwargs["out_proj_dim_zero_init"] = args.controlnet_transformer_out_proj_dim_zero_init
+    controlnet = CogVideoXControlnetPCD(
+        num_layers=args.controlnet_transformer_num_layers,
+        downscale_coef=args.downscale_coef,
+        in_channels=args.controlnet_input_channels,
+        use_zero_conv=args.use_zero_conv,
+        **controlnet_kwargs,   
+    )
+
+    if args.init_from_transformer:
+        controlnet_state_dict = {}
+        for name, params in transformer.state_dict().items():
+            if 'patch_embed.proj.weight' in name:
+                continue
+            controlnet_state_dict[name] = params
+        m, u = controlnet.load_state_dict(controlnet_state_dict, strict=False)
+        print(f'[ Weights from transformer was loaded into controlnet ] [M: {len(m)} | U: {len(u)}]')
+
+    if args.pretrained_controlnet_path:
+        ckpt = torch.load(args.pretrained_controlnet_path, map_location='cpu', weights_only=False)
+        controlnet_state_dict = {}
+        for name, params in ckpt['state_dict'].items():
+            controlnet_state_dict[name] = params
+        m, u = controlnet.load_state_dict(controlnet_state_dict, strict=False)
+        print(f'[ Weights from pretrained controlnet was loaded into controlnet ] [M: {len(m)} | U: {len(u)}]')
+    
+    scheduler = CogVideoXDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
+
+    if args.enable_slicing:
+        vae.enable_slicing()
+    if args.enable_tiling:
+        vae.enable_tiling()
+
+    # We only train the additional adapter controlnet layers
+    # text_encoder.requires_grad_(False)
+    transformer.requires_grad_(False)
+    vae.requires_grad_(False)
+    controlnet.requires_grad_(True)
+
+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.state.deepspeed_plugin:
+        # DeepSpeed is handling precision, use what's in the DeepSpeed config
+        if (
+            "fp16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["fp16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+        if (
+            "bf16" in accelerator.state.deepspeed_plugin.deepspeed_config
+            and accelerator.state.deepspeed_plugin.deepspeed_config["bf16"]["enabled"]
+        ):
+            weight_dtype = torch.float16
+    else:
+        if accelerator.mixed_precision == "fp16":
+            weight_dtype = torch.float16
+        elif accelerator.mixed_precision == "bf16":
+            weight_dtype = torch.bfloat16
+
+    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
+        # due to pytorch#99272, MPS does not yet support bfloat16.
+        raise ValueError(
+            "Mixed precision training with bfloat16 is not supported on MPS. Please use fp16 (recommended) or fp32 instead."
+        )
+
+    # text_encoder.to(accelerator.device, dtype=weight_dtype)
+    transformer.to(accelerator.device, dtype=weight_dtype)
+    vae.to(accelerator.device, dtype=weight_dtype)
+    controlnet.to(accelerator.device, dtype=weight_dtype)
+
+    if args.gradient_checkpointing:
+        transformer.enable_gradient_checkpointing()
+        controlnet.enable_gradient_checkpointing()
+
+    def unwrap_model(model):
+        model = accelerator.unwrap_model(model)
+        model = model._orig_mod if is_compiled_module(model) else model
+        return model
+
+    # Enable TF32 for faster training on Ampere GPUs,
+    # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
+    if args.allow_tf32 and torch.cuda.is_available():
+        torch.backends.cuda.matmul.allow_tf32 = True
+
+    if args.scale_lr:
+        args.learning_rate = (
+            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
+        )
+
+    # Make sure the trainable params are in float32.
+    if args.mixed_precision == "fp16":
+        # only upcast trainable parameters into fp32
+        cast_training_params([controlnet], dtype=torch.float32)
+
+    trainable_parameters = list(filter(lambda p: p.requires_grad, controlnet.parameters()))
+
+    # Optimization parameters
+    trainable_parameters_with_lr = {"params": trainable_parameters, "lr": args.learning_rate}
+    params_to_optimize = [trainable_parameters_with_lr]
+
+    use_deepspeed_optimizer = (
+        accelerator.state.deepspeed_plugin is not None
+        and "optimizer" in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+    use_deepspeed_scheduler = (
+        accelerator.state.deepspeed_plugin is not None
+        and "scheduler" not in accelerator.state.deepspeed_plugin.deepspeed_config
+    )
+
+    optimizer = get_optimizer(args, params_to_optimize, use_deepspeed=use_deepspeed_optimizer)
+
+    # Dataset and DataLoader
+    train_dataset = RealEstate10KPCDRenderLatentCapEmbDataset(
+        video_root_dir=args.video_root_dir,
+        text_embedding_path=args.text_embedding_path
+    )
+        
+    def encode_video(video):
+        video = video.to(accelerator.device, dtype=vae.dtype)
+        video = video.permute(0, 2, 1, 3, 4)  # [B, C, F, H, W]
+        latent_dist = vae.encode(video).latent_dist.sample() * vae.config.scaling_factor
+        return latent_dist.permute(0, 2, 1, 3, 4).to(memory_format=torch.contiguous_format)
+    
+    def collate_fn(examples):
+        source_latents = [example["source_latent"] for example in examples]
+        anchor_latents = [example["anchor_latent"] for example in examples]
+        caption_embs = [example["caption_emb"] for example in examples]
+        
+        images = [example["image"] for example in examples]
+        masks = [example["mask"] for example in examples]
+        
+        caption_embs = torch.concat(caption_embs)
+
+        source_latents = torch.cat(source_latents).permute(0, 2, 1, 3, 4)
+        source_latents = source_latents.to(memory_format=torch.contiguous_format).float()
+    
+        anchor_latents = torch.cat(anchor_latents).permute(0, 2, 1, 3, 4)
+        anchor_latents = anchor_latents.to(memory_format=torch.contiguous_format).float()
+        
+        images = torch.stack(images).to(memory_format=torch.contiguous_format).float()  # [B, F, C, H, W]
+        
+        masks = torch.stack(masks)
+        masks = masks.to(memory_format=torch.contiguous_format).float()
+        
+        # found average pool works better than max pool
+        masks = avgpool_mask_tensor(1-masks[:,1:])
+        # masks = maxpool_mask_tensor(1-masks[:,1:])  # [B, F, 1, 30, 45]
+        masks = masks.flatten(start_dim=1).unsqueeze(-1)
+
+        return {
+            "source_latents": source_latents,
+            "anchor_latents": anchor_latents,
+            "images": images,
+            "caption_embs": caption_embs,
+            "controlnet_masks": masks
+        }
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=args.train_batch_size,
+        shuffle=True,
+        collate_fn=collate_fn,
+        num_workers=args.dataloader_num_workers,
+    )
+
+    # Scheduler and math around the number of training steps.
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if args.max_train_steps is None:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True
+
+    if use_deepspeed_scheduler:
+        from accelerate.utils import DummyScheduler
+
+        lr_scheduler = DummyScheduler(
+            name=args.lr_scheduler,
+            optimizer=optimizer,
+            total_num_steps=args.max_train_steps * accelerator.num_processes,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        )
+    else:
+        lr_scheduler = get_scheduler(
+            args.lr_scheduler,
+            optimizer=optimizer,
+            num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+            num_training_steps=args.max_train_steps * accelerator.num_processes,
+            num_cycles=args.lr_num_cycles,
+            power=args.lr_power,
+        )
+
+    # Prepare everything with our `accelerator`.
+    controlnet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        controlnet, optimizer, train_dataloader, lr_scheduler
+    )
+
+    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
+    if overrode_max_train_steps:
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+    # Afterwards we recalculate our number of training epochs
+    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)
+
+    # We need to initialize the trackers we use, and also store our configuration.
+    # The trackers initializes automatically on the main process.
+    if accelerator.is_main_process:
+        tracker_name = args.tracker_name or "cogvideox-controlnet"
+        accelerator.init_trackers(tracker_name, config=vars(args))
+
+    # Train!
+    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
+    num_trainable_parameters = sum(param.numel() for model in params_to_optimize for param in model["params"])
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num trainable parameters = {num_trainable_parameters}")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num batches each epoch = {len(train_dataloader)}")
+    logger.info(f"  Num epochs = {args.num_train_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
+    logger.info(f"  Gradient accumulation steps = {args.gradient_accumulation_steps}")
+    logger.info(f"  Total optimization steps = {args.max_train_steps}")
+    global_step = 0
+    first_epoch = 0
+    initial_global_step = 0
+
+    progress_bar = tqdm(
+        range(0, args.max_train_steps),
+        initial=initial_global_step,
+        desc="Steps",
+        # Only show the progress bar once on each machine.
+        disable=not accelerator.is_local_main_process,
+    )
+    vae_scale_factor_spatial = 2 ** (len(vae.config.block_out_channels) - 1)
+
+    # For DeepSpeed training
+    model_config = transformer.module.config if hasattr(transformer, "module") else transformer.config
+
+    for epoch in range(first_epoch, args.num_train_epochs):
+        controlnet.train()
+
+        for step, batch in enumerate(train_dataloader):
+            models_to_accumulate = [controlnet]
+
+            with accelerator.accumulate(models_to_accumulate):
+                model_input = batch['source_latents'].to(dtype=weight_dtype)  # [B, F, C, H, W]
+                masks = batch["controlnet_masks"].to(dtype=weight_dtype)  # [B, F, 1, H, W]
+                prompt_embeds = batch["caption_embs"].to(weight_dtype)
+                
+                # Sample noise that will be added to the latents
+                noise = torch.randn_like(model_input)
+                batch_size, num_frames, num_channels, height, width = model_input.shape
+
+                # Sample a random timestep for each image
+                if args.enable_time_sampling:
+                    if args.time_sampling_type == "truncated_normal":
+                        time_sampling_dict = {
+                            'mean': args.time_sampling_mean,
+                            'std': args.time_sampling_std,
+                            'a': 1 - args.controlnet_guidance_end,
+                            'b': 1 - args.controlnet_guidance_start,
+                        }
+                        timesteps = torch.nn.init.trunc_normal_(
+                            torch.empty(batch_size, device=model_input.device), **time_sampling_dict
+                            ) * scheduler.config.num_train_timesteps
+                    elif args.time_sampling_type == "truncated_uniform":
+                        timesteps = torch.randint(
+                            int((1- args.controlnet_guidance_end) * scheduler.config.num_train_timesteps),
+                            int((1 - args.controlnet_guidance_start) * scheduler.config.num_train_timesteps),
+                            (batch_size,), device=model_input.device
+                        )
+                else:    
+                    timesteps = torch.randint(
+                        0, scheduler.config.num_train_timesteps, (batch_size,), device=model_input.device
+                    )
+                timesteps = timesteps.long()
+        
+                # Prepare rotary embeds
+                image_rotary_emb = (
+                    prepare_rotary_positional_embeddings(
+                        height=args.height,
+                        width=args.width,
+                        num_frames=num_frames,
+                        vae_scale_factor_spatial=vae_scale_factor_spatial,
+                        patch_size=model_config.patch_size,
+                        attention_head_dim=model_config.attention_head_dim,
+                        device=accelerator.device,
+                    )
+                    if model_config.use_rotary_positional_embeddings
+                    else None
+                )
+
+                # Add noise to the model input according to the noise magnitude at each timestep
+                # (this is the forward diffusion process)
+                noisy_model_input = scheduler.add_noise(model_input, noise, timesteps)
+
+                images = batch["images"][:,0].unsqueeze(2)
+                # Add noise to images
+                image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=accelerator.device)
+                image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+                noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+                image_latent_dist = vae.encode(noisy_images.to(dtype=vae.dtype)).latent_dist
+                image_latents = image_latent_dist.sample() * vae.config.scaling_factor
+
+
+                # from [B, C, F, H, W] to [B, F, C, H, W]
+                latent = model_input
+                image_latents = image_latents.permute(0, 2, 1, 3, 4)
+                assert (latent.shape[0], *latent.shape[2:]) == (image_latents.shape[0], *image_latents.shape[2:])
+
+                # Padding image_latents to the same frame number as latent
+                padding_shape = (latent.shape[0], latent.shape[1] - 1, *latent.shape[2:])
+                latent_padding = image_latents.new_zeros(padding_shape)
+                image_latents = torch.cat([image_latents, latent_padding], dim=1)
+
+                # Concatenate latent and image_latents in the channel dimension
+                latent_img_noisy = torch.cat([noisy_model_input, image_latents], dim=2)
+
+                anchor_states = batch["anchor_latents"].to(dtype=weight_dtype)
+                
+                controlnet_input_states = anchor_states
+                controlnet_states = controlnet(
+                    hidden_states=noisy_model_input,
+                    encoder_hidden_states=prompt_embeds,
+                    image_rotary_emb=image_rotary_emb,
+                    controlnet_states=controlnet_input_states,
+                    timestep=timesteps,
+                    return_dict=False,
+                    controlnet_output_mask=masks
+                )[0]
+                if isinstance(controlnet_states, (tuple, list)):
+                    controlnet_states = [x.to(dtype=weight_dtype) for x in controlnet_states]
+                else:
+                    controlnet_states = controlnet_states.to(dtype=weight_dtype)
+                # Predict the noise residual
+                model_output = transformer(
+                    hidden_states=latent_img_noisy,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep=timesteps,
+                    image_rotary_emb=image_rotary_emb,
+                    controlnet_states=controlnet_states,
+                    controlnet_weights=args.controlnet_weights,
+                    return_dict=False,
+                )[0]
+                model_pred = scheduler.get_velocity(model_output, noisy_model_input, timesteps)
+
+                alphas_cumprod = scheduler.alphas_cumprod[timesteps]
+                weights = 1 / (1 - alphas_cumprod)
+                while len(weights.shape) < len(model_pred.shape):
+                    weights = weights.unsqueeze(-1)
+
+                target = model_input
+
+                loss = torch.mean((weights * (model_pred - target) ** 2).reshape(batch_size, -1), dim=1)
+                loss = loss.mean()
+                accelerator.backward(loss)
+
+                if accelerator.sync_gradients:
+                    params_to_clip = controlnet.parameters()
+                    accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
+
+                if accelerator.state.deepspeed_plugin is None:
+                    optimizer.step()
+                    optimizer.zero_grad()
+
+                lr_scheduler.step()
+
+            # Checks if the accelerator has performed an optimization step behind the scenes
+            if accelerator.sync_gradients:
+                progress_bar.update(1)
+                global_step += 1
+
+                if accelerator.is_main_process:
+                    if global_step % args.checkpointing_steps == 0:
+                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}.pt")
+                        torch.save({'state_dict': unwrap_model(controlnet).state_dict()}, save_path)
+                        logger.info(f"Saved state to {save_path}")
+                        
+            logs = {"loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
+            progress_bar.set_postfix(**logs)
+            accelerator.log(logs, step=global_step)
+            
+            if accelerator.is_main_process:
+                with open(loss_log_path, "a") as f:
+                    f.write(f"{global_step},{logs['loss']},{logs['lr']}\n")
+
+            if global_step >= args.max_train_steps:
+                break
+
+            if accelerator.is_main_process:
+                if args.validation_prompt is not None and (step + 1) % args.validation_steps == 0:
+                    # Create pipeline
+                    pipe = ControlnetCogVideoXPipeline.from_pretrained(
+                        args.pretrained_model_name_or_path,
+                        transformer=unwrap_model(transformer),
+                        text_encoder=unwrap_model(text_encoder),
+                        vae=unwrap_model(vae),
+                        controlnet=unwrap_model(controlnet),
+                        scheduler=scheduler,
+                        torch_dtype=weight_dtype,
+                    )
+    
+                    validation_prompts = args.validation_prompt.split(args.validation_prompt_separator)
+                    validation_videos = args.validation_video.split(args.validation_prompt_separator)
+                    for validation_prompt, validation_video in zip(validation_prompts, validation_videos):
+                        numpy_frames = read_video(validation_video, frames_count=args.max_num_frames)
+                        controlnet_frames = np.stack([train_dataset.controlnet_processor(x) for x in numpy_frames])
+                        pipeline_args = {
+                            "prompt": validation_prompt,
+                            "controlnet_frames": controlnet_frames,
+                            "guidance_scale": args.guidance_scale,
+                            "use_dynamic_cfg": args.use_dynamic_cfg,
+                            "height": args.height,
+                            "width": args.width,
+                            "num_frames": args.max_num_frames,
+                            "num_inference_steps": args.num_inference_steps,
+                            "controlnet_weights": args.controlnet_weights,
+                        }
+    
+                        validation_outputs = log_validation(
+                            pipe=pipe,
+                            args=args,
+                            accelerator=accelerator,
+                            pipeline_args=pipeline_args,
+                            epoch=epoch,
+                        )
+    
+    accelerator.wait_for_everyone()
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    args = get_args()
+    main(args)
\ No newline at end of file