Spaces:

roll-ai
/

Flov-space

Paused

App Files Files Community

roll-ai commited on 10 days ago

Commit

59d751c

verified ·

1 Parent(s): 70e2d21

Upload 177 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
finetune/accelerate_config.yaml +21 -0
finetune/configs/zero2.yaml +38 -0
finetune/configs/zero2_controlnet.yaml +38 -0
finetune/configs/zero2_offload.yaml +42 -0
finetune/configs/zero3.yaml +43 -0
finetune/configs/zero3_offload.yaml +51 -0
finetune/constants.py +2 -0
finetune/datasets/__init__.py +14 -0
finetune/datasets/bucket_sampler.py +71 -0
finetune/datasets/i2v_dataset.py +311 -0
finetune/datasets/i2v_flow_dataset.py +188 -0
finetune/datasets/t2v_dataset.py +251 -0
finetune/datasets/utils.py +211 -0
finetune/models/__init__.py +12 -0
finetune/models/cogvideox_i2v/flovd_OMSM_lora_trainer.py +748 -0
finetune/models/cogvideox_i2v/flovd_controlnet_trainer.py +814 -0
finetune/models/cogvideox_i2v/lora_trainer.py +246 -0
finetune/models/cogvideox_i2v/sft_trainer.py +9 -0
finetune/models/utils.py +57 -0
finetune/modules/__init__.py +0 -0
finetune/modules/camera_flow_generator.py +46 -0
finetune/modules/camera_sampler.py +52 -0
finetune/modules/cogvideox_controlnet.py +353 -0
finetune/modules/cogvideox_custom_model.py +109 -0
finetune/modules/cogvideox_custom_modules.py +357 -0
finetune/modules/depth_warping/__init__.py +0 -0
finetune/modules/depth_warping/camera/Camera.py +70 -0
finetune/modules/depth_warping/camera/WarperPytorch.py +416 -0
finetune/modules/depth_warping/depth_anything_v2/depth_anything_wrapper.py +12 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2.py +415 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/__init__.py +11 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/attention.py +83 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/block.py +252 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/drop_path.py +35 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/layer_scale.py +28 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/mlp.py +41 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/patch_embed.py +89 -0
finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/swiglu_ffn.py +63 -0
finetune/modules/depth_warping/depth_anything_v2/dpt.py +235 -0
finetune/modules/depth_warping/depth_anything_v2/util/blocks.py +148 -0
finetune/modules/depth_warping/depth_anything_v2/util/transform.py +158 -0
finetune/modules/depth_warping/depth_pro/__init__.py +5 -0
finetune/modules/depth_warping/depth_pro/cli/__init__.py +4 -0
finetune/modules/depth_warping/depth_pro/cli/run.py +154 -0
finetune/modules/depth_warping/depth_pro/depth_pro.py +298 -0
finetune/modules/depth_warping/depth_pro/eval/boundary_metrics.py +332 -0
finetune/modules/depth_warping/depth_pro/eval/dis5k_sample_list.txt +200 -0
finetune/modules/depth_warping/depth_pro/network/__init__.py +2 -0
finetune/modules/depth_warping/depth_pro/network/decoder.py +206 -0

.gitattributes CHANGED Viewed

@@ -73,3 +73,7 @@ assets/pages/res1.mp4 filter=lfs diff=lfs merge=lfs -text
 assets/pages/res2.mp4 filter=lfs diff=lfs merge=lfs -text
 assets/pages/res3.mp4 filter=lfs diff=lfs merge=lfs -text
 assets/pages/teaser.png filter=lfs diff=lfs merge=lfs -text

 assets/pages/res2.mp4 filter=lfs diff=lfs merge=lfs -text
 assets/pages/res3.mp4 filter=lfs diff=lfs merge=lfs -text
 assets/pages/teaser.png filter=lfs diff=lfs merge=lfs -text
+results/generated_videos/A_chef_in_a_white_coat_and_gla_1593596b99e2dde9.txt.mp4 filter=lfs diff=lfs merge=lfs -text
+results/generated_videos/A_stunning_and_untouched_coast_6b6d20c6a46b9fe9.txt.mp4 filter=lfs diff=lfs merge=lfs -text
+tools/caption/assests/CogVLM2-Caption-example.png filter=lfs diff=lfs merge=lfs -text
+tools/caption/assests/cogvlm2-video-example.png filter=lfs diff=lfs merge=lfs -text

finetune/accelerate_config.yaml ADDED Viewed

	@@ -0,0 +1,21 @@

+compute_environment: LOCAL_MACHINE
+gpu_ids: "0,1,2,3,4,5,6,7"
+num_processes: 8  # should be the same as the number of GPUs
+debug: false
+deepspeed_config:
+  deepspeed_config_file: configs/zero2_controlnet.yaml  # e.g. configs/zero2.yaml, need use absolute path
+  zero3_init_flag: false
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+machine_rank: 0
+main_training_function: main
+num_machines: 1
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false

finetune/configs/zero2.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}

finetune/configs/zero2_controlnet.yaml ADDED Viewed

	@@ -0,0 +1,38 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupCosineLR",
+        "params": {
+            "warmup_min_ratio": 0.0,
+            "cos_min_ratio": 0.0001,
+            "warmup_num_steps": 250,
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}

finetune/configs/zero2_offload.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        }
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}

finetune/configs/zero3.yaml ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e8,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto",
+        "stage3_prefetch_bucket_size": 5e8,
+        "stage3_param_persistence_threshold": 1e5
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}

finetune/configs/zero3_offload.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+{
+    "bf16": {
+        "enabled": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "weight_decay": "auto",
+            "torch_adam": true,
+            "adam_w_mode": true
+        }
+    },
+    "scheduler": {
+        "type": "WarmupDecayLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto",
+            "total_num_steps": "auto"
+        }
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e8,
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "sub_group_size": 1e9,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": "auto",
+        "stage3_prefetch_bucket_size": 5e8,
+        "stage3_param_persistence_threshold": 1e6
+    },
+    "gradient_accumulation_steps": 1,
+    "train_micro_batch_size_per_gpu": 1,
+    "train_batch_size": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}

finetune/constants.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ LOG_NAME = "trainer"
2	+ LOG_LEVEL = "INFO"

finetune/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .bucket_sampler import BucketSampler
+from .i2v_dataset import I2VDatasetWithBuckets, I2VDatasetWithResize
+from .t2v_dataset import T2VDatasetWithBuckets, T2VDatasetWithResize
+from .i2v_flow_dataset import I2VFlowDataset
+__all__ = [
+    "I2VDatasetWithResize",
+    "I2VDatasetWithBuckets",
+    "T2VDatasetWithResize",
+    "T2VDatasetWithBuckets",
+    "BucketSampler",
+    "I2VFlowDataset",
+]

finetune/datasets/bucket_sampler.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import logging
+import random
+from torch.utils.data import Dataset, Sampler
+logger = logging.getLogger(__name__)
+class BucketSampler(Sampler):
+    r"""
+    PyTorch Sampler that groups 3D data by height, width and frames.
+    Args:
+        data_source (`VideoDataset`):
+            A PyTorch dataset object that is an instance of `VideoDataset`.
+        batch_size (`int`, defaults to `8`):
+            The batch size to use for training.
+        shuffle (`bool`, defaults to `True`):
+            Whether or not to shuffle the data in each batch before dispatching to dataloader.
+        drop_last (`bool`, defaults to `False`):
+            Whether or not to drop incomplete buckets of data after completely iterating over all data
+            in the dataset. If set to True, only batches that have `batch_size` number of entries will
+            be yielded. If set to False, it is guaranteed that all data in the dataset will be processed
+            and batches that do not have `batch_size` number of entries will also be yielded.
+    """
+    def __init__(
+        self, data_source: Dataset, batch_size: int = 8, shuffle: bool = True, drop_last: bool = False
+    ) -> None:
+        self.data_source = data_source
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.buckets = {resolution: [] for resolution in data_source.video_resolution_buckets}
+        self._raised_warning_for_drop_last = False
+    def __len__(self):
+        if self.drop_last and not self._raised_warning_for_drop_last:
+            self._raised_warning_for_drop_last = True
+            logger.warning(
+                "Calculating the length for bucket sampler is not possible when `drop_last` is set to True. This may cause problems when setting the number of epochs used for training."
+            )
+        return (len(self.data_source) + self.batch_size - 1) // self.batch_size
+    def __iter__(self):
+        for index, data in enumerate(self.data_source):
+            video_metadata = data["video_metadata"]
+            f, h, w = video_metadata["num_frames"], video_metadata["height"], video_metadata["width"]
+            self.buckets[(f, h, w)].append(data)
+            if len(self.buckets[(f, h, w)]) == self.batch_size:
+                if self.shuffle:
+                    random.shuffle(self.buckets[(f, h, w)])
+                yield self.buckets[(f, h, w)]
+                del self.buckets[(f, h, w)]
+                self.buckets[(f, h, w)] = []
+        if self.drop_last:
+            return
+        for fhw, bucket in list(self.buckets.items()):
+            if len(bucket) == 0:
+                continue
+            if self.shuffle:
+                random.shuffle(bucket)
+                yield bucket
+                del self.buckets[fhw]
+                self.buckets[fhw] = []

finetune/datasets/i2v_dataset.py ADDED Viewed

	@@ -0,0 +1,311 @@

+import hashlib
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+import torch
+from accelerate.logging import get_logger
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from typing_extensions import override
+from finetune.constants import LOG_LEVEL, LOG_NAME
+from .utils import (
+    load_images,
+    load_images_from_videos,
+    load_prompts,
+    load_videos,
+    preprocess_image_with_resize,
+    preprocess_video_with_buckets,
+    preprocess_video_with_resize,
+)
+if TYPE_CHECKING:
+    from finetune.trainer import Trainer
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+decord.bridge.set_bridge("torch")
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+class BaseI2VDataset(Dataset):
+    """
+    Base dataset class for Image-to-Video (I2V) training.
+    This dataset loads prompts, videos and corresponding conditioning images for I2V training.
+    Args:
+        data_root (str): Root directory containing the dataset files
+        caption_column (str): Path to file containing text prompts/captions
+        video_column (str): Path to file containing video paths
+        image_column (str): Path to file containing image paths
+        device (torch.device): Device to load the data on
+        encode_video_fn (Callable[[torch.Tensor], torch.Tensor], optional): Function to encode videos
+    """
+    def __init__(
+        self,
+        data_root: str,
+        caption_column: str,
+        video_column: str,
+        image_column: str | None,
+        device: torch.device,
+        trainer: "Trainer" = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        data_root = Path(data_root)
+        self.prompts = load_prompts(data_root / caption_column)
+        self.videos = load_videos(data_root / video_column)
+        if image_column is not None:
+            self.images = load_images(data_root / image_column)
+        else:
+            self.images = load_images_from_videos(self.videos)
+        self.trainer = trainer
+        self.device = device
+        self.encode_video = trainer.encode_video
+        self.encode_text = trainer.encode_text
+        # Check if number of prompts matches number of videos and images
+        if not (len(self.videos) == len(self.prompts) == len(self.images)):
+            raise ValueError(
+                f"Expected length of prompts, videos and images to be the same but found {len(self.prompts)=}, {len(self.videos)=} and {len(self.images)=}. Please ensure that the number of caption prompts, videos and images match in your dataset."
+            )
+        # Check if all video files exist
+        if any(not path.is_file() for path in self.videos):
+            raise ValueError(
+                f"Some video files were not found. Please ensure that all video files exist in the dataset directory. Missing file: {next(path for path in self.videos if not path.is_file())}"
+            )
+        # Check if all image files exist
+        if any(not path.is_file() for path in self.images):
+            raise ValueError(
+                f"Some image files were not found. Please ensure that all image files exist in the dataset directory. Missing file: {next(path for path in self.images if not path.is_file())}"
+            )
+    def __len__(self) -> int:
+        return len(self.videos)
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        if isinstance(index, list):
+            # Here, index is actually a list of data objects that we need to return.
+            # The BucketSampler should ideally return indices. But, in the sampler, we'd like
+            # to have information about num_frames, height and width. Since this is not stored
+            # as metadata, we need to read the video to get this information. You could read this
+            # information without loading the full video in memory, but we do it anyway. In order
+            # to not load the video twice (once to get the metadata, and once to return the loaded video
+            # based on sampled indices), we cache it in the BucketSampler. When the sampler is
+            # to yield, we yield the cache data instead of indices. So, this special check ensures
+            # that data is not loaded a second time. PRs are welcome for improvements.
+            return index
+        prompt = self.prompts[index]
+        video = self.videos[index]
+        image = self.images[index]
+        train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+        cache_dir = self.trainer.args.data_root / "cache"
+        video_latent_dir = cache_dir / "video_latent" / self.trainer.args.model_name / train_resolution_str
+        prompt_embeddings_dir = cache_dir / "prompt_embeddings"
+        video_latent_dir.mkdir(parents=True, exist_ok=True)
+        prompt_embeddings_dir.mkdir(parents=True, exist_ok=True)
+        prompt_hash = str(hashlib.sha256(prompt.encode()).hexdigest())
+        prompt_embedding_path = prompt_embeddings_dir / (prompt_hash + ".safetensors")
+        encoded_video_path = video_latent_dir / (video.stem + ".safetensors")
+        if prompt_embedding_path.exists():
+            prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"]
+            logger.debug(
+                f"process {self.trainer.accelerator.process_index}: Loaded prompt embedding from {prompt_embedding_path}",
+                main_process_only=False,
+            )
+        else:
+            prompt_embedding = self.encode_text(prompt)
+            prompt_embedding = prompt_embedding.to("cpu")
+            # [1, seq_len, hidden_size] -> [seq_len, hidden_size]
+            prompt_embedding = prompt_embedding[0]
+            save_file({"prompt_embedding": prompt_embedding}, prompt_embedding_path)
+            logger.info(f"Saved prompt embedding to {prompt_embedding_path}", main_process_only=False)
+        if encoded_video_path.exists():
+            encoded_video = load_file(encoded_video_path)["encoded_video"]
+            logger.debug(f"Loaded encoded video from {encoded_video_path}", main_process_only=False)
+            # shape of image: [C, H, W]
+            _, image = self.preprocess(None, self.images[index])
+            image = self.image_transform(image)
+        else:
+            frames, image = self.preprocess(video, image)
+            frames = frames.to(self.device)
+            image = image.to(self.device)
+            image = self.image_transform(image)
+            # Current shape of frames: [F, C, H, W]
+            frames = self.video_transform(frames)
+            # Convert to [B, C, F, H, W]
+            frames = frames.unsqueeze(0)
+            frames = frames.permute(0, 2, 1, 3, 4).contiguous()
+            encoded_video = self.encode_video(frames)
+            # [1, C, F, H, W] -> [C, F, H, W]
+            encoded_video = encoded_video[0]
+            encoded_video = encoded_video.to("cpu")
+            image = image.to("cpu")
+            save_file({"encoded_video": encoded_video}, encoded_video_path)
+            logger.info(f"Saved encoded video to {encoded_video_path}", main_process_only=False)
+        # shape of encoded_video: [C, F, H, W]
+        # shape of image: [C, H, W]
+        return {
+            "image": image,
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Loads and preprocesses a video and an image.
+        If either path is None, no preprocessing will be done for that input.
+        Args:
+            video_path: Path to the video file to load
+            image_path: Path to the image file to load
+        Returns:
+            A tuple containing:
+                - video(torch.Tensor) of shape [F, C, H, W] where F is number of frames,
+                  C is number of channels, H is height and W is width
+                - image(torch.Tensor) of shape [C, H, W]
+        """
+        raise NotImplementedError("Subclass must implement this method")
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        """
+        Applies transformations to a video.
+        Args:
+            frames (torch.Tensor): A 4D tensor representing a video
+                with shape [F, C, H, W] where:
+                - F is number of frames
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+        Returns:
+            torch.Tensor: The transformed video tensor
+        """
+        raise NotImplementedError("Subclass must implement this method")
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Applies transformations to an image.
+        Args:
+            image (torch.Tensor): A 3D tensor representing an image
+                with shape [C, H, W] where:
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+        Returns:
+            torch.Tensor: The transformed image tensor
+        """
+        raise NotImplementedError("Subclass must implement this method")
+class I2VDatasetWithResize(BaseI2VDataset):
+    """
+    A dataset class for image-to-video generation that resizes inputs to fixed dimensions.
+    This class preprocesses videos and images by resizing them to specified dimensions:
+    - Videos are resized to max_num_frames x height x width
+    - Images are resized to height x width
+    Args:
+        max_num_frames (int): Maximum number of frames to extract from videos
+        height (int): Target height for resizing videos and images
+        width (int): Target width for resizing videos and images
+    """
+    def __init__(self, max_num_frames: int, height: int, width: int, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+    @override
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if video_path is not None:
+            video = preprocess_video_with_resize(video_path, self.max_num_frames, self.height, self.width)
+        else:
+            video = None
+        if image_path is not None:
+            image = preprocess_image_with_resize(image_path, self.height, self.width)
+        else:
+            image = None
+        return video, image
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+    @override
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)
+class I2VDatasetWithBuckets(BaseI2VDataset):
+    def __init__(
+        self,
+        video_resolution_buckets: List[Tuple[int, int, int]],
+        vae_temporal_compression_ratio: int,
+        vae_height_compression_ratio: int,
+        vae_width_compression_ratio: int,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__(*args, **kwargs)
+        self.video_resolution_buckets = [
+            (
+                int(b[0] / vae_temporal_compression_ratio),
+                int(b[1] / vae_height_compression_ratio),
+                int(b[2] / vae_width_compression_ratio),
+            )
+            for b in video_resolution_buckets
+        ]
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+    @override
+    def preprocess(self, video_path: Path, image_path: Path) -> Tuple[torch.Tensor, torch.Tensor]:
+        video = preprocess_video_with_buckets(video_path, self.video_resolution_buckets)
+        image = preprocess_image_with_resize(image_path, video.shape[2], video.shape[3])
+        return video, image
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+    @override
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)

finetune/datasets/i2v_flow_dataset.py ADDED Viewed

	@@ -0,0 +1,188 @@

+import hashlib
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+import json
+import random
+import torch
+from accelerate.logging import get_logger
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from typing_extensions import override
+from finetune.constants import LOG_LEVEL, LOG_NAME
+from .utils import (
+    load_images,
+    load_images_from_videos,
+    load_prompts,
+    load_videos,
+    preprocess_image_with_resize,
+    preprocess_video_with_buckets,
+    preprocess_video_with_resize,
+    load_binary_mask_compressed,
+)
+import pdb
+if TYPE_CHECKING:
+    from finetune.trainer import Trainer
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+decord.bridge.set_bridge("torch")
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+class I2VFlowDataset(Dataset):
+    """
+    A dataset class for (image,flow)-to-video generation or image-to-flow_video that resizes inputs to fixed dimensions.
+    This class preprocesses videos and images by resizing them to specified dimensions:
+    - Videos are resized to max_num_frames x height x width
+    - Images are resized to height x width
+    Args:
+        max_num_frames (int): Maximum number of frames to extract from videos
+        height (int): Target height for resizing videos and images
+        width (int): Target width for resizing videos and images
+    """
+    def __init__(
+        self,
+        max_num_frames: int,
+        height: int,
+        width: int,
+        data_root: str,
+        caption_column: str,
+        video_column: str,
+        image_column: str | None,
+        device: torch.device,
+        trainer: "Trainer" = None,
+        *args,
+        **kwargs
+    ) -> None:
+        data_root = Path(data_root)
+        metadata_path = data_root / "metadata_revised.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl in the root path"
+        # Load metadata
+        # metadata = {
+        #     "video_path": ...,
+        #     "hash_code": ...,
+        #     "prompt": ...,
+        # }
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+        self.prompts = [x["prompt"] for x in metadata]
+        if 'curated' in str(data_root).lower():
+            self.prompt_embeddings = [data_root / "prompt_embeddings" / (x["hash_code"] + '.safetensors') for x in metadata]
+        else:
+            self.prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.videos = [data_root / "video_latent" / "x".join(str(x) for x in trainer.args.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        self.images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        self.flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+        # data_root = Path(data_root)
+        # self.prompts = load_prompts(data_root / caption_column)
+        # self.videos = load_videos(data_root / video_column)
+        self.trainer = trainer
+        self.device = device
+        self.encode_video = trainer.encode_video
+        self.encode_text = trainer.encode_text
+        # Check if number of prompts matches number of videos and images
+        if not (len(self.videos) == len(self.prompts) == len(self.images) == len(self.flows)):
+            raise ValueError(
+                f"Expected length of prompts, videos and images to be the same but found {len(self.prompts)=}, {len(self.videos)=}, {len(self.images)=} and {len(self.flows)=}. Please ensure that the number of caption prompts, videos and images match in your dataset."
+            )
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+        self.__frame_transforms = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+        self.__image_transforms = self.__frame_transforms
+        self.length = len(self.videos)
+        print(f"Dataset size: {self.length}")
+    def __len__(self) -> int:
+        return self.length
+    def load_data_pair(self, index):
+        # prompt = self.prompts[index]
+        prompt_embedding_path = self.prompt_embeddings[index]
+        encoded_video_path = self.videos[index]
+        encoded_flow_path = self.flows[index]
+        # mask_path = self.masks[index]
+        # image_path = self.images[index]
+        # train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+        prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"]
+        encoded_video = load_file(encoded_video_path)["encoded_video"] # CFHW
+        encoded_flow = load_file(encoded_flow_path)["encoded_flow_f"] # CFHW
+        return prompt_embedding, encoded_video, encoded_flow
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        while True:
+            try:
+                prompt_embedding, encoded_video, encoded_flow = self.load_data_pair(index)
+                break
+            except Exception as e:
+                print(f"Error loading {self.prompt_embeddings[index]}: {str(e)}")
+                index = random.randint(0, self.length - 1)
+        image_path = self.images[index]
+        prompt = self.prompts[index]
+        train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+        _, image = self.preprocess(None, image_path)
+        image = self.image_transform(image)
+        # shape of encoded_video: [C, F, H, W]
+        # shape and scale of image: [C, H, W], [-1,1]
+        return {
+            "image": image,
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "encoded_flow": encoded_flow,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+    @override
+    def preprocess(self, video_path: Path | None, image_path: Path | None) -> Tuple[torch.Tensor, torch.Tensor]:
+        if video_path is not None:
+            video = preprocess_video_with_resize(video_path, self.max_num_frames, self.height, self.width)
+        else:
+            video = None
+        if image_path is not None:
+            image = preprocess_image_with_resize(image_path, self.height, self.width)
+        else:
+            image = None
+        return video, image
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transforms(f) for f in frames], dim=0)
+    @override
+    def image_transform(self, image: torch.Tensor) -> torch.Tensor:
+        return self.__image_transforms(image)

finetune/datasets/t2v_dataset.py ADDED Viewed

	@@ -0,0 +1,251 @@

+import hashlib
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+import torch
+from accelerate.logging import get_logger
+from safetensors.torch import load_file, save_file
+from torch.utils.data import Dataset
+from torchvision import transforms
+from typing_extensions import override
+from finetune.constants import LOG_LEVEL, LOG_NAME
+from .utils import load_prompts, load_videos, preprocess_video_with_buckets, preprocess_video_with_resize
+if TYPE_CHECKING:
+    from finetune.trainer import Trainer
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+decord.bridge.set_bridge("torch")
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+class BaseT2VDataset(Dataset):
+    """
+    Base dataset class for Text-to-Video (T2V) training.
+    This dataset loads prompts and videos for T2V training.
+    Args:
+        data_root (str): Root directory containing the dataset files
+        caption_column (str): Path to file containing text prompts/captions
+        video_column (str): Path to file containing video paths
+        device (torch.device): Device to load the data on
+        encode_video_fn (Callable[[torch.Tensor], torch.Tensor], optional): Function to encode videos
+    """
+    def __init__(
+        self,
+        data_root: str,
+        caption_column: str,
+        video_column: str,
+        device: torch.device = None,
+        trainer: "Trainer" = None,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        data_root = Path(data_root)
+        self.prompts = load_prompts(data_root / caption_column)
+        self.videos = load_videos(data_root / video_column)
+        self.device = device
+        self.encode_video = trainer.encode_video
+        self.encode_text = trainer.encode_text
+        self.trainer = trainer
+        # Check if all video files exist
+        if any(not path.is_file() for path in self.videos):
+            raise ValueError(
+                f"Some video files were not found. Please ensure that all video files exist in the dataset directory. Missing file: {next(path for path in self.videos if not path.is_file())}"
+            )
+        # Check if number of prompts matches number of videos
+        if len(self.videos) != len(self.prompts):
+            raise ValueError(
+                f"Expected length of prompts and videos to be the same but found {len(self.prompts)=} and {len(self.videos)=}. Please ensure that the number of caption prompts and videos match in your dataset."
+            )
+    def __len__(self) -> int:
+        return len(self.videos)
+    def __getitem__(self, index: int) -> Dict[str, Any]:
+        if isinstance(index, list):
+            # Here, index is actually a list of data objects that we need to return.
+            # The BucketSampler should ideally return indices. But, in the sampler, we'd like
+            # to have information about num_frames, height and width. Since this is not stored
+            # as metadata, we need to read the video to get this information. You could read this
+            # information without loading the full video in memory, but we do it anyway. In order
+            # to not load the video twice (once to get the metadata, and once to return the loaded video
+            # based on sampled indices), we cache it in the BucketSampler. When the sampler is
+            # to yield, we yield the cache data instead of indices. So, this special check ensures
+            # that data is not loaded a second time. PRs are welcome for improvements.
+            return index
+        prompt = self.prompts[index]
+        video = self.videos[index]
+        train_resolution_str = "x".join(str(x) for x in self.trainer.args.train_resolution)
+        cache_dir = self.trainer.args.data_root / "cache"
+        video_latent_dir = cache_dir / "video_latent" / self.trainer.args.model_name / train_resolution_str
+        prompt_embeddings_dir = cache_dir / "prompt_embeddings"
+        video_latent_dir.mkdir(parents=True, exist_ok=True)
+        prompt_embeddings_dir.mkdir(parents=True, exist_ok=True)
+        prompt_hash = str(hashlib.sha256(prompt.encode()).hexdigest())
+        prompt_embedding_path = prompt_embeddings_dir / (prompt_hash + ".safetensors")
+        encoded_video_path = video_latent_dir / (video.stem + ".safetensors")
+        if prompt_embedding_path.exists():
+            prompt_embedding = load_file(prompt_embedding_path)["prompt_embedding"]
+            logger.debug(
+                f"process {self.trainer.accelerator.process_index}: Loaded prompt embedding from {prompt_embedding_path}",
+                main_process_only=False,
+            )
+        else:
+            prompt_embedding = self.encode_text(prompt)
+            prompt_embedding = prompt_embedding.to("cpu")
+            # [1, seq_len, hidden_size] -> [seq_len, hidden_size]
+            prompt_embedding = prompt_embedding[0]
+            save_file({"prompt_embedding": prompt_embedding}, prompt_embedding_path)
+            logger.info(f"Saved prompt embedding to {prompt_embedding_path}", main_process_only=False)
+        if encoded_video_path.exists():
+            # encoded_video = torch.load(encoded_video_path, weights_only=True)
+            encoded_video = load_file(encoded_video_path)["encoded_video"]
+            logger.debug(f"Loaded encoded video from {encoded_video_path}", main_process_only=False)
+            # shape of image: [C, H, W]
+        else:
+            frames = self.preprocess(video)
+            frames = frames.to(self.device)
+            # Current shape of frames: [F, C, H, W]
+            frames = self.video_transform(frames)
+            # Convert to [B, C, F, H, W]
+            frames = frames.unsqueeze(0)
+            frames = frames.permute(0, 2, 1, 3, 4).contiguous()
+            encoded_video = self.encode_video(frames)
+            # [1, C, F, H, W] -> [C, F, H, W]
+            encoded_video = encoded_video[0]
+            encoded_video = encoded_video.to("cpu")
+            save_file({"encoded_video": encoded_video}, encoded_video_path)
+            logger.info(f"Saved encoded video to {encoded_video_path}", main_process_only=False)
+        # shape of encoded_video: [C, F, H, W]
+        return {
+            "prompt_embedding": prompt_embedding,
+            "encoded_video": encoded_video,
+            "video_metadata": {
+                "num_frames": encoded_video.shape[1],
+                "height": encoded_video.shape[2],
+                "width": encoded_video.shape[3],
+            },
+        }
+    def preprocess(self, video_path: Path) -> torch.Tensor:
+        """
+        Loads and preprocesses a video.
+        Args:
+            video_path: Path to the video file to load.
+        Returns:
+            torch.Tensor: Video tensor of shape [F, C, H, W] where:
+                - F is number of frames
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+        """
+        raise NotImplementedError("Subclass must implement this method")
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        """
+        Applies transformations to a video.
+        Args:
+            frames (torch.Tensor): A 4D tensor representing a video
+                with shape [F, C, H, W] where:
+                - F is number of frames
+                - C is number of channels (3 for RGB)
+                - H is height
+                - W is width
+        Returns:
+            torch.Tensor: The transformed video tensor with the same shape as the input
+        """
+        raise NotImplementedError("Subclass must implement this method")
+class T2VDatasetWithResize(BaseT2VDataset):
+    """
+    A dataset class for text-to-video generation that resizes inputs to fixed dimensions.
+    This class preprocesses videos by resizing them to specified dimensions:
+    - Videos are resized to max_num_frames x height x width
+    Args:
+        max_num_frames (int): Maximum number of frames to extract from videos
+        height (int): Target height for resizing videos
+        width (int): Target width for resizing videos
+    """
+    def __init__(self, max_num_frames: int, height: int, width: int, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        self.max_num_frames = max_num_frames
+        self.height = height
+        self.width = width
+        self.__frame_transform = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+    @override
+    def preprocess(self, video_path: Path) -> torch.Tensor:
+        return preprocess_video_with_resize(
+            video_path,
+            self.max_num_frames,
+            self.height,
+            self.width,
+        )
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transform(f) for f in frames], dim=0)
+class T2VDatasetWithBuckets(BaseT2VDataset):
+    def __init__(
+        self,
+        video_resolution_buckets: List[Tuple[int, int, int]],
+        vae_temporal_compression_ratio: int,
+        vae_height_compression_ratio: int,
+        vae_width_compression_ratio: int,
+        *args,
+        **kwargs,
+    ) -> None:
+        """ """
+        super().__init__(*args, **kwargs)
+        self.video_resolution_buckets = [
+            (
+                int(b[0] / vae_temporal_compression_ratio),
+                int(b[1] / vae_height_compression_ratio),
+                int(b[2] / vae_width_compression_ratio),
+            )
+            for b in video_resolution_buckets
+        ]
+        self.__frame_transform = transforms.Compose([transforms.Lambda(lambda x: x / 255.0 * 2.0 - 1.0)])
+    @override
+    def preprocess(self, video_path: Path) -> torch.Tensor:
+        return preprocess_video_with_buckets(video_path, self.video_resolution_buckets)
+    @override
+    def video_transform(self, frames: torch.Tensor) -> torch.Tensor:
+        return torch.stack([self.__frame_transform(f) for f in frames], dim=0)

finetune/datasets/utils.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import logging
+from pathlib import Path
+from typing import List, Tuple
+import cv2
+import torch
+from torchvision.transforms.functional import resize
+from einops import repeat, rearrange
+# Must import after torch because this can sometimes lead to a nasty segmentation fault, or stack smashing error
+# Very few bug reports but it happens. Look in decord Github issues for more relevant information.
+import decord  # isort:skip
+decord.bridge.set_bridge("torch")
+from PIL import Image
+import numpy as np
+import pdb
+##########  loaders  ##########
+def load_prompts(prompt_path: Path) -> List[str]:
+    with open(prompt_path, "r", encoding="utf-8") as file:
+        return [line.strip() for line in file.readlines() if len(line.strip()) > 0]
+def load_videos(video_path: Path) -> List[Path]:
+    with open(video_path, "r", encoding="utf-8") as file:
+        return [video_path.parent / line.strip() for line in file.readlines() if len(line.strip()) > 0]
+def load_images(image_path: Path) -> List[Path]:
+    with open(image_path, "r", encoding="utf-8") as file:
+        return [image_path.parent / line.strip() for line in file.readlines() if len(line.strip()) > 0]
+def load_images_from_videos(videos_path: List[Path]) -> List[Path]:
+    first_frames_dir = videos_path[0].parent.parent / "first_frames"
+    first_frames_dir.mkdir(exist_ok=True)
+    first_frame_paths = []
+    for video_path in videos_path:
+        frame_path = first_frames_dir / f"{video_path.stem}.png"
+        if frame_path.exists():
+            first_frame_paths.append(frame_path)
+            continue
+        # Open video
+        cap = cv2.VideoCapture(str(video_path))
+        # Read first frame
+        ret, frame = cap.read()
+        if not ret:
+            raise RuntimeError(f"Failed to read video: {video_path}")
+        # Save frame as PNG with same name as video
+        cv2.imwrite(str(frame_path), frame)
+        logging.info(f"Saved first frame to {frame_path}")
+        # Release video capture
+        cap.release()
+        first_frame_paths.append(frame_path)
+    return first_frame_paths
+def load_binary_mask_compressed(path, shape, device, dtype):
+    # shape: (F,C,H,W), C=1
+    with open(path, 'rb') as f:
+        packed = np.frombuffer(f.read(), dtype=np.uint8)
+    unpacked = np.unpackbits(packed)[:np.prod(shape)]
+    mask_loaded = torch.from_numpy(unpacked).to(device, dtype).reshape(shape)
+    mask_interp = torch.nn.functional.interpolate(rearrange(mask_loaded, 'f c h w -> c f h w').unsqueeze(0), size=(shape[0]//4+1, shape[2]//8, shape[3]//8), mode='trilinear', align_corners=False).squeeze(0) # CFHW
+    mask_interp[mask_interp>=0.5] = 1.0
+    mask_interp[mask_interp<0.5] = 0.0
+    return rearrange(mask_loaded, 'f c h w -> c f h w'), mask_interp
+##########  preprocessors  ##########
+def preprocess_image_with_resize(
+    image_path: Path | str,
+    height: int,
+    width: int,
+) -> torch.Tensor:
+    """
+    Loads and resizes a single image.
+    Args:
+        image_path: Path to the image file.
+        height: Target height for resizing.
+        width: Target width for resizing.
+    Returns:
+        torch.Tensor: Image tensor with shape [C, H, W] where:
+            C = number of channels (3 for RGB)
+            H = height
+            W = width
+    """
+    if isinstance(image_path, str):
+        image_path = Path(image_path)
+    # image = cv2.imread(image_path.as_posix())
+    # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+    # image = cv2.resize(image, (width, height))
+    # image = torch.from_numpy(image).float()
+    # image = image.permute(2, 0, 1).contiguous()
+    image = np.array(Image.open(image_path.as_posix()).resize((width, height)))
+    image = torch.from_numpy(image).float()
+    image = image.permute(2, 0, 1).contiguous()
+    return image
+def preprocess_video_with_resize(
+    video_path: Path | str,
+    max_num_frames: int,
+    height: int,
+    width: int,
+) -> torch.Tensor:
+    """
+    Loads and resizes a single video.
+    The function processes the video through these steps:
+      1. If video frame count > max_num_frames, downsample frames evenly
+      2. If video dimensions don't match (height, width), resize frames
+    Args:
+        video_path: Path to the video file.
+        max_num_frames: Maximum number of frames to keep.
+        height: Target height for resizing.
+        width: Target width for resizing.
+    Returns:
+        A torch.Tensor with shape [F, C, H, W] where:
+          F = number of frames
+          C = number of channels (3 for RGB)
+          H = height
+          W = width
+    """
+    if isinstance(video_path, str):
+        video_path = Path(video_path)
+    video_reader = decord.VideoReader(uri=video_path.as_posix(), width=width, height=height)
+    video_num_frames = len(video_reader)
+    if video_num_frames < max_num_frames:
+        # Get all frames first
+        frames = video_reader.get_batch(list(range(video_num_frames)))
+        # Repeat the last frame until we reach max_num_frames
+        last_frame = frames[-1:]
+        num_repeats = max_num_frames - video_num_frames
+        repeated_frames = last_frame.repeat(num_repeats, 1, 1, 1)
+        frames = torch.cat([frames, repeated_frames], dim=0)
+        return frames.float().permute(0, 3, 1, 2).contiguous()
+    else:
+        indices = list(range(0, video_num_frames, video_num_frames // max_num_frames))
+        frames = video_reader.get_batch(indices)
+        import pdb
+        pdb.set_trace()
+        frames = frames[:max_num_frames].float()
+        frames = frames.permute(0, 3, 1, 2).contiguous()
+        return frames
+def preprocess_video_with_buckets(
+    video_path: Path,
+    resolution_buckets: List[Tuple[int, int, int]],
+) -> torch.Tensor:
+    """
+    Args:
+        video_path: Path to the video file.
+        resolution_buckets: List of tuples (num_frames, height, width) representing
+            available resolution buckets.
+    Returns:
+        torch.Tensor: Video tensor with shape [F, C, H, W] where:
+            F = number of frames
+            C = number of channels (3 for RGB)
+            H = height
+            W = width
+    The function processes the video through these steps:
+        1. Finds nearest frame bucket <= video frame count
+        2. Downsamples frames evenly to match bucket size
+        3. Finds nearest resolution bucket based on dimensions
+        4. Resizes frames to match bucket resolution
+    """
+    video_reader = decord.VideoReader(uri=video_path.as_posix())
+    video_num_frames = len(video_reader)
+    resolution_buckets = [bucket for bucket in resolution_buckets if bucket[0] <= video_num_frames]
+    if len(resolution_buckets) == 0:
+        raise ValueError(f"video frame count in {video_path} is less than all frame buckets {resolution_buckets}")
+    nearest_frame_bucket = min(
+        resolution_buckets,
+        key=lambda bucket: video_num_frames - bucket[0],
+        default=1,
+    )[0]
+    frame_indices = list(range(0, video_num_frames, video_num_frames // nearest_frame_bucket))
+    frames = video_reader.get_batch(frame_indices)
+    frames = frames[:nearest_frame_bucket].float()
+    frames = frames.permute(0, 3, 1, 2).contiguous()
+    nearest_res = min(resolution_buckets, key=lambda x: abs(x[1] - frames.shape[2]) + abs(x[2] - frames.shape[3]))
+    nearest_res = (nearest_res[1], nearest_res[2])
+    frames = torch.stack([resize(f, nearest_res) for f in frames], dim=0)
+    return frames

finetune/models/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import importlib
+from pathlib import Path
+package_dir = Path(__file__).parent
+for subdir in package_dir.iterdir():
+    if subdir.is_dir() and not subdir.name.startswith("_"):
+        for module_path in subdir.glob("*.py"):
+            module_name = module_path.stem
+            full_module_name = f".{subdir.name}.{module_name}"
+            importlib.import_module(full_module_name, package=__name__)

finetune/models/cogvideox_i2v/flovd_OMSM_lora_trainer.py ADDED Viewed

	@@ -0,0 +1,748 @@

+from typing import Any, Dict, List, Tuple
+from pathlib import Path
+import os
+import hashlib
+import json
+import random
+import wandb
+import math
+import numpy as np
+from einops import rearrange, repeat
+from safetensors.torch import load_file, save_file
+from accelerate.logging import get_logger
+import torch
+from accelerate.utils import gather_object
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.utils.export_utils import export_to_video
+from finetune.pipeline.flovd_OMSM_cogvideox_pipeline import FloVDOMSMCogVideoXImageToVideoPipeline
+from finetune.constants import LOG_LEVEL, LOG_NAME
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from PIL import Image
+from numpy import dtype
+from transformers import AutoTokenizer, T5EncoderModel
+from typing_extensions import override
+from finetune.schemas import Args, Components, State
+from finetune.trainer import Trainer
+from finetune.utils import (
+    cast_training_params,
+    free_memory,
+    get_memory_statistics,
+    string_to_filename,
+    unwrap_model,
+)
+from finetune.datasets.utils import (
+    preprocess_image_with_resize,
+    load_binary_mask_compressed,
+)
+from finetune.modules.camera_sampler import SampleManualCam
+from finetune.modules.camera_flow_generator import CameraFlowGenerator
+from finetune.modules.utils import get_camera_flow_generator_input, forward_bilinear_splatting, flow_to_color
+from ..utils import register
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import pdb
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+class FloVDOMSMCogVideoXI2VLoraTrainer(Trainer):
+    UNLOAD_LIST = ["text_encoder"]
+    @override
+    def __init__(self, args: Args) -> None:
+        super().__init__(args)
+    @override
+    def load_components(self) -> Dict[str, Any]:
+        # TODO. Change the pipeline and ...
+        components = Components()
+        model_path = str(self.args.model_path)
+        components.pipeline_cls = FloVDOMSMCogVideoXImageToVideoPipeline
+        components.tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")
+        components.text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder")
+        components.transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+        components.vae = AutoencoderKLCogVideoX.from_pretrained(model_path, subfolder="vae")
+        components.scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        return components
+    @override
+    def initialize_pipeline(self) -> FloVDOMSMCogVideoXImageToVideoPipeline:
+        # TODO. Change the pipeline and ...
+        pipe = FloVDOMSMCogVideoXImageToVideoPipeline(
+            tokenizer=self.components.tokenizer,
+            text_encoder=self.components.text_encoder,
+            vae=self.components.vae,
+            transformer=unwrap_model(self.accelerator, self.components.transformer),
+            scheduler=self.components.scheduler,
+        )
+        return pipe
+    def initialize_flow_generator(self):
+        depth_estimator_kwargs = {
+            "target": 'modules.depth_warping.depth_warping.DepthWarping_wrapper',
+            "kwargs": {
+                "ckpt_path": '/workspace/workspace/checkpoints/depth_anything/depth_anything_v2_metric_hypersim_vitb.pth',
+                "model_config": {
+                    "max_depth": 20,
+                    "encoder": 'vitb',
+                    "features": 128,
+                    "out_channels": [96, 192, 384, 768],
+                }
+            }
+        }
+        return CameraFlowGenerator(depth_estimator_kwargs)
+    @override
+    def collate_fn(self, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        ret = {"encoded_videos": [], "prompt_embedding": [], "images": [], "encoded_flow": []}
+        for sample in samples:
+            encoded_video = sample["encoded_video"]
+            prompt_embedding = sample["prompt_embedding"]
+            image = sample["image"]
+            encoded_flow = sample["encoded_flow"]
+            ret["encoded_videos"].append(encoded_video)
+            ret["prompt_embedding"].append(prompt_embedding)
+            ret["images"].append(image)
+            ret["encoded_flow"].append(encoded_flow)
+        ret["encoded_videos"] = torch.stack(ret["encoded_videos"])
+        ret["prompt_embedding"] = torch.stack(ret["prompt_embedding"])
+        ret["images"] = torch.stack(ret["images"])
+        ret["encoded_flow"] = torch.stack(ret["encoded_flow"])
+        return ret
+    @override
+    def compute_loss(self, batch) -> torch.Tensor:
+        prompt_embedding = batch["prompt_embedding"]
+        images = batch["images"]
+        latent_flow = batch["encoded_flow"]
+        # Shape of prompt_embedding: [B, seq_len, hidden_size]
+        # Shape of images: [B, C, H, W]
+        # Shape of latent_flow: [B, C, F, H, W]
+        patch_size_t = self.state.transformer_config.patch_size_t # WJ: None in i2v setting...
+        if patch_size_t is not None:
+            # ncopy = latent.shape[2] % patch_size_t
+            # # Copy the first frame ncopy times to match patch_size_t
+            # first_frame = latent[:, :, :1, :, :]  # Get first frame [B, C, 1, H, W]
+            # latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2)
+            # assert latent.shape[2] % patch_size_t == 0
+            raise NotImplementedError("Do not use the case whose patch_size_t is not None")
+        batch_size, num_channels, num_frames, height, width = latent_flow.shape
+        # Get prompt embeddings
+        _, seq_len, _ = prompt_embedding.shape
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent_flow.dtype)
+        # Add frame dimension to images [B,C,H,W] -> [B,C,F,H,W]
+        images = images.unsqueeze(2)
+        # Add noise to images
+        image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=self.accelerator.device)
+        image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+        noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+        image_latent_dist = self.components.vae.encode(noisy_images.to(dtype=self.components.vae.dtype)).latent_dist
+        image_latents = image_latent_dist.sample() * self.components.vae.config.scaling_factor
+        # Sample a random timestep for each sample
+        timesteps = torch.randint(
+            0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+        )
+        timesteps = timesteps.long()
+        # from [B, C, F, H, W] to [B, F, C, H, W]
+        latent_flow = latent_flow.permute(0, 2, 1, 3, 4)
+        image_latents = image_latents.permute(0, 2, 1, 3, 4)
+        assert (image_latents.shape[0], *image_latents.shape[2:]) == (latent_flow.shape[0], *latent_flow.shape[2:])
+        # Padding image_latents to the same frame number as latent
+        padding_shape = (latent_flow.shape[0], latent_flow.shape[1] - 1, *latent_flow.shape[2:])
+        latent_padding = image_latents.new_zeros(padding_shape)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+        # Add noise to latent
+        noise = torch.randn_like(latent_flow)
+        latent_flow_noisy = self.components.scheduler.add_noise(latent_flow, noise, timesteps)
+        # Concatenate latent and image_latents in the channel dimension
+        latent_flow_img_noisy = torch.cat([latent_flow_noisy, image_latents], dim=2)
+        # Prepare rotary embeds
+        vae_scale_factor_spatial = 2 ** (len(self.components.vae.config.block_out_channels) - 1)
+        transformer_config = self.state.transformer_config
+        rotary_emb = (
+            self.prepare_rotary_positional_embeddings(
+                height=height * vae_scale_factor_spatial,
+                width=width * vae_scale_factor_spatial,
+                num_frames=num_frames,
+                transformer_config=transformer_config,
+                vae_scale_factor_spatial=vae_scale_factor_spatial,
+                device=self.accelerator.device,
+            )
+            if transformer_config.use_rotary_positional_embeddings
+            else None
+        )
+        # Predict noise, For CogVideoX1.5 Only.
+        ofs_emb = (
+            None if self.state.transformer_config.ofs_embed_dim is None else latent_flow.new_full((1,), fill_value=2.0)
+        )
+        predicted_noise = self.components.transformer(
+            hidden_states=latent_flow_img_noisy,
+            encoder_hidden_states=prompt_embedding,
+            timestep=timesteps,
+            ofs=ofs_emb,
+            image_rotary_emb=rotary_emb,
+            return_dict=False,
+        )[0]
+        # Denoise
+        latent_pred = self.components.scheduler.get_velocity(predicted_noise, latent_flow_noisy, timesteps)
+        alphas_cumprod = self.components.scheduler.alphas_cumprod[timesteps]
+        weights = 1 / (1 - alphas_cumprod)
+        while len(weights.shape) < len(latent_pred.shape):
+            weights = weights.unsqueeze(-1)
+        loss = torch.mean((weights * (latent_pred - latent_flow) ** 2).reshape(batch_size, -1), dim=1)
+        loss = loss.mean()
+        return loss
+    def prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        transformer_config: Dict,
+        vae_scale_factor_spatial: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (vae_scale_factor_spatial * transformer_config.patch_size)
+        grid_width = width // (vae_scale_factor_spatial * transformer_config.patch_size)
+        if transformer_config.patch_size_t is None:
+            base_num_frames = num_frames
+        else:
+            base_num_frames = (num_frames + transformer_config.patch_size_t - 1) // transformer_config.patch_size_t
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=transformer_config.attention_head_dim,
+            crops_coords=None,
+            grid_size=(grid_height, grid_width),
+            temporal_size=base_num_frames,
+            grid_type="slice",
+            max_size=(grid_height, grid_width),
+            device=device,
+        )
+        return freqs_cos, freqs_sin
+    # Validation
+    @override
+    def prepare_for_validation(self):
+        # Load from dataset?
+        # Data_root
+        # - metadata.jsonl
+        # - video_latent / args.resolution /
+        # - prompt_embeddings /
+        # - first_frames /
+        # - flow_direct_f_latent /
+        data_root = self.args.data_root
+        metadata_path = data_root / "metadata_revised.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl or metadata_revised.jsonl in the root path"
+        # Load metadata
+        # metadata = {
+        #     "video_path": ...,
+        #     "hash_code": ...,
+        #     "prompt": ...,
+        # }
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+        metadata = random.sample(metadata, self.args.max_scene)
+        prompts = [x["prompt"] for x in metadata]
+        if 'curated' in str(data_root).lower():
+            self.prompt_embeddings = [data_root / "prompt_embeddings" / (x["hash_code"] + '.safetensors') for x in metadata]
+        else:
+            self.prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        videos = [data_root / "video_latent" / "x".join(str(x) for x in self.args.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+        # load prompt embedding
+        validation_prompts = []
+        validation_prompt_embeddings = []
+        validation_video_latents = []
+        validation_images = []
+        validation_flow_latents = []
+        for prompt, prompt_embedding, video_latent, image, flow_latent in zip(prompts, prompt_embeddings, videos, images, flows):
+            validation_prompts.append(prompt)
+            validation_prompt_embeddings.append(load_file(prompt_embedding)["prompt_embedding"].unsqueeze(0))
+            validation_video_latents.append(load_file(video_latent)["encoded_video"].unsqueeze(0))
+            validation_flow_latents.append(load_file(flow_latent)["encoded_flow_f"].unsqueeze(0))
+            # validation_images.append(preprocess_image_with_resize(image, self.args.train_resolution[1], self.args.train_resolution[2]))
+            validation_images.append(image)
+        validation_videos = [None] * len(validation_prompts)
+        self.state.validation_prompts = validation_prompts
+        self.state.validation_prompt_embeddings = validation_prompt_embeddings
+        self.state.validation_images = validation_images
+        self.state.validation_videos = validation_videos
+        self.state.validation_video_latents = validation_video_latents
+        self.state.validation_flow_latents = validation_flow_latents
+        # Debug..
+        self.validate(0)
+    @override
+    def validation_step(
+        self, eval_data: Dict[str, Any], pipe: FloVDOMSMCogVideoXImageToVideoPipeline
+    ) -> List[Tuple[str, Image.Image | List[Image.Image]]]:
+        """
+        Return the data that needs to be saved. For videos, the data format is List[PIL],
+        and for images, the data format is PIL
+        """
+        prompt_embedding, image = eval_data["prompt_embedding"], eval_data["image"]
+        flow_latent_generate = pipe(
+            num_frames=self.state.train_frames,
+            height=self.state.train_height,
+            width=self.state.train_width,
+            prompt=None,
+            prompt_embeds=prompt_embedding,
+            image=image,
+            generator=self.state.generator,
+            num_inference_steps=50,
+            output_type='latent'
+        ).frames[0]
+        flow_generate = decode_flow(flow_latent_generate.unsqueeze(0).to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36]) # BF,C,H,W
+        return [("synthesized_flow", flow_generate)]
+    @override
+    def validate(self, step: int) -> None:
+        #TODO. Fix the codes!!!!
+        logger.info("Starting validation")
+        accelerator = self.accelerator
+        num_validation_samples = len(self.state.validation_prompts)
+        if num_validation_samples == 0:
+            logger.warning("No validation samples found. Skipping validation.")
+            return
+        self.components.transformer.eval()
+        torch.set_grad_enabled(False)
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory before validation start: {json.dumps(memory_statistics, indent=4)}")
+        #####  Initialize pipeline  #####
+        pipe = self.initialize_pipeline()
+        camera_flow_generator = self.initialize_flow_generator().to(device=self.accelerator.device, dtype=self.state.weight_dtype)
+        if self.state.using_deepspeed:
+            # Can't using model_cpu_offload in deepspeed,
+            # so we need to move all components in pipe to device
+            # pipe.to(self.accelerator.device, dtype=self.state.weight_dtype)
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=["transformer"])
+        else:
+            # if not using deepspeed, use model_cpu_offload to further reduce memory usage
+            # Or use pipe.enable_sequential_cpu_offload() to further reduce memory usage
+            pipe.enable_model_cpu_offload(device=self.accelerator.device)
+            # Convert all model weights to training dtype
+            # Note, this will change LoRA weights in self.components.transformer to training dtype, rather than keep them in fp32
+            pipe = pipe.to(dtype=self.state.weight_dtype)
+        #################################
+        all_processes_artifacts = []
+        for i in range(num_validation_samples):
+            if self.state.using_deepspeed and self.accelerator.deepspeed_plugin.zero_stage != 3:
+                # Skip current validation on all processes but one
+                if i % accelerator.num_processes != accelerator.process_index:
+                    continue
+            prompt = self.state.validation_prompts[i]
+            image = self.state.validation_images[i]
+            video = self.state.validation_videos[i]
+            video_latent = self.state.validation_video_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+            prompt_embedding = self.state.validation_prompt_embeddings[i]
+            flow_latent = self.state.validation_flow_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+            if image is not None:
+                image = preprocess_image_with_resize(image, self.state.train_height, self.state.train_width)
+                image_torch = image.detach().clone()
+                # Convert image tensor (C, H, W) to PIL images
+                image = image.to(torch.uint8)
+                image = image.permute(1, 2, 0).cpu().numpy()
+                image = Image.fromarray(image)
+            if video is not None:
+                video = preprocess_video_with_resize(
+                    video, self.state.train_frames, self.state.train_height, self.state.train_width
+                )
+                # Convert video tensor (F, C, H, W) to list of PIL images
+                video = video.round().clamp(0, 255).to(torch.uint8)
+                video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+            else:
+                with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                    try:
+                        video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                    except:
+                        pass
+                    video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                video = ((video_decoded + 1.) / 2. * 255.)[0].permute(1,0,2,3).float().clip(0., 255.).to(torch.uint8)
+                video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+                with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                    try:
+                        flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36])
+                    except:
+                        pass
+                    flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36]) # (BF)CHW (C=2)
+            logger.debug(
+                f"Validating sample {i + 1}/{num_validation_samples} on process {accelerator.process_index}. Prompt: {prompt}",
+                main_process_only=False,
+            )
+            # validation_artifacts = self.validation_step({"prompt": prompt, "image": image, "video": video}, pipe)
+            validation_artifacts = self.validation_step({"prompt_embedding": prompt_embedding, "image": image}, pipe)
+            if (
+                self.state.using_deepspeed
+                and self.accelerator.deepspeed_plugin.zero_stage == 3
+                and not accelerator.is_main_process
+            ):
+                continue
+            prompt_filename = string_to_filename(prompt)[:25]
+            # Calculate hash of reversed prompt as a unique identifier
+            reversed_prompt = prompt[::-1]
+            hash_suffix = hashlib.md5(reversed_prompt.encode()).hexdigest()[:5]
+            artifacts = {
+                "image": {"type": "image", "value": image},
+                "video": {"type": "video", "value": video},
+            }
+            for i, (artifact_type, artifact_value) in enumerate(validation_artifacts):
+                artifacts.update({f"artifact_{i}": {"type": artifact_type, "value": artifact_value}})
+                # Log flow
+                artifacts.update({f"artifact_flow_{i}": {"type": 'flow', "value": flow_decoded}})
+                # Log flow_warped_frames
+                image_tensor = repeat(rearrange(torch.tensor(np.array(image)).to(flow_decoded.device, torch.float), 'h w c -> 1 c h w'), 'b c h w -> (b f) c h w', f=flow_decoded.size(0)) # scale~(0,255) (BF) C H W
+                warped_video = forward_bilinear_splatting(image_tensor, flow_decoded.to(torch.float)) # if we have an occlusion mask from dataset, we can use it.
+                frame_list = []
+                for frame in warped_video:
+                    frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                    frame_list.append(Image.fromarray(frame))
+                artifacts.update({f"artifact_warped_video_{i}": {"type": 'warped_video', "value": frame_list}})
+                # Log synthesized_flow_wraped_frames
+                # artifact_value: synthesized optical flow
+                warped_video2 = forward_bilinear_splatting(image_tensor, artifact_value.to(torch.float)) # if we have an occlusion mask from dataset, we can use it. For OMSM, do not use.
+                frame_list2 = []
+                for frame in warped_video2:
+                    frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                    frame_list2.append(Image.fromarray(frame))
+                artifacts.update({f"artifact_synthesized_flow_warped_video_{i}": {"type": 'synthesized_flow_warped_video', "value": frame_list2}})
+            logger.debug(
+                f"Validation artifacts on process {accelerator.process_index}: {list(artifacts.keys())}",
+                main_process_only=False,
+            )
+            for key, value in list(artifacts.items()):
+                artifact_type = value["type"]
+                artifact_value = value["value"]
+                if artifact_type not in ["image", "video", "flow", "warped_video", "synthesized_flow", "synthesized_flow_warped_video"] or artifact_value is None:
+                    continue
+                extension = "png" if artifact_type == "image" else "mp4"
+                if artifact_type == "warped_video" or artifact_type == "synthesized_flow_warped_video":
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}_{artifact_type}.{extension}"
+                elif artifact_type == "synthesized_flow":
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}_synthesized_flow.{extension}"
+                elif artifact_type == "flow":
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}_original_flow.{extension}"
+                else:
+                    filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}.{extension}"
+                validation_path = self.args.output_dir / "validation_res"
+                validation_path.mkdir(parents=True, exist_ok=True)
+                filename = str(validation_path / filename)
+                if artifact_type == "image":
+                    logger.debug(f"Saving image to {filename}")
+                    artifact_value.save(filename)
+                    artifact_value = wandb.Image(filename)
+                elif artifact_type == "video" or artifact_type == "warped_video" or artifact_type == "synthesized_flow_warped_video":
+                    logger.debug(f"Saving video to {filename}")
+                    export_to_video(artifact_value, filename, fps=self.args.gen_fps)
+                    artifact_value = wandb.Video(filename, caption=f"[{artifact_type}]--{prompt}")
+                elif artifact_type == "synthesized_flow" or artifact_type == "flow":
+                    # TODO. RGB Visualization of optical flow. (F,2,H,W)
+                    artifact_value_RGB = flow_to_color(artifact_value) # BF,C,H,W (B=1)
+                    frame_list = []
+                    for frame in artifact_value_RGB:
+                        frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                        frame_list.append(Image.fromarray(frame))
+                    logger.debug(f"Saving video to {filename}")
+                    export_to_video(frame_list, filename, fps=self.args.gen_fps)
+                    artifact_value = wandb.Video(filename, caption=f"[{artifact_type}]--{prompt}")
+                all_processes_artifacts.append(artifact_value)
+        all_artifacts = gather_object(all_processes_artifacts)
+        if accelerator.is_main_process:
+            tracker_key = "validation"
+            for tracker in accelerator.trackers:
+                if tracker.name == "wandb":
+                    image_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Image)]
+                    video_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Video)]
+                    tracker.log(
+                        {
+                            tracker_key: {f"images": image_artifacts, f"videos": video_artifacts},
+                        },
+                        step=step,
+                    )
+        ##########  Clean up  ##########
+        if self.state.using_deepspeed:
+            del pipe
+            # Unload models except those needed for training
+            self.__move_components_to_cpu(unload_list=self.UNLOAD_LIST)
+        else:
+            pipe.remove_all_hooks()
+            del pipe
+            # Load models except those not needed for training
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=self.UNLOAD_LIST)
+            self.components.transformer.to(self.accelerator.device, dtype=self.state.weight_dtype)
+            # Change trainable weights back to fp32 to keep with dtype after prepare the model
+            cast_training_params([self.components.transformer], dtype=torch.float32)
+        del camera_flow_generator
+        free_memory()
+        accelerator.wait_for_everyone()
+        ################################
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory after validation end: {json.dumps(memory_statistics, indent=4)}")
+        torch.cuda.reset_peak_memory_stats(accelerator.device)
+        torch.set_grad_enabled(True)
+        self.components.transformer.train()
+    # mangling
+    def __move_components_to_device(self, dtype, ignore_list: List[str] = []):
+        ignore_list = set(ignore_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name not in ignore_list:
+                    setattr(self.components, name, component.to(self.accelerator.device, dtype=dtype))
+    # mangling
+    def __move_components_to_cpu(self, unload_list: List[str] = []):
+        unload_list = set(unload_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name in unload_list:
+                    setattr(self.components, name, component.to("cpu"))
+register("cogvideox-flovd-omsm", "lora", FloVDOMSMCogVideoXI2VLoraTrainer)
+#--------------------------------------------------------------------------------------------------
+# Extract function
+def encode_text(prompt: str, components, device) -> torch.Tensor:
+    prompt_token_ids = components.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=components.transformer.config.max_text_seq_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt",
+    )
+    prompt_token_ids = prompt_token_ids.input_ids
+    prompt_embedding = components.text_encoder(prompt_token_ids.to(device))[0]
+    return prompt_embedding
+def encode_video(video: torch.Tensor, vae) -> torch.Tensor:
+    # shape of input video: [B, C, F, H, W]
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+def decode_latents(latents: torch.Tensor, vae) -> torch.Tensor:
+    latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    latents = 1 / vae.config.scaling_factor * latents
+    frames = vae.decode(latents).sample
+    return frames
+def compute_optical_flow(raft, ctxt, trgt, raft_iter=20, chunk=2, only_forward=True):
+    num_frames = ctxt.shape[0]
+    chunk_size = (num_frames // chunk) + 1
+    flow_f_list = []
+    if not only_forward:
+        flow_b_list = []
+    for i in range(chunk):
+        start = chunk_size * i
+        end = chunk_size * (i+1)
+        with torch.no_grad():
+            flow_f = raft(ctxt[start:end], trgt[start:end], num_flow_updates=raft_iter)[-1]
+            if not only_forward:
+                flow_b = raft(trgt[start:end], ctxt[start:end], num_flow_updates=raft_iter)[-1]
+        flow_f_list.append(flow_f)
+        if not only_forward:
+            flow_b_list.append(flow_b)
+    flow_f = torch.cat(flow_f_list)
+    if not only_forward:
+        flow_b = torch.cat(flow_b_list)
+    if not only_forward:
+        return flow_f, flow_b
+    else:
+        return flow_f, None
+def encode_flow(flow, vae, flow_scale_factor):
+    # flow: BF,C,H,W
+    # flow_scale_factor [sf_x, sf_y]
+    assert flow.ndim == 4
+    num_frames, _, height, width = flow.shape
+    # Normalize optical flow
+    # ndim: 4 -> 5
+    flow = rearrange(flow, '(b f) c h w -> b f c h w', b=1)
+    flow_norm = adaptive_normalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+    # ndim: 5 -> 4
+    flow_norm = rearrange(flow_norm, 'b f c h w -> (b f) c h w', b=1)
+    # Duplicate mean value for third channel
+    num_frames, _, H, W = flow_norm.shape
+    flow_norm_extended = torch.empty((num_frames, 3, height, width)).to(flow_norm)
+    flow_norm_extended[:,:2] = flow_norm
+    flow_norm_extended[:,-1:] = flow_norm.mean(dim=1, keepdim=True)
+    flow_norm_extended = rearrange(flow_norm_extended, '(b f) c h w -> b c f h w', f=num_frames)
+    return encode_video(flow_norm_extended, vae)
+def decode_flow(flow_latent, vae, flow_scale_factor):
+    flow_latent = flow_latent.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    flow_latent = 1 / vae.config.scaling_factor * flow_latent
+    flow = vae.decode(flow_latent).sample # BCFHW
+    # discard third channel (which is a mean value of f_x and f_y)
+    flow = flow[:,:2].detach().clone()
+    # Unnormalize optical flow
+    flow = rearrange(flow, 'b c f h w -> b f c h w')
+    flow = adaptive_unnormalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+    flow = rearrange(flow, 'b f c h w -> (b f) c h w')
+    return flow # BF,C,H,W
+def adaptive_normalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    b, f, c, h, w = flow.shape
+    max_clip_x = math.sqrt(w/sf_x) * 1.0
+    max_clip_y = math.sqrt(h/sf_y) * 1.0
+    flow_norm = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    flow_x_norm = torch.sign(flow_x) * torch.sqrt(torch.abs(flow_x)/sf_x + 1e-7)
+    flow_y_norm = torch.sign(flow_y) * torch.sqrt(torch.abs(flow_y)/sf_y + 1e-7)
+    flow_norm[:, :, 0] = torch.clamp(flow_x_norm, min=-max_clip_x, max=max_clip_x)
+    flow_norm[:, :, 1] = torch.clamp(flow_y_norm, min=-max_clip_y, max=max_clip_y)
+    return flow_norm
+def adaptive_unnormalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    flow_orig = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    flow_orig[:, :, 0] = torch.sign(flow_x) * sf_x * (flow_x**2 - 1e-7)
+    flow_orig[:, :, 1] = torch.sign(flow_y) * sf_y * (flow_y**2 - 1e-7)
+    return flow_orig
+#--------------------------------------------------------------------------------------------------

finetune/models/cogvideox_i2v/flovd_controlnet_trainer.py ADDED Viewed

	@@ -0,0 +1,814 @@

+from typing import Any, Dict, List, Tuple
+from pathlib import Path
+import os
+import hashlib
+import json
+import random
+import wandb
+import math
+import numpy as np
+from einops import rearrange, repeat
+from safetensors.torch import load_file, save_file
+from accelerate.logging import get_logger
+import torch
+from accelerate.utils import gather_object
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.utils.export_utils import export_to_video
+from finetune.pipeline.flovd_FVSM_cogvideox_controlnet_pipeline import FloVDCogVideoXControlnetImageToVideoPipeline
+from finetune.constants import LOG_LEVEL, LOG_NAME
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from PIL import Image
+from numpy import dtype
+from transformers import AutoTokenizer, T5EncoderModel
+from typing_extensions import override
+from finetune.schemas import Args, Components, State
+from finetune.trainer import Trainer
+from finetune.utils import (
+    cast_training_params,
+    free_memory,
+    get_memory_statistics,
+    string_to_filename,
+    unwrap_model,
+)
+from finetune.datasets.utils import (
+    preprocess_image_with_resize,
+    load_binary_mask_compressed,
+)
+from finetune.modules.cogvideox_controlnet import CogVideoXControlnet
+from finetune.modules.cogvideox_custom_model import CustomCogVideoXTransformer3DModel
+from finetune.modules.camera_sampler import SampleManualCam
+from finetune.modules.camera_flow_generator import CameraFlowGenerator
+from finetune.modules.utils import get_camera_flow_generator_input, forward_bilinear_splatting
+from ..utils import register
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import pdb
+logger = get_logger(LOG_NAME, LOG_LEVEL)
+class FloVDCogVideoXI2VControlnetTrainer(Trainer):
+    UNLOAD_LIST = ["text_encoder"]
+    @override
+    def __init__(self, args: Args) -> None:
+        super().__init__(args)
+        # For validation
+        self.CameraSampler = SampleManualCam()
+    @override
+    def load_components(self) -> Dict[str, Any]:
+        # TODO. Change the pipeline and ...
+        components = Components()
+        model_path = str(self.args.model_path)
+        components.pipeline_cls = FloVDCogVideoXControlnetImageToVideoPipeline
+        components.tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")
+        components.text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder")
+        # components.transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+        components.transformer = CustomCogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+        additional_kwargs = {
+            'num_layers': self.args.controlnet_transformer_num_layers,
+            'out_proj_dim_factor': self.args.controlnet_out_proj_dim_factor,
+            'out_proj_dim_zero_init': self.args.controlnet_out_proj_zero_init,
+            'notextinflow': self.args.notextinflow,
+        }
+        components.controlnet = CogVideoXControlnet.from_pretrained(model_path, subfolder="transformer", **additional_kwargs)
+        components.vae = AutoencoderKLCogVideoX.from_pretrained(model_path, subfolder="vae")
+        components.scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        return components
+    @override
+    def initialize_pipeline(self) -> FloVDCogVideoXControlnetImageToVideoPipeline:
+        # TODO. Change the pipeline and ...
+        pipe = FloVDCogVideoXControlnetImageToVideoPipeline(
+            tokenizer=self.components.tokenizer,
+            text_encoder=unwrap_model(self.accelerator, self.components.text_encoder),
+            vae=unwrap_model(self.accelerator, self.components.vae),
+            transformer=unwrap_model(self.accelerator, self.components.transformer),
+            controlnet=unwrap_model(self.accelerator, self.components.controlnet),
+            scheduler=self.components.scheduler,
+        )
+        return pipe
+    def initialize_flow_generator(self, ckpt_path):
+        depth_estimator_kwargs = {
+            "target": 'modules.depth_warping.depth_warping.DepthWarping_wrapper',
+            "kwargs": {
+                "ckpt_path": ckpt_path,
+                "model_config": {
+                    "max_depth": 20,
+                    "encoder": 'vitb',
+                    "features": 128,
+                    "out_channels": [96, 192, 384, 768],
+                }
+            }
+        }
+        return CameraFlowGenerator(depth_estimator_kwargs)
+    @override
+    def collate_fn(self, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        ret = {"encoded_videos": [], "prompt_embedding": [], "images": [], "encoded_flow": []}
+        for sample in samples:
+            encoded_video = sample["encoded_video"]
+            prompt_embedding = sample["prompt_embedding"]
+            image = sample["image"]
+            encoded_flow = sample["encoded_flow"]
+            ret["encoded_videos"].append(encoded_video)
+            ret["prompt_embedding"].append(prompt_embedding)
+            ret["images"].append(image)
+            ret["encoded_flow"].append(encoded_flow)
+        ret["encoded_videos"] = torch.stack(ret["encoded_videos"])
+        ret["prompt_embedding"] = torch.stack(ret["prompt_embedding"])
+        ret["images"] = torch.stack(ret["images"])
+        ret["encoded_flow"] = torch.stack(ret["encoded_flow"])
+        return ret
+    @override
+    def compute_loss(self, batch) -> torch.Tensor:
+        prompt_embedding = batch["prompt_embedding"]
+        latent = batch["encoded_videos"]
+        images = batch["images"]
+        latent_flow = batch["encoded_flow"]
+        # Shape of prompt_embedding: [B, seq_len, hidden_size]
+        # Shape of latent: [B, C, F, H, W]
+        # Shape of images: [B, C, H, W]
+        # Shape of latent_flow: [B, C, F, H, W]
+        patch_size_t = self.state.transformer_config.patch_size_t # WJ: None in i2v setting...
+        if patch_size_t is not None:
+            ncopy = latent.shape[2] % patch_size_t
+            # Copy the first frame ncopy times to match patch_size_t
+            first_frame = latent[:, :, :1, :, :]  # Get first frame [B, C, 1, H, W]
+            latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2)
+            assert latent.shape[2] % patch_size_t == 0
+        batch_size, num_channels, num_frames, height, width = latent.shape
+        # Get prompt embeddings
+        _, seq_len, _ = prompt_embedding.shape
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent.dtype)
+        # Add frame dimension to images [B,C,H,W] -> [B,C,F,H,W]
+        images = images.unsqueeze(2)
+        # Add noise to images
+        image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=self.accelerator.device)
+        image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+        noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+        image_latent_dist = self.components.vae.encode(noisy_images.to(dtype=self.components.vae.dtype)).latent_dist
+        image_latents = image_latent_dist.sample() * self.components.vae.config.scaling_factor
+        """
+            Modify below
+        """
+        # Sample a random timestep for each sample
+        # timesteps = torch.randint(
+        #     0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+        # )
+        if self.args.enable_time_sampling:
+            if self.args.time_sampling_type == "truncated_normal":
+                time_sampling_dict = {
+                    'mean': self.args.time_sampling_mean,
+                    'std': self.args.time_sampling_std,
+                    'a': 1 - self.args.controlnet_guidance_end,
+                    'b': 1 - self.args.controlnet_guidance_start,
+                }
+                timesteps = torch.nn.init.trunc_normal_(
+                    torch.empty(batch_size, device=latent.device), **time_sampling_dict
+                    ) * self.components.scheduler.config.num_train_timesteps
+            elif self.args.time_sampling_type == "truncated_uniform":
+                timesteps = torch.randint(
+                    int((1- self.args.controlnet_guidance_end) * self.components.scheduler.config.num_train_timesteps),
+                    int((1 - self.args.controlnet_guidance_start) * self.components.scheduler.config.num_train_timesteps),
+                    (batch_size,), device=latent.device
+                )
+        else:
+            timesteps = torch.randint(
+                0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+            )
+        timesteps = timesteps.long()
+        # from [B, C, F, H, W] to [B, F, C, H, W]
+        latent = latent.permute(0, 2, 1, 3, 4)
+        latent_flow = latent_flow.permute(0, 2, 1, 3, 4)
+        image_latents = image_latents.permute(0, 2, 1, 3, 4)
+        assert (latent.shape[0], *latent.shape[2:]) == (image_latents.shape[0], *image_latents.shape[2:]) == (latent_flow.shape[0], *latent_flow.shape[2:])
+        # Padding image_latents to the same frame number as latent
+        padding_shape = (latent.shape[0], latent.shape[1] - 1, *latent.shape[2:])
+        latent_padding = image_latents.new_zeros(padding_shape)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+        # Add noise to latent
+        noise = torch.randn_like(latent)
+        latent_noisy = self.components.scheduler.add_noise(latent, noise, timesteps)
+        # Concatenate latent and image_latents in the channel dimension
+        # latent_img_flow_noisy = torch.cat([latent_noisy, image_latents, latent_flow], dim=2)
+        latent_img_noisy = torch.cat([latent_noisy, image_latents], dim=2)
+        # Prepare rotary embeds
+        vae_scale_factor_spatial = 2 ** (len(self.components.vae.config.block_out_channels) - 1)
+        transformer_config = self.state.transformer_config
+        rotary_emb = (
+            self.prepare_rotary_positional_embeddings(
+                height=height * vae_scale_factor_spatial,
+                width=width * vae_scale_factor_spatial,
+                num_frames=num_frames,
+                transformer_config=transformer_config,
+                vae_scale_factor_spatial=vae_scale_factor_spatial,
+                device=self.accelerator.device,
+            )
+            if transformer_config.use_rotary_positional_embeddings
+            else None
+        )
+        # Predict noise, For CogVideoX1.5 Only.
+        ofs_emb = (
+            None if self.state.transformer_config.ofs_embed_dim is None else latent.new_full((1,), fill_value=2.0)
+        )
+        # Controlnet feedforward
+        controlnet_states = self.components.controlnet(
+            hidden_states=latent_noisy,
+            encoder_hidden_states=prompt_embedding,
+            image_rotary_emb=rotary_emb,
+            controlnet_hidden_states=latent_flow,
+            timestep=timesteps,
+            return_dict=False,
+        )[0]
+        if isinstance(controlnet_states, (tuple, list)):
+            controlnet_states = [x.to(dtype=self.state.weight_dtype) for x in controlnet_states]
+        else:
+            controlnet_states = controlnet_states.to(dtype=self.state.weight_dtype)
+        # Transformer feedforward
+        predicted_noise = self.components.transformer(
+            hidden_states=latent_img_noisy,
+            encoder_hidden_states=prompt_embedding,
+            controlnet_states=controlnet_states,
+            controlnet_weights=self.args.controlnet_weights,
+            timestep=timesteps,
+            # ofs=ofs_emb,
+            image_rotary_emb=rotary_emb,
+            return_dict=False,
+        )[0]
+        # Denoise
+        latent_pred = self.components.scheduler.get_velocity(predicted_noise, latent_noisy, timesteps)
+        alphas_cumprod = self.components.scheduler.alphas_cumprod[timesteps]
+        weights = 1 / (1 - alphas_cumprod)
+        while len(weights.shape) < len(latent_pred.shape):
+            weights = weights.unsqueeze(-1)
+        loss = torch.mean((weights * (latent_pred - latent) ** 2).reshape(batch_size, -1), dim=1)
+        loss = loss.mean()
+        return loss
+    def prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        transformer_config: Dict,
+        vae_scale_factor_spatial: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (vae_scale_factor_spatial * transformer_config.patch_size)
+        grid_width = width // (vae_scale_factor_spatial * transformer_config.patch_size)
+        if transformer_config.patch_size_t is None:
+            base_num_frames = num_frames
+        else:
+            base_num_frames = (num_frames + transformer_config.patch_size_t - 1) // transformer_config.patch_size_t
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=transformer_config.attention_head_dim,
+            crops_coords=None,
+            grid_size=(grid_height, grid_width),
+            temporal_size=base_num_frames,
+            grid_type="slice",
+            max_size=(grid_height, grid_width),
+            device=device,
+        )
+        return freqs_cos, freqs_sin
+    # Validation
+    @override
+    def prepare_for_validation(self):
+        # Load from dataset?
+        # Data_root
+        # - metadata.jsonl
+        # - video_latent / args.resolution /
+        # - prompt_embeddings /
+        # - first_frames /
+        # - flow_direct_f_latent /
+        data_root = self.args.data_root
+        metadata_path = data_root / "metadata_revised.jsonl"
+        assert metadata_path.is_file(), "For this dataset type, you need metadata.jsonl or metadata_revised.jsonl in the root path"
+        # Load metadata
+        # metadata = {
+        #     "video_path": ...,
+        #     "hash_code": ...,
+        #     "prompt": ...,
+        # }
+        metadata = []
+        with open(metadata_path, "r") as f:
+            for line in f:
+                metadata.append( json.loads(line) )
+        metadata = random.sample(metadata, self.args.max_scene)
+        prompts = [x["prompt"] for x in metadata]
+        prompt_embeddings = [data_root / "prompt_embeddings_revised" / (x["hash_code"] + '.safetensors') for x in metadata]
+        videos = [data_root / "video_latent" / "x".join(str(x) for x in self.args.train_resolution) / (x["hash_code"] + '.safetensors') for x in metadata]
+        images = [data_root / "first_frames" / (x["hash_code"] + '.png') for x in metadata]
+        flows = [data_root / "flow_direct_f_latent" / (x["hash_code"] + '.safetensors') for x in metadata]
+        # load prompt embedding
+        validation_prompts = []
+        validation_prompt_embeddings = []
+        validation_video_latents = []
+        validation_images = []
+        validation_flow_latents = []
+        for prompt, prompt_embedding, video_latent, image, flow_latent in zip(prompts, prompt_embeddings, videos, images, flows):
+            validation_prompts.append(prompt)
+            validation_prompt_embeddings.append(load_file(prompt_embedding)["prompt_embedding"].unsqueeze(0))
+            validation_video_latents.append(load_file(video_latent)["encoded_video"].unsqueeze(0))
+            validation_flow_latents.append(load_file(flow_latent)["encoded_flow_f"].unsqueeze(0))
+            # validation_images.append(preprocess_image_with_resize(image, self.args.train_resolution[1], self.args.train_resolution[2]))
+            validation_images.append(image)
+        validation_videos = [None] * len(validation_prompts)
+        self.state.validation_prompts = validation_prompts
+        self.state.validation_prompt_embeddings = validation_prompt_embeddings
+        self.state.validation_images = validation_images
+        self.state.validation_videos = validation_videos
+        self.state.validation_video_latents = validation_video_latents
+        self.state.validation_flow_latents = validation_flow_latents
+        # Debug..
+        # self.validate(0)
+    @override
+    def validation_step(
+        self, eval_data: Dict[str, Any], pipe: FloVDCogVideoXControlnetImageToVideoPipeline
+    ) -> List[Tuple[str, Image.Image | List[Image.Image]]]:
+        """
+        Return the data that needs to be saved. For videos, the data format is List[PIL],
+        and for images, the data format is PIL
+        """
+        prompt_embedding, image, flow_latent = eval_data["prompt_embedding"], eval_data["image"], eval_data["flow_latent"]
+        video_generate = pipe(
+            num_frames=self.state.train_frames,
+            height=self.state.train_height,
+            width=self.state.train_width,
+            prompt=None,
+            prompt_embeds=prompt_embedding,
+            image=image,
+            flow_latent=flow_latent,
+            generator=self.state.generator,
+            num_inference_steps=50,
+            controlnet_guidance_start = self.args.controlnet_guidance_start,
+            controlnet_guidance_end = self.args.controlnet_guidance_end,
+        ).frames[0]
+        return [("synthesized_video", video_generate)]
+    @override
+    def validate(self, step: int) -> None:
+        #TODO. Fix the codes!!!!
+        logger.info("Starting validation")
+        accelerator = self.accelerator
+        num_validation_samples = len(self.state.validation_prompts)
+        if num_validation_samples == 0:
+            logger.warning("No validation samples found. Skipping validation.")
+            return
+        self.components.controlnet.eval()
+        torch.set_grad_enabled(False)
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory before validation start: {json.dumps(memory_statistics, indent=4)}")
+        #####  Initialize pipeline  #####
+        pipe = self.initialize_pipeline()
+        camera_flow_generator = self.initialize_flow_generator(ckpt_path=self.args.depth_ckpt_path).to(device=self.accelerator.device, dtype=self.state.weight_dtype)
+        if self.state.using_deepspeed:
+            # Can't using model_cpu_offload in deepspeed,
+            # so we need to move all components in pipe to device
+            # pipe.to(self.accelerator.device, dtype=self.state.weight_dtype)
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=["controlnet"])
+            # self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=["transformer", "controlnet"])
+        else:
+            # if not using deepspeed, use model_cpu_offload to further reduce memory usage
+            # Or use pipe.enable_sequential_cpu_offload() to further reduce memory usage
+            pipe.enable_model_cpu_offload(device=self.accelerator.device)
+            # Convert all model weights to training dtype
+            # Note, this will change LoRA weights in self.components.transformer to training dtype, rather than keep them in fp32
+            pipe = pipe.to(dtype=self.state.weight_dtype)
+        #################################
+        inference_type = ['training', 'inference']
+        # inference_type = ['inference']
+        for infer_type in inference_type:
+            all_processes_artifacts = []
+            for i in range(num_validation_samples):
+                if self.state.using_deepspeed and self.accelerator.deepspeed_plugin.zero_stage != 3:
+                    # Skip current validation on all processes but one
+                    if i % accelerator.num_processes != accelerator.process_index:
+                        continue
+                prompt = self.state.validation_prompts[i]
+                image = self.state.validation_images[i]
+                video = self.state.validation_videos[i]
+                video_latent = self.state.validation_video_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+                prompt_embedding = self.state.validation_prompt_embeddings[i]
+                flow_latent = self.state.validation_flow_latents[i].permute(0,2,1,3,4) # [B,F,C,H,W] (e.g., [B, 13, 16, 60, 90])
+                if image is not None:
+                    image = preprocess_image_with_resize(image, self.state.train_height, self.state.train_width)
+                    image_torch = image.detach().clone()
+                    # Convert image tensor (C, H, W) to PIL images
+                    image = image.to(torch.uint8)
+                    image = image.permute(1, 2, 0).cpu().numpy()
+                    image = Image.fromarray(image)
+                if video is not None:
+                    video = preprocess_video_with_resize(
+                        video, self.state.train_frames, self.state.train_height, self.state.train_width
+                    )
+                    # Convert video tensor (F, C, H, W) to list of PIL images
+                    video = video.round().clamp(0, 255).to(torch.uint8)
+                    video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+                else:
+                    if infer_type == 'training':
+                        with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                            try:
+                                video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                            except:
+                                pass
+                            video_decoded = decode_latents(video_latent.to(self.accelerator.device), self.components.vae)
+                        video = ((video_decoded + 1.) / 2. * 255.)[0].permute(1,0,2,3).float().clip(0., 255.).to(torch.uint8)
+                        video = [Image.fromarray(frame.permute(1, 2, 0).cpu().numpy()) for frame in video]
+                        with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                            try:
+                                flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36])
+                            except:
+                                pass
+                            flow_decoded = decode_flow(flow_latent.to(self.accelerator.device), self.components.vae, flow_scale_factor=[60, 36]) # (BF)CHW (C=2)
+                # Prepare camera flow
+                if infer_type == 'inference':
+                    with torch.cuda.amp.autocast(enabled=True, dtype=self.state.weight_dtype):
+                        camparam, cam_name = self.CameraSampler.sample()
+                        camera_flow_generator_input = get_camera_flow_generator_input(image_torch, camparam, device=self.accelerator.device, speed=0.5)
+                        image_torch = ((image_torch.unsqueeze(0) / 255.) * 2. - 1.).to(self.accelerator.device)
+                        camera_flow, log_dict = camera_flow_generator(image_torch, camera_flow_generator_input)
+                        camera_flow = camera_flow.to(self.accelerator.device)
+                        # WTF, unknown bug. Need warm up inference.
+                        try:
+                            flow_latent = rearrange(encode_flow(camera_flow, self.components.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(self.accelerator.device, self.state.weight_dtype)
+                        except:
+                            pass
+                        flow_latent = rearrange(encode_flow(camera_flow, self.components.vae, flow_scale_factor=[60, 36]), 'b c f h w -> b f c h w').to(self.accelerator.device, self.state.weight_dtype)
+                logger.debug(
+                    f"Validating sample {i + 1}/{num_validation_samples} on process {accelerator.process_index}. Prompt: {prompt}",
+                    main_process_only=False,
+                )
+                # validation_artifacts = self.validation_step({"prompt": prompt, "image": image, "video": video}, pipe)
+                validation_artifacts = self.validation_step({"prompt_embedding": prompt_embedding, "image": image, "flow_latent": flow_latent}, pipe)
+                if (
+                    self.state.using_deepspeed
+                    and self.accelerator.deepspeed_plugin.zero_stage == 3
+                    and not accelerator.is_main_process
+                ):
+                    continue
+                prompt_filename = string_to_filename(prompt)[:25]
+                # Calculate hash of reversed prompt as a unique identifier
+                reversed_prompt = prompt[::-1]
+                hash_suffix = hashlib.md5(reversed_prompt.encode()).hexdigest()[:5]
+                artifacts = {
+                    "image": {"type": "image", "value": image},
+                    "video": {"type": "video", "value": video},
+                }
+                for i, (artifact_type, artifact_value) in enumerate(validation_artifacts):
+                    artifacts.update({f"artifact_{i}": {"type": artifact_type, "value": artifact_value}})
+                    if infer_type == 'training':
+                        # Log flow_warped_frames
+                        image_tensor = repeat(rearrange(torch.tensor(np.array(image)).to(flow_decoded.device, torch.float), 'h w c -> 1 c h w'), 'b c h w -> (b f) c h w', f=flow_decoded.size(0)) # scale~(0,255) (BF) C H W
+                        warped_video = forward_bilinear_splatting(image_tensor, flow_decoded.to(torch.float)) # if we have an occlusion mask from dataset, we can use it.
+                        frame_list = []
+                        for frame in warped_video:
+                            frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                            frame_list.append(Image.fromarray(frame))
+                        artifacts.update({f"artifact_warped_video_{i}": {"type": 'warped_video', "value": frame_list}})
+                    if infer_type == 'inference':
+                        warped_video = log_dict['depth_warped_frames']
+                        frame_list = []
+                        for frame in warped_video:
+                            frame = (frame + 1.)/2. * 255.
+                            frame = (frame.permute(1,2,0).float().detach().cpu().numpy()).astype(np.uint8).clip(0,255)
+                            frame_list.append(Image.fromarray(frame))
+                        artifacts.update({f"artifact_warped_video_{i}": {"type": 'warped_video', "value": frame_list}})
+                logger.debug(
+                    f"Validation artifacts on process {accelerator.process_index}: {list(artifacts.keys())}",
+                    main_process_only=False,
+                )
+                for key, value in list(artifacts.items()):
+                    artifact_type = value["type"]
+                    artifact_value = value["value"]
+                    if artifact_type not in ["image", "video", "warped_video", "synthesized_video"] or artifact_value is None:
+                        continue
+                    extension = "png" if artifact_type == "image" else "mp4"
+                    if artifact_type == "warped_video":
+                        filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}-{infer_type}_warped_video.{extension}"
+                    elif artifact_type == "synthesized_video":
+                        filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}-{infer_type}_synthesized_video.{extension}"
+                    else:
+                        filename = f"validation-{step}-{accelerator.process_index}-{prompt_filename}-{hash_suffix}-{infer_type}.{extension}"
+                    validation_path = self.args.output_dir / "validation_res"
+                    validation_path.mkdir(parents=True, exist_ok=True)
+                    filename = str(validation_path / filename)
+                    if artifact_type == "image":
+                        logger.debug(f"Saving image to {filename}")
+                        artifact_value.save(filename)
+                        artifact_value = wandb.Image(filename)
+                    elif artifact_type == "video" or artifact_type == "warped_video" or artifact_type == "synthesized_video":
+                        logger.debug(f"Saving video to {filename}")
+                        export_to_video(artifact_value, filename, fps=self.args.gen_fps)
+                        artifact_value = wandb.Video(filename, caption=prompt)
+                    all_processes_artifacts.append(artifact_value)
+            all_artifacts = gather_object(all_processes_artifacts)
+            if accelerator.is_main_process:
+                tracker_key = "validation"
+                for tracker in accelerator.trackers:
+                    if tracker.name == "wandb":
+                        image_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Image)]
+                        video_artifacts = [artifact for artifact in all_artifacts if isinstance(artifact, wandb.Video)]
+                        tracker.log(
+                            {
+                                tracker_key: {f"images_{infer_type}": image_artifacts, f"videos_{infer_type}": video_artifacts},
+                            },
+                            step=step,
+                        )
+        ##########  Clean up  ##########
+        if self.state.using_deepspeed:
+            del pipe
+            # Unload models except those needed for training
+            self.__move_components_to_cpu(unload_list=self.UNLOAD_LIST)
+        else:
+            pipe.remove_all_hooks()
+            del pipe
+            # Load models except those not needed for training
+            self.__move_components_to_device(dtype=self.state.weight_dtype, ignore_list=self.UNLOAD_LIST)
+            self.components.controlnet.to(self.accelerator.device, dtype=self.state.weight_dtype)
+            # Change trainable weights back to fp32 to keep with dtype after prepare the model
+            cast_training_params([self.components.controlnet], dtype=torch.float32)
+        del camera_flow_generator
+        free_memory()
+        accelerator.wait_for_everyone()
+        ################################
+        memory_statistics = get_memory_statistics()
+        logger.info(f"Memory after validation end: {json.dumps(memory_statistics, indent=4)}")
+        torch.cuda.reset_peak_memory_stats(accelerator.device)
+        torch.set_grad_enabled(True)
+        self.components.controlnet.train()
+    # mangling
+    def __move_components_to_device(self, dtype, ignore_list: List[str] = []):
+        ignore_list = set(ignore_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name not in ignore_list:
+                    setattr(self.components, name, component.to(self.accelerator.device, dtype=dtype))
+    # mangling
+    def __move_components_to_cpu(self, unload_list: List[str] = []):
+        unload_list = set(unload_list)
+        components = self.components.model_dump()
+        for name, component in components.items():
+            if not isinstance(component, type) and hasattr(component, "to"):
+                if name in unload_list:
+                    setattr(self.components, name, component.to("cpu"))
+register("cogvideox-flovd", "controlnet", FloVDCogVideoXI2VControlnetTrainer)
+#--------------------------------------------------------------------------------------------------
+# Extract function
+def encode_text(prompt: str, components, device) -> torch.Tensor:
+    prompt_token_ids = components.tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=components.transformer.config.max_text_seq_length,
+        truncation=True,
+        add_special_tokens=True,
+        return_tensors="pt",
+    )
+    prompt_token_ids = prompt_token_ids.input_ids
+    prompt_embedding = components.text_encoder(prompt_token_ids.to(device))[0]
+    return prompt_embedding
+def encode_video(video: torch.Tensor, vae) -> torch.Tensor:
+    # shape of input video: [B, C, F, H, W]
+    video = video.to(vae.device, dtype=vae.dtype)
+    latent_dist = vae.encode(video).latent_dist
+    latent = latent_dist.sample() * vae.config.scaling_factor
+    return latent
+def decode_latents(latents: torch.Tensor, vae) -> torch.Tensor:
+    latents = latents.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    latents = 1 / vae.config.scaling_factor * latents
+    frames = vae.decode(latents).sample
+    return frames
+def compute_optical_flow(raft, ctxt, trgt, raft_iter=20, chunk=2, only_forward=True):
+    num_frames = ctxt.shape[0]
+    chunk_size = (num_frames // chunk) + 1
+    flow_f_list = []
+    if not only_forward:
+        flow_b_list = []
+    for i in range(chunk):
+        start = chunk_size * i
+        end = chunk_size * (i+1)
+        with torch.no_grad():
+            flow_f = raft(ctxt[start:end], trgt[start:end], num_flow_updates=raft_iter)[-1]
+            if not only_forward:
+                flow_b = raft(trgt[start:end], ctxt[start:end], num_flow_updates=raft_iter)[-1]
+        flow_f_list.append(flow_f)
+        if not only_forward:
+            flow_b_list.append(flow_b)
+    flow_f = torch.cat(flow_f_list)
+    if not only_forward:
+        flow_b = torch.cat(flow_b_list)
+    if not only_forward:
+        return flow_f, flow_b
+    else:
+        return flow_f, None
+def encode_flow(flow, vae, flow_scale_factor):
+    # flow: BF,C,H,W
+    # flow_scale_factor [sf_x, sf_y]
+    assert flow.ndim == 4
+    num_frames, _, height, width = flow.shape
+    # Normalize optical flow
+    # ndim: 4 -> 5
+    flow = rearrange(flow, '(b f) c h w -> b f c h w', b=1)
+    flow_norm = adaptive_normalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+    # ndim: 5 -> 4
+    flow_norm = rearrange(flow_norm, 'b f c h w -> (b f) c h w', b=1)
+    # Duplicate mean value for third channel
+    num_frames, _, H, W = flow_norm.shape
+    flow_norm_extended = torch.empty((num_frames, 3, height, width)).to(flow_norm)
+    flow_norm_extended[:,:2] = flow_norm
+    flow_norm_extended[:,-1:] = flow_norm.mean(dim=1, keepdim=True)
+    flow_norm_extended = rearrange(flow_norm_extended, '(b f) c h w -> b c f h w', f=num_frames)
+    return encode_video(flow_norm_extended, vae)
+def decode_flow(flow_latent, vae, flow_scale_factor):
+    flow_latent = flow_latent.permute(0, 2, 1, 3, 4)  # [batch_size, num_channels, num_frames, height, width]
+    flow_latent = 1 / vae.config.scaling_factor * flow_latent
+    flow = vae.decode(flow_latent).sample # BCFHW
+    # discard third channel (which is a mean value of f_x and f_y)
+    flow = flow[:,:2].detach().clone()
+    # Unnormalize optical flow
+    flow = rearrange(flow, 'b c f h w -> b f c h w')
+    flow = adaptive_unnormalize(flow, flow_scale_factor[0], flow_scale_factor[1])
+    flow = rearrange(flow, 'b f c h w -> (b f) c h w')
+    return flow # BF,C,H,W
+def adaptive_normalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    b, f, c, h, w = flow.shape
+    max_clip_x = math.sqrt(w/sf_x) * 1.0
+    max_clip_y = math.sqrt(h/sf_y) * 1.0
+    flow_norm = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    flow_x_norm = torch.sign(flow_x) * torch.sqrt(torch.abs(flow_x)/sf_x + 1e-7)
+    flow_y_norm = torch.sign(flow_y) * torch.sqrt(torch.abs(flow_y)/sf_y + 1e-7)
+    flow_norm[:, :, 0] = torch.clamp(flow_x_norm, min=-max_clip_x, max=max_clip_x)
+    flow_norm[:, :, 1] = torch.clamp(flow_y_norm, min=-max_clip_y, max=max_clip_y)
+    return flow_norm
+def adaptive_unnormalize(flow, sf_x, sf_y):
+    # x: BFCHW, optical flow
+    assert flow.ndim == 5, 'Set the shape of the flow input as (B, F, C, H, W)'
+    assert sf_x is not None and sf_y is not None
+    flow_orig = flow.detach().clone()
+    flow_x = flow[:, :, 0].detach().clone()
+    flow_y = flow[:, :, 1].detach().clone()
+    flow_orig[:, :, 0] = torch.sign(flow_x) * sf_x * (flow_x**2 - 1e-7)
+    flow_orig[:, :, 1] = torch.sign(flow_y) * sf_y * (flow_y**2 - 1e-7)
+    return flow_orig
+#--------------------------------------------------------------------------------------------------

finetune/models/cogvideox_i2v/lora_trainer.py ADDED Viewed

	@@ -0,0 +1,246 @@

+from typing import Any, Dict, List, Tuple
+import torch
+from diffusers import (
+    AutoencoderKLCogVideoX,
+    CogVideoXDPMScheduler,
+    CogVideoXImageToVideoPipeline,
+    CogVideoXTransformer3DModel,
+)
+from diffusers.models.embeddings import get_3d_rotary_pos_embed
+from PIL import Image
+from numpy import dtype
+from transformers import AutoTokenizer, T5EncoderModel
+from typing_extensions import override
+from finetune.schemas import Components
+from finetune.trainer import Trainer
+from finetune.utils import unwrap_model
+from ..utils import register
+class CogVideoXI2VLoraTrainer(Trainer):
+    UNLOAD_LIST = ["text_encoder"]
+    @override
+    def load_components(self) -> Dict[str, Any]:
+        components = Components()
+        model_path = str(self.args.model_path)
+        components.pipeline_cls = CogVideoXImageToVideoPipeline
+        components.tokenizer = AutoTokenizer.from_pretrained(model_path, subfolder="tokenizer")
+        components.text_encoder = T5EncoderModel.from_pretrained(model_path, subfolder="text_encoder")
+        components.transformer = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder="transformer")
+        components.vae = AutoencoderKLCogVideoX.from_pretrained(model_path, subfolder="vae")
+        components.scheduler = CogVideoXDPMScheduler.from_pretrained(model_path, subfolder="scheduler")
+        return components
+    @override
+    def initialize_pipeline(self) -> CogVideoXImageToVideoPipeline:
+        pipe = CogVideoXImageToVideoPipeline(
+            tokenizer=self.components.tokenizer,
+            text_encoder=self.components.text_encoder,
+            vae=self.components.vae,
+            transformer=unwrap_model(self.accelerator, self.components.transformer),
+            scheduler=self.components.scheduler,
+        )
+        return pipe
+    @override
+    def encode_video(self, video: torch.Tensor) -> torch.Tensor:
+        # shape of input video: [B, C, F, H, W]
+        vae = self.components.vae
+        video = video.to(vae.device, dtype=vae.dtype)
+        latent_dist = vae.encode(video).latent_dist
+        latent = latent_dist.sample() * vae.config.scaling_factor
+        return latent
+    @override
+    def encode_text(self, prompt: str) -> torch.Tensor:
+        prompt_token_ids = self.components.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.state.transformer_config.max_text_seq_length,
+            truncation=True,
+            add_special_tokens=True,
+            return_tensors="pt",
+        )
+        prompt_token_ids = prompt_token_ids.input_ids
+        prompt_embedding = self.components.text_encoder(prompt_token_ids.to(self.accelerator.device))[0]
+        return prompt_embedding
+    @override
+    def collate_fn(self, samples: List[Dict[str, Any]]) -> Dict[str, Any]:
+        ret = {"encoded_videos": [], "prompt_embedding": [], "images": []}
+        for sample in samples:
+            encoded_video = sample["encoded_video"]
+            prompt_embedding = sample["prompt_embedding"]
+            image = sample["image"]
+            ret["encoded_videos"].append(encoded_video)
+            ret["prompt_embedding"].append(prompt_embedding)
+            ret["images"].append(image)
+        ret["encoded_videos"] = torch.stack(ret["encoded_videos"])
+        ret["prompt_embedding"] = torch.stack(ret["prompt_embedding"])
+        ret["images"] = torch.stack(ret["images"])
+        return ret
+    @override
+    def compute_loss(self, batch) -> torch.Tensor:
+        prompt_embedding = batch["prompt_embedding"]
+        latent = batch["encoded_videos"]
+        images = batch["images"]
+        # Shape of prompt_embedding: [B, seq_len, hidden_size]
+        # Shape of latent: [B, C, F, H, W]
+        # Shape of images: [B, C, H, W]
+        patch_size_t = self.state.transformer_config.patch_size_t
+        if patch_size_t is not None:
+            ncopy = latent.shape[2] % patch_size_t
+            # Copy the first frame ncopy times to match patch_size_t
+            first_frame = latent[:, :, :1, :, :]  # Get first frame [B, C, 1, H, W]
+            latent = torch.cat([first_frame.repeat(1, 1, ncopy, 1, 1), latent], dim=2)
+            assert latent.shape[2] % patch_size_t == 0
+        batch_size, num_channels, num_frames, height, width = latent.shape
+        # Get prompt embeddings
+        _, seq_len, _ = prompt_embedding.shape
+        prompt_embedding = prompt_embedding.view(batch_size, seq_len, -1).to(dtype=latent.dtype)
+        # Add frame dimension to images [B,C,H,W] -> [B,C,F,H,W]
+        images = images.unsqueeze(2)
+        # Add noise to images
+        image_noise_sigma = torch.normal(mean=-3.0, std=0.5, size=(1,), device=self.accelerator.device)
+        image_noise_sigma = torch.exp(image_noise_sigma).to(dtype=images.dtype)
+        noisy_images = images + torch.randn_like(images) * image_noise_sigma[:, None, None, None, None]
+        image_latent_dist = self.components.vae.encode(noisy_images.to(dtype=self.components.vae.dtype)).latent_dist
+        image_latents = image_latent_dist.sample() * self.components.vae.config.scaling_factor
+        # Sample a random timestep for each sample
+        timesteps = torch.randint(
+            0, self.components.scheduler.config.num_train_timesteps, (batch_size,), device=self.accelerator.device
+        )
+        timesteps = timesteps.long()
+        # from [B, C, F, H, W] to [B, F, C, H, W]
+        latent = latent.permute(0, 2, 1, 3, 4)
+        image_latents = image_latents.permute(0, 2, 1, 3, 4)
+        assert (latent.shape[0], *latent.shape[2:]) == (image_latents.shape[0], *image_latents.shape[2:])
+        # Padding image_latents to the same frame number as latent
+        padding_shape = (latent.shape[0], latent.shape[1] - 1, *latent.shape[2:])
+        latent_padding = image_latents.new_zeros(padding_shape)
+        image_latents = torch.cat([image_latents, latent_padding], dim=1)
+        # Add noise to latent
+        noise = torch.randn_like(latent)
+        latent_noisy = self.components.scheduler.add_noise(latent, noise, timesteps)
+        # Concatenate latent and image_latents in the channel dimension
+        latent_img_noisy = torch.cat([latent_noisy, image_latents], dim=2)
+        # Prepare rotary embeds
+        vae_scale_factor_spatial = 2 ** (len(self.components.vae.config.block_out_channels) - 1)
+        transformer_config = self.state.transformer_config
+        rotary_emb = (
+            self.prepare_rotary_positional_embeddings(
+                height=height * vae_scale_factor_spatial,
+                width=width * vae_scale_factor_spatial,
+                num_frames=num_frames,
+                transformer_config=transformer_config,
+                vae_scale_factor_spatial=vae_scale_factor_spatial,
+                device=self.accelerator.device,
+            )
+            if transformer_config.use_rotary_positional_embeddings
+            else None
+        )
+        # Predict noise, For CogVideoX1.5 Only.
+        ofs_emb = (
+            None if self.state.transformer_config.ofs_embed_dim is None else latent.new_full((1,), fill_value=2.0)
+        )
+        predicted_noise = self.components.transformer(
+            hidden_states=latent_img_noisy,
+            encoder_hidden_states=prompt_embedding,
+            timestep=timesteps,
+            ofs=ofs_emb,
+            image_rotary_emb=rotary_emb,
+            return_dict=False,
+        )[0]
+        # Denoise
+        latent_pred = self.components.scheduler.get_velocity(predicted_noise, latent_noisy, timesteps)
+        alphas_cumprod = self.components.scheduler.alphas_cumprod[timesteps]
+        weights = 1 / (1 - alphas_cumprod)
+        while len(weights.shape) < len(latent_pred.shape):
+            weights = weights.unsqueeze(-1)
+        loss = torch.mean((weights * (latent_pred - latent) ** 2).reshape(batch_size, -1), dim=1)
+        loss = loss.mean()
+        return loss
+    @override
+    def validation_step(
+        self, eval_data: Dict[str, Any], pipe: CogVideoXImageToVideoPipeline
+    ) -> List[Tuple[str, Image.Image | List[Image.Image]]]:
+        """
+        Return the data that needs to be saved. For videos, the data format is List[PIL],
+        and for images, the data format is PIL
+        """
+        prompt, image, video = eval_data["prompt"], eval_data["image"], eval_data["video"]
+        video_generate = pipe(
+            num_frames=self.state.train_frames,
+            height=self.state.train_height,
+            width=self.state.train_width,
+            prompt=prompt,
+            image=image,
+            generator=self.state.generator,
+        ).frames[0]
+        return [("video", video_generate)]
+    def prepare_rotary_positional_embeddings(
+        self,
+        height: int,
+        width: int,
+        num_frames: int,
+        transformer_config: Dict,
+        vae_scale_factor_spatial: int,
+        device: torch.device,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        grid_height = height // (vae_scale_factor_spatial * transformer_config.patch_size)
+        grid_width = width // (vae_scale_factor_spatial * transformer_config.patch_size)
+        if transformer_config.patch_size_t is None:
+            base_num_frames = num_frames
+        else:
+            base_num_frames = (num_frames + transformer_config.patch_size_t - 1) // transformer_config.patch_size_t
+        freqs_cos, freqs_sin = get_3d_rotary_pos_embed(
+            embed_dim=transformer_config.attention_head_dim,
+            crops_coords=None,
+            grid_size=(grid_height, grid_width),
+            temporal_size=base_num_frames,
+            grid_type="slice",
+            max_size=(grid_height, grid_width),
+            device=device,
+        )
+        return freqs_cos, freqs_sin
+register("cogvideox-i2v", "lora", CogVideoXI2VLoraTrainer)

finetune/models/cogvideox_i2v/sft_trainer.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from ..cogvideox_i2v.lora_trainer import CogVideoXI2VLoraTrainer
+from ..utils import register
+class CogVideoXI2VSftTrainer(CogVideoXI2VLoraTrainer):
+    pass
+register("cogvideox-i2v", "sft", CogVideoXI2VSftTrainer)

finetune/models/utils.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import Dict, Literal
+from finetune.trainer import Trainer
+SUPPORTED_MODELS: Dict[str, Dict[str, Trainer]] = {}
+def register(model_name: str, training_type: Literal["lora", "sft", "controlnet"], trainer_cls: Trainer):
+    """Register a model and its associated functions for a specific training type.
+    Args:
+        model_name (str): Name of the model to register (e.g. "cogvideox-5b")
+        training_type (Literal["lora", "sft", "controlnet"]): Type of training - either "lora" or "sft" or "controlnet"
+        trainer_cls (Trainer): Trainer class to register.
+    """
+    # Check if model_name and training_type exists in SUPPORTED_MODELS
+    if model_name not in SUPPORTED_MODELS:
+        SUPPORTED_MODELS[model_name] = {}
+    else:
+        if training_type in SUPPORTED_MODELS[model_name]:
+            raise ValueError(f"Training type {training_type} already exists for model {model_name}")
+    SUPPORTED_MODELS[model_name][training_type] = trainer_cls
+def show_supported_models():
+    """Print all currently supported models and their training types."""
+    print("\nSupported Models:")
+    print("================")
+    for model_name, training_types in SUPPORTED_MODELS.items():
+        print(f"\n{model_name}")
+        print("-" * len(model_name))
+        for training_type in training_types:
+            print(f"  • {training_type}")
+def get_model_cls(model_type: str, training_type: Literal["lora", "sft"]) -> Trainer:
+    """Get the trainer class for a specific model and training type."""
+    if model_type not in SUPPORTED_MODELS:
+        print(f"\nModel '{model_type}' is not supported.")
+        print("\nSupported models are:")
+        for supported_model in SUPPORTED_MODELS:
+            print(f"  • {supported_model}")
+        raise ValueError(f"Model '{model_type}' is not supported")
+    if training_type not in SUPPORTED_MODELS[model_type]:
+        print(f"\nTraining type '{training_type}' is not supported for model '{model_type}'.")
+        print(f"\nSupported training types for '{model_type}' are:")
+        for supported_type in SUPPORTED_MODELS[model_type]:
+            print(f"  • {supported_type}")
+        raise ValueError(f"Training type '{training_type}' is not supported for model '{model_type}'")
+    return SUPPORTED_MODELS[model_type][training_type]

finetune/modules/__init__.py ADDED Viewed

File without changes

finetune/modules/camera_flow_generator.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange, repeat
+from .utils import instantiate_from_config, get_camera_flow_generator_input, warp_image
+import pdb
+class CameraFlowGenerator(nn.Module):
+    def __init__(
+        self,
+        depth_estimator_kwargs,
+        use_observed_mask=False,
+        cycle_th=3.,
+    ):
+        super().__init__()
+        self.depth_warping_module = instantiate_from_config(depth_estimator_kwargs)
+        self.use_observed_mask = use_observed_mask
+        self.cycle_th = cycle_th
+    def forward(self, condition_image, camera_flow_generator_input):
+        # NOTE. camera_flow_generator_input is a dict of network inputs!
+        # camera_flow_generator_input: Dict
+        # - image
+        # - intrinsics
+        # - extrinsics
+        with torch.no_grad():
+            flow_f, flow_b, depth_warped_frames, depth_ctxt, depth_trgt = self.depth_warping_module(camera_flow_generator_input)
+        image_ctxt = repeat(condition_image, "b c h w -> (b v) c h w", v=(depth_warped_frames.shape[0]//condition_image.shape[0]))
+        log_dict = {
+            'depth_warped_frames': depth_warped_frames,
+            'depth_ctxt': depth_ctxt,
+            'depth_trgt': depth_trgt,
+        }
+        # if self.use_observed_mask:
+        #     observed_mask = run_filtering(flow_f, flow_b, cycle_th=self.cycle_th)
+        #     log_dict[
+        #         'observed_mask': observed_mask
+        #     ]
+        return flow_f, log_dict

finetune/modules/camera_sampler.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import numpy as np
+from glob import glob
+import random
+import os
+import pdb
+random.seed(7777)
+class SampleManualCam:
+    def __init__(
+        self,
+        pose_type = 'manual',
+        root_path = '../assets/manual_poses',
+    ):
+        self.root_path = root_path
+        if pose_type == 'manual':
+            self.MANUAL_CAM = ['I', 'D', 'L', 'O', 'R', 'U']
+        elif pose_type == 're10k':
+            self.RE10K_CAM = os.listdir(root_path)
+        # self.pose_path = glob(root_path, "*.txt")
+        self.pose_type = pose_type
+    def sample(self, order=None, name=None):
+        # Sample camera parameters (W2C)
+        if self.pose_type == 'manual':
+            if name is not None:
+                assert name in self.MANUAL_CAM
+                cam_name = name
+            elif order is not None:
+                order = order % len(self.MANUAL_CAM)
+                cam_name = self.MANUAL_CAM[order]
+            else:
+                cam_name = random.choice(self.MANUAL_CAM)
+            path = os.path.join(self.root_path, f"camera_{cam_name}.txt")
+        elif self.pose_type == 're10k':
+            if name is not None:
+                assert name in self.RE10K_CAM
+                cam_name = name
+            elif order is not None:
+                order = order % len(self.RE10K_CAM)
+                cam_name = self.RE10K_CAM[order]
+            else:
+                cam_name = random.choice(self.RE10K_CAM)
+            path = os.path.join(self.root_path, cam_name)
+        with open(path, 'r') as f:
+            poses = f.readlines()
+        poses = [pose.strip().split(' ') for pose in poses]
+        poses = [[float(x) for x in pose] for pose in poses]
+        return poses, cam_name

finetune/modules/cogvideox_controlnet.py ADDED Viewed

	@@ -0,0 +1,353 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+from torch import nn
+from einops import rearrange
+import torch.nn.functional as F
+from diffusers.configuration_utils import FrozenDict
+from diffusers import CogVideoXTransformer3DModel
+from diffusers.models.transformers.cogvideox_transformer_3d import Transformer2DModelOutput, CogVideoXBlock
+from diffusers.utils import is_torch_version
+from diffusers.loaders import  PeftAdapterMixin
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CogVideoXPatchEmbed, TimestepEmbedding, Timesteps, get_3d_sincos_pos_embed
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.attention_processor import AttentionProcessor, AttnProcessor2_0
+from diffusers.models.normalization import AdaLayerNorm, CogVideoXLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from .cogvideox_custom_modules import CustomCogVideoXPatchEmbed, CustomCogVideoXBlock
+import pdb
+class CogVideoXControlnet(ModelMixin, ConfigMixin, PeftAdapterMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 30, # 48 for 5B, 30 for 2B.
+        attention_head_dim: int = 64,
+        # in_channels: int = 3,
+        in_channels: int = 16,
+        out_channels: Optional[int] = 16, # Not used
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        time_embed_dim: int = 512,
+        ofs_embed_dim: Optional[int] = None,
+        text_embed_dim: int = 4096,
+        num_layers: int = 30,
+        dropout: float = 0.0,
+        attention_bias: bool = True,
+        sample_width: int = 90,
+        sample_height: int = 60,
+        sample_frames: int = 49,
+        patch_size: int = 2,
+        patch_size_t: Optional[int] = None,
+        temporal_compression_ratio: int = 4,
+        max_text_seq_length: int = 226,
+        activation_fn: str = "gelu-approximate",
+        timestep_activation_fn: str = "silu",
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        spatial_interpolation_scale: float = 1.875,
+        temporal_interpolation_scale: float = 1.0,
+        use_rotary_positional_embeddings: bool = False,
+        use_learned_positional_embeddings: bool = False,
+        patch_bias: bool = True,
+        out_proj_dim_factor: int = 8,
+        out_proj_dim_zero_init: bool = True,
+        notextinflow: bool = False,
+    ):
+        super().__init__()
+        inner_dim = num_attention_heads * attention_head_dim
+        self.notextinflow = notextinflow
+        if not use_rotary_positional_embeddings and use_learned_positional_embeddings:
+            raise ValueError(
+                "There are no CogVideoX checkpoints available with disable rotary embeddings and learned positional "
+                "embeddings. If you're using a custom model and/or believe this should be supported, please open an "
+                "issue at https://github.com/huggingface/diffusers/issues."
+            )
+        """
+            Delete below.
+            In our case, FloVD, controlnet_hidden_states is already flow_latents encoded by 3D-Causal-VAE
+        """
+        # start_channels = in_channels * (downscale_coef ** 2)
+        # input_channels = [start_channels, start_channels // 2, start_channels // 4]
+        # self.unshuffle = nn.PixelUnshuffle(downscale_coef)
+        # self.controlnet_encode_first = nn.Sequential(
+        #     nn.Conv2d(input_channels[0], input_channels[1], kernel_size=1, stride=1, padding=0),
+        #     nn.GroupNorm(2, input_channels[1]),
+        #     nn.ReLU(),
+        # )
+        # self.controlnet_encode_second = nn.Sequential(
+        #     nn.Conv2d(input_channels[1], input_channels[2], kernel_size=1, stride=1, padding=0),
+        #     nn.GroupNorm(2, input_channels[2]),
+        #     nn.ReLU(),
+        # )
+        # """
+        #     Modify below.
+        #     In our case, patch_embed takes encoder_hidden_states, hidden_states, controlnet_hidden_states (flow)
+        # """
+        # 1. Patch embedding
+        self.patch_embed = CogVideoXPatchEmbed(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            embed_dim=inner_dim,
+            bias=True,
+            sample_width=sample_width,
+            sample_height=sample_height,
+            sample_frames=sample_frames,
+            temporal_compression_ratio=temporal_compression_ratio,
+            spatial_interpolation_scale=spatial_interpolation_scale,
+            temporal_interpolation_scale=temporal_interpolation_scale,
+            use_positional_embeddings=not use_rotary_positional_embeddings,
+            use_learned_positional_embeddings=use_learned_positional_embeddings,
+        )
+        # self.patch_embed = CustomCogVideoXPatchEmbed(
+        #     patch_size=patch_size,
+        #     patch_size_t=patch_size_t,
+        #     in_channels=in_channels,
+        #     embed_dim=inner_dim,
+        #     text_embed_dim=text_embed_dim,
+        #     bias=patch_bias,
+        #     sample_width=sample_width,
+        #     sample_height=sample_height,
+        #     sample_frames=sample_frames,
+        #     temporal_compression_ratio=temporal_compression_ratio,
+        #     max_text_seq_length=max_text_seq_length,
+        #     spatial_interpolation_scale=spatial_interpolation_scale,
+        #     temporal_interpolation_scale=temporal_interpolation_scale,
+        #     use_positional_embeddings=not use_rotary_positional_embeddings,
+        #     use_learned_positional_embeddings=use_learned_positional_embeddings,
+        # )
+        self.embedding_dropout = nn.Dropout(dropout)
+        # 2. Time embeddings
+        self.time_proj = Timesteps(inner_dim, flip_sin_to_cos, freq_shift)
+        self.time_embedding = TimestepEmbedding(inner_dim, time_embed_dim, timestep_activation_fn)
+        # 3. Define spatio-temporal transformers blocks
+        # self.transformer_blocks = nn.ModuleList(
+        #     [
+        #         CogVideoXBlock(
+        #             dim=inner_dim,
+        #             num_attention_heads=num_attention_heads,
+        #             attention_head_dim=attention_head_dim,
+        #             time_embed_dim=time_embed_dim,
+        #             dropout=dropout,
+        #             activation_fn=activation_fn,
+        #             attention_bias=attention_bias,
+        #             norm_elementwise_affine=norm_elementwise_affine,
+        #             norm_eps=norm_eps,
+        #         )
+        #         for _ in range(num_layers)
+        #     ]
+        # )
+        self.transformer_blocks = nn.ModuleList(
+            [
+                CustomCogVideoXBlock(
+                    dim=inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    time_embed_dim=time_embed_dim,
+                    dropout=dropout,
+                    activation_fn=activation_fn,
+                    attention_bias=attention_bias,
+                    norm_elementwise_affine=norm_elementwise_affine,
+                    norm_eps=norm_eps,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.out_projectors = None
+        if out_proj_dim_factor is not None:
+            out_proj_dim = num_attention_heads * out_proj_dim_factor
+            self.out_projectors = nn.ModuleList(
+                [nn.Linear(inner_dim, out_proj_dim) for _ in range(num_layers)]
+            )
+            if out_proj_dim_zero_init:
+                for out_projector in self.out_projectors:
+                    self.zeros_init_linear(out_projector)
+        self.gradient_checkpointing = False
+    def zeros_init_linear(self, linear: nn.Module):
+        if isinstance(linear, (nn.Linear, nn.Conv1d)):
+            if hasattr(linear, "weight"):
+                nn.init.zeros_(linear.weight)
+            if hasattr(linear, "bias"):
+                nn.init.zeros_(linear.bias)
+    def _set_gradient_checkpointing(self, module, value=False):
+        self.gradient_checkpointing = value
+    def compress_time(self, x, num_frames):
+        x = rearrange(x, '(b f) c h w -> b f c h w', f=num_frames)
+        batch_size, frames, channels, height, width = x.shape
+        x = rearrange(x, 'b f c h w -> (b h w) c f')
+        if x.shape[-1] % 2 == 1:
+            x_first, x_rest = x[..., 0], x[..., 1:]
+            if x_rest.shape[-1] > 0:
+                x_rest = F.avg_pool1d(x_rest, kernel_size=2, stride=2)
+            x = torch.cat([x_first[..., None], x_rest], dim=-1)
+        else:
+            x = F.avg_pool1d(x, kernel_size=2, stride=2)
+        x = rearrange(x, '(b h w) c f -> (b f) c h w', b=batch_size, h=height, w=width)
+        return x
+    # """
+    #     Add below.
+    #     Load pre-trained weight from Diffusers
+    #     For patch_embed, copy a projection layer for controlnet_states
+    # """
+    @classmethod
+    def from_pretrained(cls, model_path, subfolder, **additional_kwargs):
+        base = CogVideoXTransformer3DModel.from_pretrained(model_path, subfolder=subfolder)
+        controlnet_config = FrozenDict({**base.config, **additional_kwargs})
+        model = cls(**controlnet_config)
+        missing, unexpected = model.load_state_dict(base.state_dict(), strict=False)
+        print(f"Load CogVideoXTransformer3DModel.")
+        # if len(missing) != 0 or len(unexpected) != 0:
+        #     print(f"Missing keys: {missing}")
+        #     print(f"Unexpected keys: {unexpected}")
+        del base
+        torch.cuda.empty_cache()
+        return model
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        controlnet_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        controlnet_valid_mask: torch.Tensor = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ):
+        """
+            Delete below.
+            In our case, FloVD, controlnet_hidden_states is already flow_latents encoded by 3D-Causal-VAE
+        """
+        # batch_size, num_frames, channels, height, width = controlnet_states.shape
+        # # 0. Controlnet encoder
+        # controlnet_states = rearrange(controlnet_states, 'b f c h w -> (b f) c h w')
+        # controlnet_states = self.unshuffle(controlnet_states)
+        # controlnet_states = self.controlnet_encode_first(controlnet_states)
+        # controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames)
+        # num_frames = controlnet_states.shape[0] // batch_size
+        # controlnet_states = self.controlnet_encode_second(controlnet_states)
+        # controlnet_states = self.compress_time(controlnet_states, num_frames=num_frames)
+        # controlnet_states = rearrange(controlnet_states, '(b f) c h w -> b f c h w', b=batch_size)
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        # """
+        #     Modify below.
+        #     Distinguish hidden_states and controlnet_states (i.e., flow_hidden_states)
+        # """
+        hidden_states = torch.cat([hidden_states, controlnet_hidden_states], dim=2) # instead of image_latents, we use flow_latents for condition.
+        # controlnet_states = self.controlnext_encoder(controlnet_states, timestep=timestep)
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        # """
+        #     Modify below.
+        #     patch_embed takes encoder, hidden_states, controlnet_hidden_states
+        # """
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        # hidden_states = self.patch_embed(encoder_hidden_states, hidden_states, controlnet_hidden_states) # output: [text_embeds, image_embeds, flow_embeds] [B, 35326, 3072]
+        hidden_states = self.embedding_dropout(hidden_states)
+        """
+            Not modified below.
+            hidden_states include both hidden_states and controlnet_hidden_states
+        """
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length] # [text_embeds] [B, 226, 3072]
+        hidden_states = hidden_states[:, text_seq_length:] # [image_embeds, flow_embeds] [B, 35100, 3072]
+        # attention mask
+        if controlnet_valid_mask is not None:
+            mask_shape = controlnet_valid_mask.shape
+            attention_mask = torch.nn.functional.interpolate(controlnet_valid_mask, size=(mask_shape[2], mask_shape[3]//2, mask_shape[4]//2), mode='trilinear', align_corners=False) # CFHW
+            attention_mask[attention_mask>=0.5] = 1
+            attention_mask[attention_mask<0.5] = 0
+            attention_mask = attention_mask.to(torch.bool)
+            attention_mask = rearrange(attention_mask.squeeze(1), 'b f h w -> b (f h w)') # (B, N=(fxhxw))
+            # Consider encoder_hidden_states.. or do not use?? not sure..
+            if not self.notextinflow:
+                attention_mask = F.pad(attention_mask, (text_seq_length, 0), value=0.0)
+        attention_kwargs = {
+            'attention_mask': attention_mask if controlnet_valid_mask is not None else None,
+            'notextinflow': self.notextinflow,
+        }
+        controlnet_hidden_states = ()
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    attention_kwargs,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                    attention_kwargs=attention_kwargs,
+                )
+            if self.out_projectors is not None:
+                controlnet_hidden_states += (self.out_projectors[i](hidden_states),)
+            else:
+                controlnet_hidden_states += (hidden_states,)
+        if not return_dict:
+            return (controlnet_hidden_states,)
+        return Transformer2DModelOutput(sample=controlnet_hidden_states)

finetune/modules/cogvideox_custom_model.py ADDED Viewed

	@@ -0,0 +1,109 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import numpy as np
+from diffusers.utils import is_torch_version
+from diffusers.models.transformers.cogvideox_transformer_3d import CogVideoXTransformer3DModel, Transformer2DModelOutput
+import pdb
+class CustomCogVideoXTransformer3DModel(CogVideoXTransformer3DModel):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        timestep: Union[int, float, torch.LongTensor],
+        start_frame = None,
+        timestep_cond: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        controlnet_states: torch.Tensor = None,
+        controlnet_weights: Optional[Union[float, int, list, np.ndarray, torch.FloatTensor]] = 1.0,
+        return_dict: bool = True,
+    ):
+        batch_size, num_frames, channels, height, width = hidden_states.shape
+        if start_frame is not None:
+            hidden_states = torch.cat([start_frame, hidden_states], dim=2)
+        # 1. Time embedding
+        timesteps = timestep
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=hidden_states.dtype)
+        emb = self.time_embedding(t_emb, timestep_cond)
+        # 2. Patch embedding
+        hidden_states = self.patch_embed(encoder_hidden_states, hidden_states)
+        hidden_states = self.embedding_dropout(hidden_states)
+        text_seq_length = encoder_hidden_states.shape[1]
+        encoder_hidden_states = hidden_states[:, :text_seq_length]
+        hidden_states = hidden_states[:, text_seq_length:]
+        # 3. Transformer blocks
+        for i, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, encoder_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    emb,
+                    image_rotary_emb,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, encoder_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=emb,
+                    image_rotary_emb=image_rotary_emb,
+                )
+            if (controlnet_states is not None) and (i < len(controlnet_states)):
+                controlnet_states_block = controlnet_states[i]
+                controlnet_block_weight = 1.0
+                if isinstance(controlnet_weights, (list, np.ndarray)) or torch.is_tensor(controlnet_weights):
+                    controlnet_block_weight = controlnet_weights[i]
+                elif isinstance(controlnet_weights, (float, int)):
+                    controlnet_block_weight = controlnet_weights
+                hidden_states = hidden_states + controlnet_states_block * controlnet_block_weight
+        if not self.config.use_rotary_positional_embeddings:
+            # CogVideoX-2B
+            hidden_states = self.norm_final(hidden_states)
+        else:
+            # CogVideoX-5B
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+            hidden_states = self.norm_final(hidden_states)
+            hidden_states = hidden_states[:, text_seq_length:]
+        # 4. Final block
+        hidden_states = self.norm_out(hidden_states, temb=emb)
+        hidden_states = self.proj_out(hidden_states)
+        # 5. Unpatchify
+        p = self.config.patch_size
+        p_t = self.config.patch_size_t
+        if p_t is None:
+            output = hidden_states.reshape(batch_size, num_frames, height // p, width // p, -1, p, p)
+            output = output.permute(0, 1, 4, 2, 5, 3, 6).flatten(5, 6).flatten(3, 4)
+        else:
+            output = hidden_states.reshape(
+                batch_size, (num_frames + p_t - 1) // p_t, height // p, width // p, -1, p_t, p, p
+            )
+            output = output.permute(0, 1, 5, 4, 2, 6, 3, 7).flatten(6, 7).flatten(4, 5).flatten(1, 2)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

finetune/modules/cogvideox_custom_modules.py ADDED Viewed

	@@ -0,0 +1,357 @@

+import math
+from typing import List, Optional, Tuple, Union, Dict, Any
+import copy
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers import CogVideoXTransformer3DModel
+from diffusers.models.transformers.cogvideox_transformer_3d import CogVideoXBlock
+from diffusers.models.normalization import CogVideoXLayerNormZero
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import CogVideoXAttnProcessor2_0, Attention
+from diffusers.models.embeddings import CogVideoXPatchEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from contextlib import contextmanager
+from peft.tuners.lora.layer import LoraLayer  # PEFT의 LoRA 레이어 기본 클래스
+import pdb
+# Code heavily borrowed from https://github.com/huggingface/diffusers
+class enable_lora:
+    def __init__(self, modules, enable=True):
+        self.modules = modules
+        self.enable = enable
+        self.prev_states = {}
+    def __enter__(self):
+        for module in self.modules:
+            self.prev_states[module] = getattr(module, "lora_enabled", True)
+            setattr(module, "lora_enabled", self.enable)
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        for module in self.modules:
+            setattr(module, "lora_enabled", self.prev_states[module])
+        return False
+class CustomCogVideoXPatchEmbed(CogVideoXPatchEmbed):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        patch_size = kwargs['patch_size']
+        patch_size_t = kwargs['patch_size_t']
+        bias = kwargs['bias']
+        in_channels = kwargs['in_channels']
+        embed_dim = kwargs['embed_dim']
+        # projection layer for flow latents
+        if patch_size_t is None:
+            # CogVideoX 1.0 checkpoints
+            self.flow_proj = nn.Conv2d(in_channels//2, embed_dim, kernel_size=(patch_size, patch_size), stride=patch_size, bias=bias)
+        else:
+            # CogVideoX 1.5 checkpoints
+            self.flow_proj = nn.Linear(in_channels//2 * patch_size * patch_size * patch_size_t, embed_dim)
+        # Add positional embedding for flow_embeds
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            flow_pos_embedding = self._get_positional_embeddings(self.sample_height, self.sample_width, self.sample_frames)[:,self.max_text_seq_length:] # shape: [1, 17550, 3072]
+            self.flow_pos_embedding = nn.Parameter(flow_pos_embedding)
+    def forward(self, text_embeds: torch.Tensor, image_embeds: torch.Tensor, flow_embeds: torch.Tensor):
+        r"""
+        Args:
+            text_embeds (`torch.Tensor`):
+                Input text embeddings. Expected shape: (batch_size, seq_length, embedding_dim).
+            image_embeds (`torch.Tensor`):
+                Input image embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+            flow_embeds (`torch.Tensor`):
+                Input flow embeddings. Expected shape: (batch_size, num_frames, channels, height, width).
+        """
+        text_embeds = self.text_proj(text_embeds)
+        batch_size, num_frames, channels, height, width = image_embeds.shape
+        if self.patch_size_t is None:
+            # embed video latents
+            image_embeds = image_embeds.reshape(-1, channels, height, width)
+            image_embeds = self.proj(image_embeds)
+            image_embeds = image_embeds.view(batch_size, num_frames, *image_embeds.shape[1:])
+            image_embeds = image_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+            image_embeds = image_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+            # embed flow latents
+            flow_embeds = flow_embeds.reshape(-1, channels//2, height, width)
+            flow_embeds = self.flow_proj(flow_embeds)
+            flow_embeds = flow_embeds.view(batch_size, num_frames, *flow_embeds.shape[1:])
+            flow_embeds = flow_embeds.flatten(3).transpose(2, 3)  # [batch, num_frames, height x width, channels]
+            flow_embeds = flow_embeds.flatten(1, 2)  # [batch, num_frames x height x width, channels]
+        else:
+            p = self.patch_size
+            p_t = self.patch_size_t
+            # embed video latents
+            image_embeds = image_embeds.permute(0, 1, 3, 4, 2)
+            image_embeds = image_embeds.reshape(
+                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels
+            )
+            image_embeds = image_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+            image_embeds = self.proj(image_embeds)
+            # embed flow latents
+            flow_embeds = flow_embeds.permute(0, 1, 3, 4, 2)
+            flow_embeds = flow_embeds.reshape(
+                batch_size, num_frames // p_t, p_t, height // p, p, width // p, p, channels//2
+            )
+            flow_embeds = flow_embeds.permute(0, 1, 3, 5, 7, 2, 4, 6).flatten(4, 7).flatten(1, 3)
+            flow_embeds = self.flow_proj(flow_embeds)
+        # Curriculum learning of flow token
+        # flow_embeds = self.flow_scale * flow_embeds
+        embeds = torch.cat(
+            [text_embeds, image_embeds, flow_embeds], dim=1
+        ).contiguous()  # [batch, num_frames x height x width + seq_length + num_frames x height x width, channels]
+        if self.use_positional_embeddings or self.use_learned_positional_embeddings:
+            if self.use_learned_positional_embeddings and (self.sample_width != width or self.sample_height != height):
+                raise ValueError(
+                    "It is currently not possible to generate videos at a different resolution that the defaults. This should only be the case with 'THUDM/CogVideoX-5b-I2V'."
+                    "If you think this is incorrect, please open an issue at https://github.com/huggingface/diffusers/issues."
+                )
+            pre_time_compression_frames = (num_frames - 1) * self.temporal_compression_ratio + 1
+            if (
+                self.sample_height != height
+                or self.sample_width != width
+                or self.sample_frames != pre_time_compression_frames
+            ):
+                pos_embedding = self._get_positional_embeddings(
+                    height, width, pre_time_compression_frames, device=embeds.device
+                )
+            else:
+                pos_embedding = self.pos_embedding
+            # Previous version..
+            # pos_embedding = pos_embedding.to(dtype=embeds.dtype)
+            # embeds = embeds + pos_embedding
+            # Add flow embedding..
+            # flow_pos_embedding = self.flow_pos_scale * self.flow_pos_embedding
+            flow_pos_embedding = self.flow_pos_embedding
+            pos_embedding_total = torch.cat([pos_embedding, flow_pos_embedding], dim=1).to(dtype=embeds.dtype)
+            embeds = embeds + pos_embedding_total
+        return embeds
+@maybe_allow_in_graph
+class CustomCogVideoXBlock(nn.Module):
+    r"""
+    Transformer block used in [CogVideoX](https://github.com/THUDM/CogVideo) model.
+    Parameters:
+        dim (`int`):
+            The number of channels in the input and output.
+        num_attention_heads (`int`):
+            The number of heads to use for multi-head attention.
+        attention_head_dim (`int`):
+            The number of channels in each head.
+        time_embed_dim (`int`):
+            The number of channels in timestep embedding.
+        dropout (`float`, defaults to `0.0`):
+            The dropout probability to use.
+        activation_fn (`str`, defaults to `"gelu-approximate"`):
+            Activation function to be used in feed-forward.
+        attention_bias (`bool`, defaults to `False`):
+            Whether or not to use bias in attention projection layers.
+        qk_norm (`bool`, defaults to `True`):
+            Whether or not to use normalization after query and key projections in Attention.
+        norm_elementwise_affine (`bool`, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_eps (`float`, defaults to `1e-5`):
+            Epsilon value for normalization layers.
+        final_dropout (`bool` defaults to `False`):
+            Whether to apply a final dropout after the last feed-forward layer.
+        ff_inner_dim (`int`, *optional*, defaults to `None`):
+            Custom hidden dimension of Feed-forward layer. If not provided, `4 * dim` is used.
+        ff_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Feed-forward layer.
+        attention_out_bias (`bool`, defaults to `True`):
+            Whether or not to use bias in Attention output projection layer.
+    """
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        time_embed_dim: int,
+        dropout: float = 0.0,
+        activation_fn: str = "gelu-approximate",
+        attention_bias: bool = False,
+        qk_norm: bool = True,
+        norm_elementwise_affine: bool = True,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = True,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        # 1. Self Attention
+        self.norm1 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.attn1 = Attention(
+            query_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            qk_norm="layer_norm" if qk_norm else None,
+            eps=1e-6,
+            bias=attention_bias,
+            out_bias=attention_out_bias,
+            processor=CustomCogVideoXAttnProcessor2_0(),
+        )
+        # 2. Feed Forward
+        self.norm2 = CogVideoXLayerNormZero(time_embed_dim, dim, norm_elementwise_affine, norm_eps, bias=True)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+        attention_kwargs = attention_kwargs or {}
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_msa, enc_gate_msa = self.norm1(
+            hidden_states, encoder_hidden_states, temb
+        )
+        # attention
+        attn_hidden_states, attn_encoder_hidden_states = self.attn1(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **attention_kwargs,
+        )
+        hidden_states = hidden_states + gate_msa * attn_hidden_states
+        encoder_hidden_states = encoder_hidden_states + enc_gate_msa * attn_encoder_hidden_states
+        # norm & modulate
+        norm_hidden_states, norm_encoder_hidden_states, gate_ff, enc_gate_ff = self.norm2(
+            hidden_states, encoder_hidden_states, temb
+        )
+        # feed-forward
+        norm_hidden_states = torch.cat([norm_encoder_hidden_states, norm_hidden_states], dim=1)
+        ff_output = self.ff(norm_hidden_states)
+        hidden_states = hidden_states + gate_ff * ff_output[:, text_seq_length:]
+        encoder_hidden_states = encoder_hidden_states + enc_gate_ff * ff_output[:, :text_seq_length]
+        return hidden_states, encoder_hidden_states
+class CustomCogVideoXAttnProcessor2_0:
+    r"""
+    Processor for implementing scaled dot-product attention for the CogVideoX model. It applies a rotary embedding on
+    query and key vectors, but does not include spatial normalization.
+    """
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError("CogVideoXAttnProcessor requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        notextinflow: Optional[bool] = False,
+    ) -> torch.Tensor:
+        text_seq_length = encoder_hidden_states.size(1)
+        if not notextinflow:
+            hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        batch_size, sequence_length, _ = hidden_states.shape
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # Apply RoPE if needed
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            if not notextinflow:
+                query[:, :, text_seq_length:] = apply_rotary_emb(query[:, :, text_seq_length:], image_rotary_emb)
+                if not attn.is_cross_attention:
+                    key[:, :, text_seq_length:] = apply_rotary_emb(key[:, :, text_seq_length:], image_rotary_emb)
+            else:
+                query[:, :, :] = apply_rotary_emb(query[:, :, :], image_rotary_emb)
+                if not attn.is_cross_attention:
+                    key[:, :, :] = apply_rotary_emb(key[:, :, :], image_rotary_emb)
+        hidden_states = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
+        )
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if not notextinflow:
+            encoder_hidden_states, hidden_states = hidden_states.split(
+                [text_seq_length, hidden_states.size(1) - text_seq_length], dim=1
+            )
+        return hidden_states, encoder_hidden_states

finetune/modules/depth_warping/__init__.py ADDED Viewed

File without changes

finetune/modules/depth_warping/camera/Camera.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import os
+import random
+import json
+import torch
+import torch.nn as nn
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as F
+import numpy as np
+from einops import rearrange, repeat
+class Camera(object):
+    def __init__(self, entry):
+        fx, fy, cx, cy = entry[1:5]
+        self.fx = fx
+        self.fy = fy
+        self.cx = cx
+        self.cy = cy
+        w2c_mat = np.array(entry[7:]).reshape(3, 4)
+        w2c_mat_4x4 = np.eye(4)
+        w2c_mat_4x4[:3, :] = w2c_mat
+        self.w2c_mat = w2c_mat_4x4
+        self.c2w_mat = np.linalg.inv(w2c_mat_4x4)
+def load_cameras(path):
+    with open(path, 'r') as f:
+        poses = f.readlines()
+    poses = [pose.strip().split(' ') for pose in poses[1:]]
+    cam_params = [[float(x) for x in pose] for pose in poses]
+    cam_params = [Camera(cam_param) for cam_param in cam_params]
+    return cam_params
+def get_relative_pose(cam_params):
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    source_cam_c2w = abs_c2ws[0]
+    cam_to_origin = 0
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+def get_K(intrinsics, size, do_normalize=False):
+    def normalize_intrinsic(x, size):
+        h, w = size
+        x[:,:,0:1] = x[:,:,0:1] / w
+        x[:,:,1:2] = x[:,:,1:2] / h
+        return x
+    b, _, t, _ = intrinsics.shape
+    K = torch.zeros((b, t, 9), dtype=intrinsics.dtype, device=intrinsics.device)
+    fx, fy, cx, cy = intrinsics.squeeze(1).chunk(4, dim=-1)
+    K[:,:,0:1] = fx
+    K[:,:,2:3] = cx
+    K[:,:,4:5] = fy
+    K[:,:,5:6] = cy
+    K[:,:,8:9] = 1.0
+    K = rearrange(K, "b t (h w) -> b t h w", h=3, w=3)
+    if do_normalize:
+        K = normalize_intrinsic(K, size)
+    return K

finetune/modules/depth_warping/camera/WarperPytorch.py ADDED Viewed

	@@ -0,0 +1,416 @@

+# Shree KRISHNAya Namaha
+# Differentiable warper implemented in PyTorch. Warping is done on batches.
+# Tested on PyTorch 1.8.1
+# Author: Nagabhushan S N
+# Last Modified: 27/09/2021
+# Code from https://github.com/NagabhushanSN95/Pose-Warping
+import datetime
+import time
+import traceback
+from pathlib import Path
+from typing import Tuple, Optional
+import numpy
+# import skimage.io
+import torch
+import torch.nn.functional as F
+from einops import rearrange, repeat
+# import Imath
+# import OpenEXR
+import pdb
+class Warper:
+    def __init__(self, resolution: tuple = None):
+        self.resolution = resolution
+    def forward_warp(self, frame1: torch.Tensor, mask1: Optional[torch.Tensor], depth1: torch.Tensor,
+                     transformation1: torch.Tensor, transformation2: torch.Tensor, intrinsic1: torch.Tensor,
+                     intrinsic2: Optional[torch.Tensor], is_image=True) -> \
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Given a frame1 and global transformations transformation1 and transformation2, warps frame1 to next view using
+        bilinear splatting.
+        All arrays should be torch tensors with batch dimension and channel first
+        :param frame1: (b, 3, h, w). If frame1 is not in the range [-1, 1], either set is_image=False when calling
+                        bilinear_splatting on frame within this function, or modify clipping in bilinear_splatting()
+                        method accordingly.
+        :param mask1: (b, 1, h, w) - 1 for known, 0 for unknown. Optional
+        :param depth1: (b, 1, h, w)
+        :param transformation1: (b, 4, 4) extrinsic transformation matrix of first view: [R, t; 0, 1]
+        :param transformation2: (b, 4, 4) extrinsic transformation matrix of second view: [R, t; 0, 1]
+        :param intrinsic1: (b, 3, 3) camera intrinsic matrix
+        :param intrinsic2: (b, 3, 3) camera intrinsic matrix. Optional
+        """
+        self.device = frame1.device
+        if self.resolution is not None:
+            assert frame1.shape[2:4] == self.resolution
+        b, c, h, w = frame1.shape
+        if mask1 is None:
+            mask1 = torch.ones(size=(b, 1, h, w)).to(frame1)
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+        assert frame1.shape == (b, 3, h, w) or frame1.shape == (b, 2, h, w) # flow b2hw
+        assert mask1.shape == (b, 1, h, w)
+        assert depth1.shape == (b, 1, h, w)
+        assert transformation1.shape == (b, 4, 4)
+        assert transformation2.shape == (b, 4, 4)
+        assert intrinsic1.shape == (b, 3, 3)
+        assert intrinsic2.shape == (b, 3, 3)
+        frame1 = frame1.to(self.device)
+        mask1 = mask1.to(self.device)
+        depth1 = depth1.to(self.device)
+        transformation1 = transformation1.to(self.device)
+        transformation2 = transformation2.to(self.device)
+        intrinsic1 = intrinsic1.to(self.device)
+        intrinsic2 = intrinsic2.to(self.device)
+        trans_points1 = self.compute_transformed_points(depth1, transformation1, transformation2, intrinsic1,
+                                                        intrinsic2)
+        # trans_coordinates = trans_points1[:, :, :2, 0] / trans_points1[:, :, 2:3, 0]
+        trans_coordinates = trans_points1[:, :, :, :2, 0] / (trans_points1[:, :, :, 2:3, 0]+1e-7)
+        trans_depth1 = rearrange(trans_points1[:, :, :, 2:3, 0], "b h w c -> b c h w")
+        grid = self.create_grid(b, h, w).to(trans_coordinates)
+        flow12 = rearrange(trans_coordinates, "b h w c -> b c h w") - grid
+        warped_frame2, mask2 = self.bilinear_splatting(frame1, mask1, trans_depth1, flow12, None, is_image=is_image)
+        warped_depth2 = self.bilinear_splatting(trans_depth1, mask1, trans_depth1, flow12, None, is_image=False)[0] # [0][:, :, 0]
+        return warped_frame2, mask2, warped_depth2, flow12
+    def forward_warp_displacement(self, depth1: torch.Tensor, flow1: torch.Tensor,
+                                  transformation1: torch.Tensor, transformation2: torch.Tensor, intrinsic1: torch.Tensor, intrinsic2: Optional[torch.Tensor],):
+        """
+        Given a frame1 and global transformations transformation1 and transformation2, warps frame1 to next view using
+        bilinear splatting.
+        All arrays should be torch tensors with batch dimension and channel first
+        :param depth1: (b, 1, h, w)
+        :param flow1: (b, 2, h, w)
+        :param transformation1: (b, 4, 4) extrinsic transformation matrix of first view: [R, t; 0, 1]
+        :param transformation2: (b, 4, 4) extrinsic transformation matrix of second view: [R, t; 0, 1]
+        :param intrinsic1: (b, 3, 3) camera intrinsic matrix
+        :param intrinsic2: (b, 3, 3) camera intrinsic matrix. Optional
+        """
+        self.device = flow1.device
+        if self.resolution is not None:
+            assert flow1.shape[2:4] == self.resolution
+        b, c, h, w = flow1.shape
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+        assert flow1.shape == (b, 2, h, w)
+        assert depth1.shape == (b, 1, h, w)
+        assert transformation1.shape == (b, 4, 4)
+        assert transformation2.shape == (b, 4, 4)
+        assert intrinsic1.shape == (b, 3, 3)
+        assert intrinsic2.shape == (b, 3, 3)
+        depth1 = depth1.to(self.device)
+        flow1 = flow1.to(self.device)
+        transformation1 = transformation1.to(self.device)
+        transformation2 = transformation2.to(self.device)
+        intrinsic1 = intrinsic1.to(self.device)
+        intrinsic2 = intrinsic2.to(self.device)
+        trans_points1 = self.compute_transformed_points(depth1, transformation1, transformation2, intrinsic1, intrinsic2)
+        trans_coordinates1 = trans_points1[:, :, :, :2, 0] / (trans_points1[:, :, :, 2:3, 0]+1e-7)
+        trans_points2 = self.compute_transformed_points(depth1, transformation1, transformation2, intrinsic1, intrinsic2, flow1)
+        trans_coordinates2 = trans_points2[:, :, :, :2, 0] / (trans_points2[:, :, :, 2:3, 0]+1e-7)
+        flow12_displacement = rearrange(trans_coordinates2 - trans_coordinates1, "b h w c -> b c h w")
+        return flow12_displacement
+    def compute_transformed_points(self, depth1: torch.Tensor, transformation1: torch.Tensor, transformation2: torch.Tensor,
+                                   intrinsic1: torch.Tensor, intrinsic2: Optional[torch.Tensor], flow1: Optional[torch.Tensor]=None):
+        """
+        Computes transformed position for each pixel location
+        """
+        if self.resolution is not None:
+            assert depth1.shape[2:4] == self.resolution
+        b, _, h, w = depth1.shape
+        if intrinsic2 is None:
+            intrinsic2 = intrinsic1.clone()
+        transformation = torch.bmm(transformation2, torch.linalg.inv(transformation1)).to(transformation1.dtype)  # (b, 4, 4)
+        x1d = torch.arange(0, w)[None]
+        y1d = torch.arange(0, h)[:, None]
+        x2d = x1d.repeat([h, 1]).to(depth1)  # (h, w)
+        y2d = y1d.repeat([1, w]).to(depth1)  # (h, w)
+        ones_2d = torch.ones(size=(h, w)).to(depth1)  # (h, w)
+        ones_4d = ones_2d[None, :, :, None, None].repeat([b, 1, 1, 1, 1])  # (b, h, w, 1, 1)
+        if flow1 is not None:
+            x4d = repeat(x2d[None, :, :, None], '1 h w c -> b h w c', b=b)
+            y4d = repeat(y2d[None, :, :, None], '1 h w c -> b h w c', b=b)
+            flow1_x4d = rearrange(flow1[:,:1].detach().clone(), "b c h w -> b h w c")
+            flow1_y4d = rearrange(flow1[:,1:].detach().clone(), "b c h w -> b h w c")
+            x4d = x4d + flow1_x4d
+            y4d = y4d + flow1_y4d
+            pos_vectors_homo = torch.stack([x4d, y4d, ones_4d.squeeze(-1)], dim=3)  # (b, h, w, 3, 1)
+        else:
+            pos_vectors_homo = torch.stack([x2d, y2d, ones_2d], dim=2)[None, :, :, :, None]  # (1, h, w, 3, 1)
+        intrinsic1_inv = torch.linalg.inv(intrinsic1)  # (b, 3, 3)
+        intrinsic1_inv_4d = intrinsic1_inv[:, None, None]  # (b, 1, 1, 3, 3)
+        intrinsic2_4d = intrinsic2[:, None, None]  # (b, 1, 1, 3, 3)
+        depth_4d = depth1[:, 0][:, :, :, None, None]  # (b, h, w, 1, 1)
+        trans_4d = transformation[:, None, None]  # (b, 1, 1, 4, 4)
+        unnormalized_pos = torch.matmul(intrinsic1_inv_4d, pos_vectors_homo).to(transformation1.dtype)  # (b, h, w, 3, 1)
+        world_points = depth_4d * unnormalized_pos  # (b, h, w, 3, 1)
+        world_points_homo = torch.cat([world_points, ones_4d], dim=3)  # (b, h, w, 4, 1)
+        trans_world_homo = torch.matmul(trans_4d, world_points_homo).to(transformation1.dtype)  # (b, h, w, 4, 1)
+        trans_world = trans_world_homo[:, :, :, :3]  # (b, h, w, 3, 1)
+        trans_norm_points = torch.matmul(intrinsic2_4d, trans_world).to(transformation1.dtype)  # (b, h, w, 3, 1)
+        return trans_norm_points
+    def bilinear_splatting(self, frame1: torch.Tensor, mask1: Optional[torch.Tensor], depth1: torch.Tensor,
+                           flow12: torch.Tensor, flow12_mask: Optional[torch.Tensor], is_image: bool = False) -> \
+            Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Bilinear splatting
+        :param frame1: (b,c,h,w)
+        :param mask1: (b,1,h,w): 1 for known, 0 for unknown. Optional
+        :param depth1: (b,1,h,w)
+        :param flow12: (b,2,h,w)
+        :param flow12_mask: (b,1,h,w): 1 for valid flow, 0 for invalid flow. Optional
+        :param is_image: if true, output will be clipped to (-1,1) range
+        :return: warped_frame2: (b,c,h,w)
+                 mask2: (b,1,h,w): 1 for known and 0 for unknown
+        """
+        if self.resolution is not None:
+            assert frame1.shape[2:4] == self.resolution
+        b, c, h, w = frame1.shape
+        if mask1 is None:
+            mask1 = torch.ones(size=(b, 1, h, w)).to(frame1)
+        if flow12_mask is None:
+            flow12_mask = torch.ones(size=(b, 1, h, w)).to(flow12)
+        grid = self.create_grid(b, h, w).to(frame1)
+        trans_pos = flow12 + grid
+        trans_pos_offset = trans_pos + 1
+        trans_pos_floor = torch.floor(trans_pos_offset).long()
+        trans_pos_ceil = torch.ceil(trans_pos_offset).long()
+        trans_pos_offset = torch.stack([
+            torch.clamp(trans_pos_offset[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_offset[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_floor = torch.stack([
+            torch.clamp(trans_pos_floor[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_floor[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_ceil = torch.stack([
+            torch.clamp(trans_pos_ceil[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_ceil[:, 1], min=0, max=h + 1)], dim=1)
+        prox_weight_nw = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_sw = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_ne = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+        prox_weight_se = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+        sat_depth1 = torch.clamp(depth1, min=0, max=1000)
+        log_depth1 = torch.log(1 + sat_depth1)
+        depth_weights = torch.exp(log_depth1 / log_depth1.max() * 50)
+        weight_nw = torch.moveaxis(prox_weight_nw * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_sw = torch.moveaxis(prox_weight_sw * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_ne = torch.moveaxis(prox_weight_ne * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_se = torch.moveaxis(prox_weight_se * mask1 * flow12_mask / depth_weights, [0, 1, 2, 3], [0, 3, 1, 2])
+        warped_frame = torch.zeros(size=(b, h + 2, w + 2, c), dtype=torch.float32).to(frame1)
+        warped_weights = torch.zeros(size=(b, h + 2, w + 2, 1), dtype=torch.float32).to(frame1)
+        frame1_cl = torch.moveaxis(frame1, [0, 1, 2, 3], [0, 3, 1, 2])
+        batch_indices = torch.arange(b)[:, None, None].to(frame1.device)
+        warped_frame.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_floor[:, 0]),
+                                frame1_cl * weight_nw, accumulate=True)
+        warped_frame.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]),
+                                frame1_cl * weight_sw, accumulate=True)
+        warped_frame.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]),
+                                frame1_cl * weight_ne, accumulate=True)
+        warped_frame.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]),
+                                frame1_cl * weight_se, accumulate=True)
+        warped_weights.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_floor[:, 0]),
+                                  weight_nw, accumulate=True)
+        warped_weights.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]),
+                                  weight_sw, accumulate=True)
+        warped_weights.index_put_((batch_indices, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]),
+                                  weight_ne, accumulate=True)
+        warped_weights.index_put_((batch_indices, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]),
+                                  weight_se, accumulate=True)
+        warped_frame_cf = torch.moveaxis(warped_frame, [0, 1, 2, 3], [0, 2, 3, 1])
+        warped_weights_cf = torch.moveaxis(warped_weights, [0, 1, 2, 3], [0, 2, 3, 1])
+        cropped_warped_frame = warped_frame_cf[:, :, 1:-1, 1:-1]
+        cropped_weights = warped_weights_cf[:, :, 1:-1, 1:-1]
+        mask = cropped_weights > 0
+        zero_value = -1 if is_image else 0
+        zero_tensor = torch.tensor(zero_value, dtype=frame1.dtype, device=frame1.device)
+        warped_frame2 = torch.where(mask, cropped_warped_frame / cropped_weights, zero_tensor)
+        mask2 = mask.to(frame1)
+        if is_image:
+            assert warped_frame2.min() >= -1.1  # Allow for rounding errors
+            assert warped_frame2.max() <= 1.1
+            warped_frame2 = torch.clamp(warped_frame2, min=-1, max=1)
+        return warped_frame2, mask2
+    def bilinear_interpolation(self, frame2: torch.Tensor, mask2: Optional[torch.Tensor], flow12: torch.Tensor,
+                               flow12_mask: Optional[torch.Tensor], is_image: bool = False) -> \
+            Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Bilinear interpolation
+        :param frame2: (b, c, h, w)
+        :param mask2: (b, 1, h, w): 1 for known, 0 for unknown. Optional
+        :param flow12: (b, 2, h, w)
+        :param flow12_mask: (b, 1, h, w): 1 for valid flow, 0 for invalid flow. Optional
+        :param is_image: if true, output will be clipped to (-1,1) range
+        :return: warped_frame1: (b, c, h, w)
+                 mask1: (b, 1, h, w): 1 for known and 0 for unknown
+        """
+        if self.resolution is not None:
+            assert frame2.shape[2:4] == self.resolution
+        b, c, h, w = frame2.shape
+        if mask2 is None:
+            mask2 = torch.ones(size=(b, 1, h, w)).to(frame2)
+        if flow12_mask is None:
+            flow12_mask = torch.ones(size=(b, 1, h, w)).to(flow12)
+        grid = self.create_grid(b, h, w).to(frame2)
+        trans_pos = flow12 + grid
+        trans_pos_offset = trans_pos + 1
+        trans_pos_floor = torch.floor(trans_pos_offset).long()
+        trans_pos_ceil = torch.ceil(trans_pos_offset).long()
+        trans_pos_offset = torch.stack([
+            torch.clamp(trans_pos_offset[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_offset[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_floor = torch.stack([
+            torch.clamp(trans_pos_floor[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_floor[:, 1], min=0, max=h + 1)], dim=1)
+        trans_pos_ceil = torch.stack([
+            torch.clamp(trans_pos_ceil[:, 0], min=0, max=w + 1),
+            torch.clamp(trans_pos_ceil[:, 1], min=0, max=h + 1)], dim=1)
+        prox_weight_nw = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_sw = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_offset[:, 0:1] - trans_pos_floor[:, 0:1]))
+        prox_weight_ne = (1 - (trans_pos_offset[:, 1:2] - trans_pos_floor[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+        prox_weight_se = (1 - (trans_pos_ceil[:, 1:2] - trans_pos_offset[:, 1:2])) * \
+                         (1 - (trans_pos_ceil[:, 0:1] - trans_pos_offset[:, 0:1]))
+        weight_nw = torch.moveaxis(prox_weight_nw * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_sw = torch.moveaxis(prox_weight_sw * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_ne = torch.moveaxis(prox_weight_ne * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+        weight_se = torch.moveaxis(prox_weight_se * flow12_mask, [0, 1, 2, 3], [0, 3, 1, 2])
+        frame2_offset = F.pad(frame2, [1, 1, 1, 1])
+        mask2_offset = F.pad(mask2, [1, 1, 1, 1])
+        bi = torch.arange(b)[:, None, None]
+        f2_nw = frame2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_floor[:, 0]]
+        f2_sw = frame2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]]
+        f2_ne = frame2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]]
+        f2_se = frame2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]]
+        m2_nw = mask2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_floor[:, 0]]
+        m2_sw = mask2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_floor[:, 0]]
+        m2_ne = mask2_offset[bi, :, trans_pos_floor[:, 1], trans_pos_ceil[:, 0]]
+        m2_se = mask2_offset[bi, :, trans_pos_ceil[:, 1], trans_pos_ceil[:, 0]]
+        nr = weight_nw * f2_nw * m2_nw + weight_sw * f2_sw * m2_sw + \
+             weight_ne * f2_ne * m2_ne + weight_se * f2_se * m2_se
+        dr = weight_nw * m2_nw + weight_sw * m2_sw + weight_ne * m2_ne + weight_se * m2_se
+        zero_value = -1 if is_image else 0
+        zero_tensor = torch.tensor(zero_value, dtype=nr.dtype, device=nr.device)
+        warped_frame1 = torch.where(dr > 0, nr / dr, zero_tensor)
+        mask1 = (dr > 0).to(frame2)
+        # Convert to channel first
+        warped_frame1 = torch.moveaxis(warped_frame1, [0, 1, 2, 3], [0, 2, 3, 1])
+        mask1 = torch.moveaxis(mask1, [0, 1, 2, 3], [0, 2, 3, 1])
+        if is_image:
+            assert warped_frame1.min() >= -1.1  # Allow for rounding errors
+            assert warped_frame1.max() <= 1.1
+            warped_frame1 = torch.clamp(warped_frame1, min=-1, max=1)
+        return warped_frame1, mask1
+    @staticmethod
+    def create_grid(b, h, w):
+        x_1d = torch.arange(0, w)[None]
+        y_1d = torch.arange(0, h)[:, None]
+        x_2d = x_1d.repeat([h, 1])
+        y_2d = y_1d.repeat([1, w])
+        grid = torch.stack([x_2d, y_2d], dim=0)
+        batch_grid = grid[None].repeat([b, 1, 1, 1])
+        return batch_grid
+    # @staticmethod
+    # def read_image(path: Path) -> torch.Tensor:
+    #     image = skimage.io.imread(path.as_posix())
+    #     return image
+    # @staticmethod
+    # def read_depth(path: Path) -> torch.Tensor:
+    #     if path.suffix == '.png':
+    #         depth = skimage.io.imread(path.as_posix())
+    #     elif path.suffix == '.npy':
+    #         depth = numpy.load(path.as_posix())
+    #     elif path.suffix == '.npz':
+    #         with numpy.load(path.as_posix()) as depth_data:
+    #             depth = depth_data['depth']
+    #     elif path.suffix == '.exr':
+    #         exr_file = OpenEXR.InputFile(path.as_posix())
+    #         raw_bytes = exr_file.channel('B', Imath.PixelType(Imath.PixelType.FLOAT))
+    #         depth_vector = numpy.frombuffer(raw_bytes, dtype=numpy.float32)
+    #         height = exr_file.header()['displayWindow'].max.y + 1 - exr_file.header()['displayWindow'].min.y
+    #         width = exr_file.header()['displayWindow'].max.x + 1 - exr_file.header()['displayWindow'].min.x
+    #         depth = numpy.reshape(depth_vector, (height, width))
+    #     else:
+    #         raise RuntimeError(f'Unknown depth format: {path.suffix}')
+    #     return depth
+    # @staticmethod
+    # def camera_intrinsic_transform(capture_width=1920, capture_height=1080, patch_start_point: tuple = (0, 0)):
+    #     start_y, start_x = patch_start_point
+    #     camera_intrinsics = numpy.eye(4)
+    #     camera_intrinsics[0, 0] = 2100
+    #     camera_intrinsics[0, 2] = capture_width / 2.0 - start_x
+    #     camera_intrinsics[1, 1] = 2100
+    #     camera_intrinsics[1, 2] = capture_height / 2.0 - start_y
+    #     return camera_intrinsics
+    # @staticmethod
+    # def get_device(device: str):
+    #     """
+    #     Returns torch device object
+    #     :param device: cpu/gpu0/gpu1
+    #     :return:
+    #     """
+    #     if device == 'cpu':
+    #         device = torch.device('cpu')
+    #     elif device.startswith('gpu') and torch.cuda.is_available():
+    #         gpu_num = int(device[3:])
+    #         device = torch.device(f'cuda:{gpu_num}')
+    #     else:
+    #         device = torch.device('cpu')
+    #     return device

finetune/modules/depth_warping/depth_anything_v2/depth_anything_wrapper.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import os
+from cameractrl.modules.depth_anything_v2.dpt import DepthAnythingV2
+class MVSplat_wrapper(nn.Module):
+    def __init__(
+            self,
+            model_configs,
+            ckpt_path,
+        ):
+        super().__init__()
+        depth_anything = DepthAnythingV2(model_configs)

finetune/modules/depth_warping/depth_anything_v2/dinov2.py ADDED Viewed

	@@ -0,0 +1,415 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from .dinov2_layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        ffn_bias=True,
+        proj_bias=True,
+        drop_path_rate=0.0,
+        drop_path_uniform=False,
+        init_values=None,  # for layerscale: None or 0 => no layerscale
+        embed_layer=PatchEmbed,
+        act_layer=nn.GELU,
+        block_fn=Block,
+        ffn_layer="mlp",
+        block_chunks=1,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i : i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1 : self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1 :],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+        reshape: bool = False,
+        return_class_token: bool = False,
+        norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small,
+        "vitb": vit_base,
+        "vitl": vit_large,
+        "vitg": vit_giant2
+    }
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/attention.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/mlp.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

finetune/modules/depth_warping/depth_anything_v2/dinov2_layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

finetune/modules/depth_warping/depth_anything_v2/dpt.py ADDED Viewed

	@@ -0,0 +1,235 @@

+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision.transforms as tf
+from torchvision.transforms import Compose
+from .dinov2 import DINOv2
+from .util.blocks import FeatureFusionBlock, _make_scratch
+from .util.transform import Resize, NormalizeImage, PrepareForNet
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )
+class ConvBlock(nn.Module):
+    def __init__(self, in_feature, out_feature):
+        super().__init__()
+        self.conv_block = nn.Sequential(
+            nn.Conv2d(in_feature, out_feature, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(out_feature),
+            nn.ReLU(True)
+        )
+    def forward(self, x):
+        return self.conv_block(x)
+class DPTHead(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        features=256,
+        use_bn=False,
+        out_channels=[256, 512, 1024, 1024],
+        use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.Sigmoid()
+        )
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        return out
+class DepthAnythingV2(nn.Module):
+    def __init__(
+        self,
+        encoder='vitl',
+        features=256,
+        out_channels=[256, 512, 1024, 1024],
+        use_bn=False,
+        use_clstoken=False,
+        max_depth=20.0
+    ):
+        super(DepthAnythingV2, self).__init__()
+        self.intermediate_layer_idx = {
+            'vits': [2, 5, 8, 11],
+            'vitb': [2, 5, 8, 11],
+            'vitl': [4, 11, 17, 23],
+            'vitg': [9, 19, 29, 39]
+        }
+        self.max_depth = max_depth
+        self.encoder = encoder
+        self.pretrained = DINOv2(model_name=encoder)
+        self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels, use_clstoken=use_clstoken)
+    def forward(self, x):
+        patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
+        features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder], return_class_token=True)
+        depth = self.depth_head(features, patch_h, patch_w) * self.max_depth
+        return depth.squeeze(1)
+    @torch.no_grad()
+    def infer_image(self, raw_image, input_size=518):
+        image, (h, w) = self.image2tensor(raw_image, input_size)
+        depth = self.forward(image)
+        depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
+        return depth
+        # return depth.cpu().numpy()
+    # TODO. transform for torch.Tensor
+    # TODO. inference for torch.Tensor
+    # def image2tensor_pt(self, raw_image, input_size=518):
+    #     transform = Compose([
+    #         tf
+    #     ])
+    def image2tensor(self, raw_image, input_size=518):
+        transform = Compose([
+            Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+        h, w = raw_image.shape[:2]
+        # raw_image already has RGB order, [0,255]
+        # image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+        image = raw_image / 255.0
+        image = transform({'image': image})['image']
+        image = torch.from_numpy(image).unsqueeze(0)
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+        image = image.to(DEVICE)
+        return image, (h, w)

finetune/modules/depth_warping/depth_anything_v2/util/blocks.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import torch.nn as nn
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=1
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        expand=False,
+        align_corners=True,
+        size=None
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size=size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output

finetune/modules/depth_warping/depth_anything_v2/util/transform.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import numpy as np
+import cv2
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        # resize sample
+        sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height), interpolation=cv2.INTER_NEAREST)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample

finetune/modules/depth_warping/depth_pro/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro package."""
+from .depth_pro import create_model_and_transforms  # noqa
+from .utils import load_rgb  # noqa

finetune/modules/depth_warping/depth_pro/cli/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""Depth Pro CLI and tools."""
+from .run import main as run_main  # noqa

finetune/modules/depth_warping/depth_pro/cli/run.py ADDED Viewed

	@@ -0,0 +1,154 @@

+#!/usr/bin/env python3
+"""Sample script to run DepthPro.
+Copyright (C) 2024 Apple Inc. All Rights Reserved.
+"""
+import argparse
+import logging
+from pathlib import Path
+import numpy as np
+import PIL.Image
+import torch
+from matplotlib import pyplot as plt
+from tqdm import tqdm
+from depth_pro import create_model_and_transforms, load_rgb
+LOGGER = logging.getLogger(__name__)
+def get_torch_device() -> torch.device:
+    """Get the Torch device."""
+    device = torch.device("cpu")
+    if torch.cuda.is_available():
+        device = torch.device("cuda:0")
+    elif torch.backends.mps.is_available():
+        device = torch.device("mps")
+    return device
+def run(args):
+    """Run Depth Pro on a sample image."""
+    if args.verbose:
+        logging.basicConfig(level=logging.INFO)
+    # Load model.
+    model, transform = create_model_and_transforms(
+        device=get_torch_device(),
+        precision=torch.half,
+    )
+    model.eval()
+    image_paths = [args.image_path]
+    if args.image_path.is_dir():
+        image_paths = args.image_path.glob("**/*")
+        relative_path = args.image_path
+    else:
+        relative_path = args.image_path.parent
+    if not args.skip_display:
+        plt.ion()
+        fig = plt.figure()
+        ax_rgb = fig.add_subplot(121)
+        ax_disp = fig.add_subplot(122)
+    for image_path in tqdm(image_paths):
+        # Load image and focal length from exif info (if found.).
+        try:
+            LOGGER.info(f"Loading image {image_path} ...")
+            image, _, f_px = load_rgb(image_path)
+        except Exception as e:
+            LOGGER.error(str(e))
+            continue
+        # Run prediction. If `f_px` is provided, it is used to estimate the final metric depth,
+        # otherwise the model estimates `f_px` to compute the depth metricness.
+        prediction = model.infer(transform(image), f_px=f_px)
+        # Extract the depth and focal length.
+        depth = prediction["depth"].detach().cpu().numpy().squeeze()
+        if f_px is not None:
+            LOGGER.debug(f"Focal length (from exif): {f_px:0.2f}")
+        elif prediction["focallength_px"] is not None:
+            focallength_px = prediction["focallength_px"].detach().cpu().item()
+            LOGGER.info(f"Estimated focal length: {focallength_px}")
+        inverse_depth = 1 / depth
+        # Visualize inverse depth instead of depth, clipped to [0.1m;250m] range for better visualization.
+        max_invdepth_vizu = min(inverse_depth.max(), 1 / 0.1)
+        min_invdepth_vizu = max(1 / 250, inverse_depth.min())
+        inverse_depth_normalized = (inverse_depth - min_invdepth_vizu) / (
+            max_invdepth_vizu - min_invdepth_vizu
+        )
+        # Save Depth as npz file.
+        if args.output_path is not None:
+            output_file = (
+                args.output_path
+                / image_path.relative_to(relative_path).parent
+                / image_path.stem
+            )
+            LOGGER.info(f"Saving depth map to: {str(output_file)}")
+            output_file.parent.mkdir(parents=True, exist_ok=True)
+            np.savez_compressed(output_file, depth=depth)
+            # Save as color-mapped "turbo" jpg image.
+            cmap = plt.get_cmap("turbo")
+            color_depth = (cmap(inverse_depth_normalized)[..., :3] * 255).astype(
+                np.uint8
+            )
+            color_map_output_file = str(output_file) + ".jpg"
+            LOGGER.info(f"Saving color-mapped depth to: : {color_map_output_file}")
+            PIL.Image.fromarray(color_depth).save(
+                color_map_output_file, format="JPEG", quality=90
+            )
+        # Display the image and estimated depth map.
+        if not args.skip_display:
+            ax_rgb.imshow(image)
+            ax_disp.imshow(inverse_depth_normalized, cmap="turbo")
+            fig.canvas.draw()
+            fig.canvas.flush_events()
+    LOGGER.info("Done predicting depth!")
+    if not args.skip_display:
+        plt.show(block=True)
+def main():
+    """Run DepthPro inference example."""
+    parser = argparse.ArgumentParser(
+        description="Inference scripts of DepthPro with PyTorch models."
+    )
+    parser.add_argument(
+        "-i",
+        "--image-path",
+        type=Path,
+        default="./data/example.jpg",
+        help="Path to input image.",
+    )
+    parser.add_argument(
+        "-o",
+        "--output-path",
+        type=Path,
+        help="Path to store output files.",
+    )
+    parser.add_argument(
+        "--skip-display",
+        action="store_true",
+        help="Skip matplotlib display.",
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        help="Show verbose output."
+    )
+    run(parser.parse_args())
+if __name__ == "__main__":
+    main()

finetune/modules/depth_warping/depth_pro/depth_pro.py ADDED Viewed

	@@ -0,0 +1,298 @@

+# Copyright (C) 2024 Apple Inc. All Rights Reserved.
+# Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Mapping, Optional, Tuple, Union
+import torch
+from torch import nn
+from torchvision.transforms import (
+    Compose,
+    ConvertImageDtype,
+    Lambda,
+    Normalize,
+    ToTensor,
+)
+from .network.decoder import MultiresConvDecoder
+from .network.encoder import DepthProEncoder
+from .network.fov import FOVNetwork
+from .network.vit_factory import VIT_CONFIG_DICT, ViTPreset, create_vit
+@dataclass
+class DepthProConfig:
+    """Configuration for DepthPro."""
+    patch_encoder_preset: ViTPreset
+    image_encoder_preset: ViTPreset
+    decoder_features: int
+    checkpoint_uri: Optional[str] = None
+    fov_encoder_preset: Optional[ViTPreset] = None
+    use_fov_head: bool = True
+DEFAULT_MONODEPTH_CONFIG_DICT = DepthProConfig(
+    patch_encoder_preset="dinov2l16_384",
+    image_encoder_preset="dinov2l16_384",
+    checkpoint_uri="./checkpoints/depth_pro.pt",
+    decoder_features=256,
+    use_fov_head=True,
+    fov_encoder_preset="dinov2l16_384",
+)
+def create_backbone_model(
+    preset: ViTPreset
+) -> Tuple[nn.Module, ViTPreset]:
+    """Create and load a backbone model given a config.
+    Args:
+    ----
+        preset: A backbone preset to load pre-defind configs.
+    Returns:
+    -------
+        A Torch module and the associated config.
+    """
+    if preset in VIT_CONFIG_DICT:
+        config = VIT_CONFIG_DICT[preset]
+        model = create_vit(preset=preset, use_pretrained=False)
+    else:
+        raise KeyError(f"Preset {preset} not found.")
+    return model, config
+def create_model_and_transforms(
+    config: DepthProConfig = DEFAULT_MONODEPTH_CONFIG_DICT,
+    device: torch.device = torch.device("cpu"),
+    precision: torch.dtype = torch.float32,
+) -> Tuple[DepthPro, Compose]:
+    """Create a DepthPro model and load weights from `config.checkpoint_uri`.
+    Args:
+    ----
+        config: The configuration for the DPT model architecture.
+        device: The optional Torch device to load the model onto, default runs on "cpu".
+        precision: The optional precision used for the model, default is FP32.
+    Returns:
+    -------
+        The Torch DepthPro model and associated Transform.
+    """
+    patch_encoder, patch_encoder_config = create_backbone_model(
+        preset=config.patch_encoder_preset
+    )
+    image_encoder, _ = create_backbone_model(
+        preset=config.image_encoder_preset
+    )
+    fov_encoder = None
+    if config.use_fov_head and config.fov_encoder_preset is not None:
+        fov_encoder, _ = create_backbone_model(preset=config.fov_encoder_preset)
+    dims_encoder = patch_encoder_config.encoder_feature_dims
+    hook_block_ids = patch_encoder_config.encoder_feature_layer_ids
+    encoder = DepthProEncoder(
+        dims_encoder=dims_encoder,
+        patch_encoder=patch_encoder,
+        image_encoder=image_encoder,
+        hook_block_ids=hook_block_ids,
+        decoder_features=config.decoder_features,
+    )
+    decoder = MultiresConvDecoder(
+        dims_encoder=[config.decoder_features] + list(encoder.dims_encoder),
+        dim_decoder=config.decoder_features,
+    )
+    model = DepthPro(
+        encoder=encoder,
+        decoder=decoder,
+        last_dims=(32, 1),
+        use_fov_head=config.use_fov_head,
+        fov_encoder=fov_encoder,
+    ).to(device)
+    if precision == torch.half:
+        model.half()
+    transform = Compose(
+        [
+            ToTensor(),
+            Lambda(lambda x: x.to(device)),
+            Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+            ConvertImageDtype(precision),
+        ]
+    )
+    if config.checkpoint_uri is not None:
+        state_dict = torch.load(config.checkpoint_uri, map_location="cpu")
+        missing_keys, unexpected_keys = model.load_state_dict(
+            state_dict=state_dict, strict=True
+        )
+        if len(unexpected_keys) != 0:
+            raise KeyError(
+                f"Found unexpected keys when loading monodepth: {unexpected_keys}"
+            )
+        # fc_norm is only for the classification head,
+        # which we would not use. We only use the encoding.
+        missing_keys = [key for key in missing_keys if "fc_norm" not in key]
+        if len(missing_keys) != 0:
+            raise KeyError(f"Keys are missing when loading monodepth: {missing_keys}")
+    return model, transform
+class DepthPro(nn.Module):
+    """DepthPro network."""
+    def __init__(
+        self,
+        encoder: DepthProEncoder,
+        decoder: MultiresConvDecoder,
+        last_dims: tuple[int, int],
+        use_fov_head: bool = True,
+        fov_encoder: Optional[nn.Module] = None,
+    ):
+        """Initialize DepthPro.
+        Args:
+        ----
+            encoder: The DepthProEncoder backbone.
+            decoder: The MultiresConvDecoder decoder.
+            last_dims: The dimension for the last convolution layers.
+            use_fov_head: Whether to use the field-of-view head.
+            fov_encoder: A separate encoder for the field of view.
+        """
+        super().__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+        dim_decoder = decoder.dim_decoder
+        self.head = nn.Sequential(
+            nn.Conv2d(
+                dim_decoder, dim_decoder // 2, kernel_size=3, stride=1, padding=1
+            ),
+            nn.ConvTranspose2d(
+                in_channels=dim_decoder // 2,
+                out_channels=dim_decoder // 2,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+            ),
+            nn.Conv2d(
+                dim_decoder // 2,
+                last_dims[0],
+                kernel_size=3,
+                stride=1,
+                padding=1,
+            ),
+            nn.ReLU(True),
+            nn.Conv2d(last_dims[0], last_dims[1], kernel_size=1, stride=1, padding=0),
+            nn.ReLU(),
+        )
+        # Set the final convolution layer's bias to be 0.
+        self.head[4].bias.data.fill_(0)
+        # Set the FOV estimation head.
+        if use_fov_head:
+            self.fov = FOVNetwork(num_features=dim_decoder, fov_encoder=fov_encoder)
+    @property
+    def img_size(self) -> int:
+        """Return the internal image size of the network."""
+        return self.encoder.img_size
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Decode by projection and fusion of multi-resolution encodings.
+        Args:
+        ----
+            x (torch.Tensor): Input image.
+        Returns:
+        -------
+            The canonical inverse depth map [m] and the optional estimated field of view [deg].
+        """
+        _, _, H, W = x.shape
+        assert H == self.img_size and W == self.img_size
+        encodings = self.encoder(x)
+        features, features_0 = self.decoder(encodings)
+        canonical_inverse_depth = self.head(features)
+        fov_deg = None
+        if hasattr(self, "fov"):
+            fov_deg = self.fov.forward(x, features_0.detach())
+        return canonical_inverse_depth, fov_deg
+    @torch.no_grad()
+    def infer(
+        self,
+        x: torch.Tensor,
+        f_px: Optional[Union[float, torch.Tensor]] = None,
+        interpolation_mode="bilinear",
+    ) -> Mapping[str, torch.Tensor]:
+        """Infer depth and fov for a given image.
+        If the image is not at network resolution, it is resized to 1536x1536 and
+        the estimated depth is resized to the original image resolution.
+        Note: if the focal length is given, the estimated value is ignored and the provided
+        focal length is use to generate the metric depth values.
+        Args:
+        ----
+            x (torch.Tensor): Input image
+            f_px (torch.Tensor): Optional focal length in pixels corresponding to `x`.
+            interpolation_mode (str): Interpolation function for downsampling/upsampling.
+        Returns:
+        -------
+            Tensor dictionary (torch.Tensor): depth [m], focallength [pixels].
+        """
+        if len(x.shape) == 3:
+            x = x.unsqueeze(0)
+        _, _, H, W = x.shape
+        resize = H != self.img_size or W != self.img_size
+        if resize:
+            x = nn.functional.interpolate(
+                x,
+                size=(self.img_size, self.img_size),
+                mode=interpolation_mode,
+                align_corners=False,
+            )
+        canonical_inverse_depth, fov_deg = self.forward(x)
+        if f_px is None:
+            f_px = 0.5 * W / torch.tan(0.5 * torch.deg2rad(fov_deg.to(torch.float)))
+        inverse_depth = canonical_inverse_depth * (W / f_px)
+        f_px = f_px.squeeze()
+        if resize:
+            inverse_depth = nn.functional.interpolate(
+                inverse_depth, size=(H, W), mode=interpolation_mode, align_corners=False
+            )
+        depth = 1.0 / torch.clamp(inverse_depth, min=1e-4, max=1e4)
+        return {
+            "depth": depth.squeeze(),
+            "focallength_px": f_px,
+        }

finetune/modules/depth_warping/depth_pro/eval/boundary_metrics.py ADDED Viewed

	@@ -0,0 +1,332 @@

+from typing import List, Tuple
+import numpy as np
+def connected_component(r: np.ndarray, c: np.ndarray) -> List[List[int]]:
+    """Find connected components in the given row and column indices.
+    Args:
+    ----
+        r (np.ndarray): Row indices.
+        c (np.ndarray): Column indices.
+    Yields:
+    ------
+        List[int]: Indices of connected components.
+    """
+    indices = [0]
+    for i in range(1, r.size):
+        if r[i] == r[indices[-1]] and c[i] == c[indices[-1]] + 1:
+            indices.append(i)
+        else:
+            yield indices
+            indices = [i]
+    yield indices
+def nms_horizontal(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) horizontally on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    mask = np.zeros_like(ratio, dtype=bool)
+    r, c = np.nonzero(ratio > threshold)
+    if len(r) == 0:
+        return mask
+    for ids in connected_component(r, c):
+        values = [ratio[r[i], c[i]] for i in ids]
+        mi = np.argmax(values)
+        mask[r[ids[mi]], c[ids[mi]]] = True
+    return mask
+def nms_vertical(ratio: np.ndarray, threshold: float) -> np.ndarray:
+    """Apply Non-Maximum Suppression (NMS) vertically on the given ratio matrix.
+    Args:
+    ----
+        ratio (np.ndarray): Input ratio matrix.
+        threshold (float): Threshold for NMS.
+    Returns:
+    -------
+        np.ndarray: Binary mask after applying NMS.
+    """
+    return np.transpose(nms_horizontal(np.transpose(ratio), threshold))
+def fgbg_depth(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for comparison.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations.
+    """
+    right_is_big_enough = (d[..., :, 1:] / d[..., :, :-1]) > t
+    left_is_big_enough = (d[..., :, :-1] / d[..., :, 1:]) > t
+    bottom_is_big_enough = (d[..., 1:, :] / d[..., :-1, :]) > t
+    top_is_big_enough = (d[..., :-1, :] / d[..., 1:, :]) > t
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_depth_thinned(
+    d: np.ndarray, t: float
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels with Non-Maximum Suppression.
+    Args:
+    ----
+        d (np.ndarray): Depth matrix.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations with NMS applied.
+    """
+    right_is_big_enough = nms_horizontal(d[..., :, 1:] / d[..., :, :-1], t)
+    left_is_big_enough = nms_horizontal(d[..., :, :-1] / d[..., :, 1:], t)
+    bottom_is_big_enough = nms_vertical(d[..., 1:, :] / d[..., :-1, :], t)
+    top_is_big_enough = nms_vertical(d[..., :-1, :] / d[..., 1:, :], t)
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def fgbg_binary_mask(
+    d: np.ndarray,
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
+    """Find foreground-background relations between neighboring pixels in binary masks.
+    Args:
+    ----
+        d (np.ndarray): Binary depth matrix.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: Four matrices indicating
+        left, top, right, and bottom foreground-background relations in binary masks.
+    """
+    assert d.dtype == bool
+    right_is_big_enough = d[..., :, 1:] & ~d[..., :, :-1]
+    left_is_big_enough = d[..., :, :-1] & ~d[..., :, 1:]
+    bottom_is_big_enough = d[..., 1:, :] & ~d[..., :-1, :]
+    top_is_big_enough = d[..., :-1, :] & ~d[..., 1:, :]
+    return (
+        left_is_big_enough,
+        top_is_big_enough,
+        right_is_big_enough,
+        bottom_is_big_enough,
+    )
+def edge_recall_matting(pr: np.ndarray, gt: np.ndarray, t: float) -> float:
+    """Calculate edge recall for image matting.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth binary mask.
+        t (float): Threshold for NMS.
+    Returns:
+    -------
+        float: Edge recall value.
+    """
+    assert gt.dtype == bool
+    ap, bp, cp, dp = fgbg_depth_thinned(pr, t)
+    ag, bg, cg, dg = fgbg_binary_mask(gt)
+    return 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+def boundary_f1(
+    pr: np.ndarray,
+    gt: np.ndarray,
+    t: float,
+    return_p: bool = False,
+    return_r: bool = False,
+) -> float:
+    """Calculate Boundary F1 score.
+    Args:
+    ----
+        pr (np.ndarray): Predicted depth matrix.
+        gt (np.ndarray): Ground truth depth matrix.
+        t (float): Threshold for comparison.
+        return_p (bool, optional): If True, return precision. Defaults to False.
+        return_r (bool, optional): If True, return recall. Defaults to False.
+    Returns:
+    -------
+        float: Boundary F1 score, or precision, or recall depending on the flags.
+    """
+    ap, bp, cp, dp = fgbg_depth(pr, t)
+    ag, bg, cg, dg = fgbg_depth(gt, t)
+    r = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ag), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bg), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cg), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dg), 1)
+    )
+    p = 0.25 * (
+        np.count_nonzero(ap & ag) / max(np.count_nonzero(ap), 1)
+        + np.count_nonzero(bp & bg) / max(np.count_nonzero(bp), 1)
+        + np.count_nonzero(cp & cg) / max(np.count_nonzero(cp), 1)
+        + np.count_nonzero(dp & dg) / max(np.count_nonzero(dp), 1)
+    )
+    if r + p == 0:
+        return 0.0
+    if return_p:
+        return p
+    if return_r:
+        return r
+    return 2 * (r * p) / (r + p)
+def get_thresholds_and_weights(
+    t_min: float, t_max: float, N: int
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Generate thresholds and weights for the given range.
+    Args:
+    ----
+        t_min (float): Minimum threshold.
+        t_max (float): Maximum threshold.
+        N (int): Number of thresholds.
+    Returns:
+    -------
+        Tuple[np.ndarray, np.ndarray]: Array of thresholds and corresponding weights.
+    """
+    thresholds = np.linspace(t_min, t_max, N)
+    weights = thresholds / thresholds.sum()
+    return thresholds, weights
+def invert_depth(depth: np.ndarray, eps: float = 1e-6) -> np.ndarray:
+    """Inverts a depth map with numerical stability.
+    Args:
+    ----
+        depth (np.ndarray): Depth map to be inverted.
+        eps (float): Minimum value to avoid division by zero (default is 1e-6).
+    Returns:
+    -------
+    np.ndarray: Inverted depth map.
+    """
+    inverse_depth = 1.0 / depth.clip(min=eps)
+    return inverse_depth
+def SI_boundary_F1(
+    predicted_depth: np.ndarray,
+    target_depth: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+) -> float:
+    """Calculate Scale-Invariant Boundary F1 Score for depth-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_depth (np.ndarray): Ground truth depth matrix.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary F1 Score.
+    """
+    assert predicted_depth.ndim == target_depth.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    f1_scores = np.array(
+        [
+            boundary_f1(invert_depth(predicted_depth), invert_depth(target_depth), t)
+            for t in thresholds
+        ]
+    )
+    return np.sum(f1_scores * weights)
+def SI_boundary_Recall(
+    predicted_depth: np.ndarray,
+    target_mask: np.ndarray,
+    t_min: float = 1.05,
+    t_max: float = 1.25,
+    N: int = 10,
+    alpha_threshold: float = 0.1,
+) -> float:
+    """Calculate Scale-Invariant Boundary Recall Score for mask-based ground-truth.
+    Args:
+    ----
+        predicted_depth (np.ndarray): Predicted depth matrix.
+        target_mask (np.ndarray): Ground truth binary mask.
+        t_min (float, optional): Minimum threshold. Defaults to 1.05.
+        t_max (float, optional): Maximum threshold. Defaults to 1.25.
+        N (int, optional): Number of thresholds. Defaults to 10.
+        alpha_threshold (float, optional): Threshold for alpha masking. Defaults to 0.1.
+    Returns:
+    -------
+        float: Scale-Invariant Boundary Recall Score.
+    """
+    assert predicted_depth.ndim == target_mask.ndim == 2
+    thresholds, weights = get_thresholds_and_weights(t_min, t_max, N)
+    thresholded_target = target_mask > alpha_threshold
+    recall_scores = np.array(
+        [
+            edge_recall_matting(
+                invert_depth(predicted_depth), thresholded_target, t=float(t)
+            )
+            for t in thresholds
+        ]
+    )
+    weighted_recall = np.sum(recall_scores * weights)
+    return weighted_recall

finetune/modules/depth_warping/depth_pro/eval/dis5k_sample_list.txt ADDED Viewed

	@@ -0,0 +1,200 @@

+DIS5K/DIS-TE1/im/12#Graphics#4#TrafficSign#8245751856_821be14f86_o.jpg
+DIS5K/DIS-TE1/im/13#Insect#4#Butterfly#16023994688_7ff8cdccb1_o.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205538.jpg
+DIS5K/DIS-TE1/im/14#Kitchenware#8#SweetStand#4848284981_fc90f54b50_o.jpg
+DIS5K/DIS-TE1/im/17#Non-motor Vehicle#4#Cart#15012855035_d10b57014f_o.jpg
+DIS5K/DIS-TE1/im/2#Aircraft#5#Kite#13104545564_5afceec9bd_o.jpg
+DIS5K/DIS-TE1/im/20#Sports#10#Skateboarding#8472763540_bb2390e928_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#14#Sword#32473146960_dcc6b77848_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#15#Tapeline#9680492386_2d2020f282_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#4#Flag#507752845_ef852100f0_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#6#Key#11966089533_3becd78b44_o.jpg
+DIS5K/DIS-TE1/im/21#Tool#8#Scale#31946428472_d28def471b_o.jpg
+DIS5K/DIS-TE1/im/22#Weapon#4#Rifle#8472656430_3eb908b211_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#3#Earphone#1177468301_641df8c267_o.jpg
+DIS5K/DIS-TE1/im/8#Electronics#9#MusicPlayer#2235782872_7d47847bb4_o.jpg
+DIS5K/DIS-TE2/im/11#Furniture#13#Ladder#3878434417_2ed740586e_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#1#Ant#27047700955_3b3a1271f8_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#11#Spider#5567179191_38d1f65589_o.jpg
+DIS5K/DIS-TE2/im/13#Insect#8#Locust#5237933769_e6687c05e4_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#2#DishRack#70838854_40cf689da7_o.jpg
+DIS5K/DIS-TE2/im/14#Kitchenware#8#SweetStand#8467929412_fef7f4275d_o.jpg
+DIS5K/DIS-TE2/im/16#Music Instrument#2#Harp#28058219806_28e05ff24a_o.jpg
+DIS5K/DIS-TE2/im/17#Non-motor Vehicle#1#BabyCarriage#29794777180_2e1695a0cf_o.jpg
+DIS5K/DIS-TE2/im/19#Ship#3#Sailboat#22442908623_5977e3becf_o.jpg
+DIS5K/DIS-TE2/im/2#Aircraft#5#Kite#44654358051_1400e71cc4_o.jpg
+DIS5K/DIS-TE2/im/21#Tool#11#Stand#IMG_20210520_205442.jpg
+DIS5K/DIS-TE2/im/21#Tool#17#Tripod#9318977876_34615ec9a0_o.jpg
+DIS5K/DIS-TE2/im/5#Artifact#3#Handcraft#50860882577_8482143b1b_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#10#Robot#3093360210_fee54dc5c5_o.jpg
+DIS5K/DIS-TE2/im/8#Electronics#6#Microphone#47411477652_6da66cbc10_o.jpg
+DIS5K/DIS-TE3/im/14#Kitchenware#4#Kitchenware#2451122898_ef883175dd_o.jpg
+DIS5K/DIS-TE3/im/15#Machine#4#SewingMachine#9311164128_97ba1d3947_o.jpg
+DIS5K/DIS-TE3/im/16#Music Instrument#2#Harp#7670920550_59e992fd7b_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#1#BabyCarriage#8389984877_1fddf8715c_o.jpg
+DIS5K/DIS-TE3/im/17#Non-motor Vehicle#3#Carriage#5947122724_98e0fc3d1f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#2#Balloon#2487168092_641505883f_o.jpg
+DIS5K/DIS-TE3/im/2#Aircraft#4#Helicopter#8401177591_06c71c8df2_o.jpg
+DIS5K/DIS-TE3/im/20#Sports#1#Archery#12520003103_faa43ea3e0_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#11#Stand#IMG_20210709_221507.jpg
+DIS5K/DIS-TE3/im/21#Tool#2#Clip#5656649687_63d0c6696d_o.jpg
+DIS5K/DIS-TE3/im/21#Tool#6#Key#12878459244_6387a140ea_o.jpg
+DIS5K/DIS-TE3/im/3#Aquatic#1#Lobster#109214461_f52b4b6093_o.jpg
+DIS5K/DIS-TE3/im/4#Architecture#19#Windmill#20195851863_2627117e0e_o.jpg
+DIS5K/DIS-TE3/im/5#Artifact#2#Cage#5821476369_ea23927487_o.jpg
+DIS5K/DIS-TE3/im/8#Electronics#7#MobileHolder#49732997896_7f53c290b5_o.jpg
+DIS5K/DIS-TE4/im/13#Insect#6#Centipede#15302179708_a267850881_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#11#Tricycle#5771069105_a3aef6f665_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#2#Bicycle#4245936196_fdf812dcb7_o.jpg
+DIS5K/DIS-TE4/im/17#Non-motor Vehicle#9#ShoppingCart#4674052920_a5b7a2b236_o.jpg
+DIS5K/DIS-TE4/im/18#Plant#1#Bonsai#3539420884_ca8973e2c0_o.jpg
+DIS5K/DIS-TE4/im/2#Aircraft#6#Parachute#33590416634_9d6f2325e7_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#1#Archery#46924476515_0be1caa684_o.jpg
+DIS5K/DIS-TE4/im/20#Sports#8#Racket#19337607166_dd1985fb59_o.jpg
+DIS5K/DIS-TE4/im/21#Tool#6#Key#3193329588_839b0c74ce_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#2#Cage#5821886526_0573ba2d0d_o.jpg
+DIS5K/DIS-TE4/im/5#Artifact#3#Handcraft#50105138282_3c1d02c968_o.jpg
+DIS5K/DIS-TE4/im/8#Electronics#1#Antenna#4305034305_874f21a701_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#15554964549_3105e51b6f_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#1#Bag#41104261980_098a6c4a56_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#2#Clothes#2284764037_871b2e8ca4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#1824643784_70d0134156_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#3590020230_37b09a29b3_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#4809652879_4da8a69f3b_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#3#Eyeglasses#792204934_f9b28f99b4_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#5#Jewelry#13909132974_c4750c5fb7_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#7#Shoe#2483391615_9199ece8d6_o.jpg
+DIS5K/DIS-TR/im/1#Accessories#8#Watch#4343266960_f6633b029b_o.jpg
+DIS5K/DIS-TR/im/10#Frame#2#BicycleFrame#17897573_42964dd104_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#15898634812_64807069ff_o.jpg
+DIS5K/DIS-TR/im/10#Frame#5#Rack#23928546819_c184cb0b60_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#19#Shower#6189119596_77bcfe80ee_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#2#Bench#3263647075_9306e280b5_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#5#CoatHanger#12774091054_cd5ff520ef_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#6#DentalChair#13878156865_d0439dcb32_o.jpg
+DIS5K/DIS-TR/im/11#Furniture#9#Easel#5861024714_2070cd480c_o.jpg
+DIS5K/DIS-TR/im/12#Graphics#4#TrafficSign#40621867334_f3c32ec189_o.jpg
+DIS5K/DIS-TR/im/13#Insect#1#Ant#3295038190_db5dd0d4f4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#10#Mosquito#24341339_a88a1dad4c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#27171518270_63b78069ff_o.jpg
+DIS5K/DIS-TR/im/13#Insect#11#Spider#49925050281_fa727c154e_o.jpg
+DIS5K/DIS-TR/im/13#Insect#2#Beatle#279616486_2f1e64f591_o.jpg
+DIS5K/DIS-TR/im/13#Insect#3#Bee#43892067695_82cf3e536b_o.jpg
+DIS5K/DIS-TR/im/13#Insect#6#Centipede#20874281788_3e15c90a1c_o.jpg
+DIS5K/DIS-TR/im/13#Insect#7#Dragonfly#14106671120_1b824d77e4_o.jpg
+DIS5K/DIS-TR/im/13#Insect#8#Locust#21637491048_676ef7c9f7_o.jpg
+DIS5K/DIS-TR/im/13#Insect#9#Mantis#1381120202_9dff6987b2_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#1#Cup#12812517473_327d6474b8_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#10#WineGlass#6402491641_389275d4d1_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#3#Hydrovalve#3129932040_8c05825004_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#2881934780_87d5218ebb_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#4#Kitchenware#IMG_20210520_205527.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#6#Spoon#32989113501_b69eccf0df_o.jpg
+DIS5K/DIS-TR/im/14#Kitchenware#8#SweetStand#2867322189_c56d1e0b87_o.jpg
+DIS5K/DIS-TR/im/15#Machine#1#Gear#19217846720_f5f2807475_o.jpg
+DIS5K/DIS-TR/im/15#Machine#2#Machine#1620160659_9571b7a7ab_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#2#Harp#6012801603_1a6e2c16a6_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#5#Trombone#8683292118_d223c17ccb_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#6#Trumpet#8393262740_b8c216142c_o.jpg
+DIS5K/DIS-TR/im/16#Music Instrument#8#Violin#1511267391_40e4949d68_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#1#BabyCarriage#6989512997_38b3dbc88b_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#14627183228_b2d68cf501_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#2932226475_1b2403e549_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#12#Wheel#5420155648_86459905b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#2#Bicycle#IMG_20210513_134904.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#3#Carriage#3311962551_6f211b7bd6_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#4#Cart#2609732026_baf7fff3a1_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#5#Handcart#5821282211_201cefeaf2_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#7#Mower#5779003232_3bb3ae531a_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#10051622843_ace07e32b8_o.jpg
+DIS5K/DIS-TR/im/17#Non-motor Vehicle#9#ShoppingCart#8075259294_f23e243849_o.jpg
+DIS5K/DIS-TR/im/18#Plant#2#Tree#44800999741_e377e16dbb_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#2631761913_3ac67d0223_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#1#Airplane#37707911566_e908a261b6_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#3#HangGlider#2557220131_b8506920c5_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#4#Helicopter#6215659280_5dbd9b4546_o.jpg
+DIS5K/DIS-TR/im/2#Aircraft#6#Parachute#20185790493_e56fcaf8c6_o.jpg
+DIS5K/DIS-TR/im/20#Sports#1#Archery#3871269982_ae4c59a7eb_o.jpg
+DIS5K/DIS-TR/im/20#Sports#9#RockClimbing#9662433268_51299bc50e_o.jpg
+DIS5K/DIS-TR/im/21#Tool#14#Sword#26258479365_2950d7fa37_o.jpg
+DIS5K/DIS-TR/im/21#Tool#15#Tapeline#15505703447_e0fdeaa5a6_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#26678602024_9b665742de_o.jpg
+DIS5K/DIS-TR/im/21#Tool#4#Flag#5774823110_d603ce3cc8_o.jpg
+DIS5K/DIS-TR/im/21#Tool#5#Hook#6867989814_dba18d673c_o.jpg
+DIS5K/DIS-TR/im/22#Weapon#4#Rifle#4451713125_cd91719189_o.jpg
+DIS5K/DIS-TR/im/3#Aquatic#2#Seadragon#4910944581_913139b238_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#12#Scaffold#3661448960_8aff24cc4d_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#13#Sculpture#6385318715_9a88d4eba7_o.jpg
+DIS5K/DIS-TR/im/4#Architecture#17#Well#5011603479_75cf42808a_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#2#Cage#4892828841_7f1bc05682_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#15404211628_9e9ff2ce2e_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#3200169865_7c84cfcccf_o.jpg
+DIS5K/DIS-TR/im/5#Artifact#3#Handcraft#5859295071_c217e7c22f_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#10#SteeringWheel#17200338026_f1e2122d8e_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#3#Car#3780893425_1a7d275e09_o.jpg
+DIS5K/DIS-TR/im/6#Automobile#5#Crane#15282506502_1b1132a7c3_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#16767791875_8e6df41752_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#3291433361_38747324c4_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#4195104238_12a754c61a_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#49645415132_61e5664ecf_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#1#Cable#IMG_20210521_232406.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#3298312021_92f431e3e9_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#10#UtilityPole#47950134773_fbfff63f4e_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#11#VacuumCleaner#5448403677_6a29e21881_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#2#CeilingLamp#611568868_680ed5d39f_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#3#Fan#3391683115_990525a693_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#6#StreetLamp#150049122_0692266618_o.jpg
+DIS5K/DIS-TR/im/7#Electrical#9#TransmissionTower#31433908671_7e7e277dfe_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#1#Antenna#8727884873_e0622ee5c4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#2#Camcorder#4172690390_7e5f280ace_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#3#Earphone#413984555_f290febdf5_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#5#Headset#30574225373_3717ed9fa4_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#6#Microphone#538006482_4aae4f5bd6_o.jpg
+DIS5K/DIS-TR/im/8#Electronics#9#MusicPlayer#1306012480_2ea80d2afd_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#1#GymEquipment#33071754135_8f3195cbd1_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#2305807849_be53d724ea_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#2#KidsPlayground#3862040422_5bbf903204_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#3#OutdoorFitnessEquipment#10814507005_3dacaa28b3_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#4#FerrisWheel#81640293_4b0ee62040_o.jpg
+DIS5K/DIS-TR/im/9#Entertainment#5#Swing#49867339188_08073f4b76_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#1#Bag#6815402415_e01c1a41e6_o.jpg
+DIS5K/DIS-VD/im/1#Accessories#5#Jewelry#2744070193_1486582e8d_o.jpg
+DIS5K/DIS-VD/im/10#Frame#1#BasketballHoop#IMG_20210521_232650.jpg
+DIS5K/DIS-VD/im/10#Frame#5#Rack#6156611713_49ebf12b1e_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#11#Handrail#3276641240_1b84b5af85_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#13#Ladder#33423266_5391cf47e9_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#17#Table#3725111755_4fc101e7ab_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#2#Bench#35556410400_7235b58070_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#4#Chair#3301769985_e49de6739f_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#6#DentalChair#23811071619_2a95c3a688_o.jpg
+DIS5K/DIS-VD/im/11#Furniture#9#Easel#8322807354_df6d56542e_o.jpg
+DIS5K/DIS-VD/im/13#Insect#10#Mosquito#12391674863_0cdf430d3f_o.jpg
+DIS5K/DIS-VD/im/13#Insect#7#Dragonfly#14693028899_344ea118f2_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#10#WineGlass#4450148455_8f460f541a_o.jpg
+DIS5K/DIS-VD/im/14#Kitchenware#3#Hydrovalve#IMG_20210520_203410.jpg
+DIS5K/DIS-VD/im/15#Machine#3#PlowHarrow#34521712846_df4babb024_o.jpg
+DIS5K/DIS-VD/im/16#Music Instrument#5#Trombone#6222242743_e7189405cd_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#12#Wheel#25677578797_ea47e1d9e8_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#2#Bicycle#5153474856_21560b081b_o.jpg
+DIS5K/DIS-VD/im/17#Non-motor Vehicle#7#Mower#16992510572_8a6ff27398_o.jpg
+DIS5K/DIS-VD/im/19#Ship#2#Canoe#40571458163_7faf8b73d9_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#1#Airplane#4270588164_66a619e834_o.jpg
+DIS5K/DIS-VD/im/2#Aircraft#4#Helicopter#86789665_650b94b2ee_o.jpg
+DIS5K/DIS-VD/im/20#Sports#14#Wakesurfing#5589577652_5061c168d2_o.jpg
+DIS5K/DIS-VD/im/21#Tool#10#Spade#37018312543_63b21b0784_o.jpg
+DIS5K/DIS-VD/im/21#Tool#14#Sword#24789047250_42df9bf422_o.jpg
+DIS5K/DIS-VD/im/21#Tool#18#Umbrella#IMG_20210513_140445.jpg
+DIS5K/DIS-VD/im/21#Tool#6#Key#43939732715_5a6e28b518_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#1#Cannon#12758066705_90b54295e7_o.jpg
+DIS5K/DIS-VD/im/22#Weapon#4#Rifle#8019368790_fb6dc469a7_o.jpg
+DIS5K/DIS-VD/im/3#Aquatic#5#Shrimp#2582833427_7a99e7356e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#12#Scaffold#1013402687_590750354e_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#13#Sculpture#17176841759_272a3ed6e3_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#14#Stair#15079108505_0d11281624_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#19#Windmill#2928111082_ceb3051c04_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#3#Crack#3551574032_17dd106d31_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#5#GasStation#4564307581_c3069bdc62_o.jpg
+DIS5K/DIS-VD/im/4#Architecture#8#ObservationTower#2704526950_d4f0ddc807_o.jpg
+DIS5K/DIS-VD/im/5#Artifact#3#Handcraft#10873642323_1bafce3aa5_o.jpg
+DIS5K/DIS-VD/im/6#Automobile#11#Tractor#8594504006_0c2c557d85_o.jpg
+DIS5K/DIS-VD/im/8#Electronics#3#Earphone#8106454803_1178d867cc_o.jpg

finetune/modules/depth_warping/depth_pro/network/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (C) 2024 Apple Inc. All Rights Reserved.
2	+ """Depth Pro network blocks."""

finetune/modules/depth_warping/depth_pro/network/decoder.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Copyright (C) 2024 Apple Inc. All Rights Reserved.
+Dense Prediction Transformer Decoder architecture.
+Implements a variant of Vision Transformers for Dense Prediction, https://arxiv.org/abs/2103.13413
+"""
+from __future__ import annotations
+from typing import Iterable
+import torch
+from torch import nn
+class MultiresConvDecoder(nn.Module):
+    """Decoder for multi-resolution encodings."""
+    def __init__(
+        self,
+        dims_encoder: Iterable[int],
+        dim_decoder: int,
+    ):
+        """Initialize multiresolution convolutional decoder.
+        Args:
+        ----
+            dims_encoder: Expected dims at each level from the encoder.
+            dim_decoder: Dim of decoder features.
+        """
+        super().__init__()
+        self.dims_encoder = list(dims_encoder)
+        self.dim_decoder = dim_decoder
+        self.dim_out = dim_decoder
+        num_encoders = len(self.dims_encoder)
+        # At the highest resolution, i.e. level 0, we apply projection w/ 1x1 convolution
+        # when the dimensions mismatch. Otherwise we do not do anything, which is
+        # the default behavior of monodepth.
+        conv0 = (
+            nn.Conv2d(self.dims_encoder[0], dim_decoder, kernel_size=1, bias=False)
+            if self.dims_encoder[0] != dim_decoder
+            else nn.Identity()
+        )
+        convs = [conv0]
+        for i in range(1, num_encoders):
+            convs.append(
+                nn.Conv2d(
+                    self.dims_encoder[i],
+                    dim_decoder,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                )
+            )
+        self.convs = nn.ModuleList(convs)
+        fusions = []
+        for i in range(num_encoders):
+            fusions.append(
+                FeatureFusionBlock2d(
+                    num_features=dim_decoder,
+                    deconv=(i != 0),
+                    batch_norm=False,
+                )
+            )
+        self.fusions = nn.ModuleList(fusions)
+    def forward(self, encodings: torch.Tensor) -> torch.Tensor:
+        """Decode the multi-resolution encodings."""
+        num_levels = len(encodings)
+        num_encoders = len(self.dims_encoder)
+        if num_levels != num_encoders:
+            raise ValueError(
+                f"Got encoder output levels={num_levels}, expected levels={num_encoders+1}."
+            )
+        # Project features of different encoder dims to the same decoder dim.
+        # Fuse features from the lowest resolution (num_levels-1)
+        # to the highest (0).
+        features = self.convs[-1](encodings[-1])
+        lowres_features = features
+        features = self.fusions[-1](features)
+        for i in range(num_levels - 2, -1, -1):
+            features_i = self.convs[i](encodings[i])
+            features = self.fusions[i](features, features_i)
+        return features, lowres_features
+class ResidualBlock(nn.Module):
+    """Generic implementation of residual blocks.
+    This implements a generic residual block from
+        He et al. - Identity Mappings in Deep Residual Networks (2016),
+        https://arxiv.org/abs/1603.05027
+    which can be further customized via factory functions.
+    """
+    def __init__(self, residual: nn.Module, shortcut: nn.Module | None = None) -> None:
+        """Initialize ResidualBlock."""
+        super().__init__()
+        self.residual = residual
+        self.shortcut = shortcut
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Apply residual block."""
+        delta_x = self.residual(x)
+        if self.shortcut is not None:
+            x = self.shortcut(x)
+        return x + delta_x
+class FeatureFusionBlock2d(nn.Module):
+    """Feature fusion for DPT."""
+    def __init__(
+        self,
+        num_features: int,
+        deconv: bool = False,
+        batch_norm: bool = False,
+    ):
+        """Initialize feature fusion block.
+        Args:
+        ----
+            num_features: Input and output dimensions.
+            deconv: Whether to use deconv before the final output conv.
+            batch_norm: Whether to use batch normalization in resnet blocks.
+        """
+        super().__init__()
+        self.resnet1 = self._residual_block(num_features, batch_norm)
+        self.resnet2 = self._residual_block(num_features, batch_norm)
+        self.use_deconv = deconv
+        if deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=num_features,
+                out_channels=num_features,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+        self.out_conv = nn.Conv2d(
+            num_features,
+            num_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x0: torch.Tensor, x1: torch.Tensor | None = None) -> torch.Tensor:
+        """Process and fuse input features."""
+        x = x0
+        if x1 is not None:
+            res = self.resnet1(x1)
+            x = self.skip_add.add(x, res)
+        x = self.resnet2(x)
+        if self.use_deconv:
+            x = self.deconv(x)
+        x = self.out_conv(x)
+        return x
+    @staticmethod
+    def _residual_block(num_features: int, batch_norm: bool):
+        """Create a residual block."""
+        def _create_block(dim: int, batch_norm: bool) -> list[nn.Module]:
+            layers = [
+                nn.ReLU(False),
+                nn.Conv2d(
+                    num_features,
+                    num_features,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=not batch_norm,
+                ),
+            ]
+            if batch_norm:
+                layers.append(nn.BatchNorm2d(dim))
+            return layers
+        residual = nn.Sequential(
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+            *_create_block(dim=num_features, batch_norm=batch_norm),
+        )
+        return ResidualBlock(residual)