Spaces:

qitaoz
/

DiffusionSfM

Runtime error

App Files Files Community

qitaoz commited on May 8

Commit

4562a06

verified ·

1 Parent(s): 253f3e1

Upload 57 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +22 -0
LICENSE +21 -0
assets/demo.png +3 -0
conf/config.yaml +83 -0
conf/diffusion.yml +110 -0
data/demo/jellycat/001.jpg +3 -0
data/demo/jellycat/002.jpg +3 -0
data/demo/jellycat/003.jpg +3 -0
data/demo/jellycat/004.jpg +3 -0
data/demo/jordan/001.png +3 -0
data/demo/jordan/002.png +3 -0
data/demo/jordan/003.png +3 -0
data/demo/jordan/004.png +3 -0
data/demo/jordan/005.png +3 -0
data/demo/jordan/006.png +3 -0
data/demo/jordan/007.png +3 -0
data/demo/jordan/008.png +3 -0
data/demo/kew_gardens_ruined_arch/001.jpeg +3 -0
data/demo/kew_gardens_ruined_arch/002.jpeg +3 -0
data/demo/kew_gardens_ruined_arch/003.jpeg +3 -0
data/demo/kotor_cathedral/001.jpeg +3 -0
data/demo/kotor_cathedral/002.jpeg +3 -0
data/demo/kotor_cathedral/003.jpeg +3 -0
data/demo/kotor_cathedral/004.jpeg +3 -0
data/demo/kotor_cathedral/005.jpeg +3 -0
data/demo/kotor_cathedral/006.jpeg +3 -0
diffusionsfm/__init__.py +1 -0
diffusionsfm/dataset/__init__.py +0 -0
diffusionsfm/dataset/co3d_v2.py +792 -0
diffusionsfm/dataset/custom.py +105 -0
diffusionsfm/eval/__init__.py +0 -0
diffusionsfm/eval/eval_category.py +292 -0
diffusionsfm/eval/eval_jobs.py +175 -0
diffusionsfm/inference/__init__.py +0 -0
diffusionsfm/inference/ddim.py +145 -0
diffusionsfm/inference/load_model.py +97 -0
diffusionsfm/inference/predict.py +93 -0
diffusionsfm/model/base_model.py +16 -0
diffusionsfm/model/blocks.py +247 -0
diffusionsfm/model/diffuser.py +195 -0
diffusionsfm/model/diffuser_dpt.py +331 -0
diffusionsfm/model/dit.py +428 -0
diffusionsfm/model/feature_extractors.py +176 -0
diffusionsfm/model/memory_efficient_attention.py +51 -0
diffusionsfm/model/scheduler.py +128 -0
diffusionsfm/utils/__init__.py +0 -0
diffusionsfm/utils/configs.py +66 -0
diffusionsfm/utils/distortion.py +144 -0
diffusionsfm/utils/distributed.py +31 -0
diffusionsfm/utils/geometry.py +145 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,25 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/demo.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jellycat/001.jpg filter=lfs diff=lfs merge=lfs -text
+data/demo/jellycat/002.jpg filter=lfs diff=lfs merge=lfs -text
+data/demo/jellycat/003.jpg filter=lfs diff=lfs merge=lfs -text
+data/demo/jellycat/004.jpg filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/001.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/002.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/003.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/004.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/005.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/006.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/007.png filter=lfs diff=lfs merge=lfs -text
+data/demo/jordan/008.png filter=lfs diff=lfs merge=lfs -text
+data/demo/kew_gardens_ruined_arch/001.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kew_gardens_ruined_arch/002.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kew_gardens_ruined_arch/003.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kotor_cathedral/001.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kotor_cathedral/002.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kotor_cathedral/003.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kotor_cathedral/004.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kotor_cathedral/005.jpeg filter=lfs diff=lfs merge=lfs -text
+data/demo/kotor_cathedral/006.jpeg filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Qitao Zhao
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

assets/demo.png ADDED Viewed

Git LFS Details

SHA256: f5021efbf6bf2ad1de68447a1e9d313581422b79c4f460ccb94654d5c08bb83c
Pointer size: 132 Bytes
Size of remote file: 1.25 MB

conf/config.yaml ADDED Viewed

	@@ -0,0 +1,83 @@

+training:
+  resume: False  # If True, must set hydra.run.dir accordingly
+  pretrain_path: ""
+  interval_visualize: 1000
+  interval_save_checkpoint: 5000
+  interval_delete_checkpoint: 10000
+  interval_evaluate: 5000
+  delete_all_checkpoints_after_training: False
+  lr: 1e-4
+  mixed_precision: True
+  matmul_precision: high
+  max_iterations: 100000
+  batch_size: 64
+  num_workers: 8
+  gpu_id: 0
+  freeze_encoder: True
+  seed: 0
+  job_key: ""  # Use this for submitit sweeps where timestamps might collide
+  translation_scale: 1.0
+  regression: False
+  prob_unconditional: 0
+  load_extra_cameras: False
+  calculate_intrinsics: False
+  distort: False
+  normalize_first_camera: True
+  diffuse_origins_and_endpoints: True
+  diffuse_depths: False
+  depth_resolution: 1
+  dpt_head: False
+  full_num_patches_x: 16
+  full_num_patches_y: 16
+  dpt_encoder_features: True
+  nearest_neighbor: True
+  no_bg_targets: True
+  unit_normalize_scene: False
+  sd_scale: 2
+  bfloat: True
+  first_cam_mediod: True
+  gradient_clipping: False
+  l1_loss: False
+  grad_accumulation: False
+  reinit: False
+model:
+  pred_x0: True
+  model_type: dit
+  num_patches_x: 16
+  num_patches_y: 16
+  depth: 16
+  num_images: 1
+  random_num_images: True
+  feature_extractor: dino
+  append_ndc: True
+  within_image: False
+  use_homogeneous: True
+  freeze_transformer: False
+  cond_depth_mask: True
+noise_scheduler:
+  type: linear
+  max_timesteps: 100
+  beta_start: 0.0120
+  beta_end: 0.00085
+  marigold_ddim: False
+dataset:
+  name: co3d
+  shape: all_train
+  apply_augmentation: True
+  use_global_intrinsics: True
+  mask_holes: True
+  image_size: 224
+debug:
+  wandb: True
+  project_name: diffusionsfm
+  run_name:
+  anomaly_detection: False
+hydra:
+  run:
+    dir: ./output/${now:%m%d_%H%M%S_%f}${training.job_key}
+  output_subdir: hydra

conf/diffusion.yml ADDED Viewed

	@@ -0,0 +1,110 @@

+name: diffusion
+channels:
+- conda-forge
+- iopath
+- nvidia
+- pkgs/main
+- pytorch
+- xformers
+dependencies:
+- _libgcc_mutex=0.1=conda_forge
+- _openmp_mutex=4.5=2_gnu
+- blas=1.0=mkl
+- brotli-python=1.0.9=py39h5a03fae_9
+- bzip2=1.0.8=h7f98852_4
+- ca-certificates=2023.7.22=hbcca054_0
+- certifi=2023.7.22=pyhd8ed1ab_0
+- charset-normalizer=3.2.0=pyhd8ed1ab_0
+- colorama=0.4.6=pyhd8ed1ab_0
+- cuda-cudart=11.7.99=0
+- cuda-cupti=11.7.101=0
+- cuda-libraries=11.7.1=0
+- cuda-nvrtc=11.7.99=0
+- cuda-nvtx=11.7.91=0
+- cuda-runtime=11.7.1=0
+- ffmpeg=4.3=hf484d3e_0
+- filelock=3.12.2=pyhd8ed1ab_0
+- freetype=2.12.1=hca18f0e_1
+- fvcore=0.1.5.post20221221=pyhd8ed1ab_0
+- gmp=6.2.1=h58526e2_0
+- gmpy2=2.1.2=py39h376b7d2_1
+- gnutls=3.6.13=h85f3911_1
+- idna=3.4=pyhd8ed1ab_0
+- intel-openmp=2022.1.0=h9e868ea_3769
+- iopath=0.1.9=py39
+- jinja2=3.1.2=pyhd8ed1ab_1
+- jpeg=9e=h0b41bf4_3
+- lame=3.100=h166bdaf_1003
+- lcms2=2.15=hfd0df8a_0
+- ld_impl_linux-64=2.40=h41732ed_0
+- lerc=4.0.0=h27087fc_0
+- libblas=3.9.0=16_linux64_mkl
+- libcblas=3.9.0=16_linux64_mkl
+- libcublas=11.10.3.66=0
+- libcufft=10.7.2.124=h4fbf590_0
+- libcufile=1.7.1.12=0
+- libcurand=10.3.3.129=0
+- libcusolver=11.4.0.1=0
+- libcusparse=11.7.4.91=0
+- libdeflate=1.17=h0b41bf4_0
+- libffi=3.3=h58526e2_2
+- libgcc-ng=13.1.0=he5830b7_0
+- libgomp=13.1.0=he5830b7_0
+- libiconv=1.17=h166bdaf_0
+- liblapack=3.9.0=16_linux64_mkl
+- libnpp=11.7.4.75=0
+- libnvjpeg=11.8.0.2=0
+- libpng=1.6.39=h753d276_0
+- libsqlite=3.42.0=h2797004_0
+- libstdcxx-ng=13.1.0=hfd8a6a1_0
+- libtiff=4.5.0=h6adf6a1_2
+- libwebp-base=1.3.1=hd590300_0
+- libxcb=1.13=h7f98852_1004
+- libzlib=1.2.13=hd590300_5
+- markupsafe=2.1.3=py39hd1e30aa_0
+- mkl=2022.1.0=hc2b9512_224
+- mpc=1.3.1=hfe3b2da_0
+- mpfr=4.2.0=hb012696_0
+- mpmath=1.3.0=pyhd8ed1ab_0
+- ncurses=6.4=hcb278e6_0
+- nettle=3.6=he412f7d_0
+- networkx=3.1=pyhd8ed1ab_0
+- numpy=1.25.2=py39h6183b62_0
+- openh264=2.1.1=h780b84a_0
+- openjpeg=2.5.0=hfec8fc6_2
+- openssl=1.1.1v=hd590300_0
+- pillow=9.4.0=py39h2320bf1_1
+- pip=23.2.1=pyhd8ed1ab_0
+- portalocker=2.7.0=py39hf3d152e_0
+- pthread-stubs=0.4=h36c2ea0_1001
+- pysocks=1.7.1=pyha2e5f31_6
+- python=3.9.0=hffdb5ce_5_cpython
+- python_abi=3.9=3_cp39
+- pytorch=2.0.1=py3.9_cuda11.7_cudnn8.5.0_0
+- pytorch-cuda=11.7=h778d358_5
+- pytorch-mutex=1.0=cuda
+- pyyaml=6.0=py39hb9d737c_5
+- readline=8.2=h8228510_1
+- requests=2.31.0=pyhd8ed1ab_0
+- setuptools=68.0.0=pyhd8ed1ab_0
+- sqlite=3.42.0=h2c6b66d_0
+- sympy=1.12=pypyh9d50eac_103
+- tabulate=0.9.0=pyhd8ed1ab_1
+- termcolor=2.3.0=pyhd8ed1ab_0
+- tk=8.6.12=h27826a3_0
+- torchaudio=2.0.2=py39_cu117
+- torchtriton=2.0.0=py39
+- torchvision=0.15.2=py39_cu117
+- tqdm=4.66.1=pyhd8ed1ab_0
+- typing_extensions=4.7.1=pyha770c72_0
+- tzdata=2023c=h71feb2d_0
+- urllib3=2.0.4=pyhd8ed1ab_0
+- wheel=0.41.1=pyhd8ed1ab_0
+- xformers=0.0.21=py39_cu11.8.0_pyt2.0.1
+- xorg-libxau=1.0.11=hd590300_0
+- xorg-libxdmcp=1.1.3=h7f98852_0
+- xz=5.2.6=h166bdaf_0
+- yacs=0.1.8=pyhd8ed1ab_0
+- yaml=0.2.5=h7f98852_2
+- zlib=1.2.13=hd590300_5
+- zstd=1.5.2=hfc55251_7

data/demo/jellycat/001.jpg ADDED Viewed

Git LFS Details

SHA256: bb252fabcd6588b924266098efbf0538c4cc77f6fd623166a3328692ea04b221
Pointer size: 132 Bytes
Size of remote file: 6.91 MB

data/demo/jellycat/002.jpg ADDED Viewed

Git LFS Details

SHA256: 4e36092e3ef63d3de0d9645ce001829c248b2c5ae78011c3578276d4f0009ce6
Pointer size: 132 Bytes
Size of remote file: 6.86 MB

data/demo/jellycat/003.jpg ADDED Viewed

Git LFS Details

SHA256: 6fcd32e046e04b809529c202f594a49a210d1fcd38a4664ca619704dd317b550
Pointer size: 131 Bytes
Size of remote file: 169 kB

data/demo/jellycat/004.jpg ADDED Viewed

Git LFS Details

SHA256: 082007c67949ce96af89d34fbb3dd8a6eeca4d000e4dc39a920215881ee5a4e1
Pointer size: 131 Bytes
Size of remote file: 119 kB

data/demo/jordan/001.png ADDED Viewed

Git LFS Details

SHA256: dff6883afa87339f94ac9d2b07a61e01f9107f2e37a7bda326956f209a5b1c61
Pointer size: 131 Bytes
Size of remote file: 128 kB

data/demo/jordan/002.png ADDED Viewed

Git LFS Details

SHA256: bee5060ab4b105fd9383398ab47dc1caa2dd329f9e83ae310bb068870d445270
Pointer size: 131 Bytes
Size of remote file: 126 kB

data/demo/jordan/003.png ADDED Viewed

Git LFS Details

SHA256: 34353a07643bb0f6dcc8d1a40d1658e393998af73db0cd17e392558581840c03
Pointer size: 131 Bytes
Size of remote file: 135 kB

data/demo/jordan/004.png ADDED Viewed

Git LFS Details

SHA256: c671d0fb4ff49d59e6b044e8a673ad6e9293337423f6db1345c3e1c45f0c7427
Pointer size: 131 Bytes
Size of remote file: 123 kB

data/demo/jordan/005.png ADDED Viewed

Git LFS Details

SHA256: aefe2f7ca57407dad2a7ce86759b86698d536d5f6c7fd9f97776d3905bb3ce19
Pointer size: 131 Bytes
Size of remote file: 115 kB

data/demo/jordan/006.png ADDED Viewed

Git LFS Details

SHA256: f5f23767c8a3830921e1c5299cc8373daacf9af68e96c596df5b8edbdc0f4836
Pointer size: 131 Bytes
Size of remote file: 145 kB

data/demo/jordan/007.png ADDED Viewed

Git LFS Details

SHA256: c61521925f93ec2721a02c4c5b4898171985147c763702cb5f9a7efbb341cc2d
Pointer size: 131 Bytes
Size of remote file: 145 kB

data/demo/jordan/008.png ADDED Viewed

Git LFS Details

SHA256: bb1e7c3d5fc1ad0067d2d28d9f83e5d5243010353f0ec0fd12f196cf5939f231
Pointer size: 131 Bytes
Size of remote file: 143 kB

data/demo/kew_gardens_ruined_arch/001.jpeg ADDED Viewed

Git LFS Details

SHA256: 96dfde51e8d0857120387e3b81fff665b9b94524a6f1a9f35246faed4f3e8986
Pointer size: 131 Bytes
Size of remote file: 624 kB

data/demo/kew_gardens_ruined_arch/002.jpeg ADDED Viewed

Git LFS Details

SHA256: 8c2c07e43d51594fbbea708fce0040fbe6b5ecd4e01c8b10898a2f71f3abf186
Pointer size: 131 Bytes
Size of remote file: 590 kB

data/demo/kew_gardens_ruined_arch/003.jpeg ADDED Viewed

Git LFS Details

SHA256: bfeea8fcb46fcbb0d77450227927851472a73a714c62de21e75b4d60a3dba317
Pointer size: 131 Bytes
Size of remote file: 586 kB

data/demo/kotor_cathedral/001.jpeg ADDED Viewed

Git LFS Details

SHA256: 732a5d344ddcfc2e50a97abc3792bb444cf40b82a93638f2d52d955f6595c90a
Pointer size: 131 Bytes
Size of remote file: 617 kB

data/demo/kotor_cathedral/002.jpeg ADDED Viewed

Git LFS Details

SHA256: e2dc18fde3559ae7333351ded6d765b217a5b754fd828087c5f88cbc11c84793
Pointer size: 131 Bytes
Size of remote file: 760 kB

data/demo/kotor_cathedral/003.jpeg ADDED Viewed

Git LFS Details

SHA256: b100bdbc2d9943151424b86e504ece203f15ff5d616dd4c515fabc7b3d39d11c
Pointer size: 131 Bytes
Size of remote file: 697 kB

data/demo/kotor_cathedral/004.jpeg ADDED Viewed

Git LFS Details

SHA256: 26d6433fa500c03bd982a3abaf2f8028d26a9726e409b880192de5eea17d83b5
Pointer size: 131 Bytes
Size of remote file: 583 kB

data/demo/kotor_cathedral/005.jpeg ADDED Viewed

Git LFS Details

SHA256: 15b4e342917ae6df3a43a82aba0fb199098fd5ff9946303109b5e21a613a6d30
Pointer size: 131 Bytes
Size of remote file: 902 kB

data/demo/kotor_cathedral/006.jpeg ADDED Viewed

Git LFS Details

SHA256: da1098cc0360bbc34eb1a61a224ccdaa43677c4e6b26687c8f9bb95fbf7a2f42
Pointer size: 131 Bytes
Size of remote file: 411 kB

diffusionsfm/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .utils.rays import cameras_to_rays, rays_to_cameras, Rays

diffusionsfm/dataset/__init__.py ADDED Viewed

File without changes

diffusionsfm/dataset/co3d_v2.py ADDED Viewed

	@@ -0,0 +1,792 @@

+import gzip
+import json
+import os.path as osp
+import random
+import socket
+import time
+import torch
+import warnings
+import numpy as np
+from PIL import Image, ImageFile
+from tqdm import tqdm
+from pytorch3d.renderer import PerspectiveCameras
+from torch.utils.data import Dataset
+from torchvision import transforms
+import matplotlib.pyplot as plt
+from scipy import ndimage as nd
+from diffusionsfm.utils.distortion import distort_image
+HOSTNAME = socket.gethostname()
+CO3D_DIR = "../co3d_data"  # update this
+CO3D_ANNOTATION_DIR = osp.join(CO3D_DIR, "co3d_annotations")
+CO3D_DIR = CO3D_DEPTH_DIR = osp.join(CO3D_DIR, "co3d")
+order_path = osp.join(
+    CO3D_DIR, "co3d_v2_random_order_{sample_num}/{category}.json"
+)
+TRAINING_CATEGORIES = [
+    "apple",
+    "backpack",
+    "banana",
+    "baseballbat",
+    "baseballglove",
+    "bench",
+    "bicycle",
+    "bottle",
+    "bowl",
+    "broccoli",
+    "cake",
+    "car",
+    "carrot",
+    "cellphone",
+    "chair",
+    "cup",
+    "donut",
+    "hairdryer",
+    "handbag",
+    "hydrant",
+    "keyboard",
+    "laptop",
+    "microwave",
+    "motorcycle",
+    "mouse",
+    "orange",
+    "parkingmeter",
+    "pizza",
+    "plant",
+    "stopsign",
+    "teddybear",
+    "toaster",
+    "toilet",
+    "toybus",
+    "toyplane",
+    "toytrain",
+    "toytruck",
+    "tv",
+    "umbrella",
+    "vase",
+    "wineglass",
+]
+TEST_CATEGORIES = [
+    "ball",
+    "book",
+    "couch",
+    "frisbee",
+    "hotdog",
+    "kite",
+    "remote",
+    "sandwich",
+    "skateboard",
+    "suitcase",
+]
+assert len(TRAINING_CATEGORIES) + len(TEST_CATEGORIES) == 51
+Image.MAX_IMAGE_PIXELS = None
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+def fill_depths(data, invalid=None):
+    data_list = []
+    for i in range(data.shape[0]):
+        data_item = data[i].numpy()
+        # Invalid must be 1 where stuff is invalid, 0 where valid
+        ind = nd.distance_transform_edt(
+            invalid[i], return_distances=False, return_indices=True
+        )
+        data_list.append(torch.tensor(data_item[tuple(ind)]))
+    return torch.stack(data_list, dim=0)
+def full_scene_scale(batch):
+    cameras = PerspectiveCameras(R=batch["R"], T=batch["T"], device="cuda")
+    cc = cameras.get_camera_center()
+    centroid = torch.mean(cc, dim=0)
+    diffs = cc - centroid
+    norms = torch.linalg.norm(diffs, dim=1)
+    furthest_index = torch.argmax(norms).item()
+    scale = norms[furthest_index].item()
+    return scale
+def square_bbox(bbox, padding=0.0, astype=None, tight=False):
+    """
+    Computes a square bounding box, with optional padding parameters.
+    Args:
+        bbox: Bounding box in xyxy format (4,).
+    Returns:
+        square_bbox in xyxy format (4,).
+    """
+    if astype is None:
+        astype = type(bbox[0])
+    bbox = np.array(bbox)
+    center = (bbox[:2] + bbox[2:]) / 2
+    extents = (bbox[2:] - bbox[:2]) / 2
+    # No black bars if tight
+    if tight:
+        s = min(extents) * (1 + padding)
+    else:
+        s = max(extents) * (1 + padding)
+    square_bbox = np.array(
+        [center[0] - s, center[1] - s, center[0] + s, center[1] + s],
+        dtype=astype,
+    )
+    return square_bbox
+def unnormalize_image(image, return_numpy=True, return_int=True):
+    if isinstance(image, torch.Tensor):
+        image = image.detach().cpu().numpy()
+    if image.ndim == 3:
+        if image.shape[0] == 3:
+            image = image[None, ...]
+        elif image.shape[2] == 3:
+            image = image.transpose(2, 0, 1)[None, ...]
+        else:
+            raise ValueError(f"Unexpected image shape: {image.shape}")
+    elif image.ndim == 4:
+        if image.shape[1] == 3:
+            pass
+        elif image.shape[3] == 3:
+            image = image.transpose(0, 3, 1, 2)
+        else:
+            raise ValueError(f"Unexpected batch image shape: {image.shape}")
+    else:
+        raise ValueError(f"Unsupported input shape: {image.shape}")
+    mean = np.array([0.485, 0.456, 0.406])[None, :, None, None]
+    std = np.array([0.229, 0.224, 0.225])[None, :, None, None]
+    image = image * std + mean
+    if return_int:
+        image = np.clip(image * 255.0, 0, 255).astype(np.uint8)
+    else:
+        image = np.clip(image, 0.0, 1.0)
+    if image.shape[0] == 1:
+        image = image[0]
+    if return_numpy:
+        return image
+    else:
+        return torch.from_numpy(image)
+def unnormalize_image_for_vis(image):
+    assert len(image.shape) == 5 and image.shape[2] == 3
+    mean = torch.tensor([0.485, 0.456, 0.406]).view(1, 1, 3, 1, 1).to(image.device)
+    std = torch.tensor([0.229, 0.224, 0.225]).view(1, 1, 3, 1, 1).to(image.device)
+    image = image * std + mean
+    image = (image - 0.5) / 0.5
+    return image
+def _transform_intrinsic(image, bbox, principal_point, focal_length):
+    # Rescale intrinsics to match bbox
+    half_box = np.array([image.width, image.height]).astype(np.float32) / 2
+    org_scale = min(half_box).astype(np.float32)
+    # Pixel coordinates
+    principal_point_px = half_box - (np.array(principal_point) * org_scale)
+    focal_length_px = np.array(focal_length) * org_scale
+    principal_point_px -= bbox[:2]
+    new_bbox = (bbox[2:] - bbox[:2]) / 2
+    new_scale = min(new_bbox)
+    # NDC coordinates
+    new_principal_ndc = (new_bbox - principal_point_px) / new_scale
+    new_focal_ndc = focal_length_px / new_scale
+    principal_point = torch.tensor(new_principal_ndc.astype(np.float32))
+    focal_length = torch.tensor(new_focal_ndc.astype(np.float32))
+    return principal_point, focal_length
+def construct_camera_from_batch(batch, device):
+    if isinstance(device, int):
+        device = f"cuda:{device}"
+    return PerspectiveCameras(
+        R=batch["R"].reshape(-1, 3, 3),
+        T=batch["T"].reshape(-1, 3),
+        focal_length=batch["focal_lengths"].reshape(-1, 2),
+        principal_point=batch["principal_points"].reshape(-1, 2),
+        image_size=batch["image_sizes"].reshape(-1, 2),
+        device=device,
+    )
+def save_batch_images(images, fname):
+    cmap = plt.get_cmap("hsv")
+    num_frames = len(images)
+    num_rows = len(images)
+    num_cols = 4
+    figsize = (num_cols * 2, num_rows * 2)
+    fig, axs = plt.subplots(num_rows, num_cols, figsize=figsize)
+    axs = axs.flatten()
+    for i in range(num_rows):
+        for j in range(4):
+            if i < num_frames:
+                axs[i * 4 + j].imshow(unnormalize_image(images[i][j]))
+                for s in ["bottom", "top", "left", "right"]:
+                    axs[i * 4 + j].spines[s].set_color(cmap(i / (num_frames)))
+                    axs[i * 4 + j].spines[s].set_linewidth(5)
+                axs[i * 4 + j].set_xticks([])
+                axs[i * 4 + j].set_yticks([])
+            else:
+                axs[i * 4 + j].axis("off")
+    plt.tight_layout()
+    plt.savefig(fname)
+def jitter_bbox(
+    square_bbox,
+    jitter_scale=(1.1, 1.2),
+    jitter_trans=(-0.07, 0.07),
+    direction_from_size=None,
+):
+    square_bbox = np.array(square_bbox.astype(float))
+    s = np.random.uniform(jitter_scale[0], jitter_scale[1])
+    # Jitter only one dimension if center cropping
+    tx, ty = np.random.uniform(jitter_trans[0], jitter_trans[1], size=2)
+    if direction_from_size is not None:
+        if direction_from_size[0] > direction_from_size[1]:
+            tx = 0
+        else:
+            ty = 0
+    side_length = square_bbox[2] - square_bbox[0]
+    center = (square_bbox[:2] + square_bbox[2:]) / 2 + np.array([tx, ty]) * side_length
+    extent = side_length / 2 * s
+    ul = center - extent
+    lr = ul + 2 * extent
+    return np.concatenate((ul, lr))
+class Co3dDataset(Dataset):
+    def __init__(
+        self,
+        category=("all_train",),
+        split="train",
+        transform=None,
+        num_images=2,
+        img_size=224,
+        mask_images=False,
+        crop_images=True,
+        co3d_dir=None,
+        co3d_annotation_dir=None,
+        precropped_images=False,
+        apply_augmentation=True,
+        normalize_cameras=True,
+        no_images=False,
+        sample_num=None,
+        seed=0,
+        load_extra_cameras=False,
+        distort_image=False,
+        load_depths=False,
+        center_crop=False,
+        depth_size=256,
+        mask_holes=False,
+        object_mask=True,
+    ):
+        """
+        Args:
+            num_images: Number of images in each batch.
+            perspective_correction (str):
+                "none": No perspective correction.
+                "warp": Warp the image and label.
+                "label_only": Correct the label only.
+        """
+        start_time = time.time()
+        self.category = category
+        self.split = split
+        self.transform = transform
+        self.num_images = num_images
+        self.img_size = img_size
+        self.mask_images = mask_images
+        self.crop_images = crop_images
+        self.precropped_images = precropped_images
+        self.apply_augmentation = apply_augmentation
+        self.normalize_cameras = normalize_cameras
+        self.no_images = no_images
+        self.sample_num = sample_num
+        self.load_extra_cameras = load_extra_cameras
+        self.distort = distort_image
+        self.load_depths = load_depths
+        self.center_crop = center_crop
+        self.depth_size = depth_size
+        self.mask_holes = mask_holes
+        self.object_mask = object_mask
+        if self.apply_augmentation:
+            if self.center_crop:
+                self.jitter_scale = (0.8, 1.1)
+                self.jitter_trans = (0.0, 0.0)
+            else:
+                self.jitter_scale = (1.1, 1.2)
+                self.jitter_trans = (-0.07, 0.07)
+        else:
+            # Note if trained with apply_augmentation, we should still use
+            # apply_augmentation at test time.
+            self.jitter_scale = (1, 1)
+            self.jitter_trans = (0.0, 0.0)
+        if self.distort:
+            self.k1_max = 1.0
+            self.k2_max = 1.0
+        if co3d_dir is not None:
+            self.co3d_dir = co3d_dir
+            self.co3d_annotation_dir = co3d_annotation_dir
+        else:
+            self.co3d_dir = CO3D_DIR
+            self.co3d_annotation_dir = CO3D_ANNOTATION_DIR
+            self.co3d_depth_dir = CO3D_DEPTH_DIR
+        if isinstance(self.category, str):
+            self.category = [self.category]
+        if "all_train" in self.category:
+            self.category = TRAINING_CATEGORIES
+        if "all_test" in self.category:
+            self.category = TEST_CATEGORIES
+        if "full" in self.category:
+            self.category = TRAINING_CATEGORIES + TEST_CATEGORIES
+        self.category = sorted(self.category)
+        self.is_single_category = len(self.category) == 1
+        # Fixing seed
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+        print(f"Co3d ({split}):")
+        self.low_quality_translations = [
+            "411_55952_107659",
+            "427_59915_115716",
+            "435_61970_121848",
+            "112_13265_22828",
+            "110_13069_25642",
+            "165_18080_34378",
+            "368_39891_78502",
+            "391_47029_93665",
+            "20_695_1450",
+            "135_15556_31096",
+            "417_57572_110680",
+        ]  # Initialized with sequences with poor depth masks
+        self.rotations = {}
+        self.category_map = {}
+        for c in tqdm(self.category):
+            annotation_file = osp.join(
+                self.co3d_annotation_dir, f"{c}_{self.split}.jgz"
+            )
+            with gzip.open(annotation_file, "r") as fin:
+                annotation = json.loads(fin.read())
+            counter = 0
+            for seq_name, seq_data in annotation.items():
+                counter += 1
+                if len(seq_data) < self.num_images:
+                    continue
+                filtered_data = []
+                self.category_map[seq_name] = c
+                bad_seq = False
+                for data in seq_data:
+                    # Make sure translations are not ridiculous and rotations are valid
+                    det = np.linalg.det(data["R"])
+                    if (np.abs(data["T"]) > 1e5).any() or det < 0.99 or det > 1.01:
+                        bad_seq = True
+                        self.low_quality_translations.append(seq_name)
+                        break
+                    # Ignore all unnecessary information.
+                    filtered_data.append(
+                        {
+                            "filepath": data["filepath"],
+                            "bbox": data["bbox"],
+                            "R": data["R"],
+                            "T": data["T"],
+                            "focal_length": data["focal_length"],
+                            "principal_point": data["principal_point"],
+                        },
+                    )
+                if not bad_seq:
+                    self.rotations[seq_name] = filtered_data
+        self.sequence_list = list(self.rotations.keys())
+        IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+        IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+        if self.transform is None:
+            self.transform = transforms.Compose(
+                [
+                    transforms.ToTensor(),
+                    transforms.Resize(self.img_size, antialias=True),
+                    transforms.Normalize(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD),
+                ]
+            )
+            self.transform_depth = transforms.Compose(
+                [
+                    transforms.Resize(
+                        self.depth_size,
+                        antialias=False,
+                        interpolation=transforms.InterpolationMode.NEAREST_EXACT,
+                    ),
+                ]
+            )
+        print(
+            f"Low quality translation sequences, not used: {self.low_quality_translations}"
+        )
+        print(f"Data size: {len(self)}")
+        print(f"Data loading took {(time.time()-start_time)} seconds.")
+    def __len__(self):
+        return len(self.sequence_list)
+    def __getitem__(self, index):
+        num_to_load = self.num_images if not self.load_extra_cameras else 8
+        sequence_name = self.sequence_list[index % len(self.sequence_list)]
+        metadata = self.rotations[sequence_name]
+        if self.sample_num is not None:
+            with open(
+                order_path.format(sample_num=self.sample_num, category=self.category[0])
+            ) as f:
+                order = json.load(f)
+            ids = order[sequence_name][:num_to_load]
+        else:
+            replace = len(metadata) < 8
+            ids = np.random.choice(len(metadata), num_to_load, replace=replace)
+        return self.get_data(index=index, ids=ids, num_valid_frames=num_to_load)
+    def _get_scene_scale(self, sequence_name):
+        n = len(self.rotations[sequence_name])
+        R = torch.zeros(n, 3, 3)
+        T = torch.zeros(n, 3)
+        for i, ann in enumerate(self.rotations[sequence_name]):
+            R[i, ...] = torch.tensor(self.rotations[sequence_name][i]["R"])
+            T[i, ...] = torch.tensor(self.rotations[sequence_name][i]["T"])
+        cameras = PerspectiveCameras(R=R, T=T)
+        cc = cameras.get_camera_center()
+        centeroid = torch.mean(cc, dim=0)
+        diff = cc - centeroid
+        norm = torch.norm(diff, dim=1)
+        scale = torch.max(norm).item()
+        return scale
+    def _crop_image(self, image, bbox):
+        image_crop = transforms.functional.crop(
+            image,
+            top=bbox[1],
+            left=bbox[0],
+            height=bbox[3] - bbox[1],
+            width=bbox[2] - bbox[0],
+        )
+        return image_crop
+    def _transform_intrinsic(self, image, bbox, principal_point, focal_length):
+        half_box = np.array([image.width, image.height]).astype(np.float32) / 2
+        org_scale = min(half_box).astype(np.float32)
+        # Pixel coordinates
+        principal_point_px = half_box - (np.array(principal_point) * org_scale)
+        focal_length_px = np.array(focal_length) * org_scale
+        principal_point_px -= bbox[:2]
+        new_bbox = (bbox[2:] - bbox[:2]) / 2
+        new_scale = min(new_bbox)
+        # NDC coordinates
+        new_principal_ndc = (new_bbox - principal_point_px) / new_scale
+        new_focal_ndc = focal_length_px / new_scale
+        return new_principal_ndc.astype(np.float32), new_focal_ndc.astype(np.float32)
+    def get_data(
+        self,
+        index=None,
+        sequence_name=None,
+        ids=(0, 1),
+        no_images=False,
+        num_valid_frames=None,
+        load_using_order=None,
+    ):
+        if load_using_order is not None:
+            with open(
+                order_path.format(sample_num=self.sample_num, category=self.category[0])
+            ) as f:
+                order = json.load(f)
+            ids = order[sequence_name][:load_using_order]
+        if sequence_name is None:
+            index = index % len(self.sequence_list)
+            sequence_name = self.sequence_list[index]
+        metadata = self.rotations[sequence_name]
+        category = self.category_map[sequence_name]
+        # Read image & camera information from annotations
+        annos = [metadata[i] for i in ids]
+        images = []
+        image_sizes = []
+        PP = []
+        FL = []
+        crop_parameters = []
+        filenames = []
+        distortion_parameters = []
+        depths = []
+        depth_masks = []
+        object_masks = []
+        dino_images = []
+        for anno in annos:
+            filepath = anno["filepath"]
+            if not no_images:
+                image = Image.open(osp.join(self.co3d_dir, filepath)).convert("RGB")
+                image_size = image.size
+                # Optionally mask images with black background
+                if self.mask_images:
+                    black_image = Image.new("RGB", image_size, (0, 0, 0))
+                    mask_name = osp.basename(filepath.replace(".jpg", ".png"))
+                    mask_path = osp.join(
+                        self.co3d_dir, category, sequence_name, "masks", mask_name
+                    )
+                    mask = Image.open(mask_path).convert("L")
+                    if mask.size != image_size:
+                        mask = mask.resize(image_size)
+                    mask = Image.fromarray(np.array(mask) > 125)
+                    image = Image.composite(image, black_image, mask)
+                if self.object_mask:
+                    mask_name = osp.basename(filepath.replace(".jpg", ".png"))
+                    mask_path = osp.join(
+                        self.co3d_dir, category, sequence_name, "masks", mask_name
+                    )
+                    mask = Image.open(mask_path).convert("L")
+                    if mask.size != image_size:
+                        mask = mask.resize(image_size)
+                    mask = torch.from_numpy(np.array(mask) > 125)
+                # Determine crop, Resnet wants square images
+                bbox = np.array(anno["bbox"])
+                good_bbox = ((bbox[2:] - bbox[:2]) > 30).all()
+                bbox = (
+                    anno["bbox"]
+                    if not self.center_crop and good_bbox
+                    else [0, 0, image.width, image.height]
+                )
+                # Distort image and bbox if desired
+                if self.distort:
+                    k1 = random.uniform(0, self.k1_max)
+                    k2 = random.uniform(0, self.k2_max)
+                    try:
+                        image, bbox = distort_image(
+                            image, np.array(bbox), k1, k2, modify_bbox=True
+                        )
+                    except:
+                        print("INFO:")
+                        print(sequence_name)
+                        print(index)
+                        print(ids)
+                        print(k1)
+                        print(k2)
+                    distortion_parameters.append(torch.FloatTensor([k1, k2]))
+                bbox = square_bbox(np.array(bbox), tight=self.center_crop)
+                if self.apply_augmentation:
+                    bbox = jitter_bbox(
+                        bbox,
+                        jitter_scale=self.jitter_scale,
+                        jitter_trans=self.jitter_trans,
+                        direction_from_size=image.size if self.center_crop else None,
+                    )
+                bbox = np.around(bbox).astype(int)
+                # Crop parameters
+                crop_center = (bbox[:2] + bbox[2:]) / 2
+                principal_point = torch.tensor(anno["principal_point"])
+                focal_length = torch.tensor(anno["focal_length"])
+                # convert crop center to correspond to a "square" image
+                width, height = image.size
+                length = max(width, height)
+                s = length / min(width, height)
+                crop_center = crop_center + (length - np.array([width, height])) / 2
+                # convert to NDC
+                cc = s - 2 * s * crop_center / length
+                crop_width = 2 * s * (bbox[2] - bbox[0]) / length
+                crop_params = torch.tensor([-cc[0], -cc[1], crop_width, s])
+                # Crop and normalize image
+                if not self.precropped_images:
+                    image = self._crop_image(image, bbox)
+                try:
+                    image = self.transform(image)
+                except:
+                    print("INFO:")
+                    print(sequence_name)
+                    print(index)
+                    print(ids)
+                    print(k1)
+                    print(k2)
+                images.append(image[:, : self.img_size, : self.img_size])
+                crop_parameters.append(crop_params)
+                if self.load_depths:
+                    # Open depth map
+                    depth_name = osp.basename(
+                        filepath.replace(".jpg", ".jpg.geometric.png")
+                    )
+                    depth_path = osp.join(
+                        self.co3d_depth_dir,
+                        category,
+                        sequence_name,
+                        "depths",
+                        depth_name,
+                    )
+                    depth_pil = Image.open(depth_path)
+                    # 16 bit float type casting
+                    depth = torch.tensor(
+                        np.frombuffer(
+                            np.array(depth_pil, dtype=np.uint16), dtype=np.float16
+                        )
+                        .astype(np.float32)
+                        .reshape((depth_pil.size[1], depth_pil.size[0]))
+                    )
+                    # Crop and resize as with images
+                    if depth_pil.size != image_size:
+                        # bbox may have the wrong scale
+                        bbox = depth_pil.size[0] * bbox / image_size[0]
+                    if self.object_mask:
+                        assert mask.shape == depth.shape
+                    bbox = np.around(bbox).astype(int)
+                    depth = self._crop_image(depth, bbox)
+                    # Resize
+                    depth = self.transform_depth(depth.unsqueeze(0))[
+                        0, : self.depth_size, : self.depth_size
+                    ]
+                    depths.append(depth)
+                    if self.object_mask:
+                        mask = self._crop_image(mask, bbox)
+                        mask = self.transform_depth(mask.unsqueeze(0))[
+                            0, : self.depth_size, : self.depth_size
+                        ]
+                        object_masks.append(mask)
+                PP.append(principal_point)
+                FL.append(focal_length)
+                image_sizes.append(torch.tensor([self.img_size, self.img_size]))
+                filenames.append(filepath)
+        if not no_images:
+            if self.load_depths:
+                depths = torch.stack(depths)
+                depth_masks = torch.logical_or(depths <= 0, depths.isinf())
+                depth_masks = (~depth_masks).long()
+                if self.object_mask:
+                    object_masks = torch.stack(object_masks, dim=0)
+                if self.mask_holes:
+                    depths = fill_depths(depths, depth_masks == 0)
+                # Sometimes mask_holes misses stuff
+                new_masks = torch.logical_or(depths <= 0, depths.isinf())
+                new_masks = (~new_masks).long()
+                depths[new_masks == 0] = -1
+                assert torch.logical_or(depths > 0, depths == -1).all()
+                assert not (depths.isinf()).any()
+                assert not (depths.isnan()).any()
+            if self.load_extra_cameras:
+                # Remove the extra loaded image, for saving space
+                images = images[: self.num_images]
+            if self.distort:
+                distortion_parameters = torch.stack(distortion_parameters)
+            images = torch.stack(images)
+            crop_parameters = torch.stack(crop_parameters)
+            focal_lengths = torch.stack(FL)
+            principal_points = torch.stack(PP)
+            image_sizes = torch.stack(image_sizes)
+        else:
+            images = None
+            crop_parameters = None
+            distortion_parameters = None
+            focal_lengths = []
+            principal_points = []
+            image_sizes = []
+        # Assemble batch info to send back
+        R = torch.stack([torch.tensor(anno["R"]) for anno in annos])
+        T = torch.stack([torch.tensor(anno["T"]) for anno in annos])
+        batch = {
+            "model_id": sequence_name,
+            "category": category,
+            "n": len(metadata),
+            "num_valid_frames": num_valid_frames,
+            "ind": torch.tensor(ids),
+            "image": images,
+            "depth": depths,
+            "depth_masks": depth_masks,
+            "object_masks": object_masks,
+            "R": R,
+            "T": T,
+            "focal_length": focal_lengths,
+            "principal_point": principal_points,
+            "image_size": image_sizes,
+            "crop_parameters": crop_parameters,
+            "distortion_parameters": torch.zeros(4),
+            "filename": filenames,
+            "category": category,
+            "dataset": "co3d",
+        }
+        return batch

diffusionsfm/dataset/custom.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+from PIL import Image, ImageOps
+from torch.utils.data import Dataset
+from torchvision import transforms
+from diffusionsfm.dataset.co3d_v2 import square_bbox
+class CustomDataset(Dataset):
+    def __init__(
+        self,
+        image_list,
+    ):
+        self.images = []
+        for image_path in sorted(image_list):
+            img = Image.open(image_path)
+            img = ImageOps.exif_transpose(img).convert("RGB")  # Apply EXIF rotation
+            self.images.append(img)
+        self.n = len(self.images)
+        self.jitter_scale = [1, 1]
+        self.jitter_trans = [0, 0]
+        self.transform = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Resize(224),
+                transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        self.transform_for_vis = transforms.Compose(
+            [
+                transforms.Resize(224),
+            ]
+        )
+    def __len__(self):
+        return 1
+    def _crop_image(self, image, bbox, white_bg=False):
+        if white_bg:
+            # Only support PIL Images
+            image_crop = Image.new(
+                "RGB", (bbox[2] - bbox[0], bbox[3] - bbox[1]), (255, 255, 255)
+            )
+            image_crop.paste(image, (-bbox[0], -bbox[1]))
+        else:
+            image_crop = transforms.functional.crop(
+                image,
+                top=bbox[1],
+                left=bbox[0],
+                height=bbox[3] - bbox[1],
+                width=bbox[2] - bbox[0],
+            )
+        return image_crop
+    def __getitem__(self):
+        return self.get_data()
+    def get_data(self):
+        cmap = plt.get_cmap("hsv")
+        ids = [i for i in range(len(self.images))]
+        images = [self.images[i] for i in ids]
+        images_transformed = []
+        images_for_vis = []
+        crop_parameters = []
+        for i, image in enumerate(images):
+            bbox = np.array([0, 0, image.width, image.height])
+            bbox = square_bbox(bbox, tight=True)
+            bbox = np.around(bbox).astype(int)
+            image = self._crop_image(image, bbox)
+            images_transformed.append(self.transform(image))
+            image_for_vis = self.transform_for_vis(image)
+            color_float = cmap(i / len(images))
+            color_rgb = tuple(int(255 * c) for c in color_float[:3])
+            image_for_vis = ImageOps.expand(image_for_vis, border=3, fill=color_rgb)
+            images_for_vis.append(image_for_vis)
+            width, height = image.size
+            length = max(width, height)
+            s = length / min(width, height)
+            crop_center = (bbox[:2] + bbox[2:]) / 2
+            crop_center = crop_center + (length - np.array([width, height])) / 2
+            # convert to NDC
+            cc = s - 2 * s * crop_center / length
+            crop_width = 2 * s * (bbox[2] - bbox[0]) / length
+            crop_params = torch.tensor([-cc[0], -cc[1], crop_width, s])
+            crop_parameters.append(crop_params)
+        images = images_transformed
+        batch = {}
+        batch["image"] = torch.stack(images)
+        batch["image_for_vis"] = images_for_vis
+        batch["n"] = len(images)
+        batch["ind"] = torch.tensor(ids),
+        batch["crop_parameters"] = torch.stack(crop_parameters)
+        batch["distortion_parameters"] = torch.zeros(4)
+        return batch

diffusionsfm/eval/__init__.py ADDED Viewed

File without changes

diffusionsfm/eval/eval_category.py ADDED Viewed

	@@ -0,0 +1,292 @@

+import os
+import json
+import torch
+import torchvision
+import numpy as np
+from tqdm.auto import tqdm
+from diffusionsfm.dataset.co3d_v2 import (
+    Co3dDataset,
+    full_scene_scale,
+)
+from pytorch3d.renderer import PerspectiveCameras
+from diffusionsfm.utils.visualization import filter_and_align_point_clouds
+from diffusionsfm.inference.load_model import load_model
+from diffusionsfm.inference.predict import predict_cameras
+from diffusionsfm.utils.geometry import (
+    compute_angular_error_batch,
+    get_error,
+    n_to_np_rotations,
+)
+from diffusionsfm.utils.slurm import init_slurm_signals_if_slurm
+from diffusionsfm.utils.rays import cameras_to_rays
+from diffusionsfm.utils.rays import normalize_cameras_batch
+@torch.no_grad()
+def evaluate(
+    cfg,
+    model,
+    dataset,
+    num_images,
+    device,
+    use_pbar=True,
+    calculate_intrinsics=True,
+    additional_timesteps=(),
+    num_evaluate=None,
+    max_num_images=None,
+    mode=None,
+    metrics=True,
+    load_depth=True,
+):
+    if cfg.training.get("dpt_head", False):
+        H_in = W_in = 224
+        H_out = W_out = cfg.training.full_num_patches_y
+    else:
+        H_in = H_out = cfg.model.num_patches_x
+        W_in = W_out = cfg.model.num_patches_y
+    results = {}
+    instances = np.arange(0, len(dataset)) if num_evaluate is None else np.linspace(0, len(dataset) - 1, num_evaluate, endpoint=True, dtype=int)
+    instances = tqdm(instances) if use_pbar else instances
+    for counter, idx in enumerate(instances):
+        batch = dataset[idx]
+        instance = batch["model_id"]
+        images = batch["image"].to(device)
+        focal_length = batch["focal_length"].to(device)[:num_images]
+        R = batch["R"].to(device)[:num_images]
+        T = batch["T"].to(device)[:num_images]
+        crop_parameters = batch["crop_parameters"].to(device)[:num_images]
+        if load_depth:
+            depths = batch["depth"].to(device)[:num_images]
+            depth_masks = batch["depth_masks"].to(device)[:num_images]
+            try:
+                object_masks = batch["object_masks"].to(device)[:num_images]
+            except KeyError:
+                object_masks = depth_masks.clone()
+            # Normalize cameras and scale depths for output resolution
+            cameras_gt = PerspectiveCameras(
+                R=R, T=T, focal_length=focal_length, device=device
+            )
+            cameras_gt, _, _ = normalize_cameras_batch(
+                [cameras_gt],
+                first_cam_mediod=cfg.training.first_cam_mediod,
+                normalize_first_camera=cfg.training.normalize_first_camera,
+                depths=depths.unsqueeze(0),
+                crop_parameters=crop_parameters.unsqueeze(0),
+                num_patches_x=H_in,
+                num_patches_y=W_in,
+                return_scales=True,
+            )
+            cameras_gt = cameras_gt[0]
+            gt_rays = cameras_to_rays(
+                cameras=cameras_gt,
+                num_patches_x=H_in,
+                num_patches_y=W_in,
+                crop_parameters=crop_parameters,
+                depths=depths,
+                mode=mode,
+            )
+            gt_points = gt_rays.get_segments().view(num_images, -1, 3)
+            resize = torchvision.transforms.Resize(
+                224,
+                antialias=False,
+                interpolation=torchvision.transforms.InterpolationMode.NEAREST_EXACT,
+            )
+        else:
+            cameras_gt = PerspectiveCameras(
+                R=R, T=T, focal_length=focal_length, device=device
+            )
+        pred_cameras, additional_cams = predict_cameras(
+            model,
+            images,
+            device,
+            crop_parameters=crop_parameters,
+            num_patches_x=H_out,
+            num_patches_y=W_out,
+            max_num_images=max_num_images,
+            additional_timesteps=additional_timesteps,
+            calculate_intrinsics=calculate_intrinsics,
+            mode=mode,
+            return_rays=True,
+            use_homogeneous=cfg.model.get("use_homogeneous", False),
+        )
+        cameras_to_evaluate = additional_cams + [pred_cameras]
+        all_cams_batch = dataset.get_data(
+            sequence_name=instance, ids=np.arange(0, batch["n"]), no_images=True
+        )
+        gt_scene_scale = full_scene_scale(all_cams_batch)
+        R_gt = R
+        T_gt = T
+        errors = []
+        for _, (camera, pred_rays) in enumerate(cameras_to_evaluate):
+            R_pred = camera.R
+            T_pred = camera.T
+            f_pred = camera.focal_length
+            R_pred_rel = n_to_np_rotations(num_images, R_pred).cpu().numpy()
+            R_gt_rel = n_to_np_rotations(num_images, batch["R"]).cpu().numpy()
+            R_error = compute_angular_error_batch(R_pred_rel, R_gt_rel)
+            CC_error, _ = get_error(True, R_pred, T_pred, R_gt, T_gt, gt_scene_scale)
+            if load_depth and metrics:
+                # Evaluate outputs at the same resolution as DUSt3R
+                pred_points = pred_rays.get_segments().view(num_images, H_out, H_out, 3)
+                pred_points = pred_points.permute(0, 3, 1, 2)
+                pred_points = resize(pred_points).permute(0, 2, 3, 1).view(num_images, H_in*W_in, 3)
+                (
+                    _,
+                    _,
+                    _,
+                    _,
+                    metric_values,
+                ) = filter_and_align_point_clouds(
+                    num_images,
+                    gt_points,
+                    pred_points,
+                    depth_masks,
+                    depth_masks,
+                    images,
+                    metrics=metrics,
+                    num_patches_x=H_in,
+                )
+                (
+                    _,
+                    _,
+                    _,
+                    _,
+                    object_metric_values,
+                ) = filter_and_align_point_clouds(
+                    num_images,
+                    gt_points,
+                    pred_points,
+                    depth_masks * object_masks,
+                    depth_masks * object_masks,
+                    images,
+                    metrics=metrics,
+                    num_patches_x=H_in,
+                )
+            result = {
+                "R_pred": R_pred.detach().cpu().numpy().tolist(),
+                "T_pred": T_pred.detach().cpu().numpy().tolist(),
+                "f_pred": f_pred.detach().cpu().numpy().tolist(),
+                "R_gt": R_gt.detach().cpu().numpy().tolist(),
+                "T_gt": T_gt.detach().cpu().numpy().tolist(),
+                "f_gt": focal_length.detach().cpu().numpy().tolist(),
+                "scene_scale": gt_scene_scale,
+                "R_error": R_error.tolist(),
+                "CC_error": CC_error,
+            }
+            if load_depth and metrics:
+                result["CD"] = metric_values[1]
+                result["CD_Object"] = object_metric_values[1]
+            else:
+                result["CD"] = 0
+                result["CD_Object"] = 0
+            errors.append(result)
+        results[instance] = errors
+        if counter == len(dataset) - 1:
+            break
+    return results
+def save_results(
+    output_dir,
+    checkpoint=800_000,
+    category="hydrant",
+    num_images=None,
+    calculate_additional_timesteps=True,
+    calculate_intrinsics=True,
+    split="test",
+    force=False,
+    sample_num=1,
+    max_num_images=None,
+    dataset="co3d",
+):
+    init_slurm_signals_if_slurm()
+    os.umask(000)  # Default to 777 permissions
+    eval_path = os.path.join(
+        output_dir,
+        f"eval_{dataset}",
+        f"{category}_{num_images}_{sample_num}_ckpt{checkpoint}.json",
+    )
+    if os.path.exists(eval_path) and not force:
+        print(f"File {eval_path} already exists. Skipping.")
+        return
+    if num_images is not None and num_images > 8:
+        custom_keys = {"model.num_images": num_images}
+        ignore_keys = ["pos_table"]
+    else:
+        custom_keys = None
+        ignore_keys = []
+    device = torch.device("cuda")
+    model, cfg = load_model(
+        output_dir,
+        checkpoint=checkpoint,
+        device=device,
+        custom_keys=custom_keys,
+        ignore_keys=ignore_keys,
+    )
+    if num_images is None:
+        num_images = cfg.dataset.num_images
+    if cfg.training.dpt_head:
+        # Evaluate outputs at the same resolution as DUSt3R
+        depth_size = 224
+    else:
+        depth_size = cfg.model.num_patches_x
+    dataset = Co3dDataset(
+        category=category,
+        split=split,
+        num_images=num_images,
+        apply_augmentation=False,
+        sample_num=None if split == "train" else sample_num,
+        use_global_intrinsics=cfg.dataset.use_global_intrinsics,
+        load_depths=True,
+        center_crop=True,
+        depth_size=depth_size,
+        mask_holes=not cfg.training.regression,
+        img_size=256 if cfg.model.unet_diffuser else 224,
+    )
+    print(f"Category {category} {len(dataset)}")
+    if calculate_additional_timesteps:
+        additional_timesteps = [0, 10, 20, 30, 40, 50, 60, 70, 80, 90]
+    else:
+        additional_timesteps = []
+    results = evaluate(
+        cfg=cfg,
+        model=model,
+        dataset=dataset,
+        num_images=num_images,
+        device=device,
+        calculate_intrinsics=calculate_intrinsics,
+        additional_timesteps=additional_timesteps,
+        max_num_images=max_num_images,
+        mode="segment",
+    )
+    os.makedirs(os.path.dirname(eval_path), exist_ok=True)
+    with open(eval_path, "w") as f:
+        json.dump(results, f)

diffusionsfm/eval/eval_jobs.py ADDED Viewed

	@@ -0,0 +1,175 @@

+"""
+python -m diffusionsfm.eval.eval_jobs --eval_path output/multi_diffusionsfm_dense --use_submitit
+"""
+import os
+import json
+import submitit
+import argparse
+import itertools
+from glob import glob
+import numpy as np
+from tqdm.auto import tqdm
+from diffusionsfm.dataset.co3d_v2 import TEST_CATEGORIES, TRAINING_CATEGORIES
+from diffusionsfm.eval.eval_category import save_results
+from diffusionsfm.utils.slurm import submitit_job_watcher
+def evaluate_diffusionsfm(eval_path, use_submitit, mode):
+    JOB_PARAMS = {
+        "output_dir": [eval_path],
+        "checkpoint": [800_000],
+        "num_images": [2, 3, 4, 5, 6, 7, 8],
+        "sample_num": [0, 1, 2, 3, 4],
+        "category": TEST_CATEGORIES, # TRAINING_CATEGORIES + TEST_CATEGORIES,
+        "calculate_additional_timesteps": [True],
+    }
+    if mode == "test":
+        JOB_PARAMS["category"] = TEST_CATEGORIES
+    elif mode == "train1":
+        JOB_PARAMS["category"] = TRAINING_CATEGORIES[:len(TRAINING_CATEGORIES) // 2]
+    elif mode == "train2":
+        JOB_PARAMS["category"] = TRAINING_CATEGORIES[len(TRAINING_CATEGORIES) // 2:]
+    keys, values = zip(*JOB_PARAMS.items())
+    job_configs = [dict(zip(keys, p)) for p in itertools.product(*values)]
+    if use_submitit:
+        log_output = "./slurm_logs"
+        executor = submitit.AutoExecutor(
+            cluster=None, folder=log_output, slurm_max_num_timeout=10
+        )
+        # Use your own parameters
+        executor.update_parameters(
+            slurm_additional_parameters={
+                "nodes": 1,
+                "cpus-per-task": 5,
+                "gpus": 1,
+                "time": "6:00:00",
+                "partition": "all",
+                "exclude": "grogu-1-9, grogu-1-14,"
+            }
+        )
+        jobs = []
+        with executor.batch():
+            # This context manager submits all jobs at once at the end.
+            for params in job_configs:
+                job = executor.submit(save_results, **params)
+                job_param = f"{params['category']}_N{params['num_images']}_{params['sample_num']}"
+                jobs.append((job_param, job))
+        jobs = {f"{job_param}_{job.job_id}": job for job_param, job in jobs}
+        submitit_job_watcher(jobs)
+    else:
+        for job_config in tqdm(job_configs):
+            # This is much slower.
+            save_results(**job_config)
+def process_predictions(eval_path, pred_index, checkpoint=800_000, threshold_R=15, threshold_CC=0.1):
+    """
+    pred_index should be 1 (corresponding to T=90)
+    """
+    def aggregate_per_category(categories, metric_key, num_images, sample_num, threshold=None):
+        """
+        Aggregates one metric over all data points in a prediction file and then across categories.
+        - For R_error and CC_error: use mean to threshold-based accuracy
+        - For CD and CD_Object: use median to reduce the effect of outliers
+        """
+        per_category_values = []
+        for category in tqdm(categories, desc=f"Sample {sample_num}, N={num_images}, {metric_key}"):
+            per_pred_values = []
+            data_path = glob(
+                os.path.join(eval_path, "eval", f"{category}_{num_images}_{sample_num}_ckpt{checkpoint}*.json")
+            )[0]
+            with open(data_path) as f:
+                eval_data = json.load(f)
+            for preds in eval_data.values():
+                if metric_key in ["R_error", "CC_error"]:
+                    vals = np.array(preds[pred_index][metric_key])
+                    per_pred_values.append(np.mean(vals < threshold))
+                else:
+                    per_pred_values.append(preds[pred_index][metric_key])
+            # Aggregate over all predictions within this category
+            per_category_values.append(
+                np.mean(per_pred_values) if metric_key in ["R_error", "CC_error"]
+                else np.median(per_pred_values)  # CD or CD_Object — use median to filter outliers
+            )
+        if metric_key in ["R_error", "CC_error"]:
+            return np.mean(per_category_values)
+        else:
+            return np.median(per_category_values)
+    def aggregate_metric(categories, metric_key, num_images, threshold=None):
+        """Aggregates one metric over 5 random samples per category and returns the final mean"""
+        return np.mean([
+            aggregate_per_category(categories, metric_key, num_images, sample_num, threshold=threshold)
+            for sample_num in range(5)
+        ])
+    # Output containers
+    all_seen_acc_R, all_seen_acc_CC = [], []
+    all_seen_CD, all_seen_CD_Object = [], []
+    all_unseen_acc_R, all_unseen_acc_CC = [], []
+    all_unseen_CD, all_unseen_CD_Object = [], []
+    for num_images in range(2, 9):
+        # Seen categories
+        all_seen_acc_R.append(
+            aggregate_metric(TRAINING_CATEGORIES, "R_error", num_images, threshold=threshold_R)
+        )
+        all_seen_acc_CC.append(
+            aggregate_metric(TRAINING_CATEGORIES, "CC_error", num_images, threshold=threshold_CC)
+        )
+        all_seen_CD.append(
+            aggregate_metric(TRAINING_CATEGORIES, "CD", num_images)
+        )
+        all_seen_CD_Object.append(
+            aggregate_metric(TRAINING_CATEGORIES, "CD_Object", num_images)
+        )
+        # Unseen categories
+        all_unseen_acc_R.append(
+            aggregate_metric(TEST_CATEGORIES, "R_error", num_images, threshold=threshold_R)
+        )
+        all_unseen_acc_CC.append(
+            aggregate_metric(TEST_CATEGORIES, "CC_error", num_images, threshold=threshold_CC)
+        )
+        all_unseen_CD.append(
+            aggregate_metric(TEST_CATEGORIES, "CD", num_images)
+        )
+        all_unseen_CD_Object.append(
+            aggregate_metric(TEST_CATEGORIES, "CD_Object", num_images)
+        )
+    # Print the results in formatted rows
+    print("N=           ", " ".join(f"{i: 5}" for i in range(2, 9)))
+    print("Seen R       ", " ".join([f"{x:0.3f}" for x in all_seen_acc_R]))
+    print("Seen CC      ", " ".join([f"{x:0.3f}" for x in all_seen_acc_CC]))
+    print("Seen CD      ", " ".join([f"{x:0.3f}" for x in all_seen_CD]))
+    print("Seen CD_Obj  ", " ".join([f"{x:0.3f}" for x in all_seen_CD_Object]))
+    print("Unseen R     ", " ".join([f"{x:0.3f}" for x in all_unseen_acc_R]))
+    print("Unseen CC    ", " ".join([f"{x:0.3f}" for x in all_unseen_acc_CC]))
+    print("Unseen CD    ", " ".join([f"{x:0.3f}" for x in all_unseen_CD]))
+    print("Unseen CD_Obj", " ".join([f"{x:0.3f}" for x in all_unseen_CD_Object]))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--eval_path", type=str, default=None)
+    parser.add_argument("--use_submitit", action="store_true")
+    parser.add_argument("--mode", type=str, default="test")
+    args = parser.parse_args()
+    eval_path = "output/multi_diffusionsfm_dense" if args.eval_path is None else args.eval_path
+    use_submitit = args.use_submitit
+    mode = args.mode
+    evaluate_diffusionsfm(eval_path, use_submitit, mode)
+    process_predictions(eval_path, 1)

diffusionsfm/inference/__init__.py ADDED Viewed

File without changes

diffusionsfm/inference/ddim.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import torch
+import random
+import numpy as np
+from tqdm.auto import tqdm
+from diffusionsfm.utils.rays import compute_ndc_coordinates
+def inference_ddim(
+    model,
+    images,
+    device,
+    crop_parameters=None,
+    eta=0,
+    num_inference_steps=100,
+    pbar=True,
+    num_patches_x=16,
+    num_patches_y=16,
+    visualize=False,
+    seed=0,
+):
+    """
+    Implements DDIM-style inference.
+    To get multiple samples, batch the images multiple times.
+    Args:
+        model: Ray Diffuser.
+        images (torch.Tensor): (B, N, C, H, W).
+        patch_rays_gt (torch.Tensor): If provided, the patch rays which are ground
+            truth (B, N, P, 6).
+        eta (float, optional): Stochasticity coefficient. 0 is completely deterministic,
+            1 is equivalent to DDPM. (Default: 0)
+        num_inference_steps (int, optional): Number of inference steps. (Default: 100)
+        pbar (bool, optional): Whether to show progress bar. (Default: True)
+    """
+    timesteps = model.noise_scheduler.compute_inference_timesteps(num_inference_steps)
+    batch_size = images.shape[0]
+    num_images = images.shape[1]
+    if isinstance(eta, list):
+        eta_0, eta_1 = float(eta[0]), float(eta[1])
+    else:
+        eta_0, eta_1 = 0, 0
+    # Fixing seed
+    if seed is not None:
+        torch.manual_seed(seed)
+        random.seed(seed)
+        np.random.seed(seed)
+    with torch.no_grad():
+        x_tau = torch.randn(
+            batch_size,
+            num_images,
+            model.ray_out if hasattr(model, "ray_out") else model.ray_dim,
+            num_patches_x,
+            num_patches_y,
+            device=device,
+        )
+        if visualize:
+            x_taus = [x_tau]
+            all_pred = []
+            noise_samples = []
+        image_features = model.feature_extractor(images, autoresize=True)
+        if model.append_ndc:
+            ndc_coordinates = compute_ndc_coordinates(
+                crop_parameters=crop_parameters,
+                no_crop_param_device="cpu",
+                num_patches_x=model.width,
+                num_patches_y=model.width,
+                distortion_coeffs=None,
+            )[..., :2].to(device)
+            ndc_coordinates = ndc_coordinates.permute(0, 1, 4, 2, 3)
+        else:
+            ndc_coordinates = None
+        loop = tqdm(range(len(timesteps))) if pbar else range(len(timesteps))
+        for t in loop:
+            tau = timesteps[t]
+            if tau > 0 and eta_1 > 0:
+                z = torch.randn(
+                    batch_size,
+                    num_images,
+                    model.ray_out if hasattr(model, "ray_out") else model.ray_dim,
+                    num_patches_x,
+                    num_patches_y,
+                    device=device,
+                )
+            else:
+                z = 0
+            alpha = model.noise_scheduler.alphas_cumprod[tau]
+            if tau > 0:
+                tau_prev = timesteps[t + 1]
+                alpha_prev = model.noise_scheduler.alphas_cumprod[tau_prev]
+            else:
+                alpha_prev = torch.tensor(1.0, device=device).float()
+            sigma_t = (
+                torch.sqrt((1 - alpha_prev) / (1 - alpha))
+                * torch.sqrt(1 - alpha / alpha_prev)
+            )
+            eps_pred, noise_sample = model(
+                features=image_features,
+                rays_noisy=x_tau,
+                t=int(tau),
+                ndc_coordinates=ndc_coordinates,
+            )
+            if model.use_homogeneous:
+                p1 = eps_pred[:, :, :4]
+                p2 = eps_pred[:, :, 4:]
+                c1 = torch.linalg.norm(p1, dim=2, keepdim=True)
+                c2 = torch.linalg.norm(p2, dim=2, keepdim=True)
+                eps_pred[:, :, :4] = p1 / c1
+                eps_pred[:, :, 4:] = p2 / c2
+            if visualize:
+                all_pred.append(eps_pred.clone())
+                noise_samples.append(noise_sample)
+            # TODO: Can simplify this a lot
+            x0_pred = eps_pred.clone()
+            eps_pred = (x_tau - torch.sqrt(alpha) * eps_pred) / torch.sqrt(
+                1 - alpha
+            )
+            dir_x_tau = torch.sqrt(1 - alpha_prev - eta_0*sigma_t**2) * eps_pred
+            noise = eta_1 * sigma_t * z
+            new_x_tau = torch.sqrt(alpha_prev) * x0_pred + dir_x_tau + noise
+            x_tau = new_x_tau
+            if visualize:
+                x_taus.append(x_tau.detach().clone())
+    if visualize:
+        return x_tau, x_taus, all_pred, noise_samples
+    return x_tau

diffusionsfm/inference/load_model.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os.path as osp
+from glob import glob
+import torch
+from omegaconf import OmegaConf
+from diffusionsfm.model.diffuser import RayDiffuser
+from diffusionsfm.model.diffuser_dpt import RayDiffuserDPT
+from diffusionsfm.model.scheduler import NoiseScheduler
+def load_model(
+    output_dir, checkpoint=None, device="cuda:0", custom_keys=None, ignore_keys=()
+):
+    """
+    Loads a model and config from an output directory.
+    E.g. to load with different number of images,
+    ```
+    custom_keys={"model.num_images": 15}, ignore_keys=["pos_table"]
+    ```
+    Args:
+        output_dir (str): Path to the output directory.
+        checkpoint (str or int): Path to the checkpoint to load. If None, loads the
+            latest checkpoint.
+        device (str): Device to load the model on.
+        custom_keys (dict): Dictionary of custom keys to override in the config.
+    """
+    if checkpoint is None:
+        checkpoint_path = sorted(glob(osp.join(output_dir, "checkpoints", "*.pth")))[-1]
+    else:
+        if isinstance(checkpoint, int):
+            checkpoint_name = f"ckpt_{checkpoint:08d}.pth"
+        else:
+            checkpoint_name = checkpoint
+        checkpoint_path = osp.join(output_dir, "checkpoints", checkpoint_name)
+    print("Loading checkpoint", osp.basename(checkpoint_path))
+    cfg = OmegaConf.load(osp.join(output_dir, "hydra", "config.yaml"))
+    if custom_keys is not None:
+        for k, v in custom_keys.items():
+            OmegaConf.update(cfg, k, v)
+    noise_scheduler = NoiseScheduler(
+        type=cfg.noise_scheduler.type,
+        max_timesteps=cfg.noise_scheduler.max_timesteps,
+        beta_start=cfg.noise_scheduler.beta_start,
+        beta_end=cfg.noise_scheduler.beta_end,
+    )
+    if not cfg.training.get("dpt_head", False):
+        model = RayDiffuser(
+            depth=cfg.model.depth,
+            width=cfg.model.num_patches_x,
+            P=1,
+            max_num_images=cfg.model.num_images,
+            noise_scheduler=noise_scheduler,
+            feature_extractor=cfg.model.feature_extractor,
+            append_ndc=cfg.model.append_ndc,
+            diffuse_depths=cfg.training.get("diffuse_depths", False),
+            depth_resolution=cfg.training.get("depth_resolution", 1),
+            use_homogeneous=cfg.model.get("use_homogeneous", False),
+            cond_depth_mask=cfg.model.get("cond_depth_mask", False),
+        ).to(device)
+    else:
+        model = RayDiffuserDPT(
+            depth=cfg.model.depth,
+            width=cfg.model.num_patches_x,
+            P=1,
+            max_num_images=cfg.model.num_images,
+            noise_scheduler=noise_scheduler,
+            feature_extractor=cfg.model.feature_extractor,
+            append_ndc=cfg.model.append_ndc,
+            diffuse_depths=cfg.training.get("diffuse_depths", False),
+            depth_resolution=cfg.training.get("depth_resolution", 1),
+            encoder_features=cfg.training.get("dpt_encoder_features", False),
+            use_homogeneous=cfg.model.get("use_homogeneous", False),
+            cond_depth_mask=cfg.model.get("cond_depth_mask", False),
+        ).to(device)
+    data = torch.load(checkpoint_path)
+    state_dict = {}
+    for k, v in data["state_dict"].items():
+        include = True
+        for ignore_key in ignore_keys:
+            if ignore_key in k:
+                include = False
+        if include:
+            state_dict[k] = v
+    missing, unexpected = model.load_state_dict(state_dict, strict=False)
+    if len(missing) > 0:
+        print("Missing keys:", missing)
+    if len(unexpected) > 0:
+        print("Unexpected keys:", unexpected)
+    model = model.eval()
+    return model, cfg

diffusionsfm/inference/predict.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from diffusionsfm.inference.ddim import inference_ddim
+from diffusionsfm.utils.rays import (
+    Rays,
+    rays_to_cameras,
+    rays_to_cameras_homography,
+)
+def predict_cameras(
+    model,
+    images,
+    device,
+    crop_parameters=None,
+    num_patches_x=16,
+    num_patches_y=16,
+    additional_timesteps=(),
+    calculate_intrinsics=False,
+    max_num_images=None,
+    mode=None,
+    return_rays=False,
+    use_homogeneous=False,
+    seed=0,
+):
+    """
+    Args:
+        images (torch.Tensor): (N, C, H, W)
+        crop_parameters (torch.Tensor): (N, 4) or None
+    """
+    if calculate_intrinsics:
+        ray_to_cam = rays_to_cameras_homography
+    else:
+        ray_to_cam = rays_to_cameras
+    get_spatial_rays = Rays.from_spatial
+    rays_final, rays_intermediate, pred_intermediate, _ = inference_ddim(
+        model,
+        images.unsqueeze(0),
+        device,
+        visualize=True,
+        crop_parameters=crop_parameters.unsqueeze(0),
+        num_patches_x=num_patches_x,
+        num_patches_y=num_patches_y,
+        pbar=False,
+        eta=[1, 0],
+        num_inference_steps=100,
+    )
+    spatial_rays = get_spatial_rays(
+        rays_final[0],
+        mode=mode,
+        num_patches_x=num_patches_x,
+        num_patches_y=num_patches_y,
+        use_homogeneous=use_homogeneous,
+    )
+    pred_cam = ray_to_cam(
+        spatial_rays,
+        crop_parameters,
+        num_patches_x=num_patches_x,
+        num_patches_y=num_patches_y,
+        depth_resolution=model.depth_resolution,
+        average_centers=True,
+        directions_from_averaged_center=True,
+    )
+    additional_predictions = []
+    for t in additional_timesteps:
+        ray = pred_intermediate[t]
+        ray = get_spatial_rays(
+            ray[0],
+            mode=mode,
+            num_patches_x=num_patches_x,
+            num_patches_y=num_patches_y,
+            use_homogeneous=use_homogeneous,
+        )
+        cam = ray_to_cam(
+            ray,
+            crop_parameters,
+            num_patches_x=num_patches_x,
+            num_patches_y=num_patches_y,
+            average_centers=True,
+            directions_from_averaged_center=True,
+        )
+        if return_rays:
+            cam = (cam, ray)
+        additional_predictions.append(cam)
+    if return_rays:
+        return (pred_cam, spatial_rays), additional_predictions
+    return pred_cam, additional_predictions, spatial_rays

diffusionsfm/model/base_model.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import torch
+class BaseModel(torch.nn.Module):
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device("cpu"))
+        if "optimizer" in parameters:
+            parameters = parameters["model"]
+        self.load_state_dict(parameters)

diffusionsfm/model/blocks.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import torch
+import torch.nn as nn
+from diffusionsfm.model.dit import TimestepEmbedder
+import ipdb
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(-1).unsqueeze(-1)) + shift.unsqueeze(-1).unsqueeze(
+        -1
+    )
+def _make_fusion_block(features, use_bn, use_ln, dpt_time, resolution):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        dpt_time=dpt_time,
+        ln=use_ln,
+        resolution=resolution
+    )
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand == True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0],
+        out_shape1,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1],
+        out_shape2,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2],
+        out_shape3,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3],
+        out_shape4,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        bias=False,
+        groups=groups,
+    )
+    return scratch
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module."""
+    def __init__(self, features, activation, bn, ln, dpt_time=False, resolution=16):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.ln = ln
+        self.groups = 1
+        self.conv1 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        self.conv2 = nn.Conv2d(
+            features,
+            features,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=not self.bn,
+            groups=self.groups,
+        )
+        nn.init.kaiming_uniform_(self.conv1.weight)
+        nn.init.kaiming_uniform_(self.conv2.weight)
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        if self.ln == True:
+            self.bn1 = nn.LayerNorm((features, resolution, resolution))
+            self.bn2 = nn.LayerNorm((features, resolution, resolution))
+        self.activation = activation
+        if dpt_time:
+            self.t_embedder = TimestepEmbedder(hidden_size=features)
+            self.adaLN_modulation = nn.Sequential(
+                nn.SiLU(), nn.Linear(features, 3 * features, bias=True)
+            )
+    def forward(self, x, t=None):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        if t is not None:
+            # Embed timestamp & calculate shift parameters
+            t = self.t_embedder(t)  # (B*N)
+            shift, scale, gate = self.adaLN_modulation(t).chunk(3, dim=1)  # (B * N, T)
+            # Shift & scale x
+            x = modulate(x, shift, scale)  # (B * N, T, H, W)
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn or self.ln:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn or self.ln:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        if t is not None:
+            out = gate.unsqueeze(-1).unsqueeze(-1) * out
+        return out + x
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block."""
+    def __init__(
+        self,
+        features,
+        activation,
+        deconv=False,
+        bn=False,
+        ln=False,
+        expand=False,
+        align_corners=True,
+        dpt_time=False,
+        resolution=16,
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(
+            features,
+            out_features,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+            groups=1,
+        )
+        nn.init.kaiming_uniform_(self.out_conv.weight)
+        # The second block sees time
+        self.resConfUnit1 = ResidualConvUnit_custom(
+            features, activation, bn=bn, ln=ln, dpt_time=False, resolution=resolution
+        )
+        self.resConfUnit2 = ResidualConvUnit_custom(
+            features, activation, bn=bn, ln=ln, dpt_time=dpt_time, resolution=resolution
+        )
+    def forward(self, input, activation=None, t=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = input
+        if activation is not None:
+            res = self.resConfUnit1(activation)
+            output += res
+        output = self.resConfUnit2(output, t)
+        output = torch.nn.functional.interpolate(
+            output.float(),
+            scale_factor=2,
+            mode="bilinear",
+            align_corners=self.align_corners,
+        )
+        output = self.out_conv(output)
+        return output

diffusionsfm/model/diffuser.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import ipdb  # noqa: F401
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusionsfm.model.dit import DiT
+from diffusionsfm.model.feature_extractors import PretrainedVAE, SpatialDino
+from diffusionsfm.model.scheduler import NoiseScheduler
+class RayDiffuser(nn.Module):
+    def __init__(
+        self,
+        model_type="dit",
+        depth=8,
+        width=16,
+        hidden_size=1152,
+        P=1,
+        max_num_images=1,
+        noise_scheduler=None,
+        freeze_encoder=True,
+        feature_extractor="dino",
+        append_ndc=True,
+        use_unconditional=False,
+        diffuse_depths=False,
+        depth_resolution=1,
+        use_homogeneous=False,
+        cond_depth_mask=False,
+    ):
+        super().__init__()
+        if noise_scheduler is None:
+            self.noise_scheduler = NoiseScheduler()
+        else:
+            self.noise_scheduler = noise_scheduler
+        self.diffuse_depths = diffuse_depths
+        self.depth_resolution = depth_resolution
+        self.use_homogeneous = use_homogeneous
+        self.ray_dim = 3
+        if self.use_homogeneous:
+            self.ray_dim += 1
+        self.ray_dim += self.ray_dim * self.depth_resolution**2
+        if self.diffuse_depths:
+            self.ray_dim += 1
+        self.append_ndc = append_ndc
+        self.width = width
+        self.max_num_images = max_num_images
+        self.model_type = model_type
+        self.use_unconditional = use_unconditional
+        self.cond_depth_mask = cond_depth_mask
+        if feature_extractor == "dino":
+            self.feature_extractor = SpatialDino(
+                freeze_weights=freeze_encoder, num_patches_x=width, num_patches_y=width
+            )
+            self.feature_dim = self.feature_extractor.feature_dim
+        elif feature_extractor == "vae":
+            self.feature_extractor = PretrainedVAE(
+                freeze_weights=freeze_encoder, num_patches_x=width, num_patches_y=width
+            )
+            self.feature_dim = self.feature_extractor.feature_dim
+        else:
+            raise Exception(f"Unknown feature extractor {feature_extractor}")
+        if self.use_unconditional:
+            self.register_parameter(
+                "null_token", nn.Parameter(torch.randn(self.feature_dim, 1, 1))
+            )
+        self.input_dim = self.feature_dim * 2
+        if self.append_ndc:
+            self.input_dim += 2
+        if model_type == "dit":
+            self.ray_predictor = DiT(
+                in_channels=self.input_dim,
+                out_channels=self.ray_dim,
+                width=width,
+                depth=depth,
+                hidden_size=hidden_size,
+                max_num_images=max_num_images,
+                P=P,
+            )
+        self.scratch = nn.Module()
+        self.scratch.input_conv = nn.Linear(self.ray_dim + int(self.cond_depth_mask), self.feature_dim)
+    def forward_noise(
+        self, x, t, epsilon=None, zero_out_mask=None
+    ):
+        """
+        Applies forward diffusion (adds noise) to the input.
+        If a mask is provided, the noise is only applied to the masked inputs.
+        """
+        t = t.reshape(-1, 1, 1, 1, 1)
+        if epsilon is None:
+            epsilon = torch.randn_like(x)
+        else:
+            epsilon = epsilon.reshape(x.shape)
+        alpha_bar = self.noise_scheduler.alphas_cumprod[t]
+        x_noise = torch.sqrt(alpha_bar) * x + torch.sqrt(1 - alpha_bar) * epsilon
+        if zero_out_mask is not None and self.cond_depth_mask:
+            x_noise = x_noise * zero_out_mask
+        return x_noise, epsilon
+    def forward(
+        self,
+        features=None,
+        images=None,
+        rays=None,
+        rays_noisy=None,
+        t=None,
+        ndc_coordinates=None,
+        unconditional_mask=None,
+        return_dpt_activations=False,
+        depth_mask=None,
+    ):
+        """
+        Args:
+            images: (B, N, 3, H, W).
+            t: (B,).
+            rays: (B, N, 6, H, W).
+            rays_noisy: (B, N, 6, H, W).
+            ndc_coordinates: (B, N, 2, H, W).
+            unconditional_mask: (B, N) or (B,). Should be 1 for unconditional samples
+                and 0 else.
+        """
+        if features is None:
+            # VAE expects 256x256 images while DINO expects 224x224 images.
+            # Both feature extractors support autoresize=True, but ideally we should
+            # set this to be false and handle in the dataloader.
+            features = self.feature_extractor(images, autoresize=True)
+        B = features.shape[0]
+        if (
+            unconditional_mask is not None
+            and self.use_unconditional
+        ):
+            null_token = self.null_token.reshape(1, 1, self.feature_dim, 1, 1)
+            unconditional_mask = unconditional_mask.reshape(B, -1, 1, 1, 1)
+            features = (
+                features * (1 - unconditional_mask) + null_token * unconditional_mask
+            )
+        if isinstance(t, int) or isinstance(t, np.int64):
+            t = torch.ones(1, dtype=int).to(features.device) * t
+        else:
+            t = t.reshape(B)
+        if rays_noisy is None:
+            if self.cond_depth_mask:
+                rays_noisy, epsilon = self.forward_noise(rays, t, zero_out_mask=depth_mask.unsqueeze(2))
+            else:
+                rays_noisy, epsilon = self.forward_noise(rays, t)
+        else:
+            epsilon = None
+        if self.cond_depth_mask:
+            if depth_mask is None:
+                depth_mask = torch.ones_like(rays_noisy[:, :, 0])
+            ray_repr = torch.cat([rays_noisy, depth_mask.unsqueeze(2)], dim=2)
+        else:
+            ray_repr = rays_noisy
+        ray_repr = ray_repr.permute(0, 1, 3, 4, 2)
+        ray_repr = self.scratch.input_conv(ray_repr).permute(0, 1, 4, 2, 3).contiguous()
+        scene_features = torch.cat([features, ray_repr], dim=2)
+        if self.append_ndc:
+            scene_features = torch.cat([scene_features, ndc_coordinates], dim=2)
+        epsilon_pred = self.ray_predictor(
+            scene_features,
+            t,
+            return_dpt_activations=return_dpt_activations,
+        )
+        if return_dpt_activations:
+            return epsilon_pred, rays_noisy, epsilon
+        return epsilon_pred, epsilon

diffusionsfm/model/diffuser_dpt.py ADDED Viewed

	@@ -0,0 +1,331 @@

+import ipdb  # noqa: F401
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusionsfm.model.dit import DiT
+from diffusionsfm.model.feature_extractors import PretrainedVAE, SpatialDino
+from diffusionsfm.model.blocks import _make_fusion_block, _make_scratch
+from diffusionsfm.model.scheduler import NoiseScheduler
+# functional implementation
+def nearest_neighbor_upsample(x: torch.Tensor, scale_factor: int):
+    """Upsample {x} (NCHW) by scale factor {scale_factor} using nearest neighbor interpolation."""
+    s = scale_factor
+    return (
+        x.reshape(*x.shape, 1, 1)
+        .expand(*x.shape, s, s)
+        .transpose(-2, -3)
+        .reshape(*x.shape[:2], *(s * hw for hw in x.shape[2:]))
+    )
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class RayDiffuserDPT(nn.Module):
+    def __init__(
+        self,
+        model_type="dit",
+        depth=8,
+        width=16,
+        hidden_size=1152,
+        P=1,
+        max_num_images=1,
+        noise_scheduler=None,
+        freeze_encoder=True,
+        feature_extractor="dino",
+        append_ndc=True,
+        use_unconditional=False,
+        diffuse_depths=False,
+        depth_resolution=1,
+        encoder_features=False,
+        use_homogeneous=False,
+        freeze_transformer=False,
+        cond_depth_mask=False,
+    ):
+        super().__init__()
+        if noise_scheduler is None:
+            self.noise_scheduler = NoiseScheduler()
+        else:
+            self.noise_scheduler = noise_scheduler
+        self.diffuse_depths = diffuse_depths
+        self.depth_resolution = depth_resolution
+        self.use_homogeneous = use_homogeneous
+        self.ray_dim = 3
+        if self.use_homogeneous:
+            self.ray_dim += 1
+        self.ray_dim += self.ray_dim * self.depth_resolution**2
+        if self.diffuse_depths:
+            self.ray_dim += 1
+        self.append_ndc = append_ndc
+        self.width = width
+        self.max_num_images = max_num_images
+        self.model_type = model_type
+        self.use_unconditional = use_unconditional
+        self.cond_depth_mask = cond_depth_mask
+        self.encoder_features = encoder_features
+        if feature_extractor == "dino":
+            self.feature_extractor = SpatialDino(
+                freeze_weights=freeze_encoder,
+                num_patches_x=width,
+                num_patches_y=width,
+                activation_hooks=self.encoder_features,
+            )
+            self.feature_dim = self.feature_extractor.feature_dim
+        elif feature_extractor == "vae":
+            self.feature_extractor = PretrainedVAE(
+                freeze_weights=freeze_encoder, num_patches_x=width, num_patches_y=width
+            )
+            self.feature_dim = self.feature_extractor.feature_dim
+        else:
+            raise Exception(f"Unknown feature extractor {feature_extractor}")
+        if self.use_unconditional:
+            self.register_parameter(
+                "null_token", nn.Parameter(torch.randn(self.feature_dim, 1, 1))
+            )
+        self.input_dim = self.feature_dim * 2
+        if self.append_ndc:
+            self.input_dim += 2
+        if model_type == "dit":
+            self.ray_predictor = DiT(
+                in_channels=self.input_dim,
+                out_channels=self.ray_dim,
+                width=width,
+                depth=depth,
+                hidden_size=hidden_size,
+                max_num_images=max_num_images,
+                P=P,
+            )
+            if freeze_transformer:
+                for param in self.ray_predictor.parameters():
+                    param.requires_grad = False
+        # Fusion blocks
+        self.f = 256
+        if self.encoder_features:
+            feature_lens = [
+                self.feature_extractor.feature_dim,
+                self.feature_extractor.feature_dim,
+                self.ray_predictor.hidden_size,
+                self.ray_predictor.hidden_size,
+            ]
+        else:
+            feature_lens = [self.ray_predictor.hidden_size] * 4
+        self.scratch = _make_scratch(feature_lens, 256, groups=1, expand=False)
+        self.scratch.refinenet1 = _make_fusion_block(
+            self.f, use_bn=False, use_ln=False, dpt_time=True, resolution=128
+        )
+        self.scratch.refinenet2 = _make_fusion_block(
+            self.f, use_bn=False, use_ln=False, dpt_time=True, resolution=64
+        )
+        self.scratch.refinenet3 = _make_fusion_block(
+            self.f, use_bn=False, use_ln=False, dpt_time=True, resolution=32
+        )
+        self.scratch.refinenet4 = _make_fusion_block(
+            self.f, use_bn=False, use_ln=False, dpt_time=True, resolution=16
+        )
+        self.scratch.input_conv = nn.Conv2d(
+            self.ray_dim + int(self.cond_depth_mask),
+            self.feature_dim,
+            kernel_size=16,
+            stride=16,
+            padding=0
+        )
+        self.scratch.output_conv = nn.Sequential(
+            nn.Conv2d(self.f, self.f // 2, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(),
+            nn.Conv2d(self.f // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.LeakyReLU(),
+            nn.Conv2d(32, self.ray_dim, kernel_size=1, stride=1, padding=0),
+            nn.Identity(),
+        )
+        if self.encoder_features:
+            self.project_opers = nn.ModuleList([
+                ProjectReadout(in_features=self.feature_extractor.feature_dim),
+                ProjectReadout(in_features=self.feature_extractor.feature_dim),
+            ])
+    def forward_noise(
+        self, x, t, epsilon=None, zero_out_mask=None
+    ):
+        """
+        Applies forward diffusion (adds noise) to the input.
+        If a mask is provided, the noise is only applied to the masked inputs.
+        """
+        t = t.reshape(-1, 1, 1, 1, 1)
+        if epsilon is None:
+            epsilon = torch.randn_like(x)
+        else:
+            epsilon = epsilon.reshape(x.shape)
+        alpha_bar = self.noise_scheduler.alphas_cumprod[t]
+        x_noise = torch.sqrt(alpha_bar) * x + torch.sqrt(1 - alpha_bar) * epsilon
+        if zero_out_mask is not None and self.cond_depth_mask:
+            x_noise = zero_out_mask * x_noise
+        return x_noise, epsilon
+    def forward(
+        self,
+        features=None,
+        images=None,
+        rays=None,
+        rays_noisy=None,
+        t=None,
+        ndc_coordinates=None,
+        unconditional_mask=None,
+        encoder_patches=16,
+        depth_mask=None,
+        multiview_unconditional=False,
+        indices=None,
+    ):
+        """
+        Args:
+            images: (B, N, 3, H, W).
+            t: (B,).
+            rays: (B, N, 6, H, W).
+            rays_noisy: (B, N, 6, H, W).
+            ndc_coordinates: (B, N, 2, H, W).
+            unconditional_mask: (B, N) or (B,). Should be 1 for unconditional samples
+                and 0 else.
+        """
+        if features is None:
+            # VAE expects 256x256 images while DINO expects 224x224 images.
+            # Both feature extractors support autoresize=True, but ideally we should
+            # set this to be false and handle in the dataloader.
+            features = self.feature_extractor(images, autoresize=True)
+        B = features.shape[0]
+        if unconditional_mask is not None and self.use_unconditional:
+            null_token = self.null_token.reshape(1, 1, self.feature_dim, 1, 1)
+            unconditional_mask = unconditional_mask.reshape(B, -1, 1, 1, 1)
+            features = (
+                features * (1 - unconditional_mask) + null_token * unconditional_mask
+            )
+        if isinstance(t, int) or isinstance(t, np.int64):
+            t = torch.ones(1, dtype=int).to(features.device) * t
+        else:
+            t = t.reshape(B)
+        if rays_noisy is None:
+            if self.cond_depth_mask:
+                rays_noisy, epsilon = self.forward_noise(
+                    rays, t, zero_out_mask=depth_mask.unsqueeze(2)
+                )
+            else:
+                rays_noisy, epsilon = self.forward_noise(
+                    rays, t
+                )
+        else:
+            epsilon = None
+        # DOWNSAMPLE RAYS
+        B, N, C, H, W = rays_noisy.shape
+        if self.cond_depth_mask:
+            if depth_mask is None:
+                depth_mask = torch.ones_like(rays_noisy[:, :, 0])
+            ray_repr = torch.cat([rays_noisy, depth_mask.unsqueeze(2)], dim=2)
+        else:
+            ray_repr = rays_noisy
+        ray_repr = self.scratch.input_conv(ray_repr.reshape(B * N, -1, H, W))
+        _, CP, HP, WP = ray_repr.shape
+        ray_repr = ray_repr.reshape(B, N, CP, HP, WP)
+        scene_features = torch.cat([features, ray_repr], dim=2)
+        if self.append_ndc:
+            scene_features = torch.cat([scene_features, ndc_coordinates], dim=2)
+        # DIT FORWARD PASS
+        activations = self.ray_predictor(
+            scene_features,
+            t,
+            return_dpt_activations=True,
+            multiview_unconditional=multiview_unconditional,
+        )
+        # PROJECT ENCODER ACTIVATIONS & RESHAPE
+        if self.encoder_features:
+            for i in range(2):
+                name = f"encoder{i+1}"
+                if indices is not None:
+                    act = self.feature_extractor.activations[name][indices]
+                else:
+                    act = self.feature_extractor.activations[name]
+                act = self.project_opers[i](act).permute(0, 2, 1)
+                act = act.reshape(
+                    (
+                        B * N,
+                        self.feature_extractor.feature_dim,
+                        encoder_patches,
+                        encoder_patches,
+                    )
+                )
+                activations[i] = act
+        # UPSAMPLE ACTIVATIONS
+        for i, act in enumerate(activations):
+            k = 3 - i
+            activations[i] = nearest_neighbor_upsample(act, 2**k)
+        # FUSION BLOCKS
+        layer_1_rn = self.scratch.layer1_rn(activations[0])
+        layer_2_rn = self.scratch.layer2_rn(activations[1])
+        layer_3_rn = self.scratch.layer3_rn(activations[2])
+        layer_4_rn = self.scratch.layer4_rn(activations[3])
+        # RESHAPE TIMESTEPS
+        if t.shape[0] == B:
+            t = t.unsqueeze(-1).repeat((1, N)).reshape(B * N)
+        elif t.shape[0] == 1 and B > 1:
+            t = t.repeat((B * N))
+        else:
+            assert False
+        path_4 = self.scratch.refinenet4(layer_4_rn, t=t)
+        path_3 = self.scratch.refinenet3(path_4, activation=layer_3_rn, t=t)
+        path_2 = self.scratch.refinenet2(path_3, activation=layer_2_rn, t=t)
+        path_1 = self.scratch.refinenet1(path_2, activation=layer_1_rn, t=t)
+        epsilon_pred = self.scratch.output_conv(path_1)
+        epsilon_pred = epsilon_pred.reshape((B, N, C, H, W))
+        return epsilon_pred, epsilon

diffusionsfm/model/dit.py ADDED Viewed

	@@ -0,0 +1,428 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+import math
+import ipdb  # noqa: F401
+import numpy as np
+import torch
+import torch.nn as nn
+from timm.models.vision_transformer import Attention, Mlp, PatchEmbed
+from diffusionsfm.model.memory_efficient_attention import MEAttention
+def modulate(x, shift, scale):
+    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
+#################################################################################
+#               Embedding Layers for Timesteps and Class Labels                 #
+#################################################################################
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+        return embedding
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+#################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+class DiTBlock(nn.Module):
+    """
+    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
+    """
+    def __init__(
+        self,
+        hidden_size,
+        num_heads,
+        mlp_ratio=4.0,
+        use_xformers_attention=False,
+        **block_kwargs
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        attn = MEAttention if use_xformers_attention else Attention
+        self.attn = attn(
+            hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs
+        )
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        def approx_gelu():
+            return nn.GELU(approximate="tanh")
+        self.mlp = Mlp(
+            in_features=hidden_size,
+            hidden_features=mlp_hidden_dim,
+            act_layer=approx_gelu,
+            drop=0,
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 6 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        (
+            shift_msa,
+            scale_msa,
+            gate_msa,
+            shift_mlp,
+            scale_mlp,
+            gate_mlp,
+        ) = self.adaLN_modulation(c).chunk(6, dim=1)
+        x = x + gate_msa.unsqueeze(1) * self.attn(
+            modulate(self.norm1(x), shift_msa, scale_msa)
+        )
+        x = x + gate_mlp.unsqueeze(1) * self.mlp(
+            modulate(self.norm2(x), shift_mlp, scale_mlp)
+        )
+        return x
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+    def __init__(self, hidden_size, patch_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(
+            hidden_size, patch_size * patch_size * out_channels, bias=True
+        )
+        self.adaLN_modulation = nn.Sequential(
+            nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True)
+        )
+    def forward(self, x, c):
+        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
+        x = modulate(self.norm_final(x), shift, scale)
+        x = self.linear(x)
+        return x
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+    def __init__(
+        self,
+        in_channels=442,
+        out_channels=6,
+        width=16,
+        hidden_size=1152,
+        depth=8,
+        num_heads=16,
+        mlp_ratio=4.0,
+        max_num_images=8,
+        P=1,
+        within_image=False,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.width = width
+        self.hidden_size = hidden_size
+        self.max_num_images = max_num_images
+        self.P = P
+        self.within_image = within_image
+        # self.x_embedder = nn.Linear(in_channels, hidden_size)
+        # self.x_embedder = PatchEmbed(in_channels, hidden_size, kernel_size=P, hidden_size=P)
+        self.x_embedder = PatchEmbed(
+            img_size=self.width,
+            patch_size=self.P,
+            in_chans=in_channels,
+            embed_dim=hidden_size,
+            bias=True,
+            flatten=False,
+        )
+        self.x_pos_enc = FeaturePositionalEncoding(
+            max_num_images, hidden_size, width**2, P=self.P
+        )
+        self.t_embedder = TimestepEmbedder(hidden_size)
+        try:
+            import xformers
+            use_xformers_attention = True
+        except ImportError:
+            # xformers not available
+            use_xformers_attention = False
+        self.blocks = nn.ModuleList(
+            [
+                DiTBlock(
+                    hidden_size,
+                    num_heads,
+                    mlp_ratio=mlp_ratio,
+                    use_xformers_attention=use_xformers_attention,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.final_layer = FinalLayer(hidden_size, P, out_channels)
+        self.initialize_weights()
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
+        w = self.x_embedder.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.x_embedder.proj.bias, 0)
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+        # Zero-out adaLN modulation layers in DiT blocks:
+        for block in self.blocks:
+            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
+            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)
+        # Zero-out output layers:
+        # nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
+        # nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
+        # nn.init.constant_(self.final_layer.linear.weight, 0)
+        # nn.init.constant_(self.final_layer.linear.bias, 0)
+    def unpatchify(self, x):
+        """
+        x: (N, T, patch_size**2 * C)
+        imgs: (N, H, W, C)
+        """
+        c = self.out_channels
+        p = self.x_embedder.patch_size[0]
+        h = w = int(x.shape[1] ** 0.5)
+        # print("unpatchify", c, p, h, w, x.shape)
+        # assert h * w == x.shape[2]
+        x = x.reshape(shape=(x.shape[0], h, w, p, p, c))
+        x = torch.einsum("nhwpqc->nhpwqc", x)
+        imgs = x.reshape(shape=(x.shape[0], h * p, h * p, c))
+        return imgs
+    def forward(
+        self,
+        x,
+        t,
+        return_dpt_activations=False,
+        multiview_unconditional=False,
+    ):
+        """
+        Args:
+            x: Image/Ray features (B, N, C, H, W).
+            t: Timesteps (N,).
+        Returns:
+            (B, N, D, H, W)
+        """
+        B, N, c, h, w = x.shape
+        P = self.P
+        x = x.reshape((B * N, c, h, w))  # (B * N, C, H, W)
+        x = self.x_embedder(x)  # (B * N, C, H / P, W / P)
+        x = x.permute(0, 2, 3, 1)  # (B * N, H / P, W / P, C)
+        # (B, N, H / P, W / P, C)
+        x = x.reshape((B, N, h // P, w // P, self.hidden_size))
+        x = self.x_pos_enc(x)  # (B, N, H * W / P ** 2, C)
+        # TODO: fix positional encoding to work with (N, C, H, W) format.
+        # Eval time, we get a scalar t
+        if x.shape[0] != t.shape[0] and t.shape[0] == 1:
+            t = t.repeat_interleave(B)
+        if self.within_image or multiview_unconditional:
+            t_within = t.repeat_interleave(N)
+            t_within = self.t_embedder(t_within)
+        t = self.t_embedder(t)
+        dpt_activations = []
+        for i, block in enumerate(self.blocks):
+            # Within image block
+            if (self.within_image and i % 2 == 0) or multiview_unconditional:
+                x = x.reshape((B * N, h * w // P**2, self.hidden_size))
+                x = block(x, t_within)
+            # All patches block
+            # Final layer is an all patches layer
+            else:
+                x = x.reshape((B, N * h * w // P**2, self.hidden_size))
+                x = block(x, t)  # (N, T, D)
+            if return_dpt_activations and i % 4 == 3:
+                x_prime = x.reshape(B, N, h, w, self.hidden_size)
+                x_prime = x.reshape(B * N, h, w, self.hidden_size)
+                x_prime = x_prime.permute((0, 3, 1, 2))
+                dpt_activations.append(x_prime)
+        # Reshape the output back to original shape
+        if multiview_unconditional:
+            x = x.reshape((B, N * h * w // P**2, self.hidden_size))
+        # (B, N * H * W / P ** 2, D)
+        x = self.final_layer(
+            x, t
+        )  # (B, N * H * W / P ** 2,  6 * P ** 2) or (N, T, patch_size ** 2 * out_channels)
+        x = x.reshape((B * N, w * w // P**2, self.out_channels * P**2))
+        x = self.unpatchify(x)  # (B * N, H, W, C)
+        x = x.reshape((B, N) + x.shape[1:])
+        x = x.permute(0, 1, 4, 2, 3)  # (B, N, C, H, W)
+        if return_dpt_activations:
+            return dpt_activations[:4]
+        return x
+class FeaturePositionalEncoding(nn.Module):
+    def _get_sinusoid_encoding_table(self, n_position, d_hid, base):
+        """Sinusoid position encoding table"""
+        def get_position_angle_vec(position):
+            return [
+                position / np.power(base, 2 * (hid_j // 2) / d_hid)
+                for hid_j in range(d_hid)
+            ]
+        sinusoid_table = np.array(
+            [get_position_angle_vec(pos_i) for pos_i in range(n_position)]
+        )
+        sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # dim 2i
+        sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # dim 2i+1
+        return torch.FloatTensor(sinusoid_table).unsqueeze(0)
+    def __init__(self, max_num_images=8, feature_dim=1152, num_patches=256, P=1):
+        super().__init__()
+        self.max_num_images = max_num_images
+        self.feature_dim = feature_dim
+        self.P = P
+        self.num_patches = num_patches // self.P**2
+        self.register_buffer(
+            "image_pos_table",
+            self._get_sinusoid_encoding_table(
+                self.max_num_images, self.feature_dim, 10000
+            ),
+        )
+        self.register_buffer(
+            "token_pos_table",
+            self._get_sinusoid_encoding_table(
+                self.num_patches, self.feature_dim, 70007
+            ),
+        )
+    def forward(self, x):
+        batch_size = x.shape[0]
+        num_images = x.shape[1]
+        x = x.reshape(batch_size, num_images, self.num_patches, self.feature_dim)
+        # To encode image index
+        pe1 = self.image_pos_table[:, :num_images].clone().detach()
+        pe1 = pe1.reshape((1, num_images, 1, self.feature_dim))
+        pe1 = pe1.repeat((batch_size, 1, self.num_patches, 1))
+        # To encode patch index
+        pe2 = self.token_pos_table.clone().detach()
+        pe2 = pe2.reshape((1, 1, self.num_patches, self.feature_dim))
+        pe2 = pe2.repeat((batch_size, num_images, 1, 1))
+        x_pe = x + pe1 + pe2
+        x_pe = x_pe.reshape(
+            (batch_size, num_images * self.num_patches, self.feature_dim)
+        )
+        return x_pe
+    def forward_unet(self, x, B, N):
+        D = int(self.num_patches**0.5)
+        # x should be (B, N, T, D, D)
+        x = x.permute((0, 2, 3, 1))
+        x = x.reshape(B, N, self.num_patches, self.feature_dim)
+        # To encode image index
+        pe1 = self.image_pos_table[:, :N].clone().detach()
+        pe1 = pe1.reshape((1, N, 1, self.feature_dim))
+        pe1 = pe1.repeat((B, 1, self.num_patches, 1))
+        # To encode patch index
+        pe2 = self.token_pos_table.clone().detach()
+        pe2 = pe2.reshape((1, 1, self.num_patches, self.feature_dim))
+        pe2 = pe2.repeat((B, N, 1, 1))
+        x_pe = x + pe1 + pe2
+        x_pe = x_pe.reshape((B * N, D, D, self.feature_dim))
+        x_pe = x_pe.permute((0, 3, 1, 2))
+        return x_pe

diffusionsfm/model/feature_extractors.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import importlib
+import os
+import socket
+import sys
+import ipdb  # noqa: F401
+import torch
+import torch.nn as nn
+from omegaconf import OmegaConf
+HOSTNAME = socket.gethostname()
+if "trinity" in HOSTNAME:
+    # Might be outdated
+    config_path = "/home/amylin2/latent-diffusion/configs/autoencoder/autoencoder_kl_16x16x16.yaml"
+    weights_path = "/home/amylin2/latent-diffusion/model.ckpt"
+elif "grogu" in HOSTNAME:
+    # Might be outdated
+    config_path = "/home/jasonzh2/code/latent-diffusion/configs/autoencoder/autoencoder_kl_16x16x16.yaml"
+    weights_path = "/home/jasonzh2/code/latent-diffusion/model.ckpt"
+elif "ender" in HOSTNAME:
+    config_path = "/home/jason/ray_diffusion/external/latent-diffusion/configs/autoencoder/autoencoder_kl_16x16x16.yaml"
+    weights_path = "/home/jason/ray_diffusion/external/latent-diffusion/model.ckpt"
+else:
+    config_path = None
+    weights_path = None
+if weights_path is not None:
+    LDM_PATH = os.path.dirname(weights_path)
+    if LDM_PATH not in sys.path:
+        sys.path.append(LDM_PATH)
+def resize(image, size=None, scale_factor=None):
+    return nn.functional.interpolate(
+        image,
+        size=size,
+        scale_factor=scale_factor,
+        mode="bilinear",
+        align_corners=False,
+    )
+def instantiate_from_config(config):
+    if "target" not in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+class PretrainedVAE(nn.Module):
+    def __init__(self, freeze_weights=True, num_patches_x=16, num_patches_y=16):
+        super().__init__()
+        config = OmegaConf.load(config_path)
+        self.model = instantiate_from_config(config.model)
+        self.model.init_from_ckpt(weights_path)
+        self.model.eval()
+        self.feature_dim = 16
+        self.num_patches_x = num_patches_x
+        self.num_patches_y = num_patches_y
+        if freeze_weights:
+            for param in self.model.parameters():
+                param.requires_grad = False
+    def forward(self, x, autoresize=False):
+        """
+        Spatial dimensions of output will be H // 16, W // 16. If autoresize is True,
+        then the input will be resized such that the output feature map is the correct
+        dimensions.
+        Args:
+            x (torch.Tensor): Images (B, C, H, W). Should be normalized to be [-1, 1].
+            autoresize (bool): Whether to resize the input to match the num_patch
+                dimensions.
+        Returns:
+            torch.Tensor: Latent sample (B, 16, h, w)
+        """
+        *B, c, h, w = x.shape
+        x = x.reshape(-1, c, h, w)
+        if autoresize:
+            new_w = self.num_patches_x * 16
+            new_h = self.num_patches_y * 16
+            x = resize(x, size=(new_h, new_w))
+        decoded, latent = self.model(x)
+        # A little ambiguous bc it's all 16, but it is (c, h, w)
+        latent_sample = latent.sample().reshape(
+            *B, self.feature_dim, self.num_patches_y, self.num_patches_x
+        )
+        return latent_sample
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+class SpatialDino(nn.Module):
+    def __init__(
+        self,
+        freeze_weights=True,
+        model_type="dinov2_vits14",
+        num_patches_x=16,
+        num_patches_y=16,
+        activation_hooks=False,
+    ):
+        super().__init__()
+        self.model = torch.hub.load("facebookresearch/dinov2", model_type)
+        self.feature_dim = self.model.embed_dim
+        self.num_patches_x = num_patches_x
+        self.num_patches_y = num_patches_y
+        if freeze_weights:
+            for param in self.model.parameters():
+                param.requires_grad = False
+        self.activation_hooks = activation_hooks
+        if self.activation_hooks:
+            self.model.blocks[5].register_forward_hook(get_activation("encoder1"))
+            self.model.blocks[11].register_forward_hook(get_activation("encoder2"))
+            self.activations = activations
+    def forward(self, x, autoresize=False):
+        """
+        Spatial dimensions of output will be H // 14, W // 14. If autoresize is True,
+        then the output will be resized to the correct dimensions.
+        Args:
+            x (torch.Tensor): Images (B, C, H, W). Should be ImageNet normalized.
+            autoresize (bool): Whether to resize the input to match the num_patch
+                dimensions.
+        Returns:
+            feature_map (torch.tensor): (B, C, h, w)
+        """
+        *B, c, h, w = x.shape
+        x = x.reshape(-1, c, h, w)
+        # if autoresize:
+        #     new_w = self.num_patches_x * 14
+        #     new_h = self.num_patches_y * 14
+        #     x = resize(x, size=(new_h, new_w))
+        # Output will be (B, H * W, C)
+        features = self.model.forward_features(x)["x_norm_patchtokens"]
+        features = features.permute(0, 2, 1)
+        features = features.reshape(  # (B, C, H, W)
+            -1, self.feature_dim, h // 14, w // 14
+        )
+        if autoresize:
+            features = resize(features, size=(self.num_patches_y, self.num_patches_x))
+        features = features.reshape(
+            *B, self.feature_dim, self.num_patches_y, self.num_patches_x
+        )
+        return features

diffusionsfm/model/memory_efficient_attention.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import ipdb
+import torch.nn as nn
+from xformers.ops import memory_efficient_attention
+class MEAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_norm=False,
+        attn_drop=0.0,
+        proj_drop=0.0,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        # MEA expects [B, N, H, D], whereas timm uses [B, H, N, D]
+        x = memory_efficient_attention(
+            q.transpose(1, 2),
+            k.transpose(1, 2),
+            v.transpose(1, 2),
+            scale=self.scale,
+        )
+        x = x.reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

diffusionsfm/model/scheduler.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import ipdb  # noqa: F401
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+import torch.nn as nn
+from diffusionsfm.utils.visualization import plot_to_image
+class NoiseScheduler(nn.Module):
+    def __init__(
+        self,
+        max_timesteps=1000,
+        beta_start=0.0001,
+        beta_end=0.02,
+        cos_power=2,
+        num_inference_steps=100,
+        type="linear",
+    ):
+        super().__init__()
+        self.max_timesteps = max_timesteps
+        self.num_inference_steps = num_inference_steps
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.cos_power = cos_power
+        self.type = type
+        if type == "linear":
+            self.register_linear_schedule()
+        elif type == "cosine":
+            self.register_cosine_schedule(cos_power)
+        elif type == "scaled_linear":
+            self.register_scaled_linear_schedule()
+        self.inference_timesteps = self.compute_inference_timesteps()
+    def register_linear_schedule(self):
+        # zero terminal SNR (https://arxiv.org/pdf/2305.08891)
+        betas = torch.linspace(
+            self.beta_start,
+            self.beta_end,
+            self.max_timesteps,
+            dtype=torch.float32,
+        )
+        alphas = 1.0 - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        alphas_bar_sqrt = alphas_cumprod.sqrt()
+        alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+        alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+        alphas_bar_sqrt -= alphas_bar_sqrt_T
+        alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+        alphas_bar = alphas_bar_sqrt**2
+        alphas = alphas_bar[1:] / alphas_bar[:-1]
+        alphas = torch.cat([alphas_bar[0:1], alphas])
+        betas = 1 - alphas
+        self.register_buffer(
+            "betas",
+            betas,
+        )
+        self.register_buffer("alphas", 1.0 - self.betas)
+        self.register_buffer("alphas_cumprod", torch.cumprod(self.alphas, dim=0))
+    def register_cosine_schedule(self, cos_power, s=0.008):
+        timesteps = (
+            torch.arange(self.max_timesteps + 1, dtype=torch.float32)
+            / self.max_timesteps
+        )
+        alpha_bars = (timesteps + s) / (1 + s) * np.pi / 2
+        alpha_bars = torch.cos(alpha_bars).pow(cos_power)
+        alpha_bars = alpha_bars / alpha_bars[0]
+        betas = 1 - alpha_bars[1:] / alpha_bars[:-1]
+        betas = np.clip(betas, a_min=0, a_max=0.999)
+        self.register_buffer(
+            "betas",
+            betas,
+        )
+        self.register_buffer("alphas", 1.0 - betas)
+        self.register_buffer("alphas_cumprod", torch.cumprod(self.alphas, dim=0))
+    def register_scaled_linear_schedule(self):
+        self.register_buffer(
+            "betas",
+            torch.linspace(
+                self.beta_start**0.5,
+                self.beta_end**0.5,
+                self.max_timesteps,
+                dtype=torch.float32,
+            )
+            ** 2,
+        )
+        self.register_buffer("alphas", 1.0 - self.betas)
+        self.register_buffer("alphas_cumprod", torch.cumprod(self.alphas, dim=0))
+    def compute_inference_timesteps(
+        self, num_inference_steps=None, num_train_steps=None
+    ):
+        # based on diffusers's scheduling code
+        if num_inference_steps is None:
+            num_inference_steps = self.num_inference_steps
+        if num_train_steps is None:
+            num_train_steps = self.max_timesteps
+        step_ratio = num_train_steps // num_inference_steps
+        timesteps = (
+            (np.arange(0, num_inference_steps) * step_ratio).round()[::-1].astype(int)
+        )
+        return timesteps
+    def plot_schedule(self, return_image=False):
+        fig = plt.figure(figsize=(6, 4), dpi=100)
+        alpha_bars = self.alphas_cumprod.cpu().numpy()
+        plt.plot(np.sqrt(alpha_bars))
+        plt.grid()
+        if self.type == "linear":
+            plt.title(
+                f"Linear (T={self.max_timesteps}, S={self.beta_start}, E={self.beta_end})"
+            )
+        else:
+            self.type == "cosine"
+            plt.title(f"Cosine (T={self.max_timesteps}, P={self.cos_power})")
+        if return_image:
+            image = plot_to_image(fig)
+            plt.close(fig)
+            return image

diffusionsfm/utils/__init__.py ADDED Viewed

File without changes

diffusionsfm/utils/configs.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import argparse
+import os
+from omegaconf import OmegaConf
+def load_cfg(config_path):
+    """
+    Loads a yaml configuration file.
+    Follows the chain of yaml configuration files that have a `_BASE` key, and updates
+    the new keys accordingly. _BASE configurations can be specified using relative
+    paths.
+    """
+    config_dir = os.path.dirname(config_path)
+    config_path = os.path.basename(config_path)
+    return load_cfg_recursive(config_dir, config_path)
+def load_cfg_recursive(config_dir, config_path):
+    """
+    Recursively loads config files.
+    Follows the chain of yaml configuration files that have a `_BASE` key, and updates
+    the new keys accordingly. _BASE configurations can be specified using relative
+    paths.
+    """
+    cfg = OmegaConf.load(os.path.join(config_dir, config_path))
+    base_path = OmegaConf.select(cfg, "_BASE", default=None)
+    if base_path is not None:
+        base_cfg = load_cfg_recursive(config_dir, base_path)
+        cfg = OmegaConf.merge(base_cfg, cfg)
+    return cfg
+def get_cfg():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config-path", type=str, required=True)
+    args = parser.parse_args()
+    cfg = load_cfg(args.config_path)
+    print(OmegaConf.to_yaml(cfg))
+    exp_dir = os.path.join(cfg.training.runs_dir, cfg.training.exp_tag)
+    os.makedirs(exp_dir, exist_ok=True)
+    to_path = os.path.join(exp_dir, os.path.basename(args.config_path))
+    if not os.path.exists(to_path):
+        OmegaConf.save(config=cfg, f=to_path)
+    return cfg
+def get_cfg_from_path(config_path):
+    """
+    args:
+        config_path - get config from path
+    """
+    print("getting config from path")
+    cfg = load_cfg(config_path)
+    print(OmegaConf.to_yaml(cfg))
+    exp_dir = os.path.join(cfg.training.runs_dir, cfg.training.exp_tag)
+    os.makedirs(exp_dir, exist_ok=True)
+    to_path = os.path.join(exp_dir, os.path.basename(config_path))
+    if not os.path.exists(to_path):
+        OmegaConf.save(config=cfg, f=to_path)
+    return cfg

diffusionsfm/utils/distortion.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import cv2
+import ipdb
+import numpy as np
+from PIL import Image
+import torch
+# https://gist.github.com/davegreenwood/820d51ac5ec88a2aeda28d3079e7d9eb
+def apply_distortion(pts, k1, k2):
+    """
+    Arguments:
+        pts (N x 2): numpy array in NDC coordinates
+        k1, k2 distortion coefficients
+    Return:
+        pts (N x 2): distorted points in NDC coordinates
+    """
+    r2 = np.square(pts).sum(-1)
+    f = 1 + k1 * r2 + k2 * r2**2
+    return f[..., None] * pts
+# https://gist.github.com/davegreenwood/820d51ac5ec88a2aeda28d3079e7d9eb
+def apply_distortion_tensor(pts, k1, k2):
+    """
+    Arguments:
+        pts (N x 2): numpy array in NDC coordinates
+        k1, k2 distortion coefficients
+    Return:
+        pts (N x 2): distorted points in NDC coordinates
+    """
+    r2 = torch.square(pts).sum(-1)
+    f = 1 + k1 * r2 + k2 * r2**2
+    return f[..., None] * pts
+# https://gist.github.com/davegreenwood/820d51ac5ec88a2aeda28d3079e7d9eb
+def remove_distortion_iter(points, k1, k2):
+    """
+    Arguments:
+        pts (N x 2): numpy array in NDC coordinates
+        k1, k2 distortion coefficients
+    Return:
+        pts (N x 2): distorted points in NDC coordinates
+    """
+    pts = ptsd = points
+    for _ in range(5):
+        r2 = np.square(pts).sum(-1)
+        f = 1 + k1 * r2 + k2 * r2**2
+        pts = ptsd / f[..., None]
+    return pts
+def make_square(im, fill_color=(0, 0, 0)):
+    x, y = im.size
+    size = max(x, y)
+    new_im = Image.new("RGB", (size, size), fill_color)
+    corner = (int((size - x) / 2), int((size - y) / 2))
+    new_im.paste(im, corner)
+    return new_im, corner
+def pixel_to_ndc(coords, image_size):
+    """
+    Converts pixel coordinates to normalized device coordinates (Pytorch3D convention
+    with upper left = (1, 1)) for a square image.
+    Args:
+        coords: Pixel coordinates UL=(0, 0), LR=(image_size, image_size).
+        image_size (int): Image size.
+    Returns:
+        NDC coordinates UL=(1, 1) LR=(-1, -1).
+    """
+    coords = np.array(coords)
+    return 1 - coords / image_size * 2
+def ndc_to_pixel(coords, image_size):
+    """
+    Converts normalized device coordinates to pixel coordinates for a square image.
+    """
+    num_points = coords.shape[0]
+    sizes = np.tile(np.array(image_size, dtype=np.float32)[None, ...], (num_points, 1))
+    coords = np.array(coords, dtype=np.float32)
+    return (1 - coords) * sizes / 2
+def distort_image(image, bbox, k1, k2, modify_bbox=False):
+    # We want to operate in -1 to 1 space using the padded square of the original image
+    image, corner = make_square(image)
+    bbox[:2] += np.array(corner)
+    bbox[2:] += np.array(corner)
+    # Construct grid points
+    x = np.linspace(1, -1, image.width, dtype=np.float32)
+    y = np.linspace(1, -1, image.height, dtype=np.float32)
+    x, y = np.meshgrid(x, y, indexing="xy")
+    xy_grid = np.stack((x, y), axis=-1)
+    points = xy_grid.reshape((image.height * image.width, 2))
+    new_points = ndc_to_pixel(apply_distortion(points, k1, k2), image.size)
+    # Distort image by remapping
+    map_x = new_points[:, 0].reshape((image.height, image.width))
+    map_y = new_points[:, 1].reshape((image.height, image.width))
+    distorted = cv2.remap(
+        np.asarray(image),
+        map_x,
+        map_y,
+        cv2.INTER_LINEAR,
+    )
+    distorted = Image.fromarray(distorted)
+    # Find distorted crop bounds - inverse process of above
+    if modify_bbox:
+        center = (bbox[:2] + bbox[2:]) / 2
+        top, bottom = (bbox[0], center[1]), (bbox[2], center[1])
+        left, right = (center[0], bbox[1]), (center[0], bbox[3])
+        bbox_points = np.array(
+            [
+                pixel_to_ndc(top, image.size),
+                pixel_to_ndc(left, image.size),
+                pixel_to_ndc(bottom, image.size),
+                pixel_to_ndc(right, image.size),
+            ],
+            dtype=np.float32,
+        )
+    else:
+        bbox_points = np.array(
+            [pixel_to_ndc(bbox[:2], image.size), pixel_to_ndc(bbox[2:], image.size)],
+            dtype=np.float32,
+        )
+    # Inverse mapping
+    distorted_bbox = remove_distortion_iter(bbox_points, k1, k2)
+    if modify_bbox:
+        p = ndc_to_pixel(distorted_bbox, image.size)
+        distorted_bbox = np.array([p[0][0], p[1][1], p[2][0], p[3][1]])
+    else:
+        distorted_bbox = ndc_to_pixel(distorted_bbox, image.size).reshape(4)
+    return distorted, distorted_bbox

diffusionsfm/utils/distributed.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os
+import socket
+from contextlib import closing
+import torch.distributed as dist
+def get_open_port():
+    with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
+        s.bind(("", 0))
+        s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+        return s.getsockname()[1]
+# Distributed process group
+def ddp_setup(rank, world_size, port="12345"):
+    """
+    Args:
+        rank: Unique Identifier
+        world_size: number of processes
+    """
+    os.environ["MASTER_ADDR"] = "localhost"
+    print(f"MasterPort: {str(port)}")
+    os.environ["MASTER_PORT"] = str(port)
+    # initialize the process group
+    dist.init_process_group(backend="nccl", rank=rank, world_size=world_size)
+def cleanup():
+    dist.destroy_process_group()

diffusionsfm/utils/geometry.py ADDED Viewed

	@@ -0,0 +1,145 @@

+import numpy as np
+import torch
+from pytorch3d.renderer import FoVPerspectiveCameras
+from pytorch3d.transforms import quaternion_to_matrix
+def generate_random_rotations(N=1, device="cpu"):
+    q = torch.randn(N, 4, device=device)
+    q = q / q.norm(dim=-1, keepdim=True)
+    return quaternion_to_matrix(q)
+def symmetric_orthogonalization(x):
+    """Maps 9D input vectors onto SO(3) via symmetric orthogonalization.
+    x: should have size [batch_size, 9]
+    Output has size [batch_size, 3, 3], where each inner 3x3 matrix is in SO(3).
+    """
+    m = x.view(-1, 3, 3)
+    u, s, v = torch.svd(m)
+    vt = torch.transpose(v, 1, 2)
+    det = torch.det(torch.matmul(u, vt))
+    det = det.view(-1, 1, 1)
+    vt = torch.cat((vt[:, :2, :], vt[:, -1:, :] * det), 1)
+    r = torch.matmul(u, vt)
+    return r
+def get_permutations(num_images):
+    permutations = []
+    for i in range(0, num_images):
+        for j in range(0, num_images):
+            if i != j:
+                permutations.append((j, i))
+    return permutations
+def n_to_np_rotations(num_frames, n_rots):
+    R_pred_rel = []
+    permutations = get_permutations(num_frames)
+    for i, j in permutations:
+        R_pred_rel.append(n_rots[i].T @ n_rots[j])
+    R_pred_rel = torch.stack(R_pred_rel)
+    return R_pred_rel
+def compute_angular_error_batch(rotation1, rotation2):
+    R_rel = np.einsum("Bij,Bjk ->Bik", rotation2, rotation1.transpose(0, 2, 1))
+    t = (np.trace(R_rel, axis1=1, axis2=2) - 1) / 2
+    theta = np.arccos(np.clip(t, -1, 1))
+    return theta * 180 / np.pi
+# A should be GT, B should be predicted
+def compute_optimal_alignment(A, B):
+    """
+    Compute the optimal scale s, rotation R, and translation t that minimizes:
+    || A - (s * B @ R + T) || ^ 2
+    Reference: Umeyama (TPAMI 91)
+    Args:
+        A (torch.Tensor): (N, 3).
+        B (torch.Tensor): (N, 3).
+    Returns:
+        s (float): scale.
+        R (torch.Tensor): rotation matrix (3, 3).
+        t (torch.Tensor): translation (3,).
+    """
+    A_bar = A.mean(0)
+    B_bar = B.mean(0)
+    # normally with R @ B, this would be A @ B.T
+    H = (B - B_bar).T @ (A - A_bar)
+    U, S, Vh = torch.linalg.svd(H, full_matrices=True)
+    s = torch.linalg.det(U @ Vh)
+    S_prime = torch.diag(torch.tensor([1, 1, torch.sign(s)], device=A.device))
+    variance = torch.sum((B - B_bar) ** 2)
+    scale = 1 / variance * torch.trace(torch.diag(S) @ S_prime)
+    R = U @ S_prime @ Vh
+    t = A_bar - scale * B_bar @ R
+    A_hat = scale * B @ R + t
+    return A_hat, scale, R, t
+def compute_optimal_translation_alignment(T_A, T_B, R_B):
+    """
+    Assuming right-multiplied rotation matrices.
+    E.g., for world2cam R and T, a world coordinate is transformed to camera coordinate
+    system using X_cam = X_world.T @ R + T = R.T @ X_world + T
+    Finds s, t that minimizes || T_A - (s * T_B + R_B.T @ t) ||^2
+    Args:
+        T_A (torch.Tensor): Target translation (N, 3).
+        T_B (torch.Tensor): Initial translation (N, 3).
+        R_B (torch.Tensor): Initial rotation (N, 3, 3).
+    Returns:
+        T_A_hat (torch.Tensor): s * T_B + t @ R_B (N, 3).
+        scale s (torch.Tensor): (1,).
+        translation t (torch.Tensor): (1, 3).
+    """
+    n = len(T_A)
+    T_A = T_A.unsqueeze(2)
+    T_B = T_B.unsqueeze(2)
+    A = torch.sum(T_B * T_A)
+    B = (T_B.transpose(1, 2) @ R_B.transpose(1, 2)).sum(0) @ (R_B @ T_A).sum(0) / n
+    C = torch.sum(T_B * T_B)
+    D = (T_B.transpose(1, 2) @ R_B.transpose(1, 2)).sum(0)
+    E = (D * D).sum() / n
+    s = (A - B.sum()) / (C - E.sum())
+    t = (R_B @ (T_A - s * T_B)).sum(0) / n
+    T_A_hat = s * T_B + R_B.transpose(1, 2) @ t
+    return T_A_hat.squeeze(2), s, t.transpose(1, 0)
+def get_error(predict_rotations, R_pred, T_pred, R_gt, T_gt, gt_scene_scale):
+    if predict_rotations:
+        cameras_gt = FoVPerspectiveCameras(R=R_gt, T=T_gt)
+        cc_gt = cameras_gt.get_camera_center()
+        cameras_pred = FoVPerspectiveCameras(R=R_pred, T=T_pred)
+        cc_pred = cameras_pred.get_camera_center()
+        A_hat, _, _, _ = compute_optimal_alignment(cc_gt, cc_pred)
+        norm = torch.linalg.norm(cc_gt - A_hat, dim=1) / gt_scene_scale
+        norms = np.ndarray.tolist(norm.detach().cpu().numpy())
+        return norms, A_hat
+    else:
+        T_A_hat, _, _ = compute_optimal_translation_alignment(T_gt, T_pred, R_pred)
+        norm = torch.linalg.norm(T_gt - T_A_hat, dim=1) / gt_scene_scale
+        norms = np.ndarray.tolist(norm.detach().cpu().numpy())
+        return norms, T_A_hat