Spaces:

CVPR
/

monoscene_lite

Runtime error

App Files Files Community

anhquancao commited on Jun 29, 2022

Commit

4d85df4

0 Parent(s):

up

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +28 -0
.gitignore +4 -0
README.md +13 -0
__pycache__/fusion.cpython-37.pyc +0 -0
app.py +115 -0
calib.txt +5 -0
fusion.py +507 -0
helpers.py +336 -0
images/08/000010.jpg +0 -0
images/08/000085.jpg +0 -0
images/08/000103.jpg +0 -0
images/08/000187.jpg +0 -0
images/08/000234.jpg +0 -0
images/08/000290.jpg +0 -0
images/08/000465.jpg +0 -0
images/08/000511.jpg +0 -0
images/08/000618.jpg +0 -0
images/08/000790.jpg +0 -0
images/08/000822.jpg +0 -0
images/08/001005.jpg +0 -0
images/08/001122.jpg +0 -0
images/08/001380.jpg +0 -0
images/08/001446.jpg +0 -0
images/08/001530.jpg +0 -0
images/08/001687.jpg +0 -0
images/08/001842.jpg +0 -0
images/08/002010.jpg +0 -0
images/08/002128.jpg +0 -0
images/08/002272.jpg +0 -0
images/08/002360.jpg +0 -0
images/08/002505.jpg +0 -0
images/08/002716.jpg +0 -0
images/08/002944.jpg +0 -0
images/08/003149.jpg +0 -0
images/08/003365.jpg +0 -0
images/08/003533.jpg +0 -0
images/08/003790.jpg +0 -0
images/08/003929.jpg +0 -0
images/08/004059.jpg +0 -0
monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py +97 -0
monoscene/.ipynb_checkpoints/config-checkpoint.py +34 -0
monoscene/.ipynb_checkpoints/modules-checkpoint.py +194 -0
monoscene/.ipynb_checkpoints/monoscene-checkpoint.py +123 -0
monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py +22 -0
monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py +88 -0
monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py +90 -0
monoscene/CRP3D.py +97 -0
monoscene/DDR.py +139 -0
monoscene/__init__.py +0 -0
monoscene/__pycache__/CRP3D.cpython-37.pyc +0 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,28 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+monoscene_kitti.ckpt filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__
+.ipynb_checkpoints
+*.ckpt
+gradio*

README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: MonoScene
+emoji: 🦀
+colorFrom: indigo
+colorTo: blue
+sdk: gradio
+sdk_version: 3.0.20
+app_file: app.py
+pinned: false
+license: apache-2.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/fusion.cpython-37.pyc ADDED Viewed

Binary file (14.9 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import gradio as gr
+import numpy as np
+from torchvision import transforms
+import torch
+from helpers import *
+import sys
+import csv
+from monoscene.monoscene import MonoScene
+csv.field_size_limit(sys.maxsize)
+torch.set_grad_enabled(False)
+# pipeline = pipeline(model="anhquancao/monoscene_kitti")
+# model = AutoModel.from_pretrained(
+#     "anhquancao/monoscene_kitti", trust_remote_code=True, revision='bf033f87c2a86b60903ab811b790a1532c1ae313'
+# )#.cuda()
+model = MonoScene.load_from_checkpoint(
+        "monoscene_kitti.ckpt",
+        dataset="kitti",
+        n_classes=20,
+        feature = 64,
+        project_scale = 2,
+        full_scene_size = (256, 256, 32),
+    )
+img_W, img_H = 1220, 370
+def predict(img):
+    img = np.array(img, dtype=np.float32, copy=False) / 255.0
+    normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+    img = normalize_rgb(img)
+    batch = get_projections(img_W, img_H)
+    batch["img"] = img
+    for k in batch:
+        batch[k] = batch[k].unsqueeze(0)#.cuda()
+    pred = model(batch).squeeze()
+    # print(pred.shape)
+    pred = majority_pooling(pred, k_size=2)
+    fig = draw(pred, batch['fov_mask_2'])
+    return fig
+description = """
+MonoScene Demo on SemanticKITTI Validation Set (Sequence 08), which uses the <b>camera parameters of Sequence 08</b>. \n
+Due to the <b>CPU-only</b> inference, it might take up to 20s to predict a scene. \n
+The output is <b>downsampled by 2</b> to be able to be rendered in browsers. <b>Darker</b> colors represent the <b>scenery outside the Field of View</b>, i.e. not visible on the image. \n
+Project page: https://cv-rits.github.io/MonoScene/
+"""
+title = "MonoScene: Monocular 3D Semantic Scene Completion"
+article="""
+<center>
+    <img src='https://visitor-badge.glitch.me/badge?page_id=anhquancao.MonoScene&left_color=darkmagenta&right_color=purple' alt='visitor badge'>
+</center>
+"""
+examples = [
+    'images/08/000010.jpg',
+    'images/08/000085.jpg',
+    'images/08/000290.jpg',
+    'images/08/000465.jpg',
+    'images/08/000790.jpg',
+    'images/08/001005.jpg',
+    'images/08/001380.jpg',
+    'images/08/001530.jpg',
+    'images/08/002360.jpg',
+    'images/08/002505.jpg',
+    'images/08/004059.jpg',
+    'images/08/003149.jpg',
+    'images/08/001446.jpg',
+    'images/08/001122.jpg',
+    'images/08/003533.jpg',
+    'images/08/003365.jpg',
+    'images/08/002944.jpg',
+    'images/08/000822.jpg',
+    'images/08/000103.jpg',
+    'images/08/002716.jpg',
+    'images/08/000187.jpg',
+    'images/08/002128.jpg',
+    'images/08/000511.jpg',
+    'images/08/000618.jpg',
+    'images/08/002010.jpg',
+    'images/08/000234.jpg',
+    'images/08/001842.jpg',
+    'images/08/001687.jpg',
+    'images/08/003929.jpg',
+    'images/08/002272.jpg',
+]
+demo = gr.Interface(
+    predict,
+    gr.Image(shape=(1220, 370)),
+    gr.Plot(),
+    article=article,
+    title=title,
+    enable_queue=True,
+    examples=examples,
+    description=description)
+demo.launch(enable_queue=True, debug=True)

calib.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03
+P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03
+Tr: 4.276802385584e-04 -9.999672484946e-01 -8.084491683471e-03 -1.198459927713e-02 -7.210626507497e-03 8.081198471645e-03 -9.999413164504e-01 -5.403984729748e-02 9.999738645903e-01 4.859485810390e-04 -7.206933692422e-03 -2.921968648686e-01

fusion.py ADDED Viewed

	@@ -0,0 +1,507 @@

+"""
+Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
+@inproceedings{zeng20163dmatch,
+    title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
+    author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
+    booktitle={CVPR},
+    year={2017}
+}
+"""
+import numpy as np
+from numba import njit, prange
+from skimage import measure
+FUSION_GPU_MODE = 0
+class TSDFVolume:
+    """Volumetric TSDF Fusion of RGB-D Images."""
+    def __init__(self, vol_bnds, voxel_size, use_gpu=True):
+        """Constructor.
+        Args:
+          vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
+            xyz bounds (min/max) in meters.
+          voxel_size (float): The volume discretization in meters.
+        """
+        vol_bnds = np.asarray(vol_bnds)
+        assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
+        # Define voxel volume parameters
+        self._vol_bnds = vol_bnds
+        self._voxel_size = float(voxel_size)
+        self._trunc_margin = 5 * self._voxel_size  # truncation on SDF
+        # self._trunc_margin = 10  # truncation on SDF
+        self._color_const = 256 * 256
+        # Adjust volume bounds and ensure C-order contiguous
+        self._vol_dim = (
+            np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
+            .copy(order="C")
+            .astype(int)
+        )
+        self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
+        self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
+        print(
+            "Voxel volume size: {} x {} x {} - # points: {:,}".format(
+                self._vol_dim[0],
+                self._vol_dim[1],
+                self._vol_dim[2],
+                self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
+            )
+        )
+        # Initialize pointers to voxel volume in CPU memory
+        self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        # for computing the cumulative moving average of observations per voxel
+        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        self.gpu_mode = use_gpu and FUSION_GPU_MODE
+        # Copy voxel volumes to GPU
+        if self.gpu_mode:
+            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
+            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
+            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
+            # Cuda kernel function (C++)
+            self._cuda_src_mod = SourceModule(
+                """
+        __global__ void integrate(float * tsdf_vol,
+                                  float * weight_vol,
+                                  float * color_vol,
+                                  float * vol_dim,
+                                  float * vol_origin,
+                                  float * cam_intr,
+                                  float * cam_pose,
+                                  float * other_params,
+                                  float * color_im,
+                                  float * depth_im) {
+          // Get voxel index
+          int gpu_loop_idx = (int) other_params[0];
+          int max_threads_per_block = blockDim.x;
+          int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
+          int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
+          int vol_dim_x = (int) vol_dim[0];
+          int vol_dim_y = (int) vol_dim[1];
+          int vol_dim_z = (int) vol_dim[2];
+          if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
+              return;
+          // Get voxel grid coordinates (note: be careful when casting)
+          float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
+          float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
+          float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
+          // Voxel grid coordinates to world coordinates
+          float voxel_size = other_params[1];
+          float pt_x = vol_origin[0]+voxel_x*voxel_size;
+          float pt_y = vol_origin[1]+voxel_y*voxel_size;
+          float pt_z = vol_origin[2]+voxel_z*voxel_size;
+          // World coordinates to camera coordinates
+          float tmp_pt_x = pt_x-cam_pose[0*4+3];
+          float tmp_pt_y = pt_y-cam_pose[1*4+3];
+          float tmp_pt_z = pt_z-cam_pose[2*4+3];
+          float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
+          float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
+          float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
+          // Camera coordinates to image pixels
+          int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
+          int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
+          // Skip if outside view frustum
+          int im_h = (int) other_params[2];
+          int im_w = (int) other_params[3];
+          if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
+              return;
+          // Skip invalid depth
+          float depth_value = depth_im[pixel_y*im_w+pixel_x];
+          if (depth_value == 0)
+              return;
+          // Integrate TSDF
+          float trunc_margin = other_params[4];
+          float depth_diff = depth_value-cam_pt_z;
+          if (depth_diff < -trunc_margin)
+              return;
+          float dist = fmin(1.0f,depth_diff/trunc_margin);
+          float w_old = weight_vol[voxel_idx];
+          float obs_weight = other_params[5];
+          float w_new = w_old + obs_weight;
+          weight_vol[voxel_idx] = w_new;
+          tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
+          // Integrate color
+          float old_color = color_vol[voxel_idx];
+          float old_b = floorf(old_color/(256*256));
+          float old_g = floorf((old_color-old_b*256*256)/256);
+          float old_r = old_color-old_b*256*256-old_g*256;
+          float new_color = color_im[pixel_y*im_w+pixel_x];
+          float new_b = floorf(new_color/(256*256));
+          float new_g = floorf((new_color-new_b*256*256)/256);
+          float new_r = new_color-new_b*256*256-new_g*256;
+          new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
+          new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
+          new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
+          color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
+        }"""
+            )
+            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
+            # Determine block/grid size on GPU
+            gpu_dev = cuda.Device(0)
+            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
+            n_blocks = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(self._max_gpu_threads_per_block)
+                )
+            )
+            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
+            grid_dim_y = min(
+                gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
+            )
+            grid_dim_z = min(
+                gpu_dev.MAX_GRID_DIM_Z,
+                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
+            )
+            self._max_gpu_grid_dim = np.array(
+                [grid_dim_x, grid_dim_y, grid_dim_z]
+            ).astype(int)
+            self._n_gpu_loops = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(
+                        np.prod(self._max_gpu_grid_dim)
+                        * self._max_gpu_threads_per_block
+                    )
+                )
+            )
+        else:
+            # Get voxel grid coordinates
+            xv, yv, zv = np.meshgrid(
+                range(self._vol_dim[0]),
+                range(self._vol_dim[1]),
+                range(self._vol_dim[2]),
+                indexing="ij",
+            )
+            self.vox_coords = (
+                np.concatenate(
+                    [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
+                )
+                .astype(int)
+                .T
+            )
+    @staticmethod
+    @njit(parallel=True)
+    def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
+        """Convert voxel grid coordinates to world coordinates."""
+        vol_origin = vol_origin.astype(np.float32)
+        vox_coords = vox_coords.astype(np.float32)
+        #    print(np.min(vox_coords))
+        cam_pts = np.empty_like(vox_coords, dtype=np.float32)
+        for i in prange(vox_coords.shape[0]):
+            for j in range(3):
+                cam_pts[i, j] = (
+                    vol_origin[j]
+                    + (vox_size * vox_coords[i, j])
+                    + vox_size * offsets[j]
+                )
+        return cam_pts
+    @staticmethod
+    @njit(parallel=True)
+    def cam2pix(cam_pts, intr):
+        """Convert camera coordinates to pixel coordinates."""
+        intr = intr.astype(np.float32)
+        fx, fy = intr[0, 0], intr[1, 1]
+        cx, cy = intr[0, 2], intr[1, 2]
+        pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
+        for i in prange(cam_pts.shape[0]):
+            pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
+            pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
+        return pix
+    @staticmethod
+    @njit(parallel=True)
+    def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
+        """Integrate the TSDF volume."""
+        tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
+        # print(tsdf_vol.shape)
+        w_new = np.empty_like(w_old, dtype=np.float32)
+        for i in prange(len(tsdf_vol)):
+            w_new[i] = w_old[i] + obs_weight
+            tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
+        return tsdf_vol_int, w_new
+    def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
+        """Integrate an RGB-D frame into the TSDF volume.
+        Args:
+          color_im (ndarray): An RGB image of shape (H, W, 3).
+          depth_im (ndarray): A depth image of shape (H, W).
+          cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
+          cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
+          obs_weight (float): The weight to assign for the current observation. A higher
+            value
+        """
+        im_h, im_w = depth_im.shape
+        # Fold RGB color image into a single channel image
+        color_im = color_im.astype(np.float32)
+        color_im = np.floor(
+            color_im[..., 2] * self._color_const
+            + color_im[..., 1] * 256
+            + color_im[..., 0]
+        )
+        if self.gpu_mode:  # GPU mode: integrate voxel volume (calls CUDA kernel)
+            for gpu_loop_idx in range(self._n_gpu_loops):
+                self._cuda_integrate(
+                    self._tsdf_vol_gpu,
+                    self._weight_vol_gpu,
+                    self._color_vol_gpu,
+                    cuda.InOut(self._vol_dim.astype(np.float32)),
+                    cuda.InOut(self._vol_origin.astype(np.float32)),
+                    cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
+                    cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
+                    cuda.InOut(
+                        np.asarray(
+                            [
+                                gpu_loop_idx,
+                                self._voxel_size,
+                                im_h,
+                                im_w,
+                                self._trunc_margin,
+                                obs_weight,
+                            ],
+                            np.float32,
+                        )
+                    ),
+                    cuda.InOut(color_im.reshape(-1).astype(np.float32)),
+                    cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
+                    block=(self._max_gpu_threads_per_block, 1, 1),
+                    grid=(
+                        int(self._max_gpu_grid_dim[0]),
+                        int(self._max_gpu_grid_dim[1]),
+                        int(self._max_gpu_grid_dim[2]),
+                    ),
+                )
+        else:  # CPU mode: integrate voxel volume (vectorized implementation)
+            # Convert voxel grid coordinates to pixel coordinates
+            cam_pts = self.vox2world(
+                self._vol_origin, self.vox_coords, self._voxel_size
+            )
+            cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
+            pix_z = cam_pts[:, 2]
+            pix = self.cam2pix(cam_pts, cam_intr)
+            pix_x, pix_y = pix[:, 0], pix[:, 1]
+            # Eliminate pixels outside view frustum
+            valid_pix = np.logical_and(
+                pix_x >= 0,
+                np.logical_and(
+                    pix_x < im_w,
+                    np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
+                ),
+            )
+            depth_val = np.zeros(pix_x.shape)
+            depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
+            # Integrate TSDF
+            depth_diff = depth_val - pix_z
+            valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
+            dist = depth_diff
+            valid_vox_x = self.vox_coords[valid_pts, 0]
+            valid_vox_y = self.vox_coords[valid_pts, 1]
+            valid_vox_z = self.vox_coords[valid_pts, 2]
+            w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            valid_dist = dist[valid_pts]
+            tsdf_vol_new, w_new = self.integrate_tsdf(
+                tsdf_vals, valid_dist, w_old, obs_weight
+            )
+            self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
+            self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
+            # Integrate color
+            old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            old_b = np.floor(old_color / self._color_const)
+            old_g = np.floor((old_color - old_b * self._color_const) / 256)
+            old_r = old_color - old_b * self._color_const - old_g * 256
+            new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
+            new_b = np.floor(new_color / self._color_const)
+            new_g = np.floor((new_color - new_b * self._color_const) / 256)
+            new_r = new_color - new_b * self._color_const - new_g * 256
+            new_b = np.minimum(
+                255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
+            )
+            new_g = np.minimum(
+                255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
+            )
+            new_r = np.minimum(
+                255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
+            )
+            self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
+                new_b * self._color_const + new_g * 256 + new_r
+            )
+    def get_volume(self):
+        if self.gpu_mode:
+            cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
+            cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
+        return self._tsdf_vol_cpu, self._color_vol_cpu
+    def get_point_cloud(self):
+        """Extract a point cloud from the voxel volume."""
+        tsdf_vol, color_vol = self.get_volume()
+        # Marching cubes
+        verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
+        verts_ind = np.round(verts).astype(int)
+        verts = verts * self._voxel_size + self._vol_origin
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+        pc = np.hstack([verts, colors])
+        return pc
+    def get_mesh(self):
+        """Compute a mesh from the voxel volume using marching cubes."""
+        tsdf_vol, color_vol = self.get_volume()
+        # Marching cubes
+        verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
+        verts_ind = np.round(verts).astype(int)
+        verts = (
+            verts * self._voxel_size + self._vol_origin
+        )  # voxel grid coordinates to world coordinates
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+        return verts, faces, norms, colors
+def rigid_transform(xyz, transform):
+    """Applies a rigid transform to an (N, 3) pointcloud."""
+    xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
+    xyz_t_h = np.dot(transform, xyz_h.T).T
+    return xyz_t_h[:, :3]
+def get_view_frustum(depth_im, cam_intr, cam_pose):
+    """Get corners of 3D camera view frustum of depth image"""
+    im_h = depth_im.shape[0]
+    im_w = depth_im.shape[1]
+    max_depth = np.max(depth_im)
+    view_frust_pts = np.array(
+        [
+            (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[0, 0],
+            (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[1, 1],
+            np.array([0, max_depth, max_depth, max_depth, max_depth]),
+        ]
+    )
+    view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
+    return view_frust_pts
+def meshwrite(filename, verts, faces, norms, colors):
+    """Save a 3D mesh to a polygon .ply file."""
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (verts.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property float nx\n")
+    ply_file.write("property float ny\n")
+    ply_file.write("property float nz\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("element face %d\n" % (faces.shape[0]))
+    ply_file.write("property list uchar int vertex_index\n")
+    ply_file.write("end_header\n")
+    # Write vertex list
+    for i in range(verts.shape[0]):
+        ply_file.write(
+            "%f %f %f %f %f %f %d %d %d\n"
+            % (
+                verts[i, 0],
+                verts[i, 1],
+                verts[i, 2],
+                norms[i, 0],
+                norms[i, 1],
+                norms[i, 2],
+                colors[i, 0],
+                colors[i, 1],
+                colors[i, 2],
+            )
+        )
+    # Write face list
+    for i in range(faces.shape[0]):
+        ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
+    ply_file.close()
+def pcwrite(filename, xyzrgb):
+    """Save a point cloud to a polygon .ply file."""
+    xyz = xyzrgb[:, :3]
+    rgb = xyzrgb[:, 3:].astype(np.uint8)
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (xyz.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("end_header\n")
+    # Write vertex list
+    for i in range(xyz.shape[0]):
+        ply_file.write(
+            "%f %f %f %d %d %d\n"
+            % (
+                xyz[i, 0],
+                xyz[i, 1],
+                xyz[i, 2],
+                rgb[i, 0],
+                rgb[i, 1],
+                rgb[i, 2],
+            )
+        )

helpers.py ADDED Viewed

	@@ -0,0 +1,336 @@

+import numpy as np
+import torch
+import fusion
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+def read_calib(calib_path):
+        """
+        Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
+        :param calib_path: Path to a calibration text file.
+        :return: dict with calibration matrices.
+        """
+        calib_all = {}
+        with open(calib_path, "r") as f:
+            for line in f.readlines():
+                if line == "\n":
+                    break
+                key, value = line.split(":", 1)
+                calib_all[key] = np.array([float(x) for x in value.split()])
+        # reshape matrices
+        calib_out = {}
+        # 3x4 projection matrix for left camera
+        calib_out["P2"] = calib_all["P2"].reshape(3, 4)
+        calib_out["Tr"] = np.identity(4)  # 4x4 matrix
+        calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
+        return calib_out
+def vox2pix(cam_E, cam_k,
+            vox_origin, voxel_size,
+            img_W, img_H,
+            scene_size):
+    """
+    compute the 2D projection of voxels centroids
+    Parameters:
+    ----------
+    cam_E: 4x4
+       =camera pose in case of NYUv2 dataset
+       =Transformation from camera to lidar coordinate in case of SemKITTI
+    cam_k: 3x3
+        camera intrinsics
+    vox_origin: (3,)
+        world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
+    img_W: int
+        image width
+    img_H: int
+        image height
+    scene_size: (3,)
+        scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
+    Returns
+    -------
+    projected_pix: (N, 2)
+        Projected 2D positions of voxels
+    fov_mask: (N,)
+        Voxels mask indice voxels inside image's FOV
+    pix_z: (N,)
+        Voxels'distance to the sensor in meter
+    """
+    # Compute the x, y, z bounding of the scene in meter
+    vol_bnds = np.zeros((3,2))
+    vol_bnds[:,0] = vox_origin
+    vol_bnds[:,1] = vox_origin + np.array(scene_size)
+    # Compute the voxels centroids in lidar cooridnates
+    vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
+    xv, yv, zv = np.meshgrid(
+            range(vol_dim[0]),
+            range(vol_dim[1]),
+            range(vol_dim[2]),
+            indexing='ij'
+          )
+    vox_coords = np.concatenate([
+            xv.reshape(1,-1),
+            yv.reshape(1,-1),
+            zv.reshape(1,-1)
+          ], axis=0).astype(int).T
+    # Project voxels'centroid from lidar coordinates to camera coordinates
+    cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
+    cam_pts = fusion.rigid_transform(cam_pts, cam_E)
+    # Project camera coordinates to pixel positions
+    projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
+    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+    # Eliminate pixels outside view frustum
+    pix_z = cam_pts[:, 2]
+    fov_mask = np.logical_and(pix_x >= 0,
+                np.logical_and(pix_x < img_W,
+                np.logical_and(pix_y >= 0,
+                np.logical_and(pix_y < img_H,
+                pix_z > 0))))
+    return torch.from_numpy(projected_pix), torch.from_numpy(fov_mask), torch.from_numpy(pix_z)
+def get_grid_coords(dims, resolution):
+    """
+    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+    :return coords_grid: is the center coords of voxels in the grid
+    """
+    g_xx = np.arange(0, dims[0] + 1)
+    g_yy = np.arange(0, dims[1] + 1)
+    sensor_pose = 10
+    g_zz = np.arange(0, dims[2] + 1)
+    # Obtaining the grid with coords...
+    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+    coords_grid = coords_grid.astype(np.float)
+    coords_grid = (coords_grid * resolution) + resolution / 2
+    temp = np.copy(coords_grid)
+    temp[:, 0] = coords_grid[:, 1]
+    temp[:, 1] = coords_grid[:, 0]
+    coords_grid = np.copy(temp)
+    return coords_grid
+def get_projections(img_W, img_H):
+    scale_3ds = [1, 2]
+    data = {}
+    for scale_3d in scale_3ds:
+        scene_size = (51.2, 51.2, 6.4)
+        vox_origin = np.array([0, -25.6, -2])
+        voxel_size = 0.2
+        calib = read_calib("calib.txt")
+        cam_k = calib["P2"][:3, :3]
+        T_velo_2_cam = calib["Tr"]
+        # compute the 3D-2D mapping
+        projected_pix, fov_mask, pix_z = vox2pix(
+            T_velo_2_cam,
+            cam_k,
+            vox_origin,
+            voxel_size * scale_3d,
+            img_W,
+            img_H,
+            scene_size,
+        )
+        data["projected_pix_{}".format(scale_3d)] = projected_pix
+        data["pix_z_{}".format(scale_3d)] = pix_z
+        data["fov_mask_{}".format(scale_3d)] = fov_mask
+    return data
+def majority_pooling(grid, k_size=2):
+    result = np.zeros(
+        (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
+    )
+    for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
+        for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
+            for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
+                sub_m = grid[
+                    (xx * k_size) : (xx * k_size) + k_size,
+                    (yy * k_size) : (yy * k_size) + k_size,
+                    (zz * k_size) : (zz * k_size) + k_size,
+                ]
+                unique, counts = np.unique(sub_m, return_counts=True)
+                if True in ((unique != 0) & (unique != 255)):
+                    # Remove counts with 0 and 255
+                    counts = counts[((unique != 0) & (unique != 255))]
+                    unique = unique[((unique != 0) & (unique != 255))]
+                else:
+                    if True in (unique == 0):
+                        counts = counts[(unique != 255)]
+                        unique = unique[(unique != 255)]
+                value = unique[np.argmax(counts)]
+                result[xx, yy, zz] = value
+    return result
+def draw(
+    voxels,
+    # T_velo_2_cam,
+    # vox_origin,
+    fov_mask,
+    # img_size,
+    # f,
+    voxel_size=0.4,
+    # d=7,  # 7m - determine the size of the mesh representing the camera
+):
+    fov_mask = fov_mask.reshape(-1)
+    # Compute the voxels coordinates
+    grid_coords = get_grid_coords(
+        [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
+    )
+    # Attach the predicted class to every voxel
+    grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
+    # Get the voxels inside FOV
+    fov_grid_coords = grid_coords[fov_mask, :]
+    # Get the voxels outside FOV
+    outfov_grid_coords = grid_coords[~fov_mask, :]
+    # Remove empty and unknown voxels
+    fov_voxels = fov_grid_coords[
+        (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255), :
+    ]
+    # print(np.unique(fov_voxels[:, 3], return_counts=True))
+    outfov_voxels = outfov_grid_coords[
+        (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255), :
+    ]
+    # figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
+    colors = np.array(
+        [
+            [0,0,0],
+            [100, 150, 245],
+            [100, 230, 245],
+            [30, 60, 150],
+            [80, 30, 180],
+            [100, 80, 250],
+            [255, 30, 30],
+            [255, 40, 200],
+            [150, 30, 90],
+            [255, 0, 255],
+            [255, 150, 255],
+            [75, 0, 75],
+            [175, 0, 75],
+            [255, 200, 0],
+            [255, 120, 50],
+            [0, 175, 0],
+            [135, 60, 0],
+            [150, 240, 80],
+            [255, 240, 150],
+            [255, 0, 0],
+        ]
+    ).astype(np.uint8)
+    pts_colors = [f'rgb({colors[int(i)][0]}, {colors[int(i)][1]}, {colors[int(i)][2]})' for i in fov_voxels[:, 3]]
+    out_fov_colors = [f'rgb({colors[int(i)][0]//3*2}, {colors[int(i)][1]//3*2}, {colors[int(i)][2]//3*2})' for i in outfov_voxels[:, 3]]
+    pts_colors = pts_colors + out_fov_colors
+    fov_voxels = np.concatenate([fov_voxels, outfov_voxels], axis=0)
+    x = fov_voxels[:, 0].flatten()
+    y = fov_voxels[:, 1].flatten()
+    z = fov_voxels[:, 2].flatten()
+    # label = fov_voxels[:, 3].flatten()
+    fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,mode='markers',
+                    marker=dict(
+                            size=3,
+                            color=pts_colors,                # set color to an array/list of desired values
+                            # colorscale='Viridis',   # choose a colorscale
+                            opacity=1.0,
+                            symbol='square'
+                        ))])
+    fig.update_layout(
+    scene = dict(
+        aspectmode='data',
+        xaxis = dict(
+            backgroundcolor="rgb(255, 255, 255)",
+            gridcolor="black",
+            showbackground=True,
+            zerolinecolor="black",
+            nticks=4,
+            visible=False,
+            range=[-1,55],),
+        yaxis = dict(
+            backgroundcolor="rgb(255, 255, 255)",
+            gridcolor="black",
+            showbackground=True,
+            zerolinecolor="black",
+            visible=False,
+            nticks=4, range=[-1,55],),
+        zaxis = dict(
+            backgroundcolor="rgb(255, 255, 255)",
+            gridcolor="black",
+            showbackground=True,
+            zerolinecolor="black",
+            visible=False,
+            nticks=4, range=[-1,7],),
+        bgcolor="black",
+    ),
+    )
+    # fig = px.scatter_3d(
+    #     fov_voxels,
+    #     x=fov_voxels[:, 0], y="y", z="z", color="label")
+    # Draw occupied inside FOV voxels
+    # plt_plot_fov = mlab.points3d(
+    #     fov_voxels[:, 0],
+    #     fov_voxels[:, 1],
+    #     fov_voxels[:, 2],
+    #     fov_voxels[:, 3],
+    #     colormap="viridis",
+    #     scale_factor=voxel_size - 0.05 * voxel_size,
+    #     mode="cube",
+    #     opacity=1.0,
+    #     vmin=1,
+    #     vmax=19,
+    # )
+    # # Draw occupied outside FOV voxels
+    # plt_plot_outfov = mlab.points3d(
+    #     outfov_voxels[:, 0],
+    #     outfov_voxels[:, 1],
+    #     outfov_voxels[:, 2],
+    #     outfov_voxels[:, 3],
+    #     colormap="viridis",
+    #     scale_factor=voxel_size - 0.05 * voxel_size,
+    #     mode="cube",
+    #     opacity=1.0,
+    #     vmin=1,
+    #     vmax=19,
+    # )
+    # plt_plot_fov.glyph.scale_mode = "scale_by_vector"
+    # plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
+    # plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
+    # outfov_colors = colors
+    # outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
+    # plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
+    # mlab.show()
+    return fig

images/08/000010.jpg ADDED Viewed

images/08/000085.jpg ADDED Viewed

images/08/000103.jpg ADDED Viewed

images/08/000187.jpg ADDED Viewed

images/08/000234.jpg ADDED Viewed

images/08/000290.jpg ADDED Viewed

images/08/000465.jpg ADDED Viewed

images/08/000511.jpg ADDED Viewed

images/08/000618.jpg ADDED Viewed

images/08/000790.jpg ADDED Viewed

images/08/000822.jpg ADDED Viewed

images/08/001005.jpg ADDED Viewed

images/08/001122.jpg ADDED Viewed

images/08/001380.jpg ADDED Viewed

images/08/001446.jpg ADDED Viewed

images/08/001530.jpg ADDED Viewed

images/08/001687.jpg ADDED Viewed

images/08/001842.jpg ADDED Viewed

images/08/002010.jpg ADDED Viewed

images/08/002128.jpg ADDED Viewed

images/08/002272.jpg ADDED Viewed

images/08/002360.jpg ADDED Viewed

images/08/002505.jpg ADDED Viewed

images/08/002716.jpg ADDED Viewed

images/08/002944.jpg ADDED Viewed

images/08/003149.jpg ADDED Viewed

images/08/003365.jpg ADDED Viewed

images/08/003533.jpg ADDED Viewed

images/08/003790.jpg ADDED Viewed

images/08/003929.jpg ADDED Viewed

images/08/004059.jpg ADDED Viewed

monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import torch.nn as nn
+from monoscene.modules import (
+    Process,
+    ASPP,
+)
+class CPMegaVoxels(nn.Module):
+    def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
+        super().__init__()
+        self.size = size
+        self.n_relations = n_relations
+        print("n_relations", self.n_relations)
+        self.flatten_size = size[0] * size[1] * size[2]
+        self.feature = feature
+        self.context_feature = feature * 2
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+        padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
+        self.mega_context = nn.Sequential(
+            nn.Conv3d(
+                feature, self.context_feature, stride=2, padding=padding, kernel_size=3
+            ),
+        )
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+        self.context_prior_logits = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv3d(
+                        self.feature,
+                        self.flatten_context_size,
+                        padding=0,
+                        kernel_size=1,
+                    ),
+                )
+                for i in range(n_relations)
+            ]
+        )
+        self.aspp = ASPP(feature, [1, 2, 3])
+        self.resize = nn.Sequential(
+            nn.Conv3d(
+                self.context_feature * self.n_relations + feature,
+                feature,
+                kernel_size=1,
+                padding=0,
+                bias=False,
+            ),
+            Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
+        )
+    def forward(self, input):
+        ret = {}
+        bs = input.shape[0]
+        x_agg = self.aspp(input)
+        # get the mega context
+        x_mega_context_raw = self.mega_context(x_agg)
+        x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
+        x_mega_context = x_mega_context.permute(0, 2, 1)
+        # get context prior map
+        x_context_prior_logits = []
+        x_context_rels = []
+        for rel in range(self.n_relations):
+            # Compute the relation matrices
+            x_context_prior_logit = self.context_prior_logits[rel](x_agg)
+            x_context_prior_logit = x_context_prior_logit.reshape(
+                bs, self.flatten_context_size, self.flatten_size
+            )
+            x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
+            x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
+            x_context_prior = torch.sigmoid(x_context_prior_logit)
+            # Multiply the relation matrices with the mega context to gather context features
+            x_context_rel = torch.bmm(x_context_prior, x_mega_context)  # bs, N, f
+            x_context_rels.append(x_context_rel)
+        x_context = torch.cat(x_context_rels, dim=2)
+        x_context = x_context.permute(0, 2, 1)
+        x_context = x_context.reshape(
+            bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
+        )
+        x = torch.cat([input, x_context], dim=1)
+        x = self.resize(x)
+        x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
+        ret["P_logits"] = x_context_prior_logits
+        ret["x"] = x
+        return ret

monoscene/.ipynb_checkpoints/config-checkpoint.py ADDED Viewed

	@@ -0,0 +1,34 @@

+from transformers import PretrainedConfig
+from typing import List
+class MonoSceneConfig(PretrainedConfig):
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)

monoscene/.ipynb_checkpoints/modules-checkpoint.py ADDED Viewed

	@@ -0,0 +1,194 @@

+import torch
+import torch.nn as nn
+from monoscene.DDR import Bottleneck3D
+class ASPP(nn.Module):
+    """
+    ASPP 3D
+    Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+    def __init__(self, planes, dilations_conv_list):
+        super().__init__()
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+    def forward(self, x_in):
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+        return x_in
+class SegmentationHead(nn.Module):
+    """
+    3D Segmentation heads to retrieve semantic segmentation at each scale.
+    Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
+    Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+    def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
+        super().__init__()
+        # First convolution
+        self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+        self.conv_classes = nn.Conv3d(
+            planes, nbr_classes, kernel_size=3, padding=1, stride=1
+        )
+    def forward(self, x_in):
+        # Convolution to go from inplanes to planes features...
+        x_in = self.relu(self.conv0(x_in))
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+        x_in = self.conv_classes(x_in)
+        return x_in
+class ProcessKitti(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+    def forward(self, x):
+        return self.main(x)
+class Process(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+    def forward(self, x):
+        return self.main(x)
+class Upsample(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
+        super(Upsample, self).__init__()
+        self.main = nn.Sequential(
+            nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                dilation=1,
+                output_padding=1,
+            ),
+            norm_layer(out_channels, momentum=bn_momentum),
+            nn.ReLU(),
+        )
+    def forward(self, x):
+        return self.main(x)
+class Downsample(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
+        super(Downsample, self).__init__()
+        self.main = Bottleneck3D(
+            feature,
+            feature // 4,
+            bn_momentum=bn_momentum,
+            expansion=expansion,
+            stride=2,
+            downsample=nn.Sequential(
+                nn.AvgPool3d(kernel_size=2, stride=2),
+                nn.Conv3d(
+                    feature,
+                    int(feature * expansion / 4),
+                    kernel_size=1,
+                    stride=1,
+                    bias=False,
+                ),
+                norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
+            ),
+            norm_layer=norm_layer,
+        )
+    def forward(self, x):
+        return self.main(x)

monoscene/.ipynb_checkpoints/monoscene-checkpoint.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.flosp import FLoSP
+import numpy as np
+import torch.nn.functional as F
+from monoscene.unet2d import UNet2D
+class MonoScene(pl.LightningModule):
+    def __init__(
+        self,
+        n_classes,
+        feature,
+        project_scale,
+        full_scene_size,
+        dataset,
+        n_relations=4,
+        context_prior=True,
+        fp_loss=True,
+        project_res=[],
+        frustum_size=4,
+        relation_loss=False,
+        CE_ssc_loss=True,
+        geo_scal_loss=True,
+        sem_scal_loss=True,
+        lr=1e-4,
+        weight_decay=1e-4,
+    ):
+        super().__init__()
+        self.project_res = project_res
+        self.fp_loss = fp_loss
+        self.dataset = dataset
+        self.context_prior = context_prior
+        self.frustum_size = frustum_size
+        self.relation_loss = relation_loss
+        self.CE_ssc_loss = CE_ssc_loss
+        self.sem_scal_loss = sem_scal_loss
+        self.geo_scal_loss = geo_scal_loss
+        self.project_scale = project_scale
+        self.lr = lr
+        self.weight_decay = weight_decay
+        self.projects = {}
+        self.scale_2ds = [1, 2, 4, 8]  # 2D scales
+        for scale_2d in self.scale_2ds:
+            self.projects[str(scale_2d)] = FLoSP(
+                full_scene_size, project_scale=self.project_scale, dataset=self.dataset
+            )
+        self.projects = nn.ModuleDict(self.projects)
+        self.n_classes = n_classes
+        if self.dataset == "NYU":
+            self.net_3d_decoder = UNet3DNYU(
+                self.n_classes,
+                nn.BatchNorm3d,
+                n_relations=n_relations,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        elif self.dataset == "kitti":
+            self.net_3d_decoder = UNet3DKitti(
+                self.n_classes,
+                nn.BatchNorm3d,
+                project_scale=project_scale,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
+    def forward(self, batch):
+        img = batch["img"]
+        bs = len(img)
+        out = {}
+        x_rgb = self.net_rgb(img)
+        x3ds = []
+        for i in range(bs):
+            x3d = None
+            for scale_2d in self.project_res:
+                # project features at each 2D scale to target 3D scale
+                scale_2d = int(scale_2d)
+                projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda()
+                fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda()
+                # Sum all the 3D features
+                if x3d is None:
+                    x3d = self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+                else:
+                    x3d += self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+            x3ds.append(x3d)
+        input_dict = {
+            "x3d": torch.stack(x3ds),
+        }
+        out_dict = self.net_3d_decoder(input_dict)
+        ssc_pred = out_dict["ssc_logit"]
+        y_pred = ssc_pred.detach().cpu().numpy()
+        y_pred = np.argmax(y_pred, axis=1)
+        return y_pred

monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from transformers import PreTrainedModel
+from .config import MonoSceneConfig
+from monoscene.monoscene import MonoScene
+class MonoSceneModel(PreTrainedModel):
+    config_class = ResnetConfig
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MonoScene(
+            dataset=config.dataset,
+            n_classes=config.n_classes,
+            feature=config.feature,
+            project_scale=config.project_scale,
+            full_scene_size=config.full_scene_size
+        )
+    def forward(self, tensor):
+        return self.model.forward(tensor)

monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monoscene.modules import SegmentationHead
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import Process, Upsample, Downsample
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        full_scene_size,
+        feature,
+        project_scale,
+        context_prior=None,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_scale = project_scale
+        self.full_scene_size = full_scene_size
+        self.feature = feature
+        size_l1 = (
+            int(self.full_scene_size[0] / project_scale),
+            int(self.full_scene_size[1] / project_scale),
+            int(self.full_scene_size[2] / project_scale),
+        )
+        size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
+        size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
+        dilations = [1, 2, 3]
+        self.process_l1 = nn.Sequential(
+            Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature, norm_layer, bn_momentum),
+        )
+        self.process_l2 = nn.Sequential(
+            Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature * 2, norm_layer, bn_momentum),
+        )
+        self.up_13_l2 = Upsample(
+            self.feature * 4, self.feature * 2, norm_layer, bn_momentum
+        )
+        self.up_12_l1 = Upsample(
+            self.feature * 2, self.feature, norm_layer, bn_momentum
+        )
+        self.up_l1_lfull = Upsample(
+            self.feature, self.feature // 2, norm_layer, bn_momentum
+        )
+        self.ssc_head = SegmentationHead(
+            self.feature // 2, self.feature // 2, class_num, dilations
+        )
+        self.context_prior = context_prior
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature * 4, size_l3, bn_momentum=bn_momentum
+            )
+    def forward(self, input_dict):
+        res = {}
+        x3d_l1 = input_dict["x3d"]
+        x3d_l2 = self.process_l1(x3d_l1)
+        x3d_l3 = self.process_l2(x3d_l2)
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_l3)
+            x3d_l3 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+        x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
+        x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
+        x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
+        ssc_logit_full = self.ssc_head(x3d_up_lfull)
+        res["ssc_logit"] = ssc_logit_full
+        return res

monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import (
+    Process,
+    Upsample,
+    Downsample,
+    SegmentationHead,
+    ASPP,
+)
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        feature,
+        full_scene_size,
+        n_relations=4,
+        project_res=[],
+        context_prior=True,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_res = project_res
+        self.feature_1_4 = feature
+        self.feature_1_8 = feature * 2
+        self.feature_1_16 = feature * 4
+        self.feature_1_16_dec = self.feature_1_16
+        self.feature_1_8_dec = self.feature_1_8
+        self.feature_1_4_dec = self.feature_1_4
+        self.process_1_4 = nn.Sequential(
+            Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_4, norm_layer, bn_momentum),
+        )
+        self.process_1_8 = nn.Sequential(
+            Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_8, norm_layer, bn_momentum),
+        )
+        self.up_1_16_1_8 = Upsample(
+            self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
+        )
+        self.up_1_8_1_4 = Upsample(
+            self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
+        )
+        self.ssc_head_1_4 = SegmentationHead(
+            self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
+        )
+        self.context_prior = context_prior
+        size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature_1_16,
+                size_1_16,
+                n_relations=n_relations,
+                bn_momentum=bn_momentum,
+            )
+    #
+    def forward(self, input_dict):
+        res = {}
+        x3d_1_4 = input_dict["x3d"]
+        x3d_1_8 = self.process_1_4(x3d_1_4)
+        x3d_1_16 = self.process_1_8(x3d_1_8)
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_1_16)
+            x3d_1_16 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+        x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
+        x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
+        ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
+        res["ssc_logit"] = ssc_logit_1_4
+        return res

monoscene/CRP3D.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch
+import torch.nn as nn
+from monoscene.modules import (
+    Process,
+    ASPP,
+)
+class CPMegaVoxels(nn.Module):
+    def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
+        super().__init__()
+        self.size = size
+        self.n_relations = n_relations
+        print("n_relations", self.n_relations)
+        self.flatten_size = size[0] * size[1] * size[2]
+        self.feature = feature
+        self.context_feature = feature * 2
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+        padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
+        self.mega_context = nn.Sequential(
+            nn.Conv3d(
+                feature, self.context_feature, stride=2, padding=padding, kernel_size=3
+            ),
+        )
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+        self.context_prior_logits = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv3d(
+                        self.feature,
+                        self.flatten_context_size,
+                        padding=0,
+                        kernel_size=1,
+                    ),
+                )
+                for i in range(n_relations)
+            ]
+        )
+        self.aspp = ASPP(feature, [1, 2, 3])
+        self.resize = nn.Sequential(
+            nn.Conv3d(
+                self.context_feature * self.n_relations + feature,
+                feature,
+                kernel_size=1,
+                padding=0,
+                bias=False,
+            ),
+            Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
+        )
+    def forward(self, input):
+        ret = {}
+        bs = input.shape[0]
+        x_agg = self.aspp(input)
+        # get the mega context
+        x_mega_context_raw = self.mega_context(x_agg)
+        x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
+        x_mega_context = x_mega_context.permute(0, 2, 1)
+        # get context prior map
+        x_context_prior_logits = []
+        x_context_rels = []
+        for rel in range(self.n_relations):
+            # Compute the relation matrices
+            x_context_prior_logit = self.context_prior_logits[rel](x_agg)
+            x_context_prior_logit = x_context_prior_logit.reshape(
+                bs, self.flatten_context_size, self.flatten_size
+            )
+            x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
+            x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
+            x_context_prior = torch.sigmoid(x_context_prior_logit)
+            # Multiply the relation matrices with the mega context to gather context features
+            x_context_rel = torch.bmm(x_context_prior, x_mega_context)  # bs, N, f
+            x_context_rels.append(x_context_rel)
+        x_context = torch.cat(x_context_rels, dim=2)
+        x_context = x_context.permute(0, 2, 1)
+        x_context = x_context.reshape(
+            bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
+        )
+        x = torch.cat([input, x_context], dim=1)
+        x = self.resize(x)
+        x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
+        ret["P_logits"] = x_context_prior_logits
+        ret["x"] = x
+        return ret

monoscene/DDR.py ADDED Viewed

	@@ -0,0 +1,139 @@

+"""
+Most of the code in this file is taken from https://github.com/waterljwant/SSC/blob/master/models/DDR.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class SimpleRB(nn.Module):
+    def __init__(self, in_channel, norm_layer, bn_momentum):
+        super(SimpleRB, self).__init__()
+        self.path = nn.Sequential(
+            nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+            norm_layer(in_channel, momentum=bn_momentum),
+            nn.ReLU(),
+            nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+            norm_layer(in_channel, momentum=bn_momentum),
+        )
+        self.relu = nn.ReLU()
+    def forward(self, x):
+        residual = x
+        conv_path = self.path(x)
+        out = residual + conv_path
+        out = self.relu(out)
+        return out
+"""
+3D Residual Block，3x3x3 conv ==> 3 smaller 3D conv, refered from DDRNet
+"""
+class Bottleneck3D(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        norm_layer,
+        stride=1,
+        dilation=[1, 1, 1],
+        expansion=4,
+        downsample=None,
+        fist_dilation=1,
+        multi_grid=1,
+        bn_momentum=0.0003,
+    ):
+        super(Bottleneck3D, self).__init__()
+        # often，planes = inplanes // 4
+        self.expansion = expansion
+        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(planes, momentum=bn_momentum)
+        self.conv2 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(1, 1, 3),
+            stride=(1, 1, stride),
+            dilation=(1, 1, dilation[0]),
+            padding=(0, 0, dilation[0]),
+            bias=False,
+        )
+        self.bn2 = norm_layer(planes, momentum=bn_momentum)
+        self.conv3 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(1, 3, 1),
+            stride=(1, stride, 1),
+            dilation=(1, dilation[1], 1),
+            padding=(0, dilation[1], 0),
+            bias=False,
+        )
+        self.bn3 = norm_layer(planes, momentum=bn_momentum)
+        self.conv4 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(3, 1, 1),
+            stride=(stride, 1, 1),
+            dilation=(dilation[2], 1, 1),
+            padding=(dilation[2], 0, 0),
+            bias=False,
+        )
+        self.bn4 = norm_layer(planes, momentum=bn_momentum)
+        self.conv5 = nn.Conv3d(
+            planes, planes * self.expansion, kernel_size=(1, 1, 1), bias=False
+        )
+        self.bn5 = norm_layer(planes * self.expansion, momentum=bn_momentum)
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+        self.downsample2 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(1, stride, 1), stride=(1, stride, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+        self.downsample3 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+        self.downsample4 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+    def forward(self, x):
+        residual = x
+        out1 = self.relu(self.bn1(self.conv1(x)))
+        out2 = self.bn2(self.conv2(out1))
+        out2_relu = self.relu(out2)
+        out3 = self.bn3(self.conv3(out2_relu))
+        if self.stride != 1:
+            out2 = self.downsample2(out2)
+        out3 = out3 + out2
+        out3_relu = self.relu(out3)
+        out4 = self.bn4(self.conv4(out3_relu))
+        if self.stride != 1:
+            out2 = self.downsample3(out2)
+            out3 = self.downsample4(out3)
+        out4 = out4 + out2 + out3
+        out4_relu = self.relu(out4)
+        out5 = self.bn5(self.conv5(out4_relu))
+        if self.downsample is not None:
+            residual = self.downsample(x)
+        out = out5 + residual
+        out_relu = self.relu(out)
+        return out_relu

monoscene/__init__.py ADDED Viewed

File without changes

monoscene/__pycache__/CRP3D.cpython-37.pyc ADDED Viewed

Binary file (2.34 kB). View file