diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..9d7dd240381bd4ef025168212e15f1f0b7478660
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,28 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+monoscene_kitti.ckpt filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..46769a0f0371bf73afac0874aa640b65361b0b19
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__
+.ipynb_checkpoints
+*.ckpt
+gradio*
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a7f97e80109bd91f0ed142e6eb9aac585326881
--- /dev/null
+++ b/README.md
@@ -0,0 +1,14 @@
+---
+title: MonoScene
+emoji: 🚘🏙️
+colorFrom: purple
+colorTo: pink
+sdk: gradio
+sdk_version: 3.0.20
+app_file: app.py
+pinned: true
+license: apache-2.0
+duplicated_from: CVPR/MonoScene
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..01225a388b656d95f22d55a02c47d4d62772c3c3
--- /dev/null
+++ b/app.py
@@ -0,0 +1,126 @@
+import gradio as gr
+import numpy as np
+from torchvision import transforms
+import torch
+from helpers import *
+import sys
+import csv
+from monoscene.monoscene import MonoScene
+
+csv.field_size_limit(sys.maxsize)
+torch.set_grad_enabled(False)
+
+# pipeline = pipeline(model="anhquancao/monoscene_kitti")
+# model = AutoModel.from_pretrained(
+# "anhquancao/monoscene_kitti", trust_remote_code=True, revision='bf033f87c2a86b60903ab811b790a1532c1ae313'
+# )#.cuda()
+model = MonoScene.load_from_checkpoint(
+ "monoscene_kitti.ckpt",
+ dataset="kitti",
+ n_classes=20,
+ feature = 64,
+ project_scale = 2,
+ full_scene_size = (256, 256, 32),
+ )
+
+img_W, img_H = 1220, 370
+
+
+def predict(img):
+ img = np.array(img, dtype=np.float32, copy=False) / 255.0
+
+ normalize_rgb = transforms.Compose(
+ [
+ transforms.ToTensor(),
+ transforms.Normalize(
+ mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+ ),
+ ]
+ )
+ img = normalize_rgb(img)
+
+ batch = get_projections(img_W, img_H)
+ batch["img"] = img
+ for k in batch:
+ batch[k] = batch[k].unsqueeze(0)#.cuda()
+
+ pred = model(batch).squeeze()
+ # print(pred.shape)
+ pred = majority_pooling(pred, k_size=2)
+ fig = draw(pred, batch['fov_mask_2'])
+
+
+ return fig
+
+
+description = """
+MonoScene Demo on SemanticKITTI Validation Set (Sequence 08), which uses the camera parameters of Sequence 08.
+Due to the CPU-only inference, it might take up to 20s to predict a scene. \n
+The output is downsampled by 2 for faster rendering. Darker colors represent the scenery outside the Field of View, i.e. not visible on the image.
+
+
+
+
+
+
+
+"""
+title = "MonoScene: Monocular 3D Semantic Scene Completion"
+article="""
+
+We also released a smaller MonoScene model (Half resolution - w/o 3D CRP) at: https://huggingface.co/spaces/CVPR/monoscene_lite
+
+
+"""
+
+examples = [
+ 'images/08/001385.jpg',
+ 'images/08/000295.jpg',
+ 'images/08/002505.jpg',
+ 'images/08/000085.jpg',
+ 'images/08/000290.jpg',
+ 'images/08/000465.jpg',
+ 'images/08/000790.jpg',
+ 'images/08/001005.jpg',
+ 'images/08/001380.jpg',
+ 'images/08/001530.jpg',
+ 'images/08/002360.jpg',
+ 'images/08/004059.jpg',
+ 'images/08/003149.jpg',
+ 'images/08/001446.jpg',
+ 'images/08/000010.jpg',
+ 'images/08/001122.jpg',
+ 'images/08/003533.jpg',
+ 'images/08/003365.jpg',
+ 'images/08/002944.jpg',
+ 'images/08/000822.jpg',
+ 'images/08/000103.jpg',
+ 'images/08/002716.jpg',
+ 'images/08/000187.jpg',
+ 'images/08/002128.jpg',
+ 'images/08/000511.jpg',
+ 'images/08/000618.jpg',
+ 'images/08/002010.jpg',
+ 'images/08/000234.jpg',
+ 'images/08/001842.jpg',
+ 'images/08/001687.jpg',
+ 'images/08/003929.jpg',
+ 'images/08/002272.jpg',
+]
+
+
+
+demo = gr.Interface(
+ predict,
+ gr.Image(shape=(1220, 370)),
+ gr.Plot(),
+ article=article,
+ title=title,
+ enable_queue=True,
+ cache_examples=False,
+ live=False,
+ examples=examples,
+ description=description)
+
+
+demo.launch(enable_queue=True, debug=False)
\ No newline at end of file
diff --git a/calib.txt b/calib.txt
new file mode 100644
index 0000000000000000000000000000000000000000..793946dabbfa14421b0ab261d69fca372137b76e
--- /dev/null
+++ b/calib.txt
@@ -0,0 +1,5 @@
+P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03
+P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03
+Tr: 4.276802385584e-04 -9.999672484946e-01 -8.084491683471e-03 -1.198459927713e-02 -7.210626507497e-03 8.081198471645e-03 -9.999413164504e-01 -5.403984729748e-02 9.999738645903e-01 4.859485810390e-04 -7.206933692422e-03 -2.921968648686e-01
diff --git a/fusion.py b/fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..aecd5cba3b1e3dd1e0534cda347eca8956657926
--- /dev/null
+++ b/fusion.py
@@ -0,0 +1,507 @@
+"""
+Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
+
+@inproceedings{zeng20163dmatch,
+ title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
+ author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
+ booktitle={CVPR},
+ year={2017}
+}
+"""
+
+import numpy as np
+
+from numba import njit, prange
+from skimage import measure
+
+FUSION_GPU_MODE = 0
+
+
+class TSDFVolume:
+ """Volumetric TSDF Fusion of RGB-D Images."""
+
+ def __init__(self, vol_bnds, voxel_size, use_gpu=True):
+ """Constructor.
+
+ Args:
+ vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
+ xyz bounds (min/max) in meters.
+ voxel_size (float): The volume discretization in meters.
+ """
+ vol_bnds = np.asarray(vol_bnds)
+ assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
+
+ # Define voxel volume parameters
+ self._vol_bnds = vol_bnds
+ self._voxel_size = float(voxel_size)
+ self._trunc_margin = 5 * self._voxel_size # truncation on SDF
+ # self._trunc_margin = 10 # truncation on SDF
+ self._color_const = 256 * 256
+
+ # Adjust volume bounds and ensure C-order contiguous
+ self._vol_dim = (
+ np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
+ .copy(order="C")
+ .astype(int)
+ )
+ self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
+ self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
+
+ print(
+ "Voxel volume size: {} x {} x {} - # points: {:,}".format(
+ self._vol_dim[0],
+ self._vol_dim[1],
+ self._vol_dim[2],
+ self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
+ )
+ )
+
+ # Initialize pointers to voxel volume in CPU memory
+ self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+ # for computing the cumulative moving average of observations per voxel
+ self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+ self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+
+ self.gpu_mode = use_gpu and FUSION_GPU_MODE
+
+ # Copy voxel volumes to GPU
+ if self.gpu_mode:
+ self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
+ cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
+ self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
+ cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
+ self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
+ cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
+
+ # Cuda kernel function (C++)
+ self._cuda_src_mod = SourceModule(
+ """
+ __global__ void integrate(float * tsdf_vol,
+ float * weight_vol,
+ float * color_vol,
+ float * vol_dim,
+ float * vol_origin,
+ float * cam_intr,
+ float * cam_pose,
+ float * other_params,
+ float * color_im,
+ float * depth_im) {
+ // Get voxel index
+ int gpu_loop_idx = (int) other_params[0];
+ int max_threads_per_block = blockDim.x;
+ int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
+ int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
+ int vol_dim_x = (int) vol_dim[0];
+ int vol_dim_y = (int) vol_dim[1];
+ int vol_dim_z = (int) vol_dim[2];
+ if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
+ return;
+ // Get voxel grid coordinates (note: be careful when casting)
+ float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
+ float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
+ float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
+ // Voxel grid coordinates to world coordinates
+ float voxel_size = other_params[1];
+ float pt_x = vol_origin[0]+voxel_x*voxel_size;
+ float pt_y = vol_origin[1]+voxel_y*voxel_size;
+ float pt_z = vol_origin[2]+voxel_z*voxel_size;
+ // World coordinates to camera coordinates
+ float tmp_pt_x = pt_x-cam_pose[0*4+3];
+ float tmp_pt_y = pt_y-cam_pose[1*4+3];
+ float tmp_pt_z = pt_z-cam_pose[2*4+3];
+ float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
+ float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
+ float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
+ // Camera coordinates to image pixels
+ int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
+ int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
+ // Skip if outside view frustum
+ int im_h = (int) other_params[2];
+ int im_w = (int) other_params[3];
+ if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
+ return;
+ // Skip invalid depth
+ float depth_value = depth_im[pixel_y*im_w+pixel_x];
+ if (depth_value == 0)
+ return;
+ // Integrate TSDF
+ float trunc_margin = other_params[4];
+ float depth_diff = depth_value-cam_pt_z;
+ if (depth_diff < -trunc_margin)
+ return;
+ float dist = fmin(1.0f,depth_diff/trunc_margin);
+ float w_old = weight_vol[voxel_idx];
+ float obs_weight = other_params[5];
+ float w_new = w_old + obs_weight;
+ weight_vol[voxel_idx] = w_new;
+ tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
+ // Integrate color
+ float old_color = color_vol[voxel_idx];
+ float old_b = floorf(old_color/(256*256));
+ float old_g = floorf((old_color-old_b*256*256)/256);
+ float old_r = old_color-old_b*256*256-old_g*256;
+ float new_color = color_im[pixel_y*im_w+pixel_x];
+ float new_b = floorf(new_color/(256*256));
+ float new_g = floorf((new_color-new_b*256*256)/256);
+ float new_r = new_color-new_b*256*256-new_g*256;
+ new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
+ new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
+ new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
+ color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
+ }"""
+ )
+
+ self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
+
+ # Determine block/grid size on GPU
+ gpu_dev = cuda.Device(0)
+ self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
+ n_blocks = int(
+ np.ceil(
+ float(np.prod(self._vol_dim))
+ / float(self._max_gpu_threads_per_block)
+ )
+ )
+ grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
+ grid_dim_y = min(
+ gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
+ )
+ grid_dim_z = min(
+ gpu_dev.MAX_GRID_DIM_Z,
+ int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
+ )
+ self._max_gpu_grid_dim = np.array(
+ [grid_dim_x, grid_dim_y, grid_dim_z]
+ ).astype(int)
+ self._n_gpu_loops = int(
+ np.ceil(
+ float(np.prod(self._vol_dim))
+ / float(
+ np.prod(self._max_gpu_grid_dim)
+ * self._max_gpu_threads_per_block
+ )
+ )
+ )
+
+ else:
+ # Get voxel grid coordinates
+ xv, yv, zv = np.meshgrid(
+ range(self._vol_dim[0]),
+ range(self._vol_dim[1]),
+ range(self._vol_dim[2]),
+ indexing="ij",
+ )
+ self.vox_coords = (
+ np.concatenate(
+ [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
+ )
+ .astype(int)
+ .T
+ )
+
+ @staticmethod
+ @njit(parallel=True)
+ def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
+ """Convert voxel grid coordinates to world coordinates."""
+ vol_origin = vol_origin.astype(np.float32)
+ vox_coords = vox_coords.astype(np.float32)
+ # print(np.min(vox_coords))
+ cam_pts = np.empty_like(vox_coords, dtype=np.float32)
+
+ for i in prange(vox_coords.shape[0]):
+ for j in range(3):
+ cam_pts[i, j] = (
+ vol_origin[j]
+ + (vox_size * vox_coords[i, j])
+ + vox_size * offsets[j]
+ )
+ return cam_pts
+
+ @staticmethod
+ @njit(parallel=True)
+ def cam2pix(cam_pts, intr):
+ """Convert camera coordinates to pixel coordinates."""
+ intr = intr.astype(np.float32)
+ fx, fy = intr[0, 0], intr[1, 1]
+ cx, cy = intr[0, 2], intr[1, 2]
+ pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
+ for i in prange(cam_pts.shape[0]):
+ pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
+ pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
+ return pix
+
+ @staticmethod
+ @njit(parallel=True)
+ def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
+ """Integrate the TSDF volume."""
+ tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
+ # print(tsdf_vol.shape)
+ w_new = np.empty_like(w_old, dtype=np.float32)
+ for i in prange(len(tsdf_vol)):
+ w_new[i] = w_old[i] + obs_weight
+ tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
+ return tsdf_vol_int, w_new
+
+ def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
+ """Integrate an RGB-D frame into the TSDF volume.
+
+ Args:
+ color_im (ndarray): An RGB image of shape (H, W, 3).
+ depth_im (ndarray): A depth image of shape (H, W).
+ cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
+ cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
+ obs_weight (float): The weight to assign for the current observation. A higher
+ value
+ """
+ im_h, im_w = depth_im.shape
+
+ # Fold RGB color image into a single channel image
+ color_im = color_im.astype(np.float32)
+ color_im = np.floor(
+ color_im[..., 2] * self._color_const
+ + color_im[..., 1] * 256
+ + color_im[..., 0]
+ )
+
+ if self.gpu_mode: # GPU mode: integrate voxel volume (calls CUDA kernel)
+ for gpu_loop_idx in range(self._n_gpu_loops):
+ self._cuda_integrate(
+ self._tsdf_vol_gpu,
+ self._weight_vol_gpu,
+ self._color_vol_gpu,
+ cuda.InOut(self._vol_dim.astype(np.float32)),
+ cuda.InOut(self._vol_origin.astype(np.float32)),
+ cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
+ cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
+ cuda.InOut(
+ np.asarray(
+ [
+ gpu_loop_idx,
+ self._voxel_size,
+ im_h,
+ im_w,
+ self._trunc_margin,
+ obs_weight,
+ ],
+ np.float32,
+ )
+ ),
+ cuda.InOut(color_im.reshape(-1).astype(np.float32)),
+ cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
+ block=(self._max_gpu_threads_per_block, 1, 1),
+ grid=(
+ int(self._max_gpu_grid_dim[0]),
+ int(self._max_gpu_grid_dim[1]),
+ int(self._max_gpu_grid_dim[2]),
+ ),
+ )
+ else: # CPU mode: integrate voxel volume (vectorized implementation)
+ # Convert voxel grid coordinates to pixel coordinates
+ cam_pts = self.vox2world(
+ self._vol_origin, self.vox_coords, self._voxel_size
+ )
+ cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
+ pix_z = cam_pts[:, 2]
+ pix = self.cam2pix(cam_pts, cam_intr)
+ pix_x, pix_y = pix[:, 0], pix[:, 1]
+
+ # Eliminate pixels outside view frustum
+ valid_pix = np.logical_and(
+ pix_x >= 0,
+ np.logical_and(
+ pix_x < im_w,
+ np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
+ ),
+ )
+ depth_val = np.zeros(pix_x.shape)
+ depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
+
+ # Integrate TSDF
+ depth_diff = depth_val - pix_z
+
+ valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
+ dist = depth_diff
+
+ valid_vox_x = self.vox_coords[valid_pts, 0]
+ valid_vox_y = self.vox_coords[valid_pts, 1]
+ valid_vox_z = self.vox_coords[valid_pts, 2]
+ w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+ tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+ valid_dist = dist[valid_pts]
+ tsdf_vol_new, w_new = self.integrate_tsdf(
+ tsdf_vals, valid_dist, w_old, obs_weight
+ )
+ self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
+ self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
+
+ # Integrate color
+ old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+ old_b = np.floor(old_color / self._color_const)
+ old_g = np.floor((old_color - old_b * self._color_const) / 256)
+ old_r = old_color - old_b * self._color_const - old_g * 256
+ new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
+ new_b = np.floor(new_color / self._color_const)
+ new_g = np.floor((new_color - new_b * self._color_const) / 256)
+ new_r = new_color - new_b * self._color_const - new_g * 256
+ new_b = np.minimum(
+ 255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
+ )
+ new_g = np.minimum(
+ 255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
+ )
+ new_r = np.minimum(
+ 255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
+ )
+ self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
+ new_b * self._color_const + new_g * 256 + new_r
+ )
+
+ def get_volume(self):
+ if self.gpu_mode:
+ cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
+ cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
+ return self._tsdf_vol_cpu, self._color_vol_cpu
+
+ def get_point_cloud(self):
+ """Extract a point cloud from the voxel volume."""
+ tsdf_vol, color_vol = self.get_volume()
+
+ # Marching cubes
+ verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
+ verts_ind = np.round(verts).astype(int)
+ verts = verts * self._voxel_size + self._vol_origin
+
+ # Get vertex colors
+ rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+ colors_b = np.floor(rgb_vals / self._color_const)
+ colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+ colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+ colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+ colors = colors.astype(np.uint8)
+
+ pc = np.hstack([verts, colors])
+ return pc
+
+ def get_mesh(self):
+ """Compute a mesh from the voxel volume using marching cubes."""
+ tsdf_vol, color_vol = self.get_volume()
+
+ # Marching cubes
+ verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
+ verts_ind = np.round(verts).astype(int)
+ verts = (
+ verts * self._voxel_size + self._vol_origin
+ ) # voxel grid coordinates to world coordinates
+
+ # Get vertex colors
+ rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+ colors_b = np.floor(rgb_vals / self._color_const)
+ colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+ colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+ colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+ colors = colors.astype(np.uint8)
+ return verts, faces, norms, colors
+
+
+def rigid_transform(xyz, transform):
+ """Applies a rigid transform to an (N, 3) pointcloud."""
+ xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
+ xyz_t_h = np.dot(transform, xyz_h.T).T
+ return xyz_t_h[:, :3]
+
+
+def get_view_frustum(depth_im, cam_intr, cam_pose):
+ """Get corners of 3D camera view frustum of depth image"""
+ im_h = depth_im.shape[0]
+ im_w = depth_im.shape[1]
+ max_depth = np.max(depth_im)
+ view_frust_pts = np.array(
+ [
+ (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
+ * np.array([0, max_depth, max_depth, max_depth, max_depth])
+ / cam_intr[0, 0],
+ (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
+ * np.array([0, max_depth, max_depth, max_depth, max_depth])
+ / cam_intr[1, 1],
+ np.array([0, max_depth, max_depth, max_depth, max_depth]),
+ ]
+ )
+ view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
+ return view_frust_pts
+
+
+def meshwrite(filename, verts, faces, norms, colors):
+ """Save a 3D mesh to a polygon .ply file."""
+ # Write header
+ ply_file = open(filename, "w")
+ ply_file.write("ply\n")
+ ply_file.write("format ascii 1.0\n")
+ ply_file.write("element vertex %d\n" % (verts.shape[0]))
+ ply_file.write("property float x\n")
+ ply_file.write("property float y\n")
+ ply_file.write("property float z\n")
+ ply_file.write("property float nx\n")
+ ply_file.write("property float ny\n")
+ ply_file.write("property float nz\n")
+ ply_file.write("property uchar red\n")
+ ply_file.write("property uchar green\n")
+ ply_file.write("property uchar blue\n")
+ ply_file.write("element face %d\n" % (faces.shape[0]))
+ ply_file.write("property list uchar int vertex_index\n")
+ ply_file.write("end_header\n")
+
+ # Write vertex list
+ for i in range(verts.shape[0]):
+ ply_file.write(
+ "%f %f %f %f %f %f %d %d %d\n"
+ % (
+ verts[i, 0],
+ verts[i, 1],
+ verts[i, 2],
+ norms[i, 0],
+ norms[i, 1],
+ norms[i, 2],
+ colors[i, 0],
+ colors[i, 1],
+ colors[i, 2],
+ )
+ )
+
+ # Write face list
+ for i in range(faces.shape[0]):
+ ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
+
+ ply_file.close()
+
+
+def pcwrite(filename, xyzrgb):
+ """Save a point cloud to a polygon .ply file."""
+ xyz = xyzrgb[:, :3]
+ rgb = xyzrgb[:, 3:].astype(np.uint8)
+
+ # Write header
+ ply_file = open(filename, "w")
+ ply_file.write("ply\n")
+ ply_file.write("format ascii 1.0\n")
+ ply_file.write("element vertex %d\n" % (xyz.shape[0]))
+ ply_file.write("property float x\n")
+ ply_file.write("property float y\n")
+ ply_file.write("property float z\n")
+ ply_file.write("property uchar red\n")
+ ply_file.write("property uchar green\n")
+ ply_file.write("property uchar blue\n")
+ ply_file.write("end_header\n")
+
+ # Write vertex list
+ for i in range(xyz.shape[0]):
+ ply_file.write(
+ "%f %f %f %d %d %d\n"
+ % (
+ xyz[i, 0],
+ xyz[i, 1],
+ xyz[i, 2],
+ rgb[i, 0],
+ rgb[i, 1],
+ rgb[i, 2],
+ )
+ )
diff --git a/helpers.py b/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0c04e38af0140bd37508becf644802b098ae2e2
--- /dev/null
+++ b/helpers.py
@@ -0,0 +1,336 @@
+import numpy as np
+import torch
+import fusion
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+
+def read_calib(calib_path):
+ """
+ Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
+ :param calib_path: Path to a calibration text file.
+ :return: dict with calibration matrices.
+ """
+ calib_all = {}
+ with open(calib_path, "r") as f:
+ for line in f.readlines():
+ if line == "\n":
+ break
+ key, value = line.split(":", 1)
+ calib_all[key] = np.array([float(x) for x in value.split()])
+
+ # reshape matrices
+ calib_out = {}
+ # 3x4 projection matrix for left camera
+ calib_out["P2"] = calib_all["P2"].reshape(3, 4)
+ calib_out["Tr"] = np.identity(4) # 4x4 matrix
+ calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
+ return calib_out
+
+
+def vox2pix(cam_E, cam_k,
+ vox_origin, voxel_size,
+ img_W, img_H,
+ scene_size):
+ """
+ compute the 2D projection of voxels centroids
+
+ Parameters:
+ ----------
+ cam_E: 4x4
+ =camera pose in case of NYUv2 dataset
+ =Transformation from camera to lidar coordinate in case of SemKITTI
+ cam_k: 3x3
+ camera intrinsics
+ vox_origin: (3,)
+ world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
+ img_W: int
+ image width
+ img_H: int
+ image height
+ scene_size: (3,)
+ scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
+
+ Returns
+ -------
+ projected_pix: (N, 2)
+ Projected 2D positions of voxels
+ fov_mask: (N,)
+ Voxels mask indice voxels inside image's FOV
+ pix_z: (N,)
+ Voxels'distance to the sensor in meter
+ """
+ # Compute the x, y, z bounding of the scene in meter
+ vol_bnds = np.zeros((3,2))
+ vol_bnds[:,0] = vox_origin
+ vol_bnds[:,1] = vox_origin + np.array(scene_size)
+
+ # Compute the voxels centroids in lidar cooridnates
+ vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
+ xv, yv, zv = np.meshgrid(
+ range(vol_dim[0]),
+ range(vol_dim[1]),
+ range(vol_dim[2]),
+ indexing='ij'
+ )
+ vox_coords = np.concatenate([
+ xv.reshape(1,-1),
+ yv.reshape(1,-1),
+ zv.reshape(1,-1)
+ ], axis=0).astype(int).T
+
+ # Project voxels'centroid from lidar coordinates to camera coordinates
+ cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
+ cam_pts = fusion.rigid_transform(cam_pts, cam_E)
+
+ # Project camera coordinates to pixel positions
+ projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
+ pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+
+ # Eliminate pixels outside view frustum
+ pix_z = cam_pts[:, 2]
+ fov_mask = np.logical_and(pix_x >= 0,
+ np.logical_and(pix_x < img_W,
+ np.logical_and(pix_y >= 0,
+ np.logical_and(pix_y < img_H,
+ pix_z > 0))))
+
+
+ return torch.from_numpy(projected_pix), torch.from_numpy(fov_mask), torch.from_numpy(pix_z)
+
+
+
+def get_grid_coords(dims, resolution):
+ """
+ :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+ :return coords_grid: is the center coords of voxels in the grid
+ """
+
+ g_xx = np.arange(0, dims[0] + 1)
+ g_yy = np.arange(0, dims[1] + 1)
+ sensor_pose = 10
+ g_zz = np.arange(0, dims[2] + 1)
+
+ # Obtaining the grid with coords...
+ xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+ coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+ coords_grid = coords_grid.astype(np.float)
+
+ coords_grid = (coords_grid * resolution) + resolution / 2
+
+ temp = np.copy(coords_grid)
+ temp[:, 0] = coords_grid[:, 1]
+ temp[:, 1] = coords_grid[:, 0]
+ coords_grid = np.copy(temp)
+
+ return coords_grid
+
+def get_projections(img_W, img_H):
+ scale_3ds = [1, 2]
+ data = {}
+ for scale_3d in scale_3ds:
+ scene_size = (51.2, 51.2, 6.4)
+ vox_origin = np.array([0, -25.6, -2])
+ voxel_size = 0.2
+
+ calib = read_calib("calib.txt")
+ cam_k = calib["P2"][:3, :3]
+ T_velo_2_cam = calib["Tr"]
+
+ # compute the 3D-2D mapping
+ projected_pix, fov_mask, pix_z = vox2pix(
+ T_velo_2_cam,
+ cam_k,
+ vox_origin,
+ voxel_size * scale_3d,
+ img_W,
+ img_H,
+ scene_size,
+ )
+
+ data["projected_pix_{}".format(scale_3d)] = projected_pix
+ data["pix_z_{}".format(scale_3d)] = pix_z
+ data["fov_mask_{}".format(scale_3d)] = fov_mask
+ return data
+
+
+def majority_pooling(grid, k_size=2):
+ result = np.zeros(
+ (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
+ )
+ for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
+ for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
+ for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
+
+ sub_m = grid[
+ (xx * k_size) : (xx * k_size) + k_size,
+ (yy * k_size) : (yy * k_size) + k_size,
+ (zz * k_size) : (zz * k_size) + k_size,
+ ]
+ unique, counts = np.unique(sub_m, return_counts=True)
+ if True in ((unique != 0) & (unique != 255)):
+ # Remove counts with 0 and 255
+ counts = counts[((unique != 0) & (unique != 255))]
+ unique = unique[((unique != 0) & (unique != 255))]
+ else:
+ if True in (unique == 0):
+ counts = counts[(unique != 255)]
+ unique = unique[(unique != 255)]
+ value = unique[np.argmax(counts)]
+ result[xx, yy, zz] = value
+ return result
+
+
+def draw(
+ voxels,
+ # T_velo_2_cam,
+ # vox_origin,
+ fov_mask,
+ # img_size,
+ # f,
+ voxel_size=0.4,
+ # d=7, # 7m - determine the size of the mesh representing the camera
+):
+
+ fov_mask = fov_mask.reshape(-1)
+ # Compute the voxels coordinates
+ grid_coords = get_grid_coords(
+ [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
+ )
+
+
+ # Attach the predicted class to every voxel
+ grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
+
+ # Get the voxels inside FOV
+ fov_grid_coords = grid_coords[fov_mask, :]
+
+ # Get the voxels outside FOV
+ outfov_grid_coords = grid_coords[~fov_mask, :]
+
+ # Remove empty and unknown voxels
+ fov_voxels = fov_grid_coords[
+ (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255), :
+ ]
+ # print(np.unique(fov_voxels[:, 3], return_counts=True))
+ outfov_voxels = outfov_grid_coords[
+ (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255), :
+ ]
+
+ # figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
+ colors = np.array(
+ [
+ [0,0,0],
+ [100, 150, 245],
+ [100, 230, 245],
+ [30, 60, 150],
+ [80, 30, 180],
+ [100, 80, 250],
+ [255, 30, 30],
+ [255, 40, 200],
+ [150, 30, 90],
+ [255, 0, 255],
+ [255, 150, 255],
+ [75, 0, 75],
+ [175, 0, 75],
+ [255, 200, 0],
+ [255, 120, 50],
+ [0, 175, 0],
+ [135, 60, 0],
+ [150, 240, 80],
+ [255, 240, 150],
+ [255, 0, 0],
+ ]
+ ).astype(np.uint8)
+
+ pts_colors = [f'rgb({colors[int(i)][0]}, {colors[int(i)][1]}, {colors[int(i)][2]})' for i in fov_voxels[:, 3]]
+ out_fov_colors = [f'rgb({colors[int(i)][0]//3*2}, {colors[int(i)][1]//3*2}, {colors[int(i)][2]//3*2})' for i in outfov_voxels[:, 3]]
+ pts_colors = pts_colors + out_fov_colors
+
+ fov_voxels = np.concatenate([fov_voxels, outfov_voxels], axis=0)
+ x = fov_voxels[:, 0].flatten()
+ y = fov_voxels[:, 1].flatten()
+ z = fov_voxels[:, 2].flatten()
+ # label = fov_voxels[:, 3].flatten()
+ fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,mode='markers',
+ marker=dict(
+ size=2,
+ color=pts_colors, # set color to an array/list of desired values
+ # colorscale='Viridis', # choose a colorscale
+ opacity=1.0,
+ symbol='square'
+ ))])
+ fig.update_layout(
+ scene = dict(
+ aspectmode='data',
+ xaxis = dict(
+ backgroundcolor="rgb(255, 255, 255)",
+ gridcolor="black",
+ showbackground=True,
+ zerolinecolor="black",
+ nticks=4,
+ visible=False,
+ range=[-1,55],),
+ yaxis = dict(
+ backgroundcolor="rgb(255, 255, 255)",
+ gridcolor="black",
+ showbackground=True,
+ zerolinecolor="black",
+ visible=False,
+ nticks=4, range=[-1,55],),
+ zaxis = dict(
+ backgroundcolor="rgb(255, 255, 255)",
+ gridcolor="black",
+ showbackground=True,
+ zerolinecolor="black",
+ visible=False,
+ nticks=4, range=[-1,7],),
+ bgcolor="black",
+ ),
+
+ )
+
+ # fig = px.scatter_3d(
+ # fov_voxels,
+ # x=fov_voxels[:, 0], y="y", z="z", color="label")
+ # Draw occupied inside FOV voxels
+ # plt_plot_fov = mlab.points3d(
+ # fov_voxels[:, 0],
+ # fov_voxels[:, 1],
+ # fov_voxels[:, 2],
+ # fov_voxels[:, 3],
+ # colormap="viridis",
+ # scale_factor=voxel_size - 0.05 * voxel_size,
+ # mode="cube",
+ # opacity=1.0,
+ # vmin=1,
+ # vmax=19,
+ # )
+
+ # # Draw occupied outside FOV voxels
+ # plt_plot_outfov = mlab.points3d(
+ # outfov_voxels[:, 0],
+ # outfov_voxels[:, 1],
+ # outfov_voxels[:, 2],
+ # outfov_voxels[:, 3],
+ # colormap="viridis",
+ # scale_factor=voxel_size - 0.05 * voxel_size,
+ # mode="cube",
+ # opacity=1.0,
+ # vmin=1,
+ # vmax=19,
+ # )
+
+
+
+ # plt_plot_fov.glyph.scale_mode = "scale_by_vector"
+ # plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
+
+ # plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
+
+ # outfov_colors = colors
+ # outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
+ # plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
+
+ # mlab.show()
+ return fig
\ No newline at end of file
diff --git a/images/08/000010.jpg b/images/08/000010.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dce9bd4712215f082178d79da224fedcd7d1f324
Binary files /dev/null and b/images/08/000010.jpg differ
diff --git a/images/08/000085.jpg b/images/08/000085.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..92841f53fa1c483d5537341d2052c6a6921a8c07
Binary files /dev/null and b/images/08/000085.jpg differ
diff --git a/images/08/000103.jpg b/images/08/000103.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..14460f856654a3ad525c80c9aa28a9b2e59ff7e7
Binary files /dev/null and b/images/08/000103.jpg differ
diff --git a/images/08/000187.jpg b/images/08/000187.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3f1027f581c2d1ea8cd9fb7fdce028a3db1c2105
Binary files /dev/null and b/images/08/000187.jpg differ
diff --git a/images/08/000234.jpg b/images/08/000234.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2399753ecd614b6b0d1239e22c0d422e434f28ee
Binary files /dev/null and b/images/08/000234.jpg differ
diff --git a/images/08/000290.jpg b/images/08/000290.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d09734b6689231c78216da5fdbe48c1e075c5b91
Binary files /dev/null and b/images/08/000290.jpg differ
diff --git a/images/08/000295.jpg b/images/08/000295.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9113dad82318a509db95201dbab9b6ff834ecd9d
Binary files /dev/null and b/images/08/000295.jpg differ
diff --git a/images/08/000465.jpg b/images/08/000465.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..917d465fd5b9ec6065719e88d9d0cdf39fce5823
Binary files /dev/null and b/images/08/000465.jpg differ
diff --git a/images/08/000511.jpg b/images/08/000511.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..378e6bd8023e5f643e0f9df25d0ef27e6c286498
Binary files /dev/null and b/images/08/000511.jpg differ
diff --git a/images/08/000618.jpg b/images/08/000618.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6d036b40463933033371cbd8b821d8875dff2ae0
Binary files /dev/null and b/images/08/000618.jpg differ
diff --git a/images/08/000790.jpg b/images/08/000790.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..77aea4cbaec5ec03f6cd583a77da7a7e9e4fee88
Binary files /dev/null and b/images/08/000790.jpg differ
diff --git a/images/08/000822.jpg b/images/08/000822.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0139ce00faac15151767978b4c850b957f7f9298
Binary files /dev/null and b/images/08/000822.jpg differ
diff --git a/images/08/001005.jpg b/images/08/001005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..117d1da2374d1b1107acd6ea4fac0a11193ccde7
Binary files /dev/null and b/images/08/001005.jpg differ
diff --git a/images/08/001122.jpg b/images/08/001122.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e1d8b7d6970c781e416ff1d3decd99241f568bd4
Binary files /dev/null and b/images/08/001122.jpg differ
diff --git a/images/08/001380.jpg b/images/08/001380.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9683401a216e810cd633971365b38fefaa827061
Binary files /dev/null and b/images/08/001380.jpg differ
diff --git a/images/08/001385.jpg b/images/08/001385.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a53a048254d36d4ccab94611fef4b33188a49bb4
Binary files /dev/null and b/images/08/001385.jpg differ
diff --git a/images/08/001446.jpg b/images/08/001446.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a058286e4be7273cac49ce3e600036b758c38b5e
Binary files /dev/null and b/images/08/001446.jpg differ
diff --git a/images/08/001530.jpg b/images/08/001530.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..69deae8b12a066d3159bff13006b0c8fd3cd123c
Binary files /dev/null and b/images/08/001530.jpg differ
diff --git a/images/08/001687.jpg b/images/08/001687.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6e6eb1f062bed3357d5465b38d1409e030eba8bf
Binary files /dev/null and b/images/08/001687.jpg differ
diff --git a/images/08/001842.jpg b/images/08/001842.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..482996d026e61f51f1ecf860395cf37a95efa833
Binary files /dev/null and b/images/08/001842.jpg differ
diff --git a/images/08/002010.jpg b/images/08/002010.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4bf5de984449de372b6ae8fe57bb7cf6dfa12d95
Binary files /dev/null and b/images/08/002010.jpg differ
diff --git a/images/08/002128.jpg b/images/08/002128.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2d0b1a7b0cff4997b202ed1cb322285aee208de2
Binary files /dev/null and b/images/08/002128.jpg differ
diff --git a/images/08/002272.jpg b/images/08/002272.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e0d2ebc9427cf5dc313f0d0c42b3e4301b328535
Binary files /dev/null and b/images/08/002272.jpg differ
diff --git a/images/08/002360.jpg b/images/08/002360.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..566632a905ecd04ae94b0ac599e56673ef92b92e
Binary files /dev/null and b/images/08/002360.jpg differ
diff --git a/images/08/002505.jpg b/images/08/002505.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..77b8cc2fb0fdc13604771274750bccc6ca05eae1
Binary files /dev/null and b/images/08/002505.jpg differ
diff --git a/images/08/002716.jpg b/images/08/002716.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5385ac65b015c458028b94d76d86a26cded7b203
Binary files /dev/null and b/images/08/002716.jpg differ
diff --git a/images/08/002944.jpg b/images/08/002944.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ad1e8a50cab8643acc1b7f238d16aeeb55611bf
Binary files /dev/null and b/images/08/002944.jpg differ
diff --git a/images/08/003149.jpg b/images/08/003149.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bab4c49d2515086351188e69af2cdd5db5e32fce
Binary files /dev/null and b/images/08/003149.jpg differ
diff --git a/images/08/003365.jpg b/images/08/003365.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5d77465687501909179664683b2e0432ab5dedf1
Binary files /dev/null and b/images/08/003365.jpg differ
diff --git a/images/08/003533.jpg b/images/08/003533.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4b963deee51da8231a4be14854c571c1b1a822d0
Binary files /dev/null and b/images/08/003533.jpg differ
diff --git a/images/08/003790.jpg b/images/08/003790.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0e9ea9359144d5f6479a855dbc9d82d8bb527097
Binary files /dev/null and b/images/08/003790.jpg differ
diff --git a/images/08/003929.jpg b/images/08/003929.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..74cab6e5f72cdaa1b9badc124f33e19d2bef1540
Binary files /dev/null and b/images/08/003929.jpg differ
diff --git a/images/08/004059.jpg b/images/08/004059.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1620e3db907758dd12b4b4dc6ac7853916916540
Binary files /dev/null and b/images/08/004059.jpg differ
diff --git a/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py b/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88b7b309e6fe66f597cafe2a5eb8c6d29343b7e
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+from monoscene.modules import (
+ Process,
+ ASPP,
+)
+
+
+class CPMegaVoxels(nn.Module):
+ def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
+ super().__init__()
+ self.size = size
+ self.n_relations = n_relations
+ print("n_relations", self.n_relations)
+ self.flatten_size = size[0] * size[1] * size[2]
+ self.feature = feature
+ self.context_feature = feature * 2
+ self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+ padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
+
+ self.mega_context = nn.Sequential(
+ nn.Conv3d(
+ feature, self.context_feature, stride=2, padding=padding, kernel_size=3
+ ),
+ )
+ self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+
+ self.context_prior_logits = nn.ModuleList(
+ [
+ nn.Sequential(
+ nn.Conv3d(
+ self.feature,
+ self.flatten_context_size,
+ padding=0,
+ kernel_size=1,
+ ),
+ )
+ for i in range(n_relations)
+ ]
+ )
+ self.aspp = ASPP(feature, [1, 2, 3])
+
+ self.resize = nn.Sequential(
+ nn.Conv3d(
+ self.context_feature * self.n_relations + feature,
+ feature,
+ kernel_size=1,
+ padding=0,
+ bias=False,
+ ),
+ Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
+ )
+
+ def forward(self, input):
+ ret = {}
+ bs = input.shape[0]
+
+ x_agg = self.aspp(input)
+
+ # get the mega context
+ x_mega_context_raw = self.mega_context(x_agg)
+ x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
+ x_mega_context = x_mega_context.permute(0, 2, 1)
+
+ # get context prior map
+ x_context_prior_logits = []
+ x_context_rels = []
+ for rel in range(self.n_relations):
+
+ # Compute the relation matrices
+ x_context_prior_logit = self.context_prior_logits[rel](x_agg)
+ x_context_prior_logit = x_context_prior_logit.reshape(
+ bs, self.flatten_context_size, self.flatten_size
+ )
+ x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
+
+ x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
+ x_context_prior = torch.sigmoid(x_context_prior_logit)
+
+ # Multiply the relation matrices with the mega context to gather context features
+ x_context_rel = torch.bmm(x_context_prior, x_mega_context) # bs, N, f
+ x_context_rels.append(x_context_rel)
+
+ x_context = torch.cat(x_context_rels, dim=2)
+ x_context = x_context.permute(0, 2, 1)
+ x_context = x_context.reshape(
+ bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
+ )
+
+ x = torch.cat([input, x_context], dim=1)
+ x = self.resize(x)
+
+ x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
+ ret["P_logits"] = x_context_prior_logits
+ ret["x"] = x
+
+ return ret
diff --git a/monoscene/.ipynb_checkpoints/config-checkpoint.py b/monoscene/.ipynb_checkpoints/config-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb935d3d3a41c8973e72210323205607aff2dc5
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/config-checkpoint.py
@@ -0,0 +1,34 @@
+from transformers import PretrainedConfig
+from typing import List
+
+
+class MonoSceneConfig(PretrainedConfig):
+
+ def __init__(
+ self,
+ block_type="bottleneck",
+ layers: List[int] = [3, 4, 6, 3],
+ num_classes: int = 1000,
+ input_channels: int = 3,
+ cardinality: int = 1,
+ base_width: int = 64,
+ stem_width: int = 64,
+ stem_type: str = "",
+ avg_down: bool = False,
+ **kwargs,
+ ):
+ self.block_type = block_type
+ self.layers = layers
+ self.num_classes = num_classes
+ self.input_channels = input_channels
+ self.cardinality = cardinality
+ self.base_width = base_width
+ self.stem_width = stem_width
+ self.stem_type = stem_type
+ self.avg_down = avg_down
+ super().__init__(**kwargs)
+
+
+
+
+
diff --git a/monoscene/.ipynb_checkpoints/modules-checkpoint.py b/monoscene/.ipynb_checkpoints/modules-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8bf875ccd6dffb51bb5acb25f0302fe0032d6c
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/modules-checkpoint.py
@@ -0,0 +1,194 @@
+import torch
+import torch.nn as nn
+from monoscene.DDR import Bottleneck3D
+
+
+class ASPP(nn.Module):
+ """
+ ASPP 3D
+ Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+ """
+
+ def __init__(self, planes, dilations_conv_list):
+ super().__init__()
+
+ # ASPP Block
+ self.conv_list = dilations_conv_list
+ self.conv1 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn1 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.conv2 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn2 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.relu = nn.ReLU()
+
+ def forward(self, x_in):
+
+ y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+ for i in range(1, len(self.conv_list)):
+ y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+ x_in = self.relu(y + x_in) # modified
+
+ return x_in
+
+
+class SegmentationHead(nn.Module):
+ """
+ 3D Segmentation heads to retrieve semantic segmentation at each scale.
+ Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
+ Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+ """
+
+ def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
+ super().__init__()
+
+ # First convolution
+ self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
+
+ # ASPP Block
+ self.conv_list = dilations_conv_list
+ self.conv1 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn1 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.conv2 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn2 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.relu = nn.ReLU()
+
+ self.conv_classes = nn.Conv3d(
+ planes, nbr_classes, kernel_size=3, padding=1, stride=1
+ )
+
+ def forward(self, x_in):
+
+ # Convolution to go from inplanes to planes features...
+ x_in = self.relu(self.conv0(x_in))
+
+ y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+ for i in range(1, len(self.conv_list)):
+ y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+ x_in = self.relu(y + x_in) # modified
+
+ x_in = self.conv_classes(x_in)
+
+ return x_in
+
+
+class ProcessKitti(nn.Module):
+ def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+ super(Process, self).__init__()
+ self.main = nn.Sequential(
+ *[
+ Bottleneck3D(
+ feature,
+ feature // 4,
+ bn_momentum=bn_momentum,
+ norm_layer=norm_layer,
+ dilation=[i, i, i],
+ )
+ for i in dilations
+ ]
+ )
+
+ def forward(self, x):
+ return self.main(x)
+
+
+class Process(nn.Module):
+ def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+ super(Process, self).__init__()
+ self.main = nn.Sequential(
+ *[
+ Bottleneck3D(
+ feature,
+ feature // 4,
+ bn_momentum=bn_momentum,
+ norm_layer=norm_layer,
+ dilation=[i, i, i],
+ )
+ for i in dilations
+ ]
+ )
+
+ def forward(self, x):
+ return self.main(x)
+
+
+class Upsample(nn.Module):
+ def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
+ super(Upsample, self).__init__()
+ self.main = nn.Sequential(
+ nn.ConvTranspose3d(
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ dilation=1,
+ output_padding=1,
+ ),
+ norm_layer(out_channels, momentum=bn_momentum),
+ nn.ReLU(),
+ )
+
+ def forward(self, x):
+ return self.main(x)
+
+
+class Downsample(nn.Module):
+ def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
+ super(Downsample, self).__init__()
+ self.main = Bottleneck3D(
+ feature,
+ feature // 4,
+ bn_momentum=bn_momentum,
+ expansion=expansion,
+ stride=2,
+ downsample=nn.Sequential(
+ nn.AvgPool3d(kernel_size=2, stride=2),
+ nn.Conv3d(
+ feature,
+ int(feature * expansion / 4),
+ kernel_size=1,
+ stride=1,
+ bias=False,
+ ),
+ norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
+ ),
+ norm_layer=norm_layer,
+ )
+
+ def forward(self, x):
+ return self.main(x)
diff --git a/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py b/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc4d020729b6698887055771439f87a491572bd1
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py
@@ -0,0 +1,123 @@
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.flosp import FLoSP
+import numpy as np
+import torch.nn.functional as F
+from monoscene.unet2d import UNet2D
+
+
+class MonoScene(pl.LightningModule):
+ def __init__(
+ self,
+ n_classes,
+ feature,
+ project_scale,
+ full_scene_size,
+ dataset,
+ n_relations=4,
+ context_prior=True,
+ fp_loss=True,
+ project_res=[],
+ frustum_size=4,
+ relation_loss=False,
+ CE_ssc_loss=True,
+ geo_scal_loss=True,
+ sem_scal_loss=True,
+ lr=1e-4,
+ weight_decay=1e-4,
+ ):
+ super().__init__()
+
+ self.project_res = project_res
+ self.fp_loss = fp_loss
+ self.dataset = dataset
+ self.context_prior = context_prior
+ self.frustum_size = frustum_size
+ self.relation_loss = relation_loss
+ self.CE_ssc_loss = CE_ssc_loss
+ self.sem_scal_loss = sem_scal_loss
+ self.geo_scal_loss = geo_scal_loss
+ self.project_scale = project_scale
+ self.lr = lr
+ self.weight_decay = weight_decay
+
+ self.projects = {}
+ self.scale_2ds = [1, 2, 4, 8] # 2D scales
+ for scale_2d in self.scale_2ds:
+ self.projects[str(scale_2d)] = FLoSP(
+ full_scene_size, project_scale=self.project_scale, dataset=self.dataset
+ )
+ self.projects = nn.ModuleDict(self.projects)
+
+ self.n_classes = n_classes
+ if self.dataset == "NYU":
+ self.net_3d_decoder = UNet3DNYU(
+ self.n_classes,
+ nn.BatchNorm3d,
+ n_relations=n_relations,
+ feature=feature,
+ full_scene_size=full_scene_size,
+ context_prior=context_prior,
+ )
+ elif self.dataset == "kitti":
+ self.net_3d_decoder = UNet3DKitti(
+ self.n_classes,
+ nn.BatchNorm3d,
+ project_scale=project_scale,
+ feature=feature,
+ full_scene_size=full_scene_size,
+ context_prior=context_prior,
+ )
+ self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
+
+ def forward(self, batch):
+
+ img = batch["img"]
+ bs = len(img)
+
+ out = {}
+
+ x_rgb = self.net_rgb(img)
+
+ x3ds = []
+ for i in range(bs):
+ x3d = None
+ for scale_2d in self.project_res:
+
+ # project features at each 2D scale to target 3D scale
+ scale_2d = int(scale_2d)
+ projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda()
+ fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda()
+
+ # Sum all the 3D features
+ if x3d is None:
+ x3d = self.projects[str(scale_2d)](
+ x_rgb["1_" + str(scale_2d)][i],
+ projected_pix // scale_2d,
+ fov_mask,
+ )
+ else:
+ x3d += self.projects[str(scale_2d)](
+ x_rgb["1_" + str(scale_2d)][i],
+ projected_pix // scale_2d,
+ fov_mask,
+ )
+ x3ds.append(x3d)
+
+ input_dict = {
+ "x3d": torch.stack(x3ds),
+ }
+
+ out_dict = self.net_3d_decoder(input_dict)
+
+ ssc_pred = out_dict["ssc_logit"]
+
+ y_pred = ssc_pred.detach().cpu().numpy()
+ y_pred = np.argmax(y_pred, axis=1)
+
+ return y_pred
+
+
diff --git a/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py b/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf3d80ea531ff02b3229b862b7a4cd0aec8ec58
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py
@@ -0,0 +1,22 @@
+from transformers import PreTrainedModel
+from .config import MonoSceneConfig
+from monoscene.monoscene import MonoScene
+
+
+
+class MonoSceneModel(PreTrainedModel):
+ config_class = ResnetConfig
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = MonoScene(
+ dataset=config.dataset,
+ n_classes=config.n_classes,
+ feature=config.feature,
+ project_scale=config.project_scale,
+ full_scene_size=config.full_scene_size
+ )
+
+
+ def forward(self, tensor):
+ return self.model.forward(tensor)
\ No newline at end of file
diff --git a/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py b/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d5339fbdf34e28d017d7e4e29ce4923169bef5
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py
@@ -0,0 +1,88 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monoscene.modules import SegmentationHead
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import Process, Upsample, Downsample
+
+
+class UNet3D(nn.Module):
+ def __init__(
+ self,
+ class_num,
+ norm_layer,
+ full_scene_size,
+ feature,
+ project_scale,
+ context_prior=None,
+ bn_momentum=0.1,
+ ):
+ super(UNet3D, self).__init__()
+ self.business_layer = []
+ self.project_scale = project_scale
+ self.full_scene_size = full_scene_size
+ self.feature = feature
+
+ size_l1 = (
+ int(self.full_scene_size[0] / project_scale),
+ int(self.full_scene_size[1] / project_scale),
+ int(self.full_scene_size[2] / project_scale),
+ )
+ size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
+ size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
+
+ dilations = [1, 2, 3]
+ self.process_l1 = nn.Sequential(
+ Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature, norm_layer, bn_momentum),
+ )
+ self.process_l2 = nn.Sequential(
+ Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature * 2, norm_layer, bn_momentum),
+ )
+
+ self.up_13_l2 = Upsample(
+ self.feature * 4, self.feature * 2, norm_layer, bn_momentum
+ )
+ self.up_12_l1 = Upsample(
+ self.feature * 2, self.feature, norm_layer, bn_momentum
+ )
+ self.up_l1_lfull = Upsample(
+ self.feature, self.feature // 2, norm_layer, bn_momentum
+ )
+
+ self.ssc_head = SegmentationHead(
+ self.feature // 2, self.feature // 2, class_num, dilations
+ )
+
+ self.context_prior = context_prior
+ if context_prior:
+ self.CP_mega_voxels = CPMegaVoxels(
+ self.feature * 4, size_l3, bn_momentum=bn_momentum
+ )
+
+ def forward(self, input_dict):
+ res = {}
+
+ x3d_l1 = input_dict["x3d"]
+
+ x3d_l2 = self.process_l1(x3d_l1)
+
+ x3d_l3 = self.process_l2(x3d_l2)
+
+ if self.context_prior:
+ ret = self.CP_mega_voxels(x3d_l3)
+ x3d_l3 = ret["x"]
+ for k in ret.keys():
+ res[k] = ret[k]
+
+ x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
+ x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
+ x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
+
+ ssc_logit_full = self.ssc_head(x3d_up_lfull)
+
+ res["ssc_logit"] = ssc_logit_full
+
+ return res
diff --git a/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py b/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e3b3718999248efa1b2925658465ba59801b13
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py
@@ -0,0 +1,90 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import (
+ Process,
+ Upsample,
+ Downsample,
+ SegmentationHead,
+ ASPP,
+)
+
+
+class UNet3D(nn.Module):
+ def __init__(
+ self,
+ class_num,
+ norm_layer,
+ feature,
+ full_scene_size,
+ n_relations=4,
+ project_res=[],
+ context_prior=True,
+ bn_momentum=0.1,
+ ):
+ super(UNet3D, self).__init__()
+ self.business_layer = []
+ self.project_res = project_res
+
+ self.feature_1_4 = feature
+ self.feature_1_8 = feature * 2
+ self.feature_1_16 = feature * 4
+
+ self.feature_1_16_dec = self.feature_1_16
+ self.feature_1_8_dec = self.feature_1_8
+ self.feature_1_4_dec = self.feature_1_4
+
+ self.process_1_4 = nn.Sequential(
+ Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature_1_4, norm_layer, bn_momentum),
+ )
+ self.process_1_8 = nn.Sequential(
+ Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature_1_8, norm_layer, bn_momentum),
+ )
+ self.up_1_16_1_8 = Upsample(
+ self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
+ )
+ self.up_1_8_1_4 = Upsample(
+ self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
+ )
+ self.ssc_head_1_4 = SegmentationHead(
+ self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
+ )
+
+ self.context_prior = context_prior
+ size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
+
+ if context_prior:
+ self.CP_mega_voxels = CPMegaVoxels(
+ self.feature_1_16,
+ size_1_16,
+ n_relations=n_relations,
+ bn_momentum=bn_momentum,
+ )
+
+ #
+ def forward(self, input_dict):
+ res = {}
+
+ x3d_1_4 = input_dict["x3d"]
+ x3d_1_8 = self.process_1_4(x3d_1_4)
+ x3d_1_16 = self.process_1_8(x3d_1_8)
+
+ if self.context_prior:
+ ret = self.CP_mega_voxels(x3d_1_16)
+ x3d_1_16 = ret["x"]
+ for k in ret.keys():
+ res[k] = ret[k]
+
+ x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
+ x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
+
+ ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
+
+ res["ssc_logit"] = ssc_logit_1_4
+
+ return res
diff --git a/monoscene/CRP3D.py b/monoscene/CRP3D.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88b7b309e6fe66f597cafe2a5eb8c6d29343b7e
--- /dev/null
+++ b/monoscene/CRP3D.py
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+from monoscene.modules import (
+ Process,
+ ASPP,
+)
+
+
+class CPMegaVoxels(nn.Module):
+ def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
+ super().__init__()
+ self.size = size
+ self.n_relations = n_relations
+ print("n_relations", self.n_relations)
+ self.flatten_size = size[0] * size[1] * size[2]
+ self.feature = feature
+ self.context_feature = feature * 2
+ self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+ padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
+
+ self.mega_context = nn.Sequential(
+ nn.Conv3d(
+ feature, self.context_feature, stride=2, padding=padding, kernel_size=3
+ ),
+ )
+ self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+
+ self.context_prior_logits = nn.ModuleList(
+ [
+ nn.Sequential(
+ nn.Conv3d(
+ self.feature,
+ self.flatten_context_size,
+ padding=0,
+ kernel_size=1,
+ ),
+ )
+ for i in range(n_relations)
+ ]
+ )
+ self.aspp = ASPP(feature, [1, 2, 3])
+
+ self.resize = nn.Sequential(
+ nn.Conv3d(
+ self.context_feature * self.n_relations + feature,
+ feature,
+ kernel_size=1,
+ padding=0,
+ bias=False,
+ ),
+ Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
+ )
+
+ def forward(self, input):
+ ret = {}
+ bs = input.shape[0]
+
+ x_agg = self.aspp(input)
+
+ # get the mega context
+ x_mega_context_raw = self.mega_context(x_agg)
+ x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
+ x_mega_context = x_mega_context.permute(0, 2, 1)
+
+ # get context prior map
+ x_context_prior_logits = []
+ x_context_rels = []
+ for rel in range(self.n_relations):
+
+ # Compute the relation matrices
+ x_context_prior_logit = self.context_prior_logits[rel](x_agg)
+ x_context_prior_logit = x_context_prior_logit.reshape(
+ bs, self.flatten_context_size, self.flatten_size
+ )
+ x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
+
+ x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
+ x_context_prior = torch.sigmoid(x_context_prior_logit)
+
+ # Multiply the relation matrices with the mega context to gather context features
+ x_context_rel = torch.bmm(x_context_prior, x_mega_context) # bs, N, f
+ x_context_rels.append(x_context_rel)
+
+ x_context = torch.cat(x_context_rels, dim=2)
+ x_context = x_context.permute(0, 2, 1)
+ x_context = x_context.reshape(
+ bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
+ )
+
+ x = torch.cat([input, x_context], dim=1)
+ x = self.resize(x)
+
+ x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
+ ret["P_logits"] = x_context_prior_logits
+ ret["x"] = x
+
+ return ret
diff --git a/monoscene/DDR.py b/monoscene/DDR.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0928c0741433dc24523a2c26bfad9ef1ff920e
--- /dev/null
+++ b/monoscene/DDR.py
@@ -0,0 +1,139 @@
+"""
+Most of the code in this file is taken from https://github.com/waterljwant/SSC/blob/master/models/DDR.py
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SimpleRB(nn.Module):
+ def __init__(self, in_channel, norm_layer, bn_momentum):
+ super(SimpleRB, self).__init__()
+ self.path = nn.Sequential(
+ nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+ norm_layer(in_channel, momentum=bn_momentum),
+ nn.ReLU(),
+ nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+ norm_layer(in_channel, momentum=bn_momentum),
+ )
+ self.relu = nn.ReLU()
+
+ def forward(self, x):
+ residual = x
+ conv_path = self.path(x)
+ out = residual + conv_path
+ out = self.relu(out)
+ return out
+
+
+"""
+3D Residual Block,3x3x3 conv ==> 3 smaller 3D conv, refered from DDRNet
+"""
+
+
+class Bottleneck3D(nn.Module):
+ def __init__(
+ self,
+ inplanes,
+ planes,
+ norm_layer,
+ stride=1,
+ dilation=[1, 1, 1],
+ expansion=4,
+ downsample=None,
+ fist_dilation=1,
+ multi_grid=1,
+ bn_momentum=0.0003,
+ ):
+ super(Bottleneck3D, self).__init__()
+ # often,planes = inplanes // 4
+ self.expansion = expansion
+ self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+ self.bn1 = norm_layer(planes, momentum=bn_momentum)
+ self.conv2 = nn.Conv3d(
+ planes,
+ planes,
+ kernel_size=(1, 1, 3),
+ stride=(1, 1, stride),
+ dilation=(1, 1, dilation[0]),
+ padding=(0, 0, dilation[0]),
+ bias=False,
+ )
+ self.bn2 = norm_layer(planes, momentum=bn_momentum)
+ self.conv3 = nn.Conv3d(
+ planes,
+ planes,
+ kernel_size=(1, 3, 1),
+ stride=(1, stride, 1),
+ dilation=(1, dilation[1], 1),
+ padding=(0, dilation[1], 0),
+ bias=False,
+ )
+ self.bn3 = norm_layer(planes, momentum=bn_momentum)
+ self.conv4 = nn.Conv3d(
+ planes,
+ planes,
+ kernel_size=(3, 1, 1),
+ stride=(stride, 1, 1),
+ dilation=(dilation[2], 1, 1),
+ padding=(dilation[2], 0, 0),
+ bias=False,
+ )
+ self.bn4 = norm_layer(planes, momentum=bn_momentum)
+ self.conv5 = nn.Conv3d(
+ planes, planes * self.expansion, kernel_size=(1, 1, 1), bias=False
+ )
+ self.bn5 = norm_layer(planes * self.expansion, momentum=bn_momentum)
+
+ self.relu = nn.ReLU(inplace=False)
+ self.relu_inplace = nn.ReLU(inplace=True)
+ self.downsample = downsample
+ self.dilation = dilation
+ self.stride = stride
+
+ self.downsample2 = nn.Sequential(
+ nn.AvgPool3d(kernel_size=(1, stride, 1), stride=(1, stride, 1)),
+ nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+ norm_layer(planes, momentum=bn_momentum),
+ )
+ self.downsample3 = nn.Sequential(
+ nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+ nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+ norm_layer(planes, momentum=bn_momentum),
+ )
+ self.downsample4 = nn.Sequential(
+ nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+ nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+ norm_layer(planes, momentum=bn_momentum),
+ )
+
+ def forward(self, x):
+ residual = x
+
+ out1 = self.relu(self.bn1(self.conv1(x)))
+ out2 = self.bn2(self.conv2(out1))
+ out2_relu = self.relu(out2)
+
+ out3 = self.bn3(self.conv3(out2_relu))
+ if self.stride != 1:
+ out2 = self.downsample2(out2)
+ out3 = out3 + out2
+ out3_relu = self.relu(out3)
+
+ out4 = self.bn4(self.conv4(out3_relu))
+ if self.stride != 1:
+ out2 = self.downsample3(out2)
+ out3 = self.downsample4(out3)
+ out4 = out4 + out2 + out3
+
+ out4_relu = self.relu(out4)
+ out5 = self.bn5(self.conv5(out4_relu))
+
+ if self.downsample is not None:
+ residual = self.downsample(x)
+
+ out = out5 + residual
+ out_relu = self.relu(out)
+
+ return out_relu
diff --git a/monoscene/__init__.py b/monoscene/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/monoscene/app.py b/monoscene/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e70631e75313a28bc978ac3d3bd5df28b61a552
--- /dev/null
+++ b/monoscene/app.py
@@ -0,0 +1,138 @@
+from pytorch_lightning import Trainer
+from monoscene.models.monoscene import MonoScene
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
+# import hydra
+from omegaconf import DictConfig
+import torch
+import numpy as np
+import os
+from hydra.utils import get_original_cwd
+import gradio as gr
+import numpy as np
+import plotly.express as px
+import pandas as pd
+
+
+# @hydra.main(config_name="../config/monoscene.yaml")
+def plot(input_img):
+ torch.set_grad_enabled(False)
+
+ # Setup dataloader
+ # if config.dataset == "kitti" or config.dataset == "kitti_360":
+ feature = 64
+ project_scale = 2
+ full_scene_size = (256, 256, 32)
+
+ # if config.dataset == "kitti":
+ # data_module = KittiDataModule(
+ # root=config.kitti_root,
+ # preprocess_root=config.kitti_preprocess_root,
+ # frustum_size=config.frustum_size,
+ # batch_size=int(config.batch_size / config.n_gpus),
+ # num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+ # )
+ # data_module.setup()
+ # data_loader = data_module.val_dataloader()
+ # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+ # else:
+ # data_module = Kitti360DataModule(
+ # root=config.kitti_360_root,
+ # sequences=[config.kitti_360_sequence],
+ # n_scans=2000,
+ # batch_size=1,
+ # num_workers=3,
+ # )
+ # data_module.setup()
+ # data_loader = data_module.dataloader()
+
+ # elif config.dataset == "NYU":
+ # project_scale = 1
+ # feature = 200
+ # full_scene_size = (60, 36, 60)
+ # data_module = NYUDataModule(
+ # root=config.NYU_root,
+ # preprocess_root=config.NYU_preprocess_root,
+ # n_relations=config.n_relations,
+ # frustum_size=config.frustum_size,
+ # batch_size=int(config.batch_size / config.n_gpus),
+ # num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+ # )
+ # data_module.setup()
+ # data_loader = data_module.val_dataloader()
+ # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+ # else:
+ # print("dataset not support")
+
+ # Load pretrained models
+ # if config.dataset == "NYU":
+ # model_path = os.path.join(
+ # get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
+ # )
+ # else:
+ # model_path = os.path.join(
+ # get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
+ # )
+ model_path = "trained_models/monoscene_kitti.ckpt"
+
+ model = MonoScene.load_from_checkpoint(
+ model_path,
+ feature=feature,
+ project_scale=project_scale,
+ fp_loss=False,
+ full_scene_size=full_scene_size,
+ )
+ model.cuda()
+ model.eval()
+
+ print(input_img.shape)
+
+ x = np.arange(12).reshape(4, 3) / 12
+ data = pd.DataFrame(data=x, columns=['x', 'y', 'z'])
+ fig = px.scatter_3d(data, x="x", y="y", z="z")
+ return fig
+
+demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot())
+demo.launch()
+
+
+
+ # Save prediction and additional data
+ # to draw the viewing frustum and remove scene outside the room for NYUv2
+ # output_path = os.path.join(config.output_path, config.dataset)
+ # with torch.no_grad():
+ # for batch in tqdm(data_loader):
+ # batch["img"] = batch["img"].cuda()
+ # pred = model(batch)
+ # y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
+ # y_pred = np.argmax(y_pred, axis=1)
+ # for i in range(config.batch_size):
+ # out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
+ # if "target" in batch:
+ # out_dict["target"] = (
+ # batch["target"][i].detach().cpu().numpy().astype(np.uint16)
+ # )
+
+ # if config.dataset == "NYU":
+ # write_path = output_path
+ # filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
+ # out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
+ # out_dict["vox_origin"] = (
+ # batch["vox_origin"][i].detach().cpu().numpy()
+ # )
+ # else:
+ # write_path = os.path.join(output_path, batch["sequence"][i])
+ # filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
+ # out_dict["fov_mask_1"] = (
+ # batch["fov_mask_1"][i].detach().cpu().numpy()
+ # )
+ # out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
+ # out_dict["T_velo_2_cam"] = (
+ # batch["T_velo_2_cam"][i].detach().cpu().numpy()
+ # )
+
+ # os.makedirs(write_path, exist_ok=True)
+ # with open(filepath, "wb") as handle:
+ # pickle.dump(out_dict, handle)
+ # print("wrote to", filepath)
\ No newline at end of file
diff --git a/monoscene/config.py b/monoscene/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e03e806ad5e0c7ea4c439e3e82d955e3c0b3038f
--- /dev/null
+++ b/monoscene/config.py
@@ -0,0 +1,26 @@
+from transformers import PretrainedConfig
+from typing import List
+
+
+class MonoSceneConfig(PretrainedConfig):
+
+ def __init__(
+ self,
+ dataset="kitti",
+ n_classes=20,
+ feature=64,
+ project_scale=2,
+ full_scene_size=(256, 256, 32),
+ **kwargs,
+ ):
+ self.dataset = dataset
+ self.n_classes = n_classes
+ self.feature = feature
+ self.project_scale = project_scale
+ self.full_scene_size = full_scene_size
+ super().__init__(**kwargs)
+
+
+
+
+
diff --git a/monoscene/flosp.py b/monoscene/flosp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d502197a72ee120773a47f239e86743f5a1e2d4
--- /dev/null
+++ b/monoscene/flosp.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+
+class FLoSP(nn.Module):
+ def __init__(self, scene_size, dataset, project_scale):
+ super().__init__()
+ self.scene_size = scene_size
+ self.dataset = dataset
+ self.project_scale = project_scale
+
+ def forward(self, x2d, projected_pix, fov_mask):
+ c, h, w = x2d.shape
+
+ src = x2d.view(c, -1)
+ zeros_vec = torch.zeros(c, 1).type_as(src)
+ src = torch.cat([src, zeros_vec], 1)
+
+ pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+ img_indices = pix_y * w + pix_x
+ img_indices[~fov_mask] = h * w
+ img_indices = img_indices.expand(c, -1).long() # c, HWD
+ src_feature = torch.gather(src, 1, img_indices)
+
+ if self.dataset == "NYU":
+ x3d = src_feature.reshape(
+ c,
+ self.scene_size[0] // self.project_scale,
+ self.scene_size[2] // self.project_scale,
+ self.scene_size[1] // self.project_scale,
+ )
+ x3d = x3d.permute(0, 1, 3, 2)
+ elif self.dataset == "kitti":
+ x3d = src_feature.reshape(
+ c,
+ self.scene_size[0] // self.project_scale,
+ self.scene_size[1] // self.project_scale,
+ self.scene_size[2] // self.project_scale,
+ )
+
+ return x3d
diff --git a/monoscene/modules.py b/monoscene/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8bf875ccd6dffb51bb5acb25f0302fe0032d6c
--- /dev/null
+++ b/monoscene/modules.py
@@ -0,0 +1,194 @@
+import torch
+import torch.nn as nn
+from monoscene.DDR import Bottleneck3D
+
+
+class ASPP(nn.Module):
+ """
+ ASPP 3D
+ Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+ """
+
+ def __init__(self, planes, dilations_conv_list):
+ super().__init__()
+
+ # ASPP Block
+ self.conv_list = dilations_conv_list
+ self.conv1 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn1 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.conv2 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn2 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.relu = nn.ReLU()
+
+ def forward(self, x_in):
+
+ y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+ for i in range(1, len(self.conv_list)):
+ y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+ x_in = self.relu(y + x_in) # modified
+
+ return x_in
+
+
+class SegmentationHead(nn.Module):
+ """
+ 3D Segmentation heads to retrieve semantic segmentation at each scale.
+ Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
+ Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+ """
+
+ def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
+ super().__init__()
+
+ # First convolution
+ self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
+
+ # ASPP Block
+ self.conv_list = dilations_conv_list
+ self.conv1 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn1 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.conv2 = nn.ModuleList(
+ [
+ nn.Conv3d(
+ planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+ )
+ for dil in dilations_conv_list
+ ]
+ )
+ self.bn2 = nn.ModuleList(
+ [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+ )
+ self.relu = nn.ReLU()
+
+ self.conv_classes = nn.Conv3d(
+ planes, nbr_classes, kernel_size=3, padding=1, stride=1
+ )
+
+ def forward(self, x_in):
+
+ # Convolution to go from inplanes to planes features...
+ x_in = self.relu(self.conv0(x_in))
+
+ y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+ for i in range(1, len(self.conv_list)):
+ y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+ x_in = self.relu(y + x_in) # modified
+
+ x_in = self.conv_classes(x_in)
+
+ return x_in
+
+
+class ProcessKitti(nn.Module):
+ def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+ super(Process, self).__init__()
+ self.main = nn.Sequential(
+ *[
+ Bottleneck3D(
+ feature,
+ feature // 4,
+ bn_momentum=bn_momentum,
+ norm_layer=norm_layer,
+ dilation=[i, i, i],
+ )
+ for i in dilations
+ ]
+ )
+
+ def forward(self, x):
+ return self.main(x)
+
+
+class Process(nn.Module):
+ def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+ super(Process, self).__init__()
+ self.main = nn.Sequential(
+ *[
+ Bottleneck3D(
+ feature,
+ feature // 4,
+ bn_momentum=bn_momentum,
+ norm_layer=norm_layer,
+ dilation=[i, i, i],
+ )
+ for i in dilations
+ ]
+ )
+
+ def forward(self, x):
+ return self.main(x)
+
+
+class Upsample(nn.Module):
+ def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
+ super(Upsample, self).__init__()
+ self.main = nn.Sequential(
+ nn.ConvTranspose3d(
+ in_channels,
+ out_channels,
+ kernel_size=3,
+ stride=2,
+ padding=1,
+ dilation=1,
+ output_padding=1,
+ ),
+ norm_layer(out_channels, momentum=bn_momentum),
+ nn.ReLU(),
+ )
+
+ def forward(self, x):
+ return self.main(x)
+
+
+class Downsample(nn.Module):
+ def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
+ super(Downsample, self).__init__()
+ self.main = Bottleneck3D(
+ feature,
+ feature // 4,
+ bn_momentum=bn_momentum,
+ expansion=expansion,
+ stride=2,
+ downsample=nn.Sequential(
+ nn.AvgPool3d(kernel_size=2, stride=2),
+ nn.Conv3d(
+ feature,
+ int(feature * expansion / 4),
+ kernel_size=1,
+ stride=1,
+ bias=False,
+ ),
+ norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
+ ),
+ norm_layer=norm_layer,
+ )
+
+ def forward(self, x):
+ return self.main(x)
diff --git a/monoscene/monoscene.py b/monoscene/monoscene.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8dd444c86ac9b38494e7fc0f685504ae2f25a56
--- /dev/null
+++ b/monoscene/monoscene.py
@@ -0,0 +1,125 @@
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.flosp import FLoSP
+import numpy as np
+import torch.nn.functional as F
+from monoscene.unet2d import UNet2D
+
+
+class MonoScene(pl.LightningModule):
+ def __init__(
+ self,
+ n_classes,
+ feature,
+ project_scale,
+ full_scene_size,
+ dataset,
+ project_res=["1", "2", "4", "8"],
+ n_relations=4,
+ context_prior=True,
+ fp_loss=True,
+ frustum_size=4,
+ relation_loss=False,
+ CE_ssc_loss=True,
+ geo_scal_loss=True,
+ sem_scal_loss=True,
+ lr=1e-4,
+ weight_decay=1e-4,
+ ):
+ super().__init__()
+
+ self.project_res = project_res
+ self.fp_loss = fp_loss
+ self.dataset = dataset
+ self.context_prior = context_prior
+ self.frustum_size = frustum_size
+ self.relation_loss = relation_loss
+ self.CE_ssc_loss = CE_ssc_loss
+ self.sem_scal_loss = sem_scal_loss
+ self.geo_scal_loss = geo_scal_loss
+ self.project_scale = project_scale
+ self.lr = lr
+ self.weight_decay = weight_decay
+
+ self.projects = {}
+ self.scale_2ds = [1, 2, 4, 8] # 2D scales
+ for scale_2d in self.scale_2ds:
+ self.projects[str(scale_2d)] = FLoSP(
+ full_scene_size, project_scale=self.project_scale, dataset=self.dataset
+ )
+ self.projects = nn.ModuleDict(self.projects)
+
+ self.n_classes = n_classes
+ if self.dataset == "NYU":
+ self.net_3d_decoder = UNet3DNYU(
+ self.n_classes,
+ nn.BatchNorm3d,
+ n_relations=n_relations,
+ feature=feature,
+ full_scene_size=full_scene_size,
+ context_prior=context_prior,
+ )
+ elif self.dataset == "kitti":
+ self.net_3d_decoder = UNet3DKitti(
+ self.n_classes,
+ nn.BatchNorm3d,
+ project_scale=project_scale,
+ feature=feature,
+ full_scene_size=full_scene_size,
+ context_prior=context_prior,
+ )
+ self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
+
+ def forward(self, batch):
+
+ img = batch["img"]
+ bs = len(img)
+
+ out = {}
+
+ x_rgb = self.net_rgb(img)
+
+ x3ds = []
+ for i in range(bs):
+ x3d = None
+ for scale_2d in self.project_res:
+
+ # project features at each 2D scale to target 3D scale
+ scale_2d = int(scale_2d)
+ projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda()
+ fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda()
+
+ # Sum all the 3D features
+ if x3d is None:
+ x3d = self.projects[str(scale_2d)](
+ x_rgb["1_" + str(scale_2d)][i],
+ # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
+ projected_pix // scale_2d,
+ fov_mask,
+ )
+ else:
+ x3d += self.projects[str(scale_2d)](
+ x_rgb["1_" + str(scale_2d)][i],
+ # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
+ projected_pix // scale_2d,
+ fov_mask,
+ )
+ x3ds.append(x3d)
+
+ input_dict = {
+ "x3d": torch.stack(x3ds),
+ }
+
+ out_dict = self.net_3d_decoder(input_dict)
+
+ ssc_pred = out_dict["ssc_logit"]
+
+ y_pred = ssc_pred.detach().cpu().numpy()
+ y_pred = np.argmax(y_pred, axis=1)
+
+ return y_pred
+
+
diff --git a/monoscene/monoscene_model.py b/monoscene/monoscene_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a5207f3d03de86192c5d41a8bdfe3ce32e672ab
--- /dev/null
+++ b/monoscene/monoscene_model.py
@@ -0,0 +1,21 @@
+from transformers import PreTrainedModel
+from .config import MonoSceneConfig
+from monoscene.monoscene import MonoScene
+
+
+class MonoSceneModel(PreTrainedModel):
+ config_class = MonoSceneConfig
+
+ def __init__(self, config):
+ super().__init__(config)
+ self.model = MonoScene(
+ dataset=config.dataset,
+ n_classes=config.n_classes,
+ feature=config.feature,
+ project_scale=config.project_scale,
+ full_scene_size=config.full_scene_size
+ )
+
+
+ def forward(self, tensor):
+ return self.model.forward(tensor)
\ No newline at end of file
diff --git a/monoscene/unet2d.py b/monoscene/unet2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c9e45553b1c7e083436778c6e963545446d008
--- /dev/null
+++ b/monoscene/unet2d.py
@@ -0,0 +1,198 @@
+"""
+Code adapted from https://github.com/shariqfarooq123/AdaBins/blob/main/models/unet_adaptive_bins.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+
+
+class UpSampleBN(nn.Module):
+ def __init__(self, skip_input, output_features):
+ super(UpSampleBN, self).__init__()
+ self._net = nn.Sequential(
+ nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1),
+ nn.BatchNorm2d(output_features),
+ nn.LeakyReLU(),
+ nn.Conv2d(
+ output_features, output_features, kernel_size=3, stride=1, padding=1
+ ),
+ nn.BatchNorm2d(output_features),
+ nn.LeakyReLU(),
+ )
+
+ def forward(self, x, concat_with):
+ up_x = F.interpolate(
+ x,
+ size=(concat_with.shape[2], concat_with.shape[3]),
+ mode="bilinear",
+ align_corners=True,
+ )
+ f = torch.cat([up_x, concat_with], dim=1)
+ return self._net(f)
+
+
+class DecoderBN(nn.Module):
+ def __init__(
+ self, num_features, bottleneck_features, out_feature, use_decoder=True
+ ):
+ super(DecoderBN, self).__init__()
+ features = int(num_features)
+ self.use_decoder = use_decoder
+
+ self.conv2 = nn.Conv2d(
+ bottleneck_features, features, kernel_size=1, stride=1, padding=1
+ )
+
+ self.out_feature_1_1 = out_feature
+ self.out_feature_1_2 = out_feature
+ self.out_feature_1_4 = out_feature
+ self.out_feature_1_8 = out_feature
+ self.out_feature_1_16 = out_feature
+ self.feature_1_16 = features // 2
+ self.feature_1_8 = features // 4
+ self.feature_1_4 = features // 8
+ self.feature_1_2 = features // 16
+ self.feature_1_1 = features // 32
+
+ if self.use_decoder:
+ self.resize_output_1_1 = nn.Conv2d(
+ self.feature_1_1, self.out_feature_1_1, kernel_size=1
+ )
+ self.resize_output_1_2 = nn.Conv2d(
+ self.feature_1_2, self.out_feature_1_2, kernel_size=1
+ )
+ self.resize_output_1_4 = nn.Conv2d(
+ self.feature_1_4, self.out_feature_1_4, kernel_size=1
+ )
+ self.resize_output_1_8 = nn.Conv2d(
+ self.feature_1_8, self.out_feature_1_8, kernel_size=1
+ )
+ self.resize_output_1_16 = nn.Conv2d(
+ self.feature_1_16, self.out_feature_1_16, kernel_size=1
+ )
+
+ self.up16 = UpSampleBN(
+ skip_input=features + 224, output_features=self.feature_1_16
+ )
+ self.up8 = UpSampleBN(
+ skip_input=self.feature_1_16 + 80, output_features=self.feature_1_8
+ )
+ self.up4 = UpSampleBN(
+ skip_input=self.feature_1_8 + 48, output_features=self.feature_1_4
+ )
+ self.up2 = UpSampleBN(
+ skip_input=self.feature_1_4 + 32, output_features=self.feature_1_2
+ )
+ self.up1 = UpSampleBN(
+ skip_input=self.feature_1_2 + 3, output_features=self.feature_1_1
+ )
+ else:
+ self.resize_output_1_1 = nn.Conv2d(3, out_feature, kernel_size=1)
+ self.resize_output_1_2 = nn.Conv2d(32, out_feature * 2, kernel_size=1)
+ self.resize_output_1_4 = nn.Conv2d(48, out_feature * 4, kernel_size=1)
+
+ def forward(self, features):
+ x_block0, x_block1, x_block2, x_block3, x_block4 = (
+ features[4],
+ features[5],
+ features[6],
+ features[8],
+ features[11],
+ )
+ bs = x_block0.shape[0]
+ x_d0 = self.conv2(x_block4)
+
+ if self.use_decoder:
+ x_1_16 = self.up16(x_d0, x_block3)
+ x_1_8 = self.up8(x_1_16, x_block2)
+ x_1_4 = self.up4(x_1_8, x_block1)
+ x_1_2 = self.up2(x_1_4, x_block0)
+ x_1_1 = self.up1(x_1_2, features[0])
+ return {
+ "1_1": self.resize_output_1_1(x_1_1),
+ "1_2": self.resize_output_1_2(x_1_2),
+ "1_4": self.resize_output_1_4(x_1_4),
+ "1_8": self.resize_output_1_8(x_1_8),
+ "1_16": self.resize_output_1_16(x_1_16),
+ }
+ else:
+ x_1_1 = features[0]
+ x_1_2, x_1_4, x_1_8, x_1_16 = (
+ features[4],
+ features[5],
+ features[6],
+ features[8],
+ )
+ x_global = features[-1].reshape(bs, 2560, -1).mean(2)
+ return {
+ "1_1": self.resize_output_1_1(x_1_1),
+ "1_2": self.resize_output_1_2(x_1_2),
+ "1_4": self.resize_output_1_4(x_1_4),
+ "global": x_global,
+ }
+
+
+class Encoder(nn.Module):
+ def __init__(self, backend):
+ super(Encoder, self).__init__()
+ self.original_model = backend
+
+ def forward(self, x):
+ features = [x]
+ for k, v in self.original_model._modules.items():
+ if k == "blocks":
+ for ki, vi in v._modules.items():
+ features.append(vi(features[-1]))
+ else:
+ features.append(v(features[-1]))
+ return features
+
+
+class UNet2D(nn.Module):
+ def __init__(self, backend, num_features, out_feature, use_decoder=True):
+ super(UNet2D, self).__init__()
+ self.use_decoder = use_decoder
+ self.encoder = Encoder(backend)
+ self.decoder = DecoderBN(
+ out_feature=out_feature,
+ use_decoder=use_decoder,
+ bottleneck_features=num_features,
+ num_features=num_features,
+ )
+
+ def forward(self, x, **kwargs):
+ encoded_feats = self.encoder(x)
+ unet_out = self.decoder(encoded_feats, **kwargs)
+ return unet_out
+
+ def get_encoder_params(self): # lr/10 learning rate
+ return self.encoder.parameters()
+
+ def get_decoder_params(self): # lr learning rate
+ return self.decoder.parameters()
+
+ @classmethod
+ def build(cls, **kwargs):
+ basemodel_name = "tf_efficientnet_b7_ns"
+ num_features = 2560
+
+ print("Loading base model ()...".format(basemodel_name), end="")
+ basemodel = torch.hub.load(
+ "rwightman/gen-efficientnet-pytorch", basemodel_name, pretrained=True
+ )
+ print("Done.")
+
+ # Remove last layer
+ print("Removing last two layers (global_pool & classifier).")
+ basemodel.global_pool = nn.Identity()
+ basemodel.classifier = nn.Identity()
+
+ # Building Encoder-Decoder model
+ print("Building Encoder-Decoder model..", end="")
+ m = cls(basemodel, num_features=num_features, **kwargs)
+ print("Done.")
+ return m
+
+if __name__ == '__main__':
+ model = UNet2D.build(out_feature=256, use_decoder=True)
diff --git a/monoscene/unet3d_kitti.py b/monoscene/unet3d_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d5339fbdf34e28d017d7e4e29ce4923169bef5
--- /dev/null
+++ b/monoscene/unet3d_kitti.py
@@ -0,0 +1,88 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monoscene.modules import SegmentationHead
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import Process, Upsample, Downsample
+
+
+class UNet3D(nn.Module):
+ def __init__(
+ self,
+ class_num,
+ norm_layer,
+ full_scene_size,
+ feature,
+ project_scale,
+ context_prior=None,
+ bn_momentum=0.1,
+ ):
+ super(UNet3D, self).__init__()
+ self.business_layer = []
+ self.project_scale = project_scale
+ self.full_scene_size = full_scene_size
+ self.feature = feature
+
+ size_l1 = (
+ int(self.full_scene_size[0] / project_scale),
+ int(self.full_scene_size[1] / project_scale),
+ int(self.full_scene_size[2] / project_scale),
+ )
+ size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
+ size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
+
+ dilations = [1, 2, 3]
+ self.process_l1 = nn.Sequential(
+ Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature, norm_layer, bn_momentum),
+ )
+ self.process_l2 = nn.Sequential(
+ Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature * 2, norm_layer, bn_momentum),
+ )
+
+ self.up_13_l2 = Upsample(
+ self.feature * 4, self.feature * 2, norm_layer, bn_momentum
+ )
+ self.up_12_l1 = Upsample(
+ self.feature * 2, self.feature, norm_layer, bn_momentum
+ )
+ self.up_l1_lfull = Upsample(
+ self.feature, self.feature // 2, norm_layer, bn_momentum
+ )
+
+ self.ssc_head = SegmentationHead(
+ self.feature // 2, self.feature // 2, class_num, dilations
+ )
+
+ self.context_prior = context_prior
+ if context_prior:
+ self.CP_mega_voxels = CPMegaVoxels(
+ self.feature * 4, size_l3, bn_momentum=bn_momentum
+ )
+
+ def forward(self, input_dict):
+ res = {}
+
+ x3d_l1 = input_dict["x3d"]
+
+ x3d_l2 = self.process_l1(x3d_l1)
+
+ x3d_l3 = self.process_l2(x3d_l2)
+
+ if self.context_prior:
+ ret = self.CP_mega_voxels(x3d_l3)
+ x3d_l3 = ret["x"]
+ for k in ret.keys():
+ res[k] = ret[k]
+
+ x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
+ x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
+ x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
+
+ ssc_logit_full = self.ssc_head(x3d_up_lfull)
+
+ res["ssc_logit"] = ssc_logit_full
+
+ return res
diff --git a/monoscene/unet3d_nyu.py b/monoscene/unet3d_nyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e3b3718999248efa1b2925658465ba59801b13
--- /dev/null
+++ b/monoscene/unet3d_nyu.py
@@ -0,0 +1,90 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import (
+ Process,
+ Upsample,
+ Downsample,
+ SegmentationHead,
+ ASPP,
+)
+
+
+class UNet3D(nn.Module):
+ def __init__(
+ self,
+ class_num,
+ norm_layer,
+ feature,
+ full_scene_size,
+ n_relations=4,
+ project_res=[],
+ context_prior=True,
+ bn_momentum=0.1,
+ ):
+ super(UNet3D, self).__init__()
+ self.business_layer = []
+ self.project_res = project_res
+
+ self.feature_1_4 = feature
+ self.feature_1_8 = feature * 2
+ self.feature_1_16 = feature * 4
+
+ self.feature_1_16_dec = self.feature_1_16
+ self.feature_1_8_dec = self.feature_1_8
+ self.feature_1_4_dec = self.feature_1_4
+
+ self.process_1_4 = nn.Sequential(
+ Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature_1_4, norm_layer, bn_momentum),
+ )
+ self.process_1_8 = nn.Sequential(
+ Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+ Downsample(self.feature_1_8, norm_layer, bn_momentum),
+ )
+ self.up_1_16_1_8 = Upsample(
+ self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
+ )
+ self.up_1_8_1_4 = Upsample(
+ self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
+ )
+ self.ssc_head_1_4 = SegmentationHead(
+ self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
+ )
+
+ self.context_prior = context_prior
+ size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
+
+ if context_prior:
+ self.CP_mega_voxels = CPMegaVoxels(
+ self.feature_1_16,
+ size_1_16,
+ n_relations=n_relations,
+ bn_momentum=bn_momentum,
+ )
+
+ #
+ def forward(self, input_dict):
+ res = {}
+
+ x3d_1_4 = input_dict["x3d"]
+ x3d_1_8 = self.process_1_4(x3d_1_4)
+ x3d_1_16 = self.process_1_8(x3d_1_8)
+
+ if self.context_prior:
+ ret = self.CP_mega_voxels(x3d_1_16)
+ x3d_1_16 = ret["x"]
+ for k in ret.keys():
+ res[k] = ret[k]
+
+ x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
+ x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
+
+ ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
+
+ res["ssc_logit"] = ssc_logit_1_4
+
+ return res
diff --git a/monoscene_kitti.ckpt b/monoscene_kitti.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..cd302ef17432a3b5c8b58ab5a63bb52e2c166976
--- /dev/null
+++ b/monoscene_kitti.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f0d1324885166f17949bf2dcfc0ee1eb2d2aedd0f48e75b56bb2beb87c1ce3a
+size 1796467007
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..81946126e893d56bb066160e9465ef42bc43d9d9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+scikit-image==0.18.1
+PyYAML==5.3.1
+tqdm==4.49.0
+scikit-learn==0.24.0
+pytorch-lightning==1.4.9
+opencv-python==4.5.1.48
+hydra-core==1.0.5
+numpy==1.20.3
+numba==0.53
+imageio
+protobuf~=3.19.0
+transformers
+plotly
+torch
+torchvision
+torchmetrics==0.6.0
\ No newline at end of file