diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..9d7dd240381bd4ef025168212e15f1f0b7478660 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,28 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zstandard filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +monoscene_kitti.ckpt filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..46769a0f0371bf73afac0874aa640b65361b0b19 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +__pycache__ +.ipynb_checkpoints +*.ckpt +gradio* \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3a7f97e80109bd91f0ed142e6eb9aac585326881 --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +--- +title: MonoScene +emoji: 🚘🏙️ +colorFrom: purple +colorTo: pink +sdk: gradio +sdk_version: 3.0.20 +app_file: app.py +pinned: true +license: apache-2.0 +duplicated_from: CVPR/MonoScene +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..01225a388b656d95f22d55a02c47d4d62772c3c3 --- /dev/null +++ b/app.py @@ -0,0 +1,126 @@ +import gradio as gr +import numpy as np +from torchvision import transforms +import torch +from helpers import * +import sys +import csv +from monoscene.monoscene import MonoScene + +csv.field_size_limit(sys.maxsize) +torch.set_grad_enabled(False) + +# pipeline = pipeline(model="anhquancao/monoscene_kitti") +# model = AutoModel.from_pretrained( +# "anhquancao/monoscene_kitti", trust_remote_code=True, revision='bf033f87c2a86b60903ab811b790a1532c1ae313' +# )#.cuda() +model = MonoScene.load_from_checkpoint( + "monoscene_kitti.ckpt", + dataset="kitti", + n_classes=20, + feature = 64, + project_scale = 2, + full_scene_size = (256, 256, 32), + ) + +img_W, img_H = 1220, 370 + + +def predict(img): + img = np.array(img, dtype=np.float32, copy=False) / 255.0 + + normalize_rgb = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ), + ] + ) + img = normalize_rgb(img) + + batch = get_projections(img_W, img_H) + batch["img"] = img + for k in batch: + batch[k] = batch[k].unsqueeze(0)#.cuda() + + pred = model(batch).squeeze() + # print(pred.shape) + pred = majority_pooling(pred, k_size=2) + fig = draw(pred, batch['fov_mask_2']) + + + return fig + + +description = """ +MonoScene Demo on SemanticKITTI Validation Set (Sequence 08), which uses the camera parameters of Sequence 08. +Due to the CPU-only inference, it might take up to 20s to predict a scene. \n +The output is downsampled by 2 for faster rendering. Darker colors represent the scenery outside the Field of View, i.e. not visible on the image. +
+ + Project page + + + +
+""" +title = "MonoScene: Monocular 3D Semantic Scene Completion" +article=""" +
+We also released a smaller MonoScene model (Half resolution - w/o 3D CRP) at: https://huggingface.co/spaces/CVPR/monoscene_lite + visitor badge +
+""" + +examples = [ + 'images/08/001385.jpg', + 'images/08/000295.jpg', + 'images/08/002505.jpg', + 'images/08/000085.jpg', + 'images/08/000290.jpg', + 'images/08/000465.jpg', + 'images/08/000790.jpg', + 'images/08/001005.jpg', + 'images/08/001380.jpg', + 'images/08/001530.jpg', + 'images/08/002360.jpg', + 'images/08/004059.jpg', + 'images/08/003149.jpg', + 'images/08/001446.jpg', + 'images/08/000010.jpg', + 'images/08/001122.jpg', + 'images/08/003533.jpg', + 'images/08/003365.jpg', + 'images/08/002944.jpg', + 'images/08/000822.jpg', + 'images/08/000103.jpg', + 'images/08/002716.jpg', + 'images/08/000187.jpg', + 'images/08/002128.jpg', + 'images/08/000511.jpg', + 'images/08/000618.jpg', + 'images/08/002010.jpg', + 'images/08/000234.jpg', + 'images/08/001842.jpg', + 'images/08/001687.jpg', + 'images/08/003929.jpg', + 'images/08/002272.jpg', +] + + + +demo = gr.Interface( + predict, + gr.Image(shape=(1220, 370)), + gr.Plot(), + article=article, + title=title, + enable_queue=True, + cache_examples=False, + live=False, + examples=examples, + description=description) + + +demo.launch(enable_queue=True, debug=False) \ No newline at end of file diff --git a/calib.txt b/calib.txt new file mode 100644 index 0000000000000000000000000000000000000000..793946dabbfa14421b0ab261d69fca372137b76e --- /dev/null +++ b/calib.txt @@ -0,0 +1,5 @@ +P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00 +P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03 +P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03 +Tr: 4.276802385584e-04 -9.999672484946e-01 -8.084491683471e-03 -1.198459927713e-02 -7.210626507497e-03 8.081198471645e-03 -9.999413164504e-01 -5.403984729748e-02 9.999738645903e-01 4.859485810390e-04 -7.206933692422e-03 -2.921968648686e-01 diff --git a/fusion.py b/fusion.py new file mode 100644 index 0000000000000000000000000000000000000000..aecd5cba3b1e3dd1e0534cda347eca8956657926 --- /dev/null +++ b/fusion.py @@ -0,0 +1,507 @@ +""" +Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py + +@inproceedings{zeng20163dmatch, + title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions}, + author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas}, + booktitle={CVPR}, + year={2017} +} +""" + +import numpy as np + +from numba import njit, prange +from skimage import measure + +FUSION_GPU_MODE = 0 + + +class TSDFVolume: + """Volumetric TSDF Fusion of RGB-D Images.""" + + def __init__(self, vol_bnds, voxel_size, use_gpu=True): + """Constructor. + + Args: + vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the + xyz bounds (min/max) in meters. + voxel_size (float): The volume discretization in meters. + """ + vol_bnds = np.asarray(vol_bnds) + assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)." + + # Define voxel volume parameters + self._vol_bnds = vol_bnds + self._voxel_size = float(voxel_size) + self._trunc_margin = 5 * self._voxel_size # truncation on SDF + # self._trunc_margin = 10 # truncation on SDF + self._color_const = 256 * 256 + + # Adjust volume bounds and ensure C-order contiguous + self._vol_dim = ( + np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size) + .copy(order="C") + .astype(int) + ) + self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size + self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32) + + print( + "Voxel volume size: {} x {} x {} - # points: {:,}".format( + self._vol_dim[0], + self._vol_dim[1], + self._vol_dim[2], + self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2], + ) + ) + + # Initialize pointers to voxel volume in CPU memory + self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) + # for computing the cumulative moving average of observations per voxel + self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) + self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32) + + self.gpu_mode = use_gpu and FUSION_GPU_MODE + + # Copy voxel volumes to GPU + if self.gpu_mode: + self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes) + cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu) + self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes) + cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu) + self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes) + cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu) + + # Cuda kernel function (C++) + self._cuda_src_mod = SourceModule( + """ + __global__ void integrate(float * tsdf_vol, + float * weight_vol, + float * color_vol, + float * vol_dim, + float * vol_origin, + float * cam_intr, + float * cam_pose, + float * other_params, + float * color_im, + float * depth_im) { + // Get voxel index + int gpu_loop_idx = (int) other_params[0]; + int max_threads_per_block = blockDim.x; + int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x; + int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x; + int vol_dim_x = (int) vol_dim[0]; + int vol_dim_y = (int) vol_dim[1]; + int vol_dim_z = (int) vol_dim[2]; + if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z) + return; + // Get voxel grid coordinates (note: be careful when casting) + float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z))); + float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z)); + float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z); + // Voxel grid coordinates to world coordinates + float voxel_size = other_params[1]; + float pt_x = vol_origin[0]+voxel_x*voxel_size; + float pt_y = vol_origin[1]+voxel_y*voxel_size; + float pt_z = vol_origin[2]+voxel_z*voxel_size; + // World coordinates to camera coordinates + float tmp_pt_x = pt_x-cam_pose[0*4+3]; + float tmp_pt_y = pt_y-cam_pose[1*4+3]; + float tmp_pt_z = pt_z-cam_pose[2*4+3]; + float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z; + float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z; + float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z; + // Camera coordinates to image pixels + int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]); + int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]); + // Skip if outside view frustum + int im_h = (int) other_params[2]; + int im_w = (int) other_params[3]; + if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0) + return; + // Skip invalid depth + float depth_value = depth_im[pixel_y*im_w+pixel_x]; + if (depth_value == 0) + return; + // Integrate TSDF + float trunc_margin = other_params[4]; + float depth_diff = depth_value-cam_pt_z; + if (depth_diff < -trunc_margin) + return; + float dist = fmin(1.0f,depth_diff/trunc_margin); + float w_old = weight_vol[voxel_idx]; + float obs_weight = other_params[5]; + float w_new = w_old + obs_weight; + weight_vol[voxel_idx] = w_new; + tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new; + // Integrate color + float old_color = color_vol[voxel_idx]; + float old_b = floorf(old_color/(256*256)); + float old_g = floorf((old_color-old_b*256*256)/256); + float old_r = old_color-old_b*256*256-old_g*256; + float new_color = color_im[pixel_y*im_w+pixel_x]; + float new_b = floorf(new_color/(256*256)); + float new_g = floorf((new_color-new_b*256*256)/256); + float new_r = new_color-new_b*256*256-new_g*256; + new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f); + new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f); + new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f); + color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r; + }""" + ) + + self._cuda_integrate = self._cuda_src_mod.get_function("integrate") + + # Determine block/grid size on GPU + gpu_dev = cuda.Device(0) + self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK + n_blocks = int( + np.ceil( + float(np.prod(self._vol_dim)) + / float(self._max_gpu_threads_per_block) + ) + ) + grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks)))) + grid_dim_y = min( + gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x))) + ) + grid_dim_z = min( + gpu_dev.MAX_GRID_DIM_Z, + int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))), + ) + self._max_gpu_grid_dim = np.array( + [grid_dim_x, grid_dim_y, grid_dim_z] + ).astype(int) + self._n_gpu_loops = int( + np.ceil( + float(np.prod(self._vol_dim)) + / float( + np.prod(self._max_gpu_grid_dim) + * self._max_gpu_threads_per_block + ) + ) + ) + + else: + # Get voxel grid coordinates + xv, yv, zv = np.meshgrid( + range(self._vol_dim[0]), + range(self._vol_dim[1]), + range(self._vol_dim[2]), + indexing="ij", + ) + self.vox_coords = ( + np.concatenate( + [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0 + ) + .astype(int) + .T + ) + + @staticmethod + @njit(parallel=True) + def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)): + """Convert voxel grid coordinates to world coordinates.""" + vol_origin = vol_origin.astype(np.float32) + vox_coords = vox_coords.astype(np.float32) + # print(np.min(vox_coords)) + cam_pts = np.empty_like(vox_coords, dtype=np.float32) + + for i in prange(vox_coords.shape[0]): + for j in range(3): + cam_pts[i, j] = ( + vol_origin[j] + + (vox_size * vox_coords[i, j]) + + vox_size * offsets[j] + ) + return cam_pts + + @staticmethod + @njit(parallel=True) + def cam2pix(cam_pts, intr): + """Convert camera coordinates to pixel coordinates.""" + intr = intr.astype(np.float32) + fx, fy = intr[0, 0], intr[1, 1] + cx, cy = intr[0, 2], intr[1, 2] + pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64) + for i in prange(cam_pts.shape[0]): + pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx)) + pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy)) + return pix + + @staticmethod + @njit(parallel=True) + def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight): + """Integrate the TSDF volume.""" + tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32) + # print(tsdf_vol.shape) + w_new = np.empty_like(w_old, dtype=np.float32) + for i in prange(len(tsdf_vol)): + w_new[i] = w_old[i] + obs_weight + tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i] + return tsdf_vol_int, w_new + + def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0): + """Integrate an RGB-D frame into the TSDF volume. + + Args: + color_im (ndarray): An RGB image of shape (H, W, 3). + depth_im (ndarray): A depth image of shape (H, W). + cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3). + cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4). + obs_weight (float): The weight to assign for the current observation. A higher + value + """ + im_h, im_w = depth_im.shape + + # Fold RGB color image into a single channel image + color_im = color_im.astype(np.float32) + color_im = np.floor( + color_im[..., 2] * self._color_const + + color_im[..., 1] * 256 + + color_im[..., 0] + ) + + if self.gpu_mode: # GPU mode: integrate voxel volume (calls CUDA kernel) + for gpu_loop_idx in range(self._n_gpu_loops): + self._cuda_integrate( + self._tsdf_vol_gpu, + self._weight_vol_gpu, + self._color_vol_gpu, + cuda.InOut(self._vol_dim.astype(np.float32)), + cuda.InOut(self._vol_origin.astype(np.float32)), + cuda.InOut(cam_intr.reshape(-1).astype(np.float32)), + cuda.InOut(cam_pose.reshape(-1).astype(np.float32)), + cuda.InOut( + np.asarray( + [ + gpu_loop_idx, + self._voxel_size, + im_h, + im_w, + self._trunc_margin, + obs_weight, + ], + np.float32, + ) + ), + cuda.InOut(color_im.reshape(-1).astype(np.float32)), + cuda.InOut(depth_im.reshape(-1).astype(np.float32)), + block=(self._max_gpu_threads_per_block, 1, 1), + grid=( + int(self._max_gpu_grid_dim[0]), + int(self._max_gpu_grid_dim[1]), + int(self._max_gpu_grid_dim[2]), + ), + ) + else: # CPU mode: integrate voxel volume (vectorized implementation) + # Convert voxel grid coordinates to pixel coordinates + cam_pts = self.vox2world( + self._vol_origin, self.vox_coords, self._voxel_size + ) + cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose)) + pix_z = cam_pts[:, 2] + pix = self.cam2pix(cam_pts, cam_intr) + pix_x, pix_y = pix[:, 0], pix[:, 1] + + # Eliminate pixels outside view frustum + valid_pix = np.logical_and( + pix_x >= 0, + np.logical_and( + pix_x < im_w, + np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)), + ), + ) + depth_val = np.zeros(pix_x.shape) + depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]] + + # Integrate TSDF + depth_diff = depth_val - pix_z + + valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10) + dist = depth_diff + + valid_vox_x = self.vox_coords[valid_pts, 0] + valid_vox_y = self.vox_coords[valid_pts, 1] + valid_vox_z = self.vox_coords[valid_pts, 2] + w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] + tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] + valid_dist = dist[valid_pts] + tsdf_vol_new, w_new = self.integrate_tsdf( + tsdf_vals, valid_dist, w_old, obs_weight + ) + self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new + self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new + + # Integrate color + old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] + old_b = np.floor(old_color / self._color_const) + old_g = np.floor((old_color - old_b * self._color_const) / 256) + old_r = old_color - old_b * self._color_const - old_g * 256 + new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]] + new_b = np.floor(new_color / self._color_const) + new_g = np.floor((new_color - new_b * self._color_const) / 256) + new_r = new_color - new_b * self._color_const - new_g * 256 + new_b = np.minimum( + 255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new) + ) + new_g = np.minimum( + 255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new) + ) + new_r = np.minimum( + 255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new) + ) + self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = ( + new_b * self._color_const + new_g * 256 + new_r + ) + + def get_volume(self): + if self.gpu_mode: + cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu) + cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu) + return self._tsdf_vol_cpu, self._color_vol_cpu + + def get_point_cloud(self): + """Extract a point cloud from the voxel volume.""" + tsdf_vol, color_vol = self.get_volume() + + # Marching cubes + verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0] + verts_ind = np.round(verts).astype(int) + verts = verts * self._voxel_size + self._vol_origin + + # Get vertex colors + rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]] + colors_b = np.floor(rgb_vals / self._color_const) + colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256) + colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256 + colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T + colors = colors.astype(np.uint8) + + pc = np.hstack([verts, colors]) + return pc + + def get_mesh(self): + """Compute a mesh from the voxel volume using marching cubes.""" + tsdf_vol, color_vol = self.get_volume() + + # Marching cubes + verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0) + verts_ind = np.round(verts).astype(int) + verts = ( + verts * self._voxel_size + self._vol_origin + ) # voxel grid coordinates to world coordinates + + # Get vertex colors + rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]] + colors_b = np.floor(rgb_vals / self._color_const) + colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256) + colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256 + colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T + colors = colors.astype(np.uint8) + return verts, faces, norms, colors + + +def rigid_transform(xyz, transform): + """Applies a rigid transform to an (N, 3) pointcloud.""" + xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)]) + xyz_t_h = np.dot(transform, xyz_h.T).T + return xyz_t_h[:, :3] + + +def get_view_frustum(depth_im, cam_intr, cam_pose): + """Get corners of 3D camera view frustum of depth image""" + im_h = depth_im.shape[0] + im_w = depth_im.shape[1] + max_depth = np.max(depth_im) + view_frust_pts = np.array( + [ + (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2]) + * np.array([0, max_depth, max_depth, max_depth, max_depth]) + / cam_intr[0, 0], + (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2]) + * np.array([0, max_depth, max_depth, max_depth, max_depth]) + / cam_intr[1, 1], + np.array([0, max_depth, max_depth, max_depth, max_depth]), + ] + ) + view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T + return view_frust_pts + + +def meshwrite(filename, verts, faces, norms, colors): + """Save a 3D mesh to a polygon .ply file.""" + # Write header + ply_file = open(filename, "w") + ply_file.write("ply\n") + ply_file.write("format ascii 1.0\n") + ply_file.write("element vertex %d\n" % (verts.shape[0])) + ply_file.write("property float x\n") + ply_file.write("property float y\n") + ply_file.write("property float z\n") + ply_file.write("property float nx\n") + ply_file.write("property float ny\n") + ply_file.write("property float nz\n") + ply_file.write("property uchar red\n") + ply_file.write("property uchar green\n") + ply_file.write("property uchar blue\n") + ply_file.write("element face %d\n" % (faces.shape[0])) + ply_file.write("property list uchar int vertex_index\n") + ply_file.write("end_header\n") + + # Write vertex list + for i in range(verts.shape[0]): + ply_file.write( + "%f %f %f %f %f %f %d %d %d\n" + % ( + verts[i, 0], + verts[i, 1], + verts[i, 2], + norms[i, 0], + norms[i, 1], + norms[i, 2], + colors[i, 0], + colors[i, 1], + colors[i, 2], + ) + ) + + # Write face list + for i in range(faces.shape[0]): + ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2])) + + ply_file.close() + + +def pcwrite(filename, xyzrgb): + """Save a point cloud to a polygon .ply file.""" + xyz = xyzrgb[:, :3] + rgb = xyzrgb[:, 3:].astype(np.uint8) + + # Write header + ply_file = open(filename, "w") + ply_file.write("ply\n") + ply_file.write("format ascii 1.0\n") + ply_file.write("element vertex %d\n" % (xyz.shape[0])) + ply_file.write("property float x\n") + ply_file.write("property float y\n") + ply_file.write("property float z\n") + ply_file.write("property uchar red\n") + ply_file.write("property uchar green\n") + ply_file.write("property uchar blue\n") + ply_file.write("end_header\n") + + # Write vertex list + for i in range(xyz.shape[0]): + ply_file.write( + "%f %f %f %d %d %d\n" + % ( + xyz[i, 0], + xyz[i, 1], + xyz[i, 2], + rgb[i, 0], + rgb[i, 1], + rgb[i, 2], + ) + ) diff --git a/helpers.py b/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..e0c04e38af0140bd37508becf644802b098ae2e2 --- /dev/null +++ b/helpers.py @@ -0,0 +1,336 @@ +import numpy as np +import torch +import fusion +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + +def read_calib(calib_path): + """ + Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68 + :param calib_path: Path to a calibration text file. + :return: dict with calibration matrices. + """ + calib_all = {} + with open(calib_path, "r") as f: + for line in f.readlines(): + if line == "\n": + break + key, value = line.split(":", 1) + calib_all[key] = np.array([float(x) for x in value.split()]) + + # reshape matrices + calib_out = {} + # 3x4 projection matrix for left camera + calib_out["P2"] = calib_all["P2"].reshape(3, 4) + calib_out["Tr"] = np.identity(4) # 4x4 matrix + calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4) + return calib_out + + +def vox2pix(cam_E, cam_k, + vox_origin, voxel_size, + img_W, img_H, + scene_size): + """ + compute the 2D projection of voxels centroids + + Parameters: + ---------- + cam_E: 4x4 + =camera pose in case of NYUv2 dataset + =Transformation from camera to lidar coordinate in case of SemKITTI + cam_k: 3x3 + camera intrinsics + vox_origin: (3,) + world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0) + img_W: int + image width + img_H: int + image height + scene_size: (3,) + scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2 + + Returns + ------- + projected_pix: (N, 2) + Projected 2D positions of voxels + fov_mask: (N,) + Voxels mask indice voxels inside image's FOV + pix_z: (N,) + Voxels'distance to the sensor in meter + """ + # Compute the x, y, z bounding of the scene in meter + vol_bnds = np.zeros((3,2)) + vol_bnds[:,0] = vox_origin + vol_bnds[:,1] = vox_origin + np.array(scene_size) + + # Compute the voxels centroids in lidar cooridnates + vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int) + xv, yv, zv = np.meshgrid( + range(vol_dim[0]), + range(vol_dim[1]), + range(vol_dim[2]), + indexing='ij' + ) + vox_coords = np.concatenate([ + xv.reshape(1,-1), + yv.reshape(1,-1), + zv.reshape(1,-1) + ], axis=0).astype(int).T + + # Project voxels'centroid from lidar coordinates to camera coordinates + cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size) + cam_pts = fusion.rigid_transform(cam_pts, cam_E) + + # Project camera coordinates to pixel positions + projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k) + pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1] + + # Eliminate pixels outside view frustum + pix_z = cam_pts[:, 2] + fov_mask = np.logical_and(pix_x >= 0, + np.logical_and(pix_x < img_W, + np.logical_and(pix_y >= 0, + np.logical_and(pix_y < img_H, + pix_z > 0)))) + + + return torch.from_numpy(projected_pix), torch.from_numpy(fov_mask), torch.from_numpy(pix_z) + + + +def get_grid_coords(dims, resolution): + """ + :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32]) + :return coords_grid: is the center coords of voxels in the grid + """ + + g_xx = np.arange(0, dims[0] + 1) + g_yy = np.arange(0, dims[1] + 1) + sensor_pose = 10 + g_zz = np.arange(0, dims[2] + 1) + + # Obtaining the grid with coords... + xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1]) + coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T + coords_grid = coords_grid.astype(np.float) + + coords_grid = (coords_grid * resolution) + resolution / 2 + + temp = np.copy(coords_grid) + temp[:, 0] = coords_grid[:, 1] + temp[:, 1] = coords_grid[:, 0] + coords_grid = np.copy(temp) + + return coords_grid + +def get_projections(img_W, img_H): + scale_3ds = [1, 2] + data = {} + for scale_3d in scale_3ds: + scene_size = (51.2, 51.2, 6.4) + vox_origin = np.array([0, -25.6, -2]) + voxel_size = 0.2 + + calib = read_calib("calib.txt") + cam_k = calib["P2"][:3, :3] + T_velo_2_cam = calib["Tr"] + + # compute the 3D-2D mapping + projected_pix, fov_mask, pix_z = vox2pix( + T_velo_2_cam, + cam_k, + vox_origin, + voxel_size * scale_3d, + img_W, + img_H, + scene_size, + ) + + data["projected_pix_{}".format(scale_3d)] = projected_pix + data["pix_z_{}".format(scale_3d)] = pix_z + data["fov_mask_{}".format(scale_3d)] = fov_mask + return data + + +def majority_pooling(grid, k_size=2): + result = np.zeros( + (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size) + ) + for xx in range(0, int(np.floor(grid.shape[0] / k_size))): + for yy in range(0, int(np.floor(grid.shape[1] / k_size))): + for zz in range(0, int(np.floor(grid.shape[2] / k_size))): + + sub_m = grid[ + (xx * k_size) : (xx * k_size) + k_size, + (yy * k_size) : (yy * k_size) + k_size, + (zz * k_size) : (zz * k_size) + k_size, + ] + unique, counts = np.unique(sub_m, return_counts=True) + if True in ((unique != 0) & (unique != 255)): + # Remove counts with 0 and 255 + counts = counts[((unique != 0) & (unique != 255))] + unique = unique[((unique != 0) & (unique != 255))] + else: + if True in (unique == 0): + counts = counts[(unique != 255)] + unique = unique[(unique != 255)] + value = unique[np.argmax(counts)] + result[xx, yy, zz] = value + return result + + +def draw( + voxels, + # T_velo_2_cam, + # vox_origin, + fov_mask, + # img_size, + # f, + voxel_size=0.4, + # d=7, # 7m - determine the size of the mesh representing the camera +): + + fov_mask = fov_mask.reshape(-1) + # Compute the voxels coordinates + grid_coords = get_grid_coords( + [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size + ) + + + # Attach the predicted class to every voxel + grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T + + # Get the voxels inside FOV + fov_grid_coords = grid_coords[fov_mask, :] + + # Get the voxels outside FOV + outfov_grid_coords = grid_coords[~fov_mask, :] + + # Remove empty and unknown voxels + fov_voxels = fov_grid_coords[ + (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255), : + ] + # print(np.unique(fov_voxels[:, 3], return_counts=True)) + outfov_voxels = outfov_grid_coords[ + (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255), : + ] + + # figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1)) + colors = np.array( + [ + [0,0,0], + [100, 150, 245], + [100, 230, 245], + [30, 60, 150], + [80, 30, 180], + [100, 80, 250], + [255, 30, 30], + [255, 40, 200], + [150, 30, 90], + [255, 0, 255], + [255, 150, 255], + [75, 0, 75], + [175, 0, 75], + [255, 200, 0], + [255, 120, 50], + [0, 175, 0], + [135, 60, 0], + [150, 240, 80], + [255, 240, 150], + [255, 0, 0], + ] + ).astype(np.uint8) + + pts_colors = [f'rgb({colors[int(i)][0]}, {colors[int(i)][1]}, {colors[int(i)][2]})' for i in fov_voxels[:, 3]] + out_fov_colors = [f'rgb({colors[int(i)][0]//3*2}, {colors[int(i)][1]//3*2}, {colors[int(i)][2]//3*2})' for i in outfov_voxels[:, 3]] + pts_colors = pts_colors + out_fov_colors + + fov_voxels = np.concatenate([fov_voxels, outfov_voxels], axis=0) + x = fov_voxels[:, 0].flatten() + y = fov_voxels[:, 1].flatten() + z = fov_voxels[:, 2].flatten() + # label = fov_voxels[:, 3].flatten() + fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,mode='markers', + marker=dict( + size=2, + color=pts_colors, # set color to an array/list of desired values + # colorscale='Viridis', # choose a colorscale + opacity=1.0, + symbol='square' + ))]) + fig.update_layout( + scene = dict( + aspectmode='data', + xaxis = dict( + backgroundcolor="rgb(255, 255, 255)", + gridcolor="black", + showbackground=True, + zerolinecolor="black", + nticks=4, + visible=False, + range=[-1,55],), + yaxis = dict( + backgroundcolor="rgb(255, 255, 255)", + gridcolor="black", + showbackground=True, + zerolinecolor="black", + visible=False, + nticks=4, range=[-1,55],), + zaxis = dict( + backgroundcolor="rgb(255, 255, 255)", + gridcolor="black", + showbackground=True, + zerolinecolor="black", + visible=False, + nticks=4, range=[-1,7],), + bgcolor="black", + ), + + ) + + # fig = px.scatter_3d( + # fov_voxels, + # x=fov_voxels[:, 0], y="y", z="z", color="label") + # Draw occupied inside FOV voxels + # plt_plot_fov = mlab.points3d( + # fov_voxels[:, 0], + # fov_voxels[:, 1], + # fov_voxels[:, 2], + # fov_voxels[:, 3], + # colormap="viridis", + # scale_factor=voxel_size - 0.05 * voxel_size, + # mode="cube", + # opacity=1.0, + # vmin=1, + # vmax=19, + # ) + + # # Draw occupied outside FOV voxels + # plt_plot_outfov = mlab.points3d( + # outfov_voxels[:, 0], + # outfov_voxels[:, 1], + # outfov_voxels[:, 2], + # outfov_voxels[:, 3], + # colormap="viridis", + # scale_factor=voxel_size - 0.05 * voxel_size, + # mode="cube", + # opacity=1.0, + # vmin=1, + # vmax=19, + # ) + + + + # plt_plot_fov.glyph.scale_mode = "scale_by_vector" + # plt_plot_outfov.glyph.scale_mode = "scale_by_vector" + + # plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors + + # outfov_colors = colors + # outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2 + # plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors + + # mlab.show() + return fig \ No newline at end of file diff --git a/images/08/000010.jpg b/images/08/000010.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dce9bd4712215f082178d79da224fedcd7d1f324 Binary files /dev/null and b/images/08/000010.jpg differ diff --git a/images/08/000085.jpg b/images/08/000085.jpg new file mode 100644 index 0000000000000000000000000000000000000000..92841f53fa1c483d5537341d2052c6a6921a8c07 Binary files /dev/null and b/images/08/000085.jpg differ diff --git a/images/08/000103.jpg b/images/08/000103.jpg new file mode 100644 index 0000000000000000000000000000000000000000..14460f856654a3ad525c80c9aa28a9b2e59ff7e7 Binary files /dev/null and b/images/08/000103.jpg differ diff --git a/images/08/000187.jpg b/images/08/000187.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3f1027f581c2d1ea8cd9fb7fdce028a3db1c2105 Binary files /dev/null and b/images/08/000187.jpg differ diff --git a/images/08/000234.jpg b/images/08/000234.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2399753ecd614b6b0d1239e22c0d422e434f28ee Binary files /dev/null and b/images/08/000234.jpg differ diff --git a/images/08/000290.jpg b/images/08/000290.jpg new file mode 100644 index 0000000000000000000000000000000000000000..d09734b6689231c78216da5fdbe48c1e075c5b91 Binary files /dev/null and b/images/08/000290.jpg differ diff --git a/images/08/000295.jpg b/images/08/000295.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9113dad82318a509db95201dbab9b6ff834ecd9d Binary files /dev/null and b/images/08/000295.jpg differ diff --git a/images/08/000465.jpg b/images/08/000465.jpg new file mode 100644 index 0000000000000000000000000000000000000000..917d465fd5b9ec6065719e88d9d0cdf39fce5823 Binary files /dev/null and b/images/08/000465.jpg differ diff --git a/images/08/000511.jpg b/images/08/000511.jpg new file mode 100644 index 0000000000000000000000000000000000000000..378e6bd8023e5f643e0f9df25d0ef27e6c286498 Binary files /dev/null and b/images/08/000511.jpg differ diff --git a/images/08/000618.jpg b/images/08/000618.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6d036b40463933033371cbd8b821d8875dff2ae0 Binary files /dev/null and b/images/08/000618.jpg differ diff --git a/images/08/000790.jpg b/images/08/000790.jpg new file mode 100644 index 0000000000000000000000000000000000000000..77aea4cbaec5ec03f6cd583a77da7a7e9e4fee88 Binary files /dev/null and b/images/08/000790.jpg differ diff --git a/images/08/000822.jpg b/images/08/000822.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0139ce00faac15151767978b4c850b957f7f9298 Binary files /dev/null and b/images/08/000822.jpg differ diff --git a/images/08/001005.jpg b/images/08/001005.jpg new file mode 100644 index 0000000000000000000000000000000000000000..117d1da2374d1b1107acd6ea4fac0a11193ccde7 Binary files /dev/null and b/images/08/001005.jpg differ diff --git a/images/08/001122.jpg b/images/08/001122.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e1d8b7d6970c781e416ff1d3decd99241f568bd4 Binary files /dev/null and b/images/08/001122.jpg differ diff --git a/images/08/001380.jpg b/images/08/001380.jpg new file mode 100644 index 0000000000000000000000000000000000000000..9683401a216e810cd633971365b38fefaa827061 Binary files /dev/null and b/images/08/001380.jpg differ diff --git a/images/08/001385.jpg b/images/08/001385.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a53a048254d36d4ccab94611fef4b33188a49bb4 Binary files /dev/null and b/images/08/001385.jpg differ diff --git a/images/08/001446.jpg b/images/08/001446.jpg new file mode 100644 index 0000000000000000000000000000000000000000..a058286e4be7273cac49ce3e600036b758c38b5e Binary files /dev/null and b/images/08/001446.jpg differ diff --git a/images/08/001530.jpg b/images/08/001530.jpg new file mode 100644 index 0000000000000000000000000000000000000000..69deae8b12a066d3159bff13006b0c8fd3cd123c Binary files /dev/null and b/images/08/001530.jpg differ diff --git a/images/08/001687.jpg b/images/08/001687.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6e6eb1f062bed3357d5465b38d1409e030eba8bf Binary files /dev/null and b/images/08/001687.jpg differ diff --git a/images/08/001842.jpg b/images/08/001842.jpg new file mode 100644 index 0000000000000000000000000000000000000000..482996d026e61f51f1ecf860395cf37a95efa833 Binary files /dev/null and b/images/08/001842.jpg differ diff --git a/images/08/002010.jpg b/images/08/002010.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4bf5de984449de372b6ae8fe57bb7cf6dfa12d95 Binary files /dev/null and b/images/08/002010.jpg differ diff --git a/images/08/002128.jpg b/images/08/002128.jpg new file mode 100644 index 0000000000000000000000000000000000000000..2d0b1a7b0cff4997b202ed1cb322285aee208de2 Binary files /dev/null and b/images/08/002128.jpg differ diff --git a/images/08/002272.jpg b/images/08/002272.jpg new file mode 100644 index 0000000000000000000000000000000000000000..e0d2ebc9427cf5dc313f0d0c42b3e4301b328535 Binary files /dev/null and b/images/08/002272.jpg differ diff --git a/images/08/002360.jpg b/images/08/002360.jpg new file mode 100644 index 0000000000000000000000000000000000000000..566632a905ecd04ae94b0ac599e56673ef92b92e Binary files /dev/null and b/images/08/002360.jpg differ diff --git a/images/08/002505.jpg b/images/08/002505.jpg new file mode 100644 index 0000000000000000000000000000000000000000..77b8cc2fb0fdc13604771274750bccc6ca05eae1 Binary files /dev/null and b/images/08/002505.jpg differ diff --git a/images/08/002716.jpg b/images/08/002716.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5385ac65b015c458028b94d76d86a26cded7b203 Binary files /dev/null and b/images/08/002716.jpg differ diff --git a/images/08/002944.jpg b/images/08/002944.jpg new file mode 100644 index 0000000000000000000000000000000000000000..6ad1e8a50cab8643acc1b7f238d16aeeb55611bf Binary files /dev/null and b/images/08/002944.jpg differ diff --git a/images/08/003149.jpg b/images/08/003149.jpg new file mode 100644 index 0000000000000000000000000000000000000000..bab4c49d2515086351188e69af2cdd5db5e32fce Binary files /dev/null and b/images/08/003149.jpg differ diff --git a/images/08/003365.jpg b/images/08/003365.jpg new file mode 100644 index 0000000000000000000000000000000000000000..5d77465687501909179664683b2e0432ab5dedf1 Binary files /dev/null and b/images/08/003365.jpg differ diff --git a/images/08/003533.jpg b/images/08/003533.jpg new file mode 100644 index 0000000000000000000000000000000000000000..4b963deee51da8231a4be14854c571c1b1a822d0 Binary files /dev/null and b/images/08/003533.jpg differ diff --git a/images/08/003790.jpg b/images/08/003790.jpg new file mode 100644 index 0000000000000000000000000000000000000000..0e9ea9359144d5f6479a855dbc9d82d8bb527097 Binary files /dev/null and b/images/08/003790.jpg differ diff --git a/images/08/003929.jpg b/images/08/003929.jpg new file mode 100644 index 0000000000000000000000000000000000000000..74cab6e5f72cdaa1b9badc124f33e19d2bef1540 Binary files /dev/null and b/images/08/003929.jpg differ diff --git a/images/08/004059.jpg b/images/08/004059.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1620e3db907758dd12b4b4dc6ac7853916916540 Binary files /dev/null and b/images/08/004059.jpg differ diff --git a/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py b/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..c88b7b309e6fe66f597cafe2a5eb8c6d29343b7e --- /dev/null +++ b/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py @@ -0,0 +1,97 @@ +import torch +import torch.nn as nn +from monoscene.modules import ( + Process, + ASPP, +) + + +class CPMegaVoxels(nn.Module): + def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003): + super().__init__() + self.size = size + self.n_relations = n_relations + print("n_relations", self.n_relations) + self.flatten_size = size[0] * size[1] * size[2] + self.feature = feature + self.context_feature = feature * 2 + self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2) + padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2) + + self.mega_context = nn.Sequential( + nn.Conv3d( + feature, self.context_feature, stride=2, padding=padding, kernel_size=3 + ), + ) + self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2) + + self.context_prior_logits = nn.ModuleList( + [ + nn.Sequential( + nn.Conv3d( + self.feature, + self.flatten_context_size, + padding=0, + kernel_size=1, + ), + ) + for i in range(n_relations) + ] + ) + self.aspp = ASPP(feature, [1, 2, 3]) + + self.resize = nn.Sequential( + nn.Conv3d( + self.context_feature * self.n_relations + feature, + feature, + kernel_size=1, + padding=0, + bias=False, + ), + Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]), + ) + + def forward(self, input): + ret = {} + bs = input.shape[0] + + x_agg = self.aspp(input) + + # get the mega context + x_mega_context_raw = self.mega_context(x_agg) + x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1) + x_mega_context = x_mega_context.permute(0, 2, 1) + + # get context prior map + x_context_prior_logits = [] + x_context_rels = [] + for rel in range(self.n_relations): + + # Compute the relation matrices + x_context_prior_logit = self.context_prior_logits[rel](x_agg) + x_context_prior_logit = x_context_prior_logit.reshape( + bs, self.flatten_context_size, self.flatten_size + ) + x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1)) + + x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1) + x_context_prior = torch.sigmoid(x_context_prior_logit) + + # Multiply the relation matrices with the mega context to gather context features + x_context_rel = torch.bmm(x_context_prior, x_mega_context) # bs, N, f + x_context_rels.append(x_context_rel) + + x_context = torch.cat(x_context_rels, dim=2) + x_context = x_context.permute(0, 2, 1) + x_context = x_context.reshape( + bs, x_context.shape[1], self.size[0], self.size[1], self.size[2] + ) + + x = torch.cat([input, x_context], dim=1) + x = self.resize(x) + + x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1) + ret["P_logits"] = x_context_prior_logits + ret["x"] = x + + return ret diff --git a/monoscene/.ipynb_checkpoints/config-checkpoint.py b/monoscene/.ipynb_checkpoints/config-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..7cb935d3d3a41c8973e72210323205607aff2dc5 --- /dev/null +++ b/monoscene/.ipynb_checkpoints/config-checkpoint.py @@ -0,0 +1,34 @@ +from transformers import PretrainedConfig +from typing import List + + +class MonoSceneConfig(PretrainedConfig): + + def __init__( + self, + block_type="bottleneck", + layers: List[int] = [3, 4, 6, 3], + num_classes: int = 1000, + input_channels: int = 3, + cardinality: int = 1, + base_width: int = 64, + stem_width: int = 64, + stem_type: str = "", + avg_down: bool = False, + **kwargs, + ): + self.block_type = block_type + self.layers = layers + self.num_classes = num_classes + self.input_channels = input_channels + self.cardinality = cardinality + self.base_width = base_width + self.stem_width = stem_width + self.stem_type = stem_type + self.avg_down = avg_down + super().__init__(**kwargs) + + + + + diff --git a/monoscene/.ipynb_checkpoints/modules-checkpoint.py b/monoscene/.ipynb_checkpoints/modules-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..3e8bf875ccd6dffb51bb5acb25f0302fe0032d6c --- /dev/null +++ b/monoscene/.ipynb_checkpoints/modules-checkpoint.py @@ -0,0 +1,194 @@ +import torch +import torch.nn as nn +from monoscene.DDR import Bottleneck3D + + +class ASPP(nn.Module): + """ + ASPP 3D + Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7 + """ + + def __init__(self, planes, dilations_conv_list): + super().__init__() + + # ASPP Block + self.conv_list = dilations_conv_list + self.conv1 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn1 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.conv2 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn2 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.relu = nn.ReLU() + + def forward(self, x_in): + + y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in))))) + for i in range(1, len(self.conv_list)): + y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in))))) + x_in = self.relu(y + x_in) # modified + + return x_in + + +class SegmentationHead(nn.Module): + """ + 3D Segmentation heads to retrieve semantic segmentation at each scale. + Formed by Dim expansion, Conv3D, ASPP block, Conv3D. + Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7 + """ + + def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list): + super().__init__() + + # First convolution + self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1) + + # ASPP Block + self.conv_list = dilations_conv_list + self.conv1 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn1 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.conv2 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn2 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.relu = nn.ReLU() + + self.conv_classes = nn.Conv3d( + planes, nbr_classes, kernel_size=3, padding=1, stride=1 + ) + + def forward(self, x_in): + + # Convolution to go from inplanes to planes features... + x_in = self.relu(self.conv0(x_in)) + + y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in))))) + for i in range(1, len(self.conv_list)): + y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in))))) + x_in = self.relu(y + x_in) # modified + + x_in = self.conv_classes(x_in) + + return x_in + + +class ProcessKitti(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]): + super(Process, self).__init__() + self.main = nn.Sequential( + *[ + Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + norm_layer=norm_layer, + dilation=[i, i, i], + ) + for i in dilations + ] + ) + + def forward(self, x): + return self.main(x) + + +class Process(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]): + super(Process, self).__init__() + self.main = nn.Sequential( + *[ + Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + norm_layer=norm_layer, + dilation=[i, i, i], + ) + for i in dilations + ] + ) + + def forward(self, x): + return self.main(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, out_channels, norm_layer, bn_momentum): + super(Upsample, self).__init__() + self.main = nn.Sequential( + nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + dilation=1, + output_padding=1, + ), + norm_layer(out_channels, momentum=bn_momentum), + nn.ReLU(), + ) + + def forward(self, x): + return self.main(x) + + +class Downsample(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, expansion=8): + super(Downsample, self).__init__() + self.main = Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + expansion=expansion, + stride=2, + downsample=nn.Sequential( + nn.AvgPool3d(kernel_size=2, stride=2), + nn.Conv3d( + feature, + int(feature * expansion / 4), + kernel_size=1, + stride=1, + bias=False, + ), + norm_layer(int(feature * expansion / 4), momentum=bn_momentum), + ), + norm_layer=norm_layer, + ) + + def forward(self, x): + return self.main(x) diff --git a/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py b/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..bc4d020729b6698887055771439f87a491572bd1 --- /dev/null +++ b/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py @@ -0,0 +1,123 @@ +import pytorch_lightning as pl +import torch +import torch.nn as nn +from monoscene.unet3d_nyu import UNet3D as UNet3DNYU +from monoscene.unet3d_kitti import UNet3D as UNet3DKitti +from monoscene.flosp import FLoSP +import numpy as np +import torch.nn.functional as F +from monoscene.unet2d import UNet2D + + +class MonoScene(pl.LightningModule): + def __init__( + self, + n_classes, + feature, + project_scale, + full_scene_size, + dataset, + n_relations=4, + context_prior=True, + fp_loss=True, + project_res=[], + frustum_size=4, + relation_loss=False, + CE_ssc_loss=True, + geo_scal_loss=True, + sem_scal_loss=True, + lr=1e-4, + weight_decay=1e-4, + ): + super().__init__() + + self.project_res = project_res + self.fp_loss = fp_loss + self.dataset = dataset + self.context_prior = context_prior + self.frustum_size = frustum_size + self.relation_loss = relation_loss + self.CE_ssc_loss = CE_ssc_loss + self.sem_scal_loss = sem_scal_loss + self.geo_scal_loss = geo_scal_loss + self.project_scale = project_scale + self.lr = lr + self.weight_decay = weight_decay + + self.projects = {} + self.scale_2ds = [1, 2, 4, 8] # 2D scales + for scale_2d in self.scale_2ds: + self.projects[str(scale_2d)] = FLoSP( + full_scene_size, project_scale=self.project_scale, dataset=self.dataset + ) + self.projects = nn.ModuleDict(self.projects) + + self.n_classes = n_classes + if self.dataset == "NYU": + self.net_3d_decoder = UNet3DNYU( + self.n_classes, + nn.BatchNorm3d, + n_relations=n_relations, + feature=feature, + full_scene_size=full_scene_size, + context_prior=context_prior, + ) + elif self.dataset == "kitti": + self.net_3d_decoder = UNet3DKitti( + self.n_classes, + nn.BatchNorm3d, + project_scale=project_scale, + feature=feature, + full_scene_size=full_scene_size, + context_prior=context_prior, + ) + self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True) + + def forward(self, batch): + + img = batch["img"] + bs = len(img) + + out = {} + + x_rgb = self.net_rgb(img) + + x3ds = [] + for i in range(bs): + x3d = None + for scale_2d in self.project_res: + + # project features at each 2D scale to target 3D scale + scale_2d = int(scale_2d) + projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda() + fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda() + + # Sum all the 3D features + if x3d is None: + x3d = self.projects[str(scale_2d)]( + x_rgb["1_" + str(scale_2d)][i], + projected_pix // scale_2d, + fov_mask, + ) + else: + x3d += self.projects[str(scale_2d)]( + x_rgb["1_" + str(scale_2d)][i], + projected_pix // scale_2d, + fov_mask, + ) + x3ds.append(x3d) + + input_dict = { + "x3d": torch.stack(x3ds), + } + + out_dict = self.net_3d_decoder(input_dict) + + ssc_pred = out_dict["ssc_logit"] + + y_pred = ssc_pred.detach().cpu().numpy() + y_pred = np.argmax(y_pred, axis=1) + + return y_pred + + diff --git a/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py b/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1bf3d80ea531ff02b3229b862b7a4cd0aec8ec58 --- /dev/null +++ b/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py @@ -0,0 +1,22 @@ +from transformers import PreTrainedModel +from .config import MonoSceneConfig +from monoscene.monoscene import MonoScene + + + +class MonoSceneModel(PreTrainedModel): + config_class = ResnetConfig + + def __init__(self, config): + super().__init__(config) + self.model = MonoScene( + dataset=config.dataset, + n_classes=config.n_classes, + feature=config.feature, + project_scale=config.project_scale, + full_scene_size=config.full_scene_size + ) + + + def forward(self, tensor): + return self.model.forward(tensor) \ No newline at end of file diff --git a/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py b/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..91d5339fbdf34e28d017d7e4e29ce4923169bef5 --- /dev/null +++ b/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py @@ -0,0 +1,88 @@ +# encoding: utf-8 +import torch +import torch.nn as nn +import torch.nn.functional as F +from monoscene.modules import SegmentationHead +from monoscene.CRP3D import CPMegaVoxels +from monoscene.modules import Process, Upsample, Downsample + + +class UNet3D(nn.Module): + def __init__( + self, + class_num, + norm_layer, + full_scene_size, + feature, + project_scale, + context_prior=None, + bn_momentum=0.1, + ): + super(UNet3D, self).__init__() + self.business_layer = [] + self.project_scale = project_scale + self.full_scene_size = full_scene_size + self.feature = feature + + size_l1 = ( + int(self.full_scene_size[0] / project_scale), + int(self.full_scene_size[1] / project_scale), + int(self.full_scene_size[2] / project_scale), + ) + size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2) + size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2) + + dilations = [1, 2, 3] + self.process_l1 = nn.Sequential( + Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature, norm_layer, bn_momentum), + ) + self.process_l2 = nn.Sequential( + Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature * 2, norm_layer, bn_momentum), + ) + + self.up_13_l2 = Upsample( + self.feature * 4, self.feature * 2, norm_layer, bn_momentum + ) + self.up_12_l1 = Upsample( + self.feature * 2, self.feature, norm_layer, bn_momentum + ) + self.up_l1_lfull = Upsample( + self.feature, self.feature // 2, norm_layer, bn_momentum + ) + + self.ssc_head = SegmentationHead( + self.feature // 2, self.feature // 2, class_num, dilations + ) + + self.context_prior = context_prior + if context_prior: + self.CP_mega_voxels = CPMegaVoxels( + self.feature * 4, size_l3, bn_momentum=bn_momentum + ) + + def forward(self, input_dict): + res = {} + + x3d_l1 = input_dict["x3d"] + + x3d_l2 = self.process_l1(x3d_l1) + + x3d_l3 = self.process_l2(x3d_l2) + + if self.context_prior: + ret = self.CP_mega_voxels(x3d_l3) + x3d_l3 = ret["x"] + for k in ret.keys(): + res[k] = ret[k] + + x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2 + x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1 + x3d_up_lfull = self.up_l1_lfull(x3d_up_l1) + + ssc_logit_full = self.ssc_head(x3d_up_lfull) + + res["ssc_logit"] = ssc_logit_full + + return res diff --git a/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py b/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e9e3b3718999248efa1b2925658465ba59801b13 --- /dev/null +++ b/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py @@ -0,0 +1,90 @@ +# encoding: utf-8 +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from monoscene.CRP3D import CPMegaVoxels +from monoscene.modules import ( + Process, + Upsample, + Downsample, + SegmentationHead, + ASPP, +) + + +class UNet3D(nn.Module): + def __init__( + self, + class_num, + norm_layer, + feature, + full_scene_size, + n_relations=4, + project_res=[], + context_prior=True, + bn_momentum=0.1, + ): + super(UNet3D, self).__init__() + self.business_layer = [] + self.project_res = project_res + + self.feature_1_4 = feature + self.feature_1_8 = feature * 2 + self.feature_1_16 = feature * 4 + + self.feature_1_16_dec = self.feature_1_16 + self.feature_1_8_dec = self.feature_1_8 + self.feature_1_4_dec = self.feature_1_4 + + self.process_1_4 = nn.Sequential( + Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature_1_4, norm_layer, bn_momentum), + ) + self.process_1_8 = nn.Sequential( + Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature_1_8, norm_layer, bn_momentum), + ) + self.up_1_16_1_8 = Upsample( + self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum + ) + self.up_1_8_1_4 = Upsample( + self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum + ) + self.ssc_head_1_4 = SegmentationHead( + self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3] + ) + + self.context_prior = context_prior + size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size) + + if context_prior: + self.CP_mega_voxels = CPMegaVoxels( + self.feature_1_16, + size_1_16, + n_relations=n_relations, + bn_momentum=bn_momentum, + ) + + # + def forward(self, input_dict): + res = {} + + x3d_1_4 = input_dict["x3d"] + x3d_1_8 = self.process_1_4(x3d_1_4) + x3d_1_16 = self.process_1_8(x3d_1_8) + + if self.context_prior: + ret = self.CP_mega_voxels(x3d_1_16) + x3d_1_16 = ret["x"] + for k in ret.keys(): + res[k] = ret[k] + + x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8 + x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4 + + ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4) + + res["ssc_logit"] = ssc_logit_1_4 + + return res diff --git a/monoscene/CRP3D.py b/monoscene/CRP3D.py new file mode 100644 index 0000000000000000000000000000000000000000..c88b7b309e6fe66f597cafe2a5eb8c6d29343b7e --- /dev/null +++ b/monoscene/CRP3D.py @@ -0,0 +1,97 @@ +import torch +import torch.nn as nn +from monoscene.modules import ( + Process, + ASPP, +) + + +class CPMegaVoxels(nn.Module): + def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003): + super().__init__() + self.size = size + self.n_relations = n_relations + print("n_relations", self.n_relations) + self.flatten_size = size[0] * size[1] * size[2] + self.feature = feature + self.context_feature = feature * 2 + self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2) + padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2) + + self.mega_context = nn.Sequential( + nn.Conv3d( + feature, self.context_feature, stride=2, padding=padding, kernel_size=3 + ), + ) + self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2) + + self.context_prior_logits = nn.ModuleList( + [ + nn.Sequential( + nn.Conv3d( + self.feature, + self.flatten_context_size, + padding=0, + kernel_size=1, + ), + ) + for i in range(n_relations) + ] + ) + self.aspp = ASPP(feature, [1, 2, 3]) + + self.resize = nn.Sequential( + nn.Conv3d( + self.context_feature * self.n_relations + feature, + feature, + kernel_size=1, + padding=0, + bias=False, + ), + Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]), + ) + + def forward(self, input): + ret = {} + bs = input.shape[0] + + x_agg = self.aspp(input) + + # get the mega context + x_mega_context_raw = self.mega_context(x_agg) + x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1) + x_mega_context = x_mega_context.permute(0, 2, 1) + + # get context prior map + x_context_prior_logits = [] + x_context_rels = [] + for rel in range(self.n_relations): + + # Compute the relation matrices + x_context_prior_logit = self.context_prior_logits[rel](x_agg) + x_context_prior_logit = x_context_prior_logit.reshape( + bs, self.flatten_context_size, self.flatten_size + ) + x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1)) + + x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1) + x_context_prior = torch.sigmoid(x_context_prior_logit) + + # Multiply the relation matrices with the mega context to gather context features + x_context_rel = torch.bmm(x_context_prior, x_mega_context) # bs, N, f + x_context_rels.append(x_context_rel) + + x_context = torch.cat(x_context_rels, dim=2) + x_context = x_context.permute(0, 2, 1) + x_context = x_context.reshape( + bs, x_context.shape[1], self.size[0], self.size[1], self.size[2] + ) + + x = torch.cat([input, x_context], dim=1) + x = self.resize(x) + + x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1) + ret["P_logits"] = x_context_prior_logits + ret["x"] = x + + return ret diff --git a/monoscene/DDR.py b/monoscene/DDR.py new file mode 100644 index 0000000000000000000000000000000000000000..0d0928c0741433dc24523a2c26bfad9ef1ff920e --- /dev/null +++ b/monoscene/DDR.py @@ -0,0 +1,139 @@ +""" +Most of the code in this file is taken from https://github.com/waterljwant/SSC/blob/master/models/DDR.py +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class SimpleRB(nn.Module): + def __init__(self, in_channel, norm_layer, bn_momentum): + super(SimpleRB, self).__init__() + self.path = nn.Sequential( + nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False), + norm_layer(in_channel, momentum=bn_momentum), + nn.ReLU(), + nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False), + norm_layer(in_channel, momentum=bn_momentum), + ) + self.relu = nn.ReLU() + + def forward(self, x): + residual = x + conv_path = self.path(x) + out = residual + conv_path + out = self.relu(out) + return out + + +""" +3D Residual Block,3x3x3 conv ==> 3 smaller 3D conv, refered from DDRNet +""" + + +class Bottleneck3D(nn.Module): + def __init__( + self, + inplanes, + planes, + norm_layer, + stride=1, + dilation=[1, 1, 1], + expansion=4, + downsample=None, + fist_dilation=1, + multi_grid=1, + bn_momentum=0.0003, + ): + super(Bottleneck3D, self).__init__() + # often,planes = inplanes // 4 + self.expansion = expansion + self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False) + self.bn1 = norm_layer(planes, momentum=bn_momentum) + self.conv2 = nn.Conv3d( + planes, + planes, + kernel_size=(1, 1, 3), + stride=(1, 1, stride), + dilation=(1, 1, dilation[0]), + padding=(0, 0, dilation[0]), + bias=False, + ) + self.bn2 = norm_layer(planes, momentum=bn_momentum) + self.conv3 = nn.Conv3d( + planes, + planes, + kernel_size=(1, 3, 1), + stride=(1, stride, 1), + dilation=(1, dilation[1], 1), + padding=(0, dilation[1], 0), + bias=False, + ) + self.bn3 = norm_layer(planes, momentum=bn_momentum) + self.conv4 = nn.Conv3d( + planes, + planes, + kernel_size=(3, 1, 1), + stride=(stride, 1, 1), + dilation=(dilation[2], 1, 1), + padding=(dilation[2], 0, 0), + bias=False, + ) + self.bn4 = norm_layer(planes, momentum=bn_momentum) + self.conv5 = nn.Conv3d( + planes, planes * self.expansion, kernel_size=(1, 1, 1), bias=False + ) + self.bn5 = norm_layer(planes * self.expansion, momentum=bn_momentum) + + self.relu = nn.ReLU(inplace=False) + self.relu_inplace = nn.ReLU(inplace=True) + self.downsample = downsample + self.dilation = dilation + self.stride = stride + + self.downsample2 = nn.Sequential( + nn.AvgPool3d(kernel_size=(1, stride, 1), stride=(1, stride, 1)), + nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False), + norm_layer(planes, momentum=bn_momentum), + ) + self.downsample3 = nn.Sequential( + nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)), + nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False), + norm_layer(planes, momentum=bn_momentum), + ) + self.downsample4 = nn.Sequential( + nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)), + nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False), + norm_layer(planes, momentum=bn_momentum), + ) + + def forward(self, x): + residual = x + + out1 = self.relu(self.bn1(self.conv1(x))) + out2 = self.bn2(self.conv2(out1)) + out2_relu = self.relu(out2) + + out3 = self.bn3(self.conv3(out2_relu)) + if self.stride != 1: + out2 = self.downsample2(out2) + out3 = out3 + out2 + out3_relu = self.relu(out3) + + out4 = self.bn4(self.conv4(out3_relu)) + if self.stride != 1: + out2 = self.downsample3(out2) + out3 = self.downsample4(out3) + out4 = out4 + out2 + out3 + + out4_relu = self.relu(out4) + out5 = self.bn5(self.conv5(out4_relu)) + + if self.downsample is not None: + residual = self.downsample(x) + + out = out5 + residual + out_relu = self.relu(out) + + return out_relu diff --git a/monoscene/__init__.py b/monoscene/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/monoscene/app.py b/monoscene/app.py new file mode 100644 index 0000000000000000000000000000000000000000..8e70631e75313a28bc978ac3d3bd5df28b61a552 --- /dev/null +++ b/monoscene/app.py @@ -0,0 +1,138 @@ +from pytorch_lightning import Trainer +from monoscene.models.monoscene import MonoScene +from monoscene.data.NYU.nyu_dm import NYUDataModule +from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule +from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule +# import hydra +from omegaconf import DictConfig +import torch +import numpy as np +import os +from hydra.utils import get_original_cwd +import gradio as gr +import numpy as np +import plotly.express as px +import pandas as pd + + +# @hydra.main(config_name="../config/monoscene.yaml") +def plot(input_img): + torch.set_grad_enabled(False) + + # Setup dataloader + # if config.dataset == "kitti" or config.dataset == "kitti_360": + feature = 64 + project_scale = 2 + full_scene_size = (256, 256, 32) + + # if config.dataset == "kitti": + # data_module = KittiDataModule( + # root=config.kitti_root, + # preprocess_root=config.kitti_preprocess_root, + # frustum_size=config.frustum_size, + # batch_size=int(config.batch_size / config.n_gpus), + # num_workers=int(config.num_workers_per_gpu * config.n_gpus), + # ) + # data_module.setup() + # data_loader = data_module.val_dataloader() + # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set + # else: + # data_module = Kitti360DataModule( + # root=config.kitti_360_root, + # sequences=[config.kitti_360_sequence], + # n_scans=2000, + # batch_size=1, + # num_workers=3, + # ) + # data_module.setup() + # data_loader = data_module.dataloader() + + # elif config.dataset == "NYU": + # project_scale = 1 + # feature = 200 + # full_scene_size = (60, 36, 60) + # data_module = NYUDataModule( + # root=config.NYU_root, + # preprocess_root=config.NYU_preprocess_root, + # n_relations=config.n_relations, + # frustum_size=config.frustum_size, + # batch_size=int(config.batch_size / config.n_gpus), + # num_workers=int(config.num_workers_per_gpu * config.n_gpus), + # ) + # data_module.setup() + # data_loader = data_module.val_dataloader() + # # data_loader = data_module.test_dataloader() # use this if you want to infer on test set + # else: + # print("dataset not support") + + # Load pretrained models + # if config.dataset == "NYU": + # model_path = os.path.join( + # get_original_cwd(), "trained_models", "monoscene_nyu.ckpt" + # ) + # else: + # model_path = os.path.join( + # get_original_cwd(), "trained_models", "monoscene_kitti.ckpt" + # ) + model_path = "trained_models/monoscene_kitti.ckpt" + + model = MonoScene.load_from_checkpoint( + model_path, + feature=feature, + project_scale=project_scale, + fp_loss=False, + full_scene_size=full_scene_size, + ) + model.cuda() + model.eval() + + print(input_img.shape) + + x = np.arange(12).reshape(4, 3) / 12 + data = pd.DataFrame(data=x, columns=['x', 'y', 'z']) + fig = px.scatter_3d(data, x="x", y="y", z="z") + return fig + +demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot()) +demo.launch() + + + + # Save prediction and additional data + # to draw the viewing frustum and remove scene outside the room for NYUv2 + # output_path = os.path.join(config.output_path, config.dataset) + # with torch.no_grad(): + # for batch in tqdm(data_loader): + # batch["img"] = batch["img"].cuda() + # pred = model(batch) + # y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy() + # y_pred = np.argmax(y_pred, axis=1) + # for i in range(config.batch_size): + # out_dict = {"y_pred": y_pred[i].astype(np.uint16)} + # if "target" in batch: + # out_dict["target"] = ( + # batch["target"][i].detach().cpu().numpy().astype(np.uint16) + # ) + + # if config.dataset == "NYU": + # write_path = output_path + # filepath = os.path.join(write_path, batch["name"][i] + ".pkl") + # out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy() + # out_dict["vox_origin"] = ( + # batch["vox_origin"][i].detach().cpu().numpy() + # ) + # else: + # write_path = os.path.join(output_path, batch["sequence"][i]) + # filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl") + # out_dict["fov_mask_1"] = ( + # batch["fov_mask_1"][i].detach().cpu().numpy() + # ) + # out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy() + # out_dict["T_velo_2_cam"] = ( + # batch["T_velo_2_cam"][i].detach().cpu().numpy() + # ) + + # os.makedirs(write_path, exist_ok=True) + # with open(filepath, "wb") as handle: + # pickle.dump(out_dict, handle) + # print("wrote to", filepath) \ No newline at end of file diff --git a/monoscene/config.py b/monoscene/config.py new file mode 100644 index 0000000000000000000000000000000000000000..e03e806ad5e0c7ea4c439e3e82d955e3c0b3038f --- /dev/null +++ b/monoscene/config.py @@ -0,0 +1,26 @@ +from transformers import PretrainedConfig +from typing import List + + +class MonoSceneConfig(PretrainedConfig): + + def __init__( + self, + dataset="kitti", + n_classes=20, + feature=64, + project_scale=2, + full_scene_size=(256, 256, 32), + **kwargs, + ): + self.dataset = dataset + self.n_classes = n_classes + self.feature = feature + self.project_scale = project_scale + self.full_scene_size = full_scene_size + super().__init__(**kwargs) + + + + + diff --git a/monoscene/flosp.py b/monoscene/flosp.py new file mode 100644 index 0000000000000000000000000000000000000000..2d502197a72ee120773a47f239e86743f5a1e2d4 --- /dev/null +++ b/monoscene/flosp.py @@ -0,0 +1,41 @@ +import torch +import torch.nn as nn + + +class FLoSP(nn.Module): + def __init__(self, scene_size, dataset, project_scale): + super().__init__() + self.scene_size = scene_size + self.dataset = dataset + self.project_scale = project_scale + + def forward(self, x2d, projected_pix, fov_mask): + c, h, w = x2d.shape + + src = x2d.view(c, -1) + zeros_vec = torch.zeros(c, 1).type_as(src) + src = torch.cat([src, zeros_vec], 1) + + pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1] + img_indices = pix_y * w + pix_x + img_indices[~fov_mask] = h * w + img_indices = img_indices.expand(c, -1).long() # c, HWD + src_feature = torch.gather(src, 1, img_indices) + + if self.dataset == "NYU": + x3d = src_feature.reshape( + c, + self.scene_size[0] // self.project_scale, + self.scene_size[2] // self.project_scale, + self.scene_size[1] // self.project_scale, + ) + x3d = x3d.permute(0, 1, 3, 2) + elif self.dataset == "kitti": + x3d = src_feature.reshape( + c, + self.scene_size[0] // self.project_scale, + self.scene_size[1] // self.project_scale, + self.scene_size[2] // self.project_scale, + ) + + return x3d diff --git a/monoscene/modules.py b/monoscene/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..3e8bf875ccd6dffb51bb5acb25f0302fe0032d6c --- /dev/null +++ b/monoscene/modules.py @@ -0,0 +1,194 @@ +import torch +import torch.nn as nn +from monoscene.DDR import Bottleneck3D + + +class ASPP(nn.Module): + """ + ASPP 3D + Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7 + """ + + def __init__(self, planes, dilations_conv_list): + super().__init__() + + # ASPP Block + self.conv_list = dilations_conv_list + self.conv1 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn1 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.conv2 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn2 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.relu = nn.ReLU() + + def forward(self, x_in): + + y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in))))) + for i in range(1, len(self.conv_list)): + y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in))))) + x_in = self.relu(y + x_in) # modified + + return x_in + + +class SegmentationHead(nn.Module): + """ + 3D Segmentation heads to retrieve semantic segmentation at each scale. + Formed by Dim expansion, Conv3D, ASPP block, Conv3D. + Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7 + """ + + def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list): + super().__init__() + + # First convolution + self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1) + + # ASPP Block + self.conv_list = dilations_conv_list + self.conv1 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn1 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.conv2 = nn.ModuleList( + [ + nn.Conv3d( + planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False + ) + for dil in dilations_conv_list + ] + ) + self.bn2 = nn.ModuleList( + [nn.BatchNorm3d(planes) for dil in dilations_conv_list] + ) + self.relu = nn.ReLU() + + self.conv_classes = nn.Conv3d( + planes, nbr_classes, kernel_size=3, padding=1, stride=1 + ) + + def forward(self, x_in): + + # Convolution to go from inplanes to planes features... + x_in = self.relu(self.conv0(x_in)) + + y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in))))) + for i in range(1, len(self.conv_list)): + y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in))))) + x_in = self.relu(y + x_in) # modified + + x_in = self.conv_classes(x_in) + + return x_in + + +class ProcessKitti(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]): + super(Process, self).__init__() + self.main = nn.Sequential( + *[ + Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + norm_layer=norm_layer, + dilation=[i, i, i], + ) + for i in dilations + ] + ) + + def forward(self, x): + return self.main(x) + + +class Process(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]): + super(Process, self).__init__() + self.main = nn.Sequential( + *[ + Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + norm_layer=norm_layer, + dilation=[i, i, i], + ) + for i in dilations + ] + ) + + def forward(self, x): + return self.main(x) + + +class Upsample(nn.Module): + def __init__(self, in_channels, out_channels, norm_layer, bn_momentum): + super(Upsample, self).__init__() + self.main = nn.Sequential( + nn.ConvTranspose3d( + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + dilation=1, + output_padding=1, + ), + norm_layer(out_channels, momentum=bn_momentum), + nn.ReLU(), + ) + + def forward(self, x): + return self.main(x) + + +class Downsample(nn.Module): + def __init__(self, feature, norm_layer, bn_momentum, expansion=8): + super(Downsample, self).__init__() + self.main = Bottleneck3D( + feature, + feature // 4, + bn_momentum=bn_momentum, + expansion=expansion, + stride=2, + downsample=nn.Sequential( + nn.AvgPool3d(kernel_size=2, stride=2), + nn.Conv3d( + feature, + int(feature * expansion / 4), + kernel_size=1, + stride=1, + bias=False, + ), + norm_layer(int(feature * expansion / 4), momentum=bn_momentum), + ), + norm_layer=norm_layer, + ) + + def forward(self, x): + return self.main(x) diff --git a/monoscene/monoscene.py b/monoscene/monoscene.py new file mode 100644 index 0000000000000000000000000000000000000000..d8dd444c86ac9b38494e7fc0f685504ae2f25a56 --- /dev/null +++ b/monoscene/monoscene.py @@ -0,0 +1,125 @@ +import pytorch_lightning as pl +import torch +import torch.nn as nn +from monoscene.unet3d_nyu import UNet3D as UNet3DNYU +from monoscene.unet3d_kitti import UNet3D as UNet3DKitti +from monoscene.flosp import FLoSP +import numpy as np +import torch.nn.functional as F +from monoscene.unet2d import UNet2D + + +class MonoScene(pl.LightningModule): + def __init__( + self, + n_classes, + feature, + project_scale, + full_scene_size, + dataset, + project_res=["1", "2", "4", "8"], + n_relations=4, + context_prior=True, + fp_loss=True, + frustum_size=4, + relation_loss=False, + CE_ssc_loss=True, + geo_scal_loss=True, + sem_scal_loss=True, + lr=1e-4, + weight_decay=1e-4, + ): + super().__init__() + + self.project_res = project_res + self.fp_loss = fp_loss + self.dataset = dataset + self.context_prior = context_prior + self.frustum_size = frustum_size + self.relation_loss = relation_loss + self.CE_ssc_loss = CE_ssc_loss + self.sem_scal_loss = sem_scal_loss + self.geo_scal_loss = geo_scal_loss + self.project_scale = project_scale + self.lr = lr + self.weight_decay = weight_decay + + self.projects = {} + self.scale_2ds = [1, 2, 4, 8] # 2D scales + for scale_2d in self.scale_2ds: + self.projects[str(scale_2d)] = FLoSP( + full_scene_size, project_scale=self.project_scale, dataset=self.dataset + ) + self.projects = nn.ModuleDict(self.projects) + + self.n_classes = n_classes + if self.dataset == "NYU": + self.net_3d_decoder = UNet3DNYU( + self.n_classes, + nn.BatchNorm3d, + n_relations=n_relations, + feature=feature, + full_scene_size=full_scene_size, + context_prior=context_prior, + ) + elif self.dataset == "kitti": + self.net_3d_decoder = UNet3DKitti( + self.n_classes, + nn.BatchNorm3d, + project_scale=project_scale, + feature=feature, + full_scene_size=full_scene_size, + context_prior=context_prior, + ) + self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True) + + def forward(self, batch): + + img = batch["img"] + bs = len(img) + + out = {} + + x_rgb = self.net_rgb(img) + + x3ds = [] + for i in range(bs): + x3d = None + for scale_2d in self.project_res: + + # project features at each 2D scale to target 3D scale + scale_2d = int(scale_2d) + projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda() + fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda() + + # Sum all the 3D features + if x3d is None: + x3d = self.projects[str(scale_2d)]( + x_rgb["1_" + str(scale_2d)][i], + # torch.div(projected_pix, scale_2d, rounding_mode='floor'), + projected_pix // scale_2d, + fov_mask, + ) + else: + x3d += self.projects[str(scale_2d)]( + x_rgb["1_" + str(scale_2d)][i], + # torch.div(projected_pix, scale_2d, rounding_mode='floor'), + projected_pix // scale_2d, + fov_mask, + ) + x3ds.append(x3d) + + input_dict = { + "x3d": torch.stack(x3ds), + } + + out_dict = self.net_3d_decoder(input_dict) + + ssc_pred = out_dict["ssc_logit"] + + y_pred = ssc_pred.detach().cpu().numpy() + y_pred = np.argmax(y_pred, axis=1) + + return y_pred + + diff --git a/monoscene/monoscene_model.py b/monoscene/monoscene_model.py new file mode 100644 index 0000000000000000000000000000000000000000..8a5207f3d03de86192c5d41a8bdfe3ce32e672ab --- /dev/null +++ b/monoscene/monoscene_model.py @@ -0,0 +1,21 @@ +from transformers import PreTrainedModel +from .config import MonoSceneConfig +from monoscene.monoscene import MonoScene + + +class MonoSceneModel(PreTrainedModel): + config_class = MonoSceneConfig + + def __init__(self, config): + super().__init__(config) + self.model = MonoScene( + dataset=config.dataset, + n_classes=config.n_classes, + feature=config.feature, + project_scale=config.project_scale, + full_scene_size=config.full_scene_size + ) + + + def forward(self, tensor): + return self.model.forward(tensor) \ No newline at end of file diff --git a/monoscene/unet2d.py b/monoscene/unet2d.py new file mode 100644 index 0000000000000000000000000000000000000000..a1c9e45553b1c7e083436778c6e963545446d008 --- /dev/null +++ b/monoscene/unet2d.py @@ -0,0 +1,198 @@ +""" +Code adapted from https://github.com/shariqfarooq123/AdaBins/blob/main/models/unet_adaptive_bins.py +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +import os + + +class UpSampleBN(nn.Module): + def __init__(self, skip_input, output_features): + super(UpSampleBN, self).__init__() + self._net = nn.Sequential( + nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1), + nn.BatchNorm2d(output_features), + nn.LeakyReLU(), + nn.Conv2d( + output_features, output_features, kernel_size=3, stride=1, padding=1 + ), + nn.BatchNorm2d(output_features), + nn.LeakyReLU(), + ) + + def forward(self, x, concat_with): + up_x = F.interpolate( + x, + size=(concat_with.shape[2], concat_with.shape[3]), + mode="bilinear", + align_corners=True, + ) + f = torch.cat([up_x, concat_with], dim=1) + return self._net(f) + + +class DecoderBN(nn.Module): + def __init__( + self, num_features, bottleneck_features, out_feature, use_decoder=True + ): + super(DecoderBN, self).__init__() + features = int(num_features) + self.use_decoder = use_decoder + + self.conv2 = nn.Conv2d( + bottleneck_features, features, kernel_size=1, stride=1, padding=1 + ) + + self.out_feature_1_1 = out_feature + self.out_feature_1_2 = out_feature + self.out_feature_1_4 = out_feature + self.out_feature_1_8 = out_feature + self.out_feature_1_16 = out_feature + self.feature_1_16 = features // 2 + self.feature_1_8 = features // 4 + self.feature_1_4 = features // 8 + self.feature_1_2 = features // 16 + self.feature_1_1 = features // 32 + + if self.use_decoder: + self.resize_output_1_1 = nn.Conv2d( + self.feature_1_1, self.out_feature_1_1, kernel_size=1 + ) + self.resize_output_1_2 = nn.Conv2d( + self.feature_1_2, self.out_feature_1_2, kernel_size=1 + ) + self.resize_output_1_4 = nn.Conv2d( + self.feature_1_4, self.out_feature_1_4, kernel_size=1 + ) + self.resize_output_1_8 = nn.Conv2d( + self.feature_1_8, self.out_feature_1_8, kernel_size=1 + ) + self.resize_output_1_16 = nn.Conv2d( + self.feature_1_16, self.out_feature_1_16, kernel_size=1 + ) + + self.up16 = UpSampleBN( + skip_input=features + 224, output_features=self.feature_1_16 + ) + self.up8 = UpSampleBN( + skip_input=self.feature_1_16 + 80, output_features=self.feature_1_8 + ) + self.up4 = UpSampleBN( + skip_input=self.feature_1_8 + 48, output_features=self.feature_1_4 + ) + self.up2 = UpSampleBN( + skip_input=self.feature_1_4 + 32, output_features=self.feature_1_2 + ) + self.up1 = UpSampleBN( + skip_input=self.feature_1_2 + 3, output_features=self.feature_1_1 + ) + else: + self.resize_output_1_1 = nn.Conv2d(3, out_feature, kernel_size=1) + self.resize_output_1_2 = nn.Conv2d(32, out_feature * 2, kernel_size=1) + self.resize_output_1_4 = nn.Conv2d(48, out_feature * 4, kernel_size=1) + + def forward(self, features): + x_block0, x_block1, x_block2, x_block3, x_block4 = ( + features[4], + features[5], + features[6], + features[8], + features[11], + ) + bs = x_block0.shape[0] + x_d0 = self.conv2(x_block4) + + if self.use_decoder: + x_1_16 = self.up16(x_d0, x_block3) + x_1_8 = self.up8(x_1_16, x_block2) + x_1_4 = self.up4(x_1_8, x_block1) + x_1_2 = self.up2(x_1_4, x_block0) + x_1_1 = self.up1(x_1_2, features[0]) + return { + "1_1": self.resize_output_1_1(x_1_1), + "1_2": self.resize_output_1_2(x_1_2), + "1_4": self.resize_output_1_4(x_1_4), + "1_8": self.resize_output_1_8(x_1_8), + "1_16": self.resize_output_1_16(x_1_16), + } + else: + x_1_1 = features[0] + x_1_2, x_1_4, x_1_8, x_1_16 = ( + features[4], + features[5], + features[6], + features[8], + ) + x_global = features[-1].reshape(bs, 2560, -1).mean(2) + return { + "1_1": self.resize_output_1_1(x_1_1), + "1_2": self.resize_output_1_2(x_1_2), + "1_4": self.resize_output_1_4(x_1_4), + "global": x_global, + } + + +class Encoder(nn.Module): + def __init__(self, backend): + super(Encoder, self).__init__() + self.original_model = backend + + def forward(self, x): + features = [x] + for k, v in self.original_model._modules.items(): + if k == "blocks": + for ki, vi in v._modules.items(): + features.append(vi(features[-1])) + else: + features.append(v(features[-1])) + return features + + +class UNet2D(nn.Module): + def __init__(self, backend, num_features, out_feature, use_decoder=True): + super(UNet2D, self).__init__() + self.use_decoder = use_decoder + self.encoder = Encoder(backend) + self.decoder = DecoderBN( + out_feature=out_feature, + use_decoder=use_decoder, + bottleneck_features=num_features, + num_features=num_features, + ) + + def forward(self, x, **kwargs): + encoded_feats = self.encoder(x) + unet_out = self.decoder(encoded_feats, **kwargs) + return unet_out + + def get_encoder_params(self): # lr/10 learning rate + return self.encoder.parameters() + + def get_decoder_params(self): # lr learning rate + return self.decoder.parameters() + + @classmethod + def build(cls, **kwargs): + basemodel_name = "tf_efficientnet_b7_ns" + num_features = 2560 + + print("Loading base model ()...".format(basemodel_name), end="") + basemodel = torch.hub.load( + "rwightman/gen-efficientnet-pytorch", basemodel_name, pretrained=True + ) + print("Done.") + + # Remove last layer + print("Removing last two layers (global_pool & classifier).") + basemodel.global_pool = nn.Identity() + basemodel.classifier = nn.Identity() + + # Building Encoder-Decoder model + print("Building Encoder-Decoder model..", end="") + m = cls(basemodel, num_features=num_features, **kwargs) + print("Done.") + return m + +if __name__ == '__main__': + model = UNet2D.build(out_feature=256, use_decoder=True) diff --git a/monoscene/unet3d_kitti.py b/monoscene/unet3d_kitti.py new file mode 100644 index 0000000000000000000000000000000000000000..91d5339fbdf34e28d017d7e4e29ce4923169bef5 --- /dev/null +++ b/monoscene/unet3d_kitti.py @@ -0,0 +1,88 @@ +# encoding: utf-8 +import torch +import torch.nn as nn +import torch.nn.functional as F +from monoscene.modules import SegmentationHead +from monoscene.CRP3D import CPMegaVoxels +from monoscene.modules import Process, Upsample, Downsample + + +class UNet3D(nn.Module): + def __init__( + self, + class_num, + norm_layer, + full_scene_size, + feature, + project_scale, + context_prior=None, + bn_momentum=0.1, + ): + super(UNet3D, self).__init__() + self.business_layer = [] + self.project_scale = project_scale + self.full_scene_size = full_scene_size + self.feature = feature + + size_l1 = ( + int(self.full_scene_size[0] / project_scale), + int(self.full_scene_size[1] / project_scale), + int(self.full_scene_size[2] / project_scale), + ) + size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2) + size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2) + + dilations = [1, 2, 3] + self.process_l1 = nn.Sequential( + Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature, norm_layer, bn_momentum), + ) + self.process_l2 = nn.Sequential( + Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature * 2, norm_layer, bn_momentum), + ) + + self.up_13_l2 = Upsample( + self.feature * 4, self.feature * 2, norm_layer, bn_momentum + ) + self.up_12_l1 = Upsample( + self.feature * 2, self.feature, norm_layer, bn_momentum + ) + self.up_l1_lfull = Upsample( + self.feature, self.feature // 2, norm_layer, bn_momentum + ) + + self.ssc_head = SegmentationHead( + self.feature // 2, self.feature // 2, class_num, dilations + ) + + self.context_prior = context_prior + if context_prior: + self.CP_mega_voxels = CPMegaVoxels( + self.feature * 4, size_l3, bn_momentum=bn_momentum + ) + + def forward(self, input_dict): + res = {} + + x3d_l1 = input_dict["x3d"] + + x3d_l2 = self.process_l1(x3d_l1) + + x3d_l3 = self.process_l2(x3d_l2) + + if self.context_prior: + ret = self.CP_mega_voxels(x3d_l3) + x3d_l3 = ret["x"] + for k in ret.keys(): + res[k] = ret[k] + + x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2 + x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1 + x3d_up_lfull = self.up_l1_lfull(x3d_up_l1) + + ssc_logit_full = self.ssc_head(x3d_up_lfull) + + res["ssc_logit"] = ssc_logit_full + + return res diff --git a/monoscene/unet3d_nyu.py b/monoscene/unet3d_nyu.py new file mode 100644 index 0000000000000000000000000000000000000000..e9e3b3718999248efa1b2925658465ba59801b13 --- /dev/null +++ b/monoscene/unet3d_nyu.py @@ -0,0 +1,90 @@ +# encoding: utf-8 +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from monoscene.CRP3D import CPMegaVoxels +from monoscene.modules import ( + Process, + Upsample, + Downsample, + SegmentationHead, + ASPP, +) + + +class UNet3D(nn.Module): + def __init__( + self, + class_num, + norm_layer, + feature, + full_scene_size, + n_relations=4, + project_res=[], + context_prior=True, + bn_momentum=0.1, + ): + super(UNet3D, self).__init__() + self.business_layer = [] + self.project_res = project_res + + self.feature_1_4 = feature + self.feature_1_8 = feature * 2 + self.feature_1_16 = feature * 4 + + self.feature_1_16_dec = self.feature_1_16 + self.feature_1_8_dec = self.feature_1_8 + self.feature_1_4_dec = self.feature_1_4 + + self.process_1_4 = nn.Sequential( + Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature_1_4, norm_layer, bn_momentum), + ) + self.process_1_8 = nn.Sequential( + Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]), + Downsample(self.feature_1_8, norm_layer, bn_momentum), + ) + self.up_1_16_1_8 = Upsample( + self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum + ) + self.up_1_8_1_4 = Upsample( + self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum + ) + self.ssc_head_1_4 = SegmentationHead( + self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3] + ) + + self.context_prior = context_prior + size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size) + + if context_prior: + self.CP_mega_voxels = CPMegaVoxels( + self.feature_1_16, + size_1_16, + n_relations=n_relations, + bn_momentum=bn_momentum, + ) + + # + def forward(self, input_dict): + res = {} + + x3d_1_4 = input_dict["x3d"] + x3d_1_8 = self.process_1_4(x3d_1_4) + x3d_1_16 = self.process_1_8(x3d_1_8) + + if self.context_prior: + ret = self.CP_mega_voxels(x3d_1_16) + x3d_1_16 = ret["x"] + for k in ret.keys(): + res[k] = ret[k] + + x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8 + x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4 + + ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4) + + res["ssc_logit"] = ssc_logit_1_4 + + return res diff --git a/monoscene_kitti.ckpt b/monoscene_kitti.ckpt new file mode 100644 index 0000000000000000000000000000000000000000..cd302ef17432a3b5c8b58ab5a63bb52e2c166976 --- /dev/null +++ b/monoscene_kitti.ckpt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f0d1324885166f17949bf2dcfc0ee1eb2d2aedd0f48e75b56bb2beb87c1ce3a +size 1796467007 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..81946126e893d56bb066160e9465ef42bc43d9d9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +scikit-image==0.18.1 +PyYAML==5.3.1 +tqdm==4.49.0 +scikit-learn==0.24.0 +pytorch-lightning==1.4.9 +opencv-python==4.5.1.48 +hydra-core==1.0.5 +numpy==1.20.3 +numba==0.53 +imageio +protobuf~=3.19.0 +transformers +plotly +torch +torchvision +torchmetrics==0.6.0 \ No newline at end of file