diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..9d7dd240381bd4ef025168212e15f1f0b7478660
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,28 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zstandard filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+monoscene_kitti.ckpt filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..46769a0f0371bf73afac0874aa640b65361b0b19
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+__pycache__
+.ipynb_checkpoints
+*.ckpt
+gradio*
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..3a7f97e80109bd91f0ed142e6eb9aac585326881
--- /dev/null
+++ b/README.md
@@ -0,0 +1,14 @@
+---
+title: MonoScene
+emoji: 🚘🏙️
+colorFrom: purple
+colorTo: pink
+sdk: gradio
+sdk_version: 3.0.20
+app_file: app.py
+pinned: true
+license: apache-2.0
+duplicated_from: CVPR/MonoScene
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..01225a388b656d95f22d55a02c47d4d62772c3c3
--- /dev/null
+++ b/app.py
@@ -0,0 +1,126 @@
+import gradio as gr
+import numpy as np
+from torchvision import transforms
+import torch
+from helpers import *
+import sys
+import csv
+from monoscene.monoscene import MonoScene
+
+csv.field_size_limit(sys.maxsize)
+torch.set_grad_enabled(False)
+
+# pipeline = pipeline(model="anhquancao/monoscene_kitti")
+# model = AutoModel.from_pretrained(
+#     "anhquancao/monoscene_kitti", trust_remote_code=True, revision='bf033f87c2a86b60903ab811b790a1532c1ae313'
+# )#.cuda()
+model = MonoScene.load_from_checkpoint(
+        "monoscene_kitti.ckpt",
+        dataset="kitti",
+        n_classes=20,
+        feature = 64,
+        project_scale = 2,
+        full_scene_size = (256, 256, 32),
+    )
+
+img_W, img_H = 1220, 370
+
+
+def predict(img):
+    img = np.array(img, dtype=np.float32, copy=False) / 255.0
+
+    normalize_rgb = transforms.Compose(
+            [
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+    img = normalize_rgb(img)
+   
+    batch = get_projections(img_W, img_H)
+    batch["img"] = img
+    for k in batch:
+        batch[k] = batch[k].unsqueeze(0)#.cuda()
+
+    pred = model(batch).squeeze()
+    # print(pred.shape)
+    pred = majority_pooling(pred, k_size=2)
+    fig = draw(pred, batch['fov_mask_2'])
+
+
+    return fig
+   
+
+description = """
+MonoScene Demo on SemanticKITTI Validation Set (Sequence 08), which uses the <b>camera parameters of Sequence 08</b>.
+Due to the <b>CPU-only</b> inference, it might take up to 20s to predict a scene. \n
+The output is <b>downsampled by 2</b> for faster rendering. <b>Darker</b> colors represent the <b>scenery outside the Field of View</b>, i.e. not visible on the image.
+<center>
+    <a href="https://astra-vision.github.io/MonoScene/">
+        <img style="display:inline" alt="Project page" src="https://img.shields.io/badge/Project%20Page-MonoScene-red">
+    </a>
+    <a href="https://arxiv.org/abs/2112.00726"><img style="display:inline" src="https://img.shields.io/badge/arXiv%20%2B%20supp-2112.00726-purple"></a>
+    <a href="https://github.com/cv-rits/MonoScene"><img style="display:inline" src="https://img.shields.io/github/stars/cv-rits/MonoScene?style=social"></a>
+</center>
+"""
+title = "MonoScene: Monocular 3D Semantic Scene Completion"
+article="""
+<center>
+We also released a <b>smaller</b> MonoScene model (Half resolution - w/o 3D CRP) at: <a href="https://huggingface.co/spaces/CVPR/monoscene_lite">https://huggingface.co/spaces/CVPR/monoscene_lite</a>
+    <img src='https://visitor-badge.glitch.me/badge?page_id=anhquancao.MonoScene&left_color=darkmagenta&right_color=purple' alt='visitor badge'>
+</center>
+"""
+
+examples = [
+    'images/08/001385.jpg',
+    'images/08/000295.jpg',
+    'images/08/002505.jpg',
+    'images/08/000085.jpg',
+    'images/08/000290.jpg',
+    'images/08/000465.jpg',
+    'images/08/000790.jpg',
+    'images/08/001005.jpg',
+    'images/08/001380.jpg',
+    'images/08/001530.jpg',
+    'images/08/002360.jpg',
+    'images/08/004059.jpg',
+    'images/08/003149.jpg',
+    'images/08/001446.jpg',
+    'images/08/000010.jpg',
+    'images/08/001122.jpg',
+    'images/08/003533.jpg',
+    'images/08/003365.jpg',
+    'images/08/002944.jpg',
+    'images/08/000822.jpg',
+    'images/08/000103.jpg',
+    'images/08/002716.jpg',
+    'images/08/000187.jpg',
+    'images/08/002128.jpg',
+    'images/08/000511.jpg',
+    'images/08/000618.jpg',
+    'images/08/002010.jpg',
+    'images/08/000234.jpg',
+    'images/08/001842.jpg',
+    'images/08/001687.jpg',
+    'images/08/003929.jpg',
+    'images/08/002272.jpg',
+]
+
+
+
+demo = gr.Interface(
+    predict, 
+    gr.Image(shape=(1220, 370)), 
+    gr.Plot(),  
+    article=article,
+    title=title,
+    enable_queue=True,
+    cache_examples=False,
+    live=False,
+    examples=examples,
+    description=description)
+
+
+demo.launch(enable_queue=True, debug=False)
\ No newline at end of file
diff --git a/calib.txt b/calib.txt
new file mode 100644
index 0000000000000000000000000000000000000000..793946dabbfa14421b0ab261d69fca372137b76e
--- /dev/null
+++ b/calib.txt
@@ -0,0 +1,5 @@
+P0: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 0.000000000000e+00 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P1: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.861448000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 0.000000000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 0.000000000000e+00
+P2: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 4.538225000000e+01 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 -1.130887000000e-01 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 3.779761000000e-03
+P3: 7.188560000000e+02 0.000000000000e+00 6.071928000000e+02 -3.372877000000e+02 0.000000000000e+00 7.188560000000e+02 1.852157000000e+02 2.369057000000e+00 0.000000000000e+00 0.000000000000e+00 1.000000000000e+00 4.915215000000e-03
+Tr: 4.276802385584e-04 -9.999672484946e-01 -8.084491683471e-03 -1.198459927713e-02 -7.210626507497e-03 8.081198471645e-03 -9.999413164504e-01 -5.403984729748e-02 9.999738645903e-01 4.859485810390e-04 -7.206933692422e-03 -2.921968648686e-01
diff --git a/fusion.py b/fusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..aecd5cba3b1e3dd1e0534cda347eca8956657926
--- /dev/null
+++ b/fusion.py
@@ -0,0 +1,507 @@
+"""
+Most of the code is taken from https://github.com/andyzeng/tsdf-fusion-python/blob/master/fusion.py
+
+@inproceedings{zeng20163dmatch,
+    title={3DMatch: Learning Local Geometric Descriptors from RGB-D Reconstructions},
+    author={Zeng, Andy and Song, Shuran and Nie{\ss}ner, Matthias and Fisher, Matthew and Xiao, Jianxiong and Funkhouser, Thomas},
+    booktitle={CVPR},
+    year={2017}
+}
+"""
+
+import numpy as np
+
+from numba import njit, prange
+from skimage import measure
+
+FUSION_GPU_MODE = 0
+
+
+class TSDFVolume:
+    """Volumetric TSDF Fusion of RGB-D Images."""
+
+    def __init__(self, vol_bnds, voxel_size, use_gpu=True):
+        """Constructor.
+
+        Args:
+          vol_bnds (ndarray): An ndarray of shape (3, 2). Specifies the
+            xyz bounds (min/max) in meters.
+          voxel_size (float): The volume discretization in meters.
+        """
+        vol_bnds = np.asarray(vol_bnds)
+        assert vol_bnds.shape == (3, 2), "[!] `vol_bnds` should be of shape (3, 2)."
+
+        # Define voxel volume parameters
+        self._vol_bnds = vol_bnds
+        self._voxel_size = float(voxel_size)
+        self._trunc_margin = 5 * self._voxel_size  # truncation on SDF
+        # self._trunc_margin = 10  # truncation on SDF
+        self._color_const = 256 * 256
+
+        # Adjust volume bounds and ensure C-order contiguous
+        self._vol_dim = (
+            np.ceil((self._vol_bnds[:, 1] - self._vol_bnds[:, 0]) / self._voxel_size)
+            .copy(order="C")
+            .astype(int)
+        )
+        self._vol_bnds[:, 1] = self._vol_bnds[:, 0] + self._vol_dim * self._voxel_size
+        self._vol_origin = self._vol_bnds[:, 0].copy(order="C").astype(np.float32)
+
+        print(
+            "Voxel volume size: {} x {} x {} - # points: {:,}".format(
+                self._vol_dim[0],
+                self._vol_dim[1],
+                self._vol_dim[2],
+                self._vol_dim[0] * self._vol_dim[1] * self._vol_dim[2],
+            )
+        )
+
+        # Initialize pointers to voxel volume in CPU memory
+        self._tsdf_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        # for computing the cumulative moving average of observations per voxel
+        self._weight_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+        self._color_vol_cpu = np.zeros(self._vol_dim).astype(np.float32)
+
+        self.gpu_mode = use_gpu and FUSION_GPU_MODE
+
+        # Copy voxel volumes to GPU
+        if self.gpu_mode:
+            self._tsdf_vol_gpu = cuda.mem_alloc(self._tsdf_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._tsdf_vol_gpu, self._tsdf_vol_cpu)
+            self._weight_vol_gpu = cuda.mem_alloc(self._weight_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._weight_vol_gpu, self._weight_vol_cpu)
+            self._color_vol_gpu = cuda.mem_alloc(self._color_vol_cpu.nbytes)
+            cuda.memcpy_htod(self._color_vol_gpu, self._color_vol_cpu)
+
+            # Cuda kernel function (C++)
+            self._cuda_src_mod = SourceModule(
+                """
+        __global__ void integrate(float * tsdf_vol,
+                                  float * weight_vol,
+                                  float * color_vol,
+                                  float * vol_dim,
+                                  float * vol_origin,
+                                  float * cam_intr,
+                                  float * cam_pose,
+                                  float * other_params,
+                                  float * color_im,
+                                  float * depth_im) {
+          // Get voxel index
+          int gpu_loop_idx = (int) other_params[0];
+          int max_threads_per_block = blockDim.x;
+          int block_idx = blockIdx.z*gridDim.y*gridDim.x+blockIdx.y*gridDim.x+blockIdx.x;
+          int voxel_idx = gpu_loop_idx*gridDim.x*gridDim.y*gridDim.z*max_threads_per_block+block_idx*max_threads_per_block+threadIdx.x;
+          int vol_dim_x = (int) vol_dim[0];
+          int vol_dim_y = (int) vol_dim[1];
+          int vol_dim_z = (int) vol_dim[2];
+          if (voxel_idx > vol_dim_x*vol_dim_y*vol_dim_z)
+              return;
+          // Get voxel grid coordinates (note: be careful when casting)
+          float voxel_x = floorf(((float)voxel_idx)/((float)(vol_dim_y*vol_dim_z)));
+          float voxel_y = floorf(((float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z))/((float)vol_dim_z));
+          float voxel_z = (float)(voxel_idx-((int)voxel_x)*vol_dim_y*vol_dim_z-((int)voxel_y)*vol_dim_z);
+          // Voxel grid coordinates to world coordinates
+          float voxel_size = other_params[1];
+          float pt_x = vol_origin[0]+voxel_x*voxel_size;
+          float pt_y = vol_origin[1]+voxel_y*voxel_size;
+          float pt_z = vol_origin[2]+voxel_z*voxel_size;
+          // World coordinates to camera coordinates
+          float tmp_pt_x = pt_x-cam_pose[0*4+3];
+          float tmp_pt_y = pt_y-cam_pose[1*4+3];
+          float tmp_pt_z = pt_z-cam_pose[2*4+3];
+          float cam_pt_x = cam_pose[0*4+0]*tmp_pt_x+cam_pose[1*4+0]*tmp_pt_y+cam_pose[2*4+0]*tmp_pt_z;
+          float cam_pt_y = cam_pose[0*4+1]*tmp_pt_x+cam_pose[1*4+1]*tmp_pt_y+cam_pose[2*4+1]*tmp_pt_z;
+          float cam_pt_z = cam_pose[0*4+2]*tmp_pt_x+cam_pose[1*4+2]*tmp_pt_y+cam_pose[2*4+2]*tmp_pt_z;
+          // Camera coordinates to image pixels
+          int pixel_x = (int) roundf(cam_intr[0*3+0]*(cam_pt_x/cam_pt_z)+cam_intr[0*3+2]);
+          int pixel_y = (int) roundf(cam_intr[1*3+1]*(cam_pt_y/cam_pt_z)+cam_intr[1*3+2]);
+          // Skip if outside view frustum
+          int im_h = (int) other_params[2];
+          int im_w = (int) other_params[3];
+          if (pixel_x < 0 || pixel_x >= im_w || pixel_y < 0 || pixel_y >= im_h || cam_pt_z<0)
+              return;
+          // Skip invalid depth
+          float depth_value = depth_im[pixel_y*im_w+pixel_x];
+          if (depth_value == 0)
+              return;
+          // Integrate TSDF
+          float trunc_margin = other_params[4];
+          float depth_diff = depth_value-cam_pt_z;
+          if (depth_diff < -trunc_margin)
+              return;
+          float dist = fmin(1.0f,depth_diff/trunc_margin);
+          float w_old = weight_vol[voxel_idx];
+          float obs_weight = other_params[5];
+          float w_new = w_old + obs_weight;
+          weight_vol[voxel_idx] = w_new;
+          tsdf_vol[voxel_idx] = (tsdf_vol[voxel_idx]*w_old+obs_weight*dist)/w_new;
+          // Integrate color
+          float old_color = color_vol[voxel_idx];
+          float old_b = floorf(old_color/(256*256));
+          float old_g = floorf((old_color-old_b*256*256)/256);
+          float old_r = old_color-old_b*256*256-old_g*256;
+          float new_color = color_im[pixel_y*im_w+pixel_x];
+          float new_b = floorf(new_color/(256*256));
+          float new_g = floorf((new_color-new_b*256*256)/256);
+          float new_r = new_color-new_b*256*256-new_g*256;
+          new_b = fmin(roundf((old_b*w_old+obs_weight*new_b)/w_new),255.0f);
+          new_g = fmin(roundf((old_g*w_old+obs_weight*new_g)/w_new),255.0f);
+          new_r = fmin(roundf((old_r*w_old+obs_weight*new_r)/w_new),255.0f);
+          color_vol[voxel_idx] = new_b*256*256+new_g*256+new_r;
+        }"""
+            )
+
+            self._cuda_integrate = self._cuda_src_mod.get_function("integrate")
+
+            # Determine block/grid size on GPU
+            gpu_dev = cuda.Device(0)
+            self._max_gpu_threads_per_block = gpu_dev.MAX_THREADS_PER_BLOCK
+            n_blocks = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(self._max_gpu_threads_per_block)
+                )
+            )
+            grid_dim_x = min(gpu_dev.MAX_GRID_DIM_X, int(np.floor(np.cbrt(n_blocks))))
+            grid_dim_y = min(
+                gpu_dev.MAX_GRID_DIM_Y, int(np.floor(np.sqrt(n_blocks / grid_dim_x)))
+            )
+            grid_dim_z = min(
+                gpu_dev.MAX_GRID_DIM_Z,
+                int(np.ceil(float(n_blocks) / float(grid_dim_x * grid_dim_y))),
+            )
+            self._max_gpu_grid_dim = np.array(
+                [grid_dim_x, grid_dim_y, grid_dim_z]
+            ).astype(int)
+            self._n_gpu_loops = int(
+                np.ceil(
+                    float(np.prod(self._vol_dim))
+                    / float(
+                        np.prod(self._max_gpu_grid_dim)
+                        * self._max_gpu_threads_per_block
+                    )
+                )
+            )
+
+        else:
+            # Get voxel grid coordinates
+            xv, yv, zv = np.meshgrid(
+                range(self._vol_dim[0]),
+                range(self._vol_dim[1]),
+                range(self._vol_dim[2]),
+                indexing="ij",
+            )
+            self.vox_coords = (
+                np.concatenate(
+                    [xv.reshape(1, -1), yv.reshape(1, -1), zv.reshape(1, -1)], axis=0
+                )
+                .astype(int)
+                .T
+            )
+
+    @staticmethod
+    @njit(parallel=True)
+    def vox2world(vol_origin, vox_coords, vox_size, offsets=(0.5, 0.5, 0.5)):
+        """Convert voxel grid coordinates to world coordinates."""
+        vol_origin = vol_origin.astype(np.float32)
+        vox_coords = vox_coords.astype(np.float32)
+        #    print(np.min(vox_coords))
+        cam_pts = np.empty_like(vox_coords, dtype=np.float32)
+
+        for i in prange(vox_coords.shape[0]):
+            for j in range(3):
+                cam_pts[i, j] = (
+                    vol_origin[j]
+                    + (vox_size * vox_coords[i, j])
+                    + vox_size * offsets[j]
+                )
+        return cam_pts
+
+    @staticmethod
+    @njit(parallel=True)
+    def cam2pix(cam_pts, intr):
+        """Convert camera coordinates to pixel coordinates."""
+        intr = intr.astype(np.float32)
+        fx, fy = intr[0, 0], intr[1, 1]
+        cx, cy = intr[0, 2], intr[1, 2]
+        pix = np.empty((cam_pts.shape[0], 2), dtype=np.int64)
+        for i in prange(cam_pts.shape[0]):
+            pix[i, 0] = int(np.round((cam_pts[i, 0] * fx / cam_pts[i, 2]) + cx))
+            pix[i, 1] = int(np.round((cam_pts[i, 1] * fy / cam_pts[i, 2]) + cy))
+        return pix
+
+    @staticmethod
+    @njit(parallel=True)
+    def integrate_tsdf(tsdf_vol, dist, w_old, obs_weight):
+        """Integrate the TSDF volume."""
+        tsdf_vol_int = np.empty_like(tsdf_vol, dtype=np.float32)
+        # print(tsdf_vol.shape)
+        w_new = np.empty_like(w_old, dtype=np.float32)
+        for i in prange(len(tsdf_vol)):
+            w_new[i] = w_old[i] + obs_weight
+            tsdf_vol_int[i] = (w_old[i] * tsdf_vol[i] + obs_weight * dist[i]) / w_new[i]
+        return tsdf_vol_int, w_new
+
+    def integrate(self, color_im, depth_im, cam_intr, cam_pose, obs_weight=1.0):
+        """Integrate an RGB-D frame into the TSDF volume.
+
+        Args:
+          color_im (ndarray): An RGB image of shape (H, W, 3).
+          depth_im (ndarray): A depth image of shape (H, W).
+          cam_intr (ndarray): The camera intrinsics matrix of shape (3, 3).
+          cam_pose (ndarray): The camera pose (i.e. extrinsics) of shape (4, 4).
+          obs_weight (float): The weight to assign for the current observation. A higher
+            value
+        """
+        im_h, im_w = depth_im.shape
+
+        # Fold RGB color image into a single channel image
+        color_im = color_im.astype(np.float32)
+        color_im = np.floor(
+            color_im[..., 2] * self._color_const
+            + color_im[..., 1] * 256
+            + color_im[..., 0]
+        )
+
+        if self.gpu_mode:  # GPU mode: integrate voxel volume (calls CUDA kernel)
+            for gpu_loop_idx in range(self._n_gpu_loops):
+                self._cuda_integrate(
+                    self._tsdf_vol_gpu,
+                    self._weight_vol_gpu,
+                    self._color_vol_gpu,
+                    cuda.InOut(self._vol_dim.astype(np.float32)),
+                    cuda.InOut(self._vol_origin.astype(np.float32)),
+                    cuda.InOut(cam_intr.reshape(-1).astype(np.float32)),
+                    cuda.InOut(cam_pose.reshape(-1).astype(np.float32)),
+                    cuda.InOut(
+                        np.asarray(
+                            [
+                                gpu_loop_idx,
+                                self._voxel_size,
+                                im_h,
+                                im_w,
+                                self._trunc_margin,
+                                obs_weight,
+                            ],
+                            np.float32,
+                        )
+                    ),
+                    cuda.InOut(color_im.reshape(-1).astype(np.float32)),
+                    cuda.InOut(depth_im.reshape(-1).astype(np.float32)),
+                    block=(self._max_gpu_threads_per_block, 1, 1),
+                    grid=(
+                        int(self._max_gpu_grid_dim[0]),
+                        int(self._max_gpu_grid_dim[1]),
+                        int(self._max_gpu_grid_dim[2]),
+                    ),
+                )
+        else:  # CPU mode: integrate voxel volume (vectorized implementation)
+            # Convert voxel grid coordinates to pixel coordinates
+            cam_pts = self.vox2world(
+                self._vol_origin, self.vox_coords, self._voxel_size
+            )
+            cam_pts = rigid_transform(cam_pts, np.linalg.inv(cam_pose))
+            pix_z = cam_pts[:, 2]
+            pix = self.cam2pix(cam_pts, cam_intr)
+            pix_x, pix_y = pix[:, 0], pix[:, 1]
+
+            # Eliminate pixels outside view frustum
+            valid_pix = np.logical_and(
+                pix_x >= 0,
+                np.logical_and(
+                    pix_x < im_w,
+                    np.logical_and(pix_y >= 0, np.logical_and(pix_y < im_h, pix_z > 0)),
+                ),
+            )
+            depth_val = np.zeros(pix_x.shape)
+            depth_val[valid_pix] = depth_im[pix_y[valid_pix], pix_x[valid_pix]]
+
+            # Integrate TSDF
+            depth_diff = depth_val - pix_z
+
+            valid_pts = np.logical_and(depth_val > 0, depth_diff >= -10)
+            dist = depth_diff
+
+            valid_vox_x = self.vox_coords[valid_pts, 0]
+            valid_vox_y = self.vox_coords[valid_pts, 1]
+            valid_vox_z = self.vox_coords[valid_pts, 2]
+            w_old = self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            tsdf_vals = self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            valid_dist = dist[valid_pts]
+            tsdf_vol_new, w_new = self.integrate_tsdf(
+                tsdf_vals, valid_dist, w_old, obs_weight
+            )
+            self._weight_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = w_new
+            self._tsdf_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = tsdf_vol_new
+
+            # Integrate color
+            old_color = self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z]
+            old_b = np.floor(old_color / self._color_const)
+            old_g = np.floor((old_color - old_b * self._color_const) / 256)
+            old_r = old_color - old_b * self._color_const - old_g * 256
+            new_color = color_im[pix_y[valid_pts], pix_x[valid_pts]]
+            new_b = np.floor(new_color / self._color_const)
+            new_g = np.floor((new_color - new_b * self._color_const) / 256)
+            new_r = new_color - new_b * self._color_const - new_g * 256
+            new_b = np.minimum(
+                255.0, np.round((w_old * old_b + obs_weight * new_b) / w_new)
+            )
+            new_g = np.minimum(
+                255.0, np.round((w_old * old_g + obs_weight * new_g) / w_new)
+            )
+            new_r = np.minimum(
+                255.0, np.round((w_old * old_r + obs_weight * new_r) / w_new)
+            )
+            self._color_vol_cpu[valid_vox_x, valid_vox_y, valid_vox_z] = (
+                new_b * self._color_const + new_g * 256 + new_r
+            )
+
+    def get_volume(self):
+        if self.gpu_mode:
+            cuda.memcpy_dtoh(self._tsdf_vol_cpu, self._tsdf_vol_gpu)
+            cuda.memcpy_dtoh(self._color_vol_cpu, self._color_vol_gpu)
+        return self._tsdf_vol_cpu, self._color_vol_cpu
+
+    def get_point_cloud(self):
+        """Extract a point cloud from the voxel volume."""
+        tsdf_vol, color_vol = self.get_volume()
+
+        # Marching cubes
+        verts = measure.marching_cubes_lewiner(tsdf_vol, level=0)[0]
+        verts_ind = np.round(verts).astype(int)
+        verts = verts * self._voxel_size + self._vol_origin
+
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+
+        pc = np.hstack([verts, colors])
+        return pc
+
+    def get_mesh(self):
+        """Compute a mesh from the voxel volume using marching cubes."""
+        tsdf_vol, color_vol = self.get_volume()
+
+        # Marching cubes
+        verts, faces, norms, vals = measure.marching_cubes_lewiner(tsdf_vol, level=0)
+        verts_ind = np.round(verts).astype(int)
+        verts = (
+            verts * self._voxel_size + self._vol_origin
+        )  # voxel grid coordinates to world coordinates
+
+        # Get vertex colors
+        rgb_vals = color_vol[verts_ind[:, 0], verts_ind[:, 1], verts_ind[:, 2]]
+        colors_b = np.floor(rgb_vals / self._color_const)
+        colors_g = np.floor((rgb_vals - colors_b * self._color_const) / 256)
+        colors_r = rgb_vals - colors_b * self._color_const - colors_g * 256
+        colors = np.floor(np.asarray([colors_r, colors_g, colors_b])).T
+        colors = colors.astype(np.uint8)
+        return verts, faces, norms, colors
+
+
+def rigid_transform(xyz, transform):
+    """Applies a rigid transform to an (N, 3) pointcloud."""
+    xyz_h = np.hstack([xyz, np.ones((len(xyz), 1), dtype=np.float32)])
+    xyz_t_h = np.dot(transform, xyz_h.T).T
+    return xyz_t_h[:, :3]
+
+
+def get_view_frustum(depth_im, cam_intr, cam_pose):
+    """Get corners of 3D camera view frustum of depth image"""
+    im_h = depth_im.shape[0]
+    im_w = depth_im.shape[1]
+    max_depth = np.max(depth_im)
+    view_frust_pts = np.array(
+        [
+            (np.array([0, 0, 0, im_w, im_w]) - cam_intr[0, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[0, 0],
+            (np.array([0, 0, im_h, 0, im_h]) - cam_intr[1, 2])
+            * np.array([0, max_depth, max_depth, max_depth, max_depth])
+            / cam_intr[1, 1],
+            np.array([0, max_depth, max_depth, max_depth, max_depth]),
+        ]
+    )
+    view_frust_pts = rigid_transform(view_frust_pts.T, cam_pose).T
+    return view_frust_pts
+
+
+def meshwrite(filename, verts, faces, norms, colors):
+    """Save a 3D mesh to a polygon .ply file."""
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (verts.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property float nx\n")
+    ply_file.write("property float ny\n")
+    ply_file.write("property float nz\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("element face %d\n" % (faces.shape[0]))
+    ply_file.write("property list uchar int vertex_index\n")
+    ply_file.write("end_header\n")
+
+    # Write vertex list
+    for i in range(verts.shape[0]):
+        ply_file.write(
+            "%f %f %f %f %f %f %d %d %d\n"
+            % (
+                verts[i, 0],
+                verts[i, 1],
+                verts[i, 2],
+                norms[i, 0],
+                norms[i, 1],
+                norms[i, 2],
+                colors[i, 0],
+                colors[i, 1],
+                colors[i, 2],
+            )
+        )
+
+    # Write face list
+    for i in range(faces.shape[0]):
+        ply_file.write("3 %d %d %d\n" % (faces[i, 0], faces[i, 1], faces[i, 2]))
+
+    ply_file.close()
+
+
+def pcwrite(filename, xyzrgb):
+    """Save a point cloud to a polygon .ply file."""
+    xyz = xyzrgb[:, :3]
+    rgb = xyzrgb[:, 3:].astype(np.uint8)
+
+    # Write header
+    ply_file = open(filename, "w")
+    ply_file.write("ply\n")
+    ply_file.write("format ascii 1.0\n")
+    ply_file.write("element vertex %d\n" % (xyz.shape[0]))
+    ply_file.write("property float x\n")
+    ply_file.write("property float y\n")
+    ply_file.write("property float z\n")
+    ply_file.write("property uchar red\n")
+    ply_file.write("property uchar green\n")
+    ply_file.write("property uchar blue\n")
+    ply_file.write("end_header\n")
+
+    # Write vertex list
+    for i in range(xyz.shape[0]):
+        ply_file.write(
+            "%f %f %f %d %d %d\n"
+            % (
+                xyz[i, 0],
+                xyz[i, 1],
+                xyz[i, 2],
+                rgb[i, 0],
+                rgb[i, 1],
+                rgb[i, 2],
+            )
+        )
diff --git a/helpers.py b/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0c04e38af0140bd37508becf644802b098ae2e2
--- /dev/null
+++ b/helpers.py
@@ -0,0 +1,336 @@
+import numpy as np
+import torch
+import fusion
+import pandas as pd
+import plotly.express as px
+import plotly.graph_objects as go
+
+def read_calib(calib_path):
+        """
+        Modify from https://github.com/utiasSTARS/pykitti/blob/d3e1bb81676e831886726cc5ed79ce1f049aef2c/pykitti/utils.py#L68
+        :param calib_path: Path to a calibration text file.
+        :return: dict with calibration matrices.
+        """
+        calib_all = {}
+        with open(calib_path, "r") as f:
+            for line in f.readlines():
+                if line == "\n":
+                    break
+                key, value = line.split(":", 1)
+                calib_all[key] = np.array([float(x) for x in value.split()])
+
+        # reshape matrices
+        calib_out = {}
+        # 3x4 projection matrix for left camera
+        calib_out["P2"] = calib_all["P2"].reshape(3, 4)
+        calib_out["Tr"] = np.identity(4)  # 4x4 matrix
+        calib_out["Tr"][:3, :4] = calib_all["Tr"].reshape(3, 4)
+        return calib_out
+
+
+def vox2pix(cam_E, cam_k, 
+            vox_origin, voxel_size, 
+            img_W, img_H, 
+            scene_size):
+    """
+    compute the 2D projection of voxels centroids
+    
+    Parameters:
+    ----------
+    cam_E: 4x4
+       =camera pose in case of NYUv2 dataset
+       =Transformation from camera to lidar coordinate in case of SemKITTI
+    cam_k: 3x3
+        camera intrinsics
+    vox_origin: (3,)
+        world(NYU)/lidar(SemKITTI) cooridnates of the voxel at index (0, 0, 0)
+    img_W: int
+        image width
+    img_H: int
+        image height
+    scene_size: (3,)
+        scene size in meter: (51.2, 51.2, 6.4) for SemKITTI and (4.8, 4.8, 2.88) for NYUv2
+    
+    Returns
+    -------
+    projected_pix: (N, 2)
+        Projected 2D positions of voxels
+    fov_mask: (N,)
+        Voxels mask indice voxels inside image's FOV 
+    pix_z: (N,)
+        Voxels'distance to the sensor in meter
+    """
+    # Compute the x, y, z bounding of the scene in meter
+    vol_bnds = np.zeros((3,2))
+    vol_bnds[:,0] = vox_origin
+    vol_bnds[:,1] = vox_origin + np.array(scene_size)
+
+    # Compute the voxels centroids in lidar cooridnates
+    vol_dim = np.ceil((vol_bnds[:,1]- vol_bnds[:,0])/ voxel_size).copy(order='C').astype(int)
+    xv, yv, zv = np.meshgrid(
+            range(vol_dim[0]),
+            range(vol_dim[1]),
+            range(vol_dim[2]),
+            indexing='ij'
+          )
+    vox_coords = np.concatenate([
+            xv.reshape(1,-1),
+            yv.reshape(1,-1),
+            zv.reshape(1,-1)
+          ], axis=0).astype(int).T
+
+    # Project voxels'centroid from lidar coordinates to camera coordinates
+    cam_pts = fusion.TSDFVolume.vox2world(vox_origin, vox_coords, voxel_size)
+    cam_pts = fusion.rigid_transform(cam_pts, cam_E)
+
+    # Project camera coordinates to pixel positions
+    projected_pix = fusion.TSDFVolume.cam2pix(cam_pts, cam_k)
+    pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+
+    # Eliminate pixels outside view frustum
+    pix_z = cam_pts[:, 2]
+    fov_mask = np.logical_and(pix_x >= 0,
+                np.logical_and(pix_x < img_W,
+                np.logical_and(pix_y >= 0,
+                np.logical_and(pix_y < img_H,
+                pix_z > 0))))
+
+
+    return torch.from_numpy(projected_pix), torch.from_numpy(fov_mask), torch.from_numpy(pix_z)
+
+
+
+def get_grid_coords(dims, resolution):
+    """
+    :param dims: the dimensions of the grid [x, y, z] (i.e. [256, 256, 32])
+    :return coords_grid: is the center coords of voxels in the grid
+    """
+
+    g_xx = np.arange(0, dims[0] + 1)
+    g_yy = np.arange(0, dims[1] + 1)
+    sensor_pose = 10
+    g_zz = np.arange(0, dims[2] + 1)
+
+    # Obtaining the grid with coords...
+    xx, yy, zz = np.meshgrid(g_xx[:-1], g_yy[:-1], g_zz[:-1])
+    coords_grid = np.array([xx.flatten(), yy.flatten(), zz.flatten()]).T
+    coords_grid = coords_grid.astype(np.float)
+
+    coords_grid = (coords_grid * resolution) + resolution / 2
+
+    temp = np.copy(coords_grid)
+    temp[:, 0] = coords_grid[:, 1]
+    temp[:, 1] = coords_grid[:, 0]
+    coords_grid = np.copy(temp)
+
+    return coords_grid
+
+def get_projections(img_W, img_H):
+    scale_3ds = [1, 2]
+    data = {}
+    for scale_3d in scale_3ds:
+        scene_size = (51.2, 51.2, 6.4)
+        vox_origin = np.array([0, -25.6, -2])
+        voxel_size = 0.2
+        
+        calib = read_calib("calib.txt")    
+        cam_k = calib["P2"][:3, :3]
+        T_velo_2_cam = calib["Tr"]
+        
+        # compute the 3D-2D mapping
+        projected_pix, fov_mask, pix_z = vox2pix(
+            T_velo_2_cam,
+            cam_k,
+            vox_origin,
+            voxel_size * scale_3d,
+            img_W,
+            img_H,
+            scene_size,
+        )            
+
+        data["projected_pix_{}".format(scale_3d)] = projected_pix
+        data["pix_z_{}".format(scale_3d)] = pix_z
+        data["fov_mask_{}".format(scale_3d)] = fov_mask 
+    return data
+
+
+def majority_pooling(grid, k_size=2):
+    result = np.zeros(
+        (grid.shape[0] // k_size, grid.shape[1] // k_size, grid.shape[2] // k_size)
+    )
+    for xx in range(0, int(np.floor(grid.shape[0] / k_size))):
+        for yy in range(0, int(np.floor(grid.shape[1] / k_size))):
+            for zz in range(0, int(np.floor(grid.shape[2] / k_size))):
+
+                sub_m = grid[
+                    (xx * k_size) : (xx * k_size) + k_size,
+                    (yy * k_size) : (yy * k_size) + k_size,
+                    (zz * k_size) : (zz * k_size) + k_size,
+                ]
+                unique, counts = np.unique(sub_m, return_counts=True)
+                if True in ((unique != 0) & (unique != 255)):
+                    # Remove counts with 0 and 255
+                    counts = counts[((unique != 0) & (unique != 255))]
+                    unique = unique[((unique != 0) & (unique != 255))]
+                else:
+                    if True in (unique == 0):
+                        counts = counts[(unique != 255)]
+                        unique = unique[(unique != 255)]
+                value = unique[np.argmax(counts)]
+                result[xx, yy, zz] = value
+    return result
+
+
+def draw(
+    voxels,
+    # T_velo_2_cam,
+    # vox_origin,
+    fov_mask,
+    # img_size,
+    # f,
+    voxel_size=0.4,
+    # d=7,  # 7m - determine the size of the mesh representing the camera
+):
+
+    fov_mask = fov_mask.reshape(-1)
+    # Compute the voxels coordinates
+    grid_coords = get_grid_coords(
+        [voxels.shape[0], voxels.shape[1], voxels.shape[2]], voxel_size
+    )
+
+
+    # Attach the predicted class to every voxel
+    grid_coords = np.vstack([grid_coords.T, voxels.reshape(-1)]).T
+
+    # Get the voxels inside FOV
+    fov_grid_coords = grid_coords[fov_mask, :]
+
+    # Get the voxels outside FOV
+    outfov_grid_coords = grid_coords[~fov_mask, :]
+
+    # Remove empty and unknown voxels
+    fov_voxels = fov_grid_coords[
+        (fov_grid_coords[:, 3] > 0) & (fov_grid_coords[:, 3] < 255), :
+    ]
+    # print(np.unique(fov_voxels[:, 3], return_counts=True))
+    outfov_voxels = outfov_grid_coords[
+        (outfov_grid_coords[:, 3] > 0) & (outfov_grid_coords[:, 3] < 255), :
+    ]
+
+    # figure = mlab.figure(size=(1400, 1400), bgcolor=(1, 1, 1))
+    colors = np.array(
+        [
+            [0,0,0],
+            [100, 150, 245],
+            [100, 230, 245],
+            [30, 60, 150],
+            [80, 30, 180],
+            [100, 80, 250],
+            [255, 30, 30],
+            [255, 40, 200],
+            [150, 30, 90],
+            [255, 0, 255],
+            [255, 150, 255],
+            [75, 0, 75],
+            [175, 0, 75],
+            [255, 200, 0],
+            [255, 120, 50],
+            [0, 175, 0],
+            [135, 60, 0],
+            [150, 240, 80],
+            [255, 240, 150],
+            [255, 0, 0],
+        ]
+    ).astype(np.uint8)
+
+    pts_colors = [f'rgb({colors[int(i)][0]}, {colors[int(i)][1]}, {colors[int(i)][2]})' for i in fov_voxels[:, 3]]
+    out_fov_colors = [f'rgb({colors[int(i)][0]//3*2}, {colors[int(i)][1]//3*2}, {colors[int(i)][2]//3*2})' for i in outfov_voxels[:, 3]]
+    pts_colors = pts_colors + out_fov_colors
+    
+    fov_voxels = np.concatenate([fov_voxels, outfov_voxels], axis=0)
+    x = fov_voxels[:, 0].flatten()
+    y = fov_voxels[:, 1].flatten()
+    z = fov_voxels[:, 2].flatten()
+    # label = fov_voxels[:, 3].flatten()
+    fig = go.Figure(data=[go.Scatter3d(x=x, y=y, z=z,mode='markers',
+                    marker=dict(
+                            size=2,
+                            color=pts_colors,                # set color to an array/list of desired values
+                            # colorscale='Viridis',   # choose a colorscale
+                            opacity=1.0,
+                            symbol='square'
+                        ))])
+    fig.update_layout(
+    scene = dict(
+        aspectmode='data',
+        xaxis = dict(
+            backgroundcolor="rgb(255, 255, 255)",
+            gridcolor="black",
+            showbackground=True,
+            zerolinecolor="black",
+            nticks=4, 
+            visible=False,
+            range=[-1,55],),
+        yaxis = dict(
+            backgroundcolor="rgb(255, 255, 255)",
+            gridcolor="black",
+            showbackground=True,
+            zerolinecolor="black",
+            visible=False,
+            nticks=4, range=[-1,55],),
+        zaxis = dict(
+            backgroundcolor="rgb(255, 255, 255)",
+            gridcolor="black",
+            showbackground=True,
+            zerolinecolor="black",
+            visible=False,
+            nticks=4, range=[-1,7],),
+        bgcolor="black",
+    ),
+        
+    )
+
+    # fig = px.scatter_3d(
+    #     fov_voxels, 
+    #     x=fov_voxels[:, 0], y="y", z="z", color="label")
+    # Draw occupied inside FOV voxels
+    # plt_plot_fov = mlab.points3d(
+    #     fov_voxels[:, 0],
+    #     fov_voxels[:, 1],
+    #     fov_voxels[:, 2],
+    #     fov_voxels[:, 3],
+    #     colormap="viridis",
+    #     scale_factor=voxel_size - 0.05 * voxel_size,
+    #     mode="cube",
+    #     opacity=1.0,
+    #     vmin=1,
+    #     vmax=19,
+    # )
+
+    # # Draw occupied outside FOV voxels
+    # plt_plot_outfov = mlab.points3d(
+    #     outfov_voxels[:, 0],
+    #     outfov_voxels[:, 1],
+    #     outfov_voxels[:, 2],
+    #     outfov_voxels[:, 3],
+    #     colormap="viridis",
+    #     scale_factor=voxel_size - 0.05 * voxel_size,
+    #     mode="cube",
+    #     opacity=1.0,
+    #     vmin=1,
+    #     vmax=19,
+    # )
+
+    
+
+    # plt_plot_fov.glyph.scale_mode = "scale_by_vector"
+    # plt_plot_outfov.glyph.scale_mode = "scale_by_vector"
+
+    # plt_plot_fov.module_manager.scalar_lut_manager.lut.table = colors
+
+    # outfov_colors = colors
+    # outfov_colors[:, :3] = outfov_colors[:, :3] // 3 * 2
+    # plt_plot_outfov.module_manager.scalar_lut_manager.lut.table = outfov_colors
+
+    # mlab.show()
+    return fig
\ No newline at end of file
diff --git a/images/08/000010.jpg b/images/08/000010.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dce9bd4712215f082178d79da224fedcd7d1f324
Binary files /dev/null and b/images/08/000010.jpg differ
diff --git a/images/08/000085.jpg b/images/08/000085.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..92841f53fa1c483d5537341d2052c6a6921a8c07
Binary files /dev/null and b/images/08/000085.jpg differ
diff --git a/images/08/000103.jpg b/images/08/000103.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..14460f856654a3ad525c80c9aa28a9b2e59ff7e7
Binary files /dev/null and b/images/08/000103.jpg differ
diff --git a/images/08/000187.jpg b/images/08/000187.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..3f1027f581c2d1ea8cd9fb7fdce028a3db1c2105
Binary files /dev/null and b/images/08/000187.jpg differ
diff --git a/images/08/000234.jpg b/images/08/000234.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2399753ecd614b6b0d1239e22c0d422e434f28ee
Binary files /dev/null and b/images/08/000234.jpg differ
diff --git a/images/08/000290.jpg b/images/08/000290.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..d09734b6689231c78216da5fdbe48c1e075c5b91
Binary files /dev/null and b/images/08/000290.jpg differ
diff --git a/images/08/000295.jpg b/images/08/000295.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9113dad82318a509db95201dbab9b6ff834ecd9d
Binary files /dev/null and b/images/08/000295.jpg differ
diff --git a/images/08/000465.jpg b/images/08/000465.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..917d465fd5b9ec6065719e88d9d0cdf39fce5823
Binary files /dev/null and b/images/08/000465.jpg differ
diff --git a/images/08/000511.jpg b/images/08/000511.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..378e6bd8023e5f643e0f9df25d0ef27e6c286498
Binary files /dev/null and b/images/08/000511.jpg differ
diff --git a/images/08/000618.jpg b/images/08/000618.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6d036b40463933033371cbd8b821d8875dff2ae0
Binary files /dev/null and b/images/08/000618.jpg differ
diff --git a/images/08/000790.jpg b/images/08/000790.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..77aea4cbaec5ec03f6cd583a77da7a7e9e4fee88
Binary files /dev/null and b/images/08/000790.jpg differ
diff --git a/images/08/000822.jpg b/images/08/000822.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0139ce00faac15151767978b4c850b957f7f9298
Binary files /dev/null and b/images/08/000822.jpg differ
diff --git a/images/08/001005.jpg b/images/08/001005.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..117d1da2374d1b1107acd6ea4fac0a11193ccde7
Binary files /dev/null and b/images/08/001005.jpg differ
diff --git a/images/08/001122.jpg b/images/08/001122.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e1d8b7d6970c781e416ff1d3decd99241f568bd4
Binary files /dev/null and b/images/08/001122.jpg differ
diff --git a/images/08/001380.jpg b/images/08/001380.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..9683401a216e810cd633971365b38fefaa827061
Binary files /dev/null and b/images/08/001380.jpg differ
diff --git a/images/08/001385.jpg b/images/08/001385.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a53a048254d36d4ccab94611fef4b33188a49bb4
Binary files /dev/null and b/images/08/001385.jpg differ
diff --git a/images/08/001446.jpg b/images/08/001446.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..a058286e4be7273cac49ce3e600036b758c38b5e
Binary files /dev/null and b/images/08/001446.jpg differ
diff --git a/images/08/001530.jpg b/images/08/001530.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..69deae8b12a066d3159bff13006b0c8fd3cd123c
Binary files /dev/null and b/images/08/001530.jpg differ
diff --git a/images/08/001687.jpg b/images/08/001687.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6e6eb1f062bed3357d5465b38d1409e030eba8bf
Binary files /dev/null and b/images/08/001687.jpg differ
diff --git a/images/08/001842.jpg b/images/08/001842.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..482996d026e61f51f1ecf860395cf37a95efa833
Binary files /dev/null and b/images/08/001842.jpg differ
diff --git a/images/08/002010.jpg b/images/08/002010.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4bf5de984449de372b6ae8fe57bb7cf6dfa12d95
Binary files /dev/null and b/images/08/002010.jpg differ
diff --git a/images/08/002128.jpg b/images/08/002128.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..2d0b1a7b0cff4997b202ed1cb322285aee208de2
Binary files /dev/null and b/images/08/002128.jpg differ
diff --git a/images/08/002272.jpg b/images/08/002272.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..e0d2ebc9427cf5dc313f0d0c42b3e4301b328535
Binary files /dev/null and b/images/08/002272.jpg differ
diff --git a/images/08/002360.jpg b/images/08/002360.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..566632a905ecd04ae94b0ac599e56673ef92b92e
Binary files /dev/null and b/images/08/002360.jpg differ
diff --git a/images/08/002505.jpg b/images/08/002505.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..77b8cc2fb0fdc13604771274750bccc6ca05eae1
Binary files /dev/null and b/images/08/002505.jpg differ
diff --git a/images/08/002716.jpg b/images/08/002716.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5385ac65b015c458028b94d76d86a26cded7b203
Binary files /dev/null and b/images/08/002716.jpg differ
diff --git a/images/08/002944.jpg b/images/08/002944.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..6ad1e8a50cab8643acc1b7f238d16aeeb55611bf
Binary files /dev/null and b/images/08/002944.jpg differ
diff --git a/images/08/003149.jpg b/images/08/003149.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bab4c49d2515086351188e69af2cdd5db5e32fce
Binary files /dev/null and b/images/08/003149.jpg differ
diff --git a/images/08/003365.jpg b/images/08/003365.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..5d77465687501909179664683b2e0432ab5dedf1
Binary files /dev/null and b/images/08/003365.jpg differ
diff --git a/images/08/003533.jpg b/images/08/003533.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4b963deee51da8231a4be14854c571c1b1a822d0
Binary files /dev/null and b/images/08/003533.jpg differ
diff --git a/images/08/003790.jpg b/images/08/003790.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0e9ea9359144d5f6479a855dbc9d82d8bb527097
Binary files /dev/null and b/images/08/003790.jpg differ
diff --git a/images/08/003929.jpg b/images/08/003929.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..74cab6e5f72cdaa1b9badc124f33e19d2bef1540
Binary files /dev/null and b/images/08/003929.jpg differ
diff --git a/images/08/004059.jpg b/images/08/004059.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1620e3db907758dd12b4b4dc6ac7853916916540
Binary files /dev/null and b/images/08/004059.jpg differ
diff --git a/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py b/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88b7b309e6fe66f597cafe2a5eb8c6d29343b7e
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/CRP3D-checkpoint.py
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+from monoscene.modules import (
+    Process,
+    ASPP,
+)
+
+
+class CPMegaVoxels(nn.Module):
+    def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
+        super().__init__()
+        self.size = size
+        self.n_relations = n_relations
+        print("n_relations", self.n_relations)
+        self.flatten_size = size[0] * size[1] * size[2]
+        self.feature = feature
+        self.context_feature = feature * 2
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+        padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
+        
+        self.mega_context = nn.Sequential(
+            nn.Conv3d(
+                feature, self.context_feature, stride=2, padding=padding, kernel_size=3
+            ),
+        )
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+
+        self.context_prior_logits = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv3d(
+                        self.feature,
+                        self.flatten_context_size,
+                        padding=0,
+                        kernel_size=1,
+                    ),
+                )
+                for i in range(n_relations)
+            ]
+        )
+        self.aspp = ASPP(feature, [1, 2, 3])
+
+        self.resize = nn.Sequential(
+            nn.Conv3d(
+                self.context_feature * self.n_relations + feature,
+                feature,
+                kernel_size=1,
+                padding=0,
+                bias=False,
+            ),
+            Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
+        )
+
+    def forward(self, input):
+        ret = {}
+        bs = input.shape[0]
+
+        x_agg = self.aspp(input)
+
+        # get the mega context
+        x_mega_context_raw = self.mega_context(x_agg)
+        x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
+        x_mega_context = x_mega_context.permute(0, 2, 1)
+
+        # get context prior map
+        x_context_prior_logits = []
+        x_context_rels = []
+        for rel in range(self.n_relations):
+
+            # Compute the relation matrices
+            x_context_prior_logit = self.context_prior_logits[rel](x_agg)
+            x_context_prior_logit = x_context_prior_logit.reshape(
+                bs, self.flatten_context_size, self.flatten_size
+            )
+            x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
+
+            x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
+            x_context_prior = torch.sigmoid(x_context_prior_logit)
+
+            # Multiply the relation matrices with the mega context to gather context features
+            x_context_rel = torch.bmm(x_context_prior, x_mega_context)  # bs, N, f
+            x_context_rels.append(x_context_rel)
+
+        x_context = torch.cat(x_context_rels, dim=2)
+        x_context = x_context.permute(0, 2, 1)
+        x_context = x_context.reshape(
+            bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
+        )
+
+        x = torch.cat([input, x_context], dim=1)
+        x = self.resize(x)
+
+        x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
+        ret["P_logits"] = x_context_prior_logits
+        ret["x"] = x
+
+        return ret
diff --git a/monoscene/.ipynb_checkpoints/config-checkpoint.py b/monoscene/.ipynb_checkpoints/config-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cb935d3d3a41c8973e72210323205607aff2dc5
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/config-checkpoint.py
@@ -0,0 +1,34 @@
+from transformers import PretrainedConfig
+from typing import List
+
+
+class MonoSceneConfig(PretrainedConfig):
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+
+
+
+
+
diff --git a/monoscene/.ipynb_checkpoints/modules-checkpoint.py b/monoscene/.ipynb_checkpoints/modules-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8bf875ccd6dffb51bb5acb25f0302fe0032d6c
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/modules-checkpoint.py
@@ -0,0 +1,194 @@
+import torch
+import torch.nn as nn
+from monoscene.DDR import Bottleneck3D
+
+
+class ASPP(nn.Module):
+    """
+    ASPP 3D
+    Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+
+    def __init__(self, planes, dilations_conv_list):
+        super().__init__()
+
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+
+    def forward(self, x_in):
+
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+
+        return x_in
+
+
+class SegmentationHead(nn.Module):
+    """
+    3D Segmentation heads to retrieve semantic segmentation at each scale.
+    Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
+    Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+
+    def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
+        super().__init__()
+
+        # First convolution
+        self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
+
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+
+        self.conv_classes = nn.Conv3d(
+            planes, nbr_classes, kernel_size=3, padding=1, stride=1
+        )
+
+    def forward(self, x_in):
+
+        # Convolution to go from inplanes to planes features...
+        x_in = self.relu(self.conv0(x_in))
+
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+
+        x_in = self.conv_classes(x_in)
+
+        return x_in
+
+
+class ProcessKitti(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Process(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
+        super(Upsample, self).__init__()
+        self.main = nn.Sequential(
+            nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                dilation=1,
+                output_padding=1,
+            ),
+            norm_layer(out_channels, momentum=bn_momentum),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Downsample(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
+        super(Downsample, self).__init__()
+        self.main = Bottleneck3D(
+            feature,
+            feature // 4,
+            bn_momentum=bn_momentum,
+            expansion=expansion,
+            stride=2,
+            downsample=nn.Sequential(
+                nn.AvgPool3d(kernel_size=2, stride=2),
+                nn.Conv3d(
+                    feature,
+                    int(feature * expansion / 4),
+                    kernel_size=1,
+                    stride=1,
+                    bias=False,
+                ),
+                norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
+            ),
+            norm_layer=norm_layer,
+        )
+
+    def forward(self, x):
+        return self.main(x)
diff --git a/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py b/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc4d020729b6698887055771439f87a491572bd1
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/monoscene-checkpoint.py
@@ -0,0 +1,123 @@
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.flosp import FLoSP
+import numpy as np
+import torch.nn.functional as F
+from monoscene.unet2d import UNet2D
+
+
+class MonoScene(pl.LightningModule):
+    def __init__(
+        self,
+        n_classes,
+        feature,
+        project_scale,
+        full_scene_size,
+        dataset,
+        n_relations=4,
+        context_prior=True,
+        fp_loss=True,
+        project_res=[],
+        frustum_size=4,
+        relation_loss=False,
+        CE_ssc_loss=True,
+        geo_scal_loss=True,
+        sem_scal_loss=True,
+        lr=1e-4,
+        weight_decay=1e-4,
+    ):
+        super().__init__()
+
+        self.project_res = project_res
+        self.fp_loss = fp_loss
+        self.dataset = dataset
+        self.context_prior = context_prior
+        self.frustum_size = frustum_size
+        self.relation_loss = relation_loss
+        self.CE_ssc_loss = CE_ssc_loss
+        self.sem_scal_loss = sem_scal_loss
+        self.geo_scal_loss = geo_scal_loss
+        self.project_scale = project_scale
+        self.lr = lr
+        self.weight_decay = weight_decay
+
+        self.projects = {}
+        self.scale_2ds = [1, 2, 4, 8]  # 2D scales
+        for scale_2d in self.scale_2ds:
+            self.projects[str(scale_2d)] = FLoSP(
+                full_scene_size, project_scale=self.project_scale, dataset=self.dataset
+            )
+        self.projects = nn.ModuleDict(self.projects)
+
+        self.n_classes = n_classes
+        if self.dataset == "NYU":
+            self.net_3d_decoder = UNet3DNYU(
+                self.n_classes,
+                nn.BatchNorm3d,
+                n_relations=n_relations,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        elif self.dataset == "kitti":
+            self.net_3d_decoder = UNet3DKitti(
+                self.n_classes,
+                nn.BatchNorm3d,
+                project_scale=project_scale,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
+
+    def forward(self, batch):
+
+        img = batch["img"]
+        bs = len(img)
+
+        out = {}
+
+        x_rgb = self.net_rgb(img)
+
+        x3ds = []
+        for i in range(bs):
+            x3d = None
+            for scale_2d in self.project_res:
+
+                # project features at each 2D scale to target 3D scale
+                scale_2d = int(scale_2d)
+                projected_pix = batch["projected_pix_{}".format(self.project_scale)][i].cuda()
+                fov_mask = batch["fov_mask_{}".format(self.project_scale)][i].cuda()
+
+                # Sum all the 3D features
+                if x3d is None:
+                    x3d = self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+                else:
+                    x3d += self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+            x3ds.append(x3d)
+
+        input_dict = {
+            "x3d": torch.stack(x3ds),
+        }
+
+        out_dict = self.net_3d_decoder(input_dict)
+
+        ssc_pred = out_dict["ssc_logit"]
+    
+        y_pred = ssc_pred.detach().cpu().numpy()
+        y_pred = np.argmax(y_pred, axis=1)
+
+        return y_pred
+
+
diff --git a/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py b/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bf3d80ea531ff02b3229b862b7a4cd0aec8ec58
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/monoscene_model-checkpoint.py
@@ -0,0 +1,22 @@
+from transformers import PreTrainedModel
+from .config import MonoSceneConfig
+from monoscene.monoscene import MonoScene
+
+
+
+class MonoSceneModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MonoScene(
+            dataset=config.dataset,
+            n_classes=config.n_classes,
+            feature=config.feature,
+            project_scale=config.project_scale,
+            full_scene_size=config.full_scene_size
+        )
+     
+
+    def forward(self, tensor):
+        return self.model.forward(tensor)
\ No newline at end of file
diff --git a/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py b/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d5339fbdf34e28d017d7e4e29ce4923169bef5
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/unet3d_kitti-checkpoint.py
@@ -0,0 +1,88 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monoscene.modules import SegmentationHead
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import Process, Upsample, Downsample
+
+
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        full_scene_size,
+        feature,
+        project_scale,
+        context_prior=None,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_scale = project_scale
+        self.full_scene_size = full_scene_size
+        self.feature = feature
+
+        size_l1 = (
+            int(self.full_scene_size[0] / project_scale),
+            int(self.full_scene_size[1] / project_scale),
+            int(self.full_scene_size[2] / project_scale),
+        )
+        size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
+        size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
+
+        dilations = [1, 2, 3]
+        self.process_l1 = nn.Sequential(
+            Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature, norm_layer, bn_momentum),
+        )
+        self.process_l2 = nn.Sequential(
+            Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature * 2, norm_layer, bn_momentum),
+        )
+
+        self.up_13_l2 = Upsample(
+            self.feature * 4, self.feature * 2, norm_layer, bn_momentum
+        )
+        self.up_12_l1 = Upsample(
+            self.feature * 2, self.feature, norm_layer, bn_momentum
+        )
+        self.up_l1_lfull = Upsample(
+            self.feature, self.feature // 2, norm_layer, bn_momentum
+        )
+
+        self.ssc_head = SegmentationHead(
+            self.feature // 2, self.feature // 2, class_num, dilations
+        )
+
+        self.context_prior = context_prior
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature * 4, size_l3, bn_momentum=bn_momentum
+            )
+
+    def forward(self, input_dict):
+        res = {}
+
+        x3d_l1 = input_dict["x3d"]
+
+        x3d_l2 = self.process_l1(x3d_l1)
+
+        x3d_l3 = self.process_l2(x3d_l2)
+
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_l3)
+            x3d_l3 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+
+        x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
+        x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
+        x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
+
+        ssc_logit_full = self.ssc_head(x3d_up_lfull)
+
+        res["ssc_logit"] = ssc_logit_full
+
+        return res
diff --git a/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py b/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e3b3718999248efa1b2925658465ba59801b13
--- /dev/null
+++ b/monoscene/.ipynb_checkpoints/unet3d_nyu-checkpoint.py
@@ -0,0 +1,90 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import (
+    Process,
+    Upsample,
+    Downsample,
+    SegmentationHead,
+    ASPP,
+)
+
+
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        feature,
+        full_scene_size,
+        n_relations=4,
+        project_res=[],
+        context_prior=True,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_res = project_res
+
+        self.feature_1_4 = feature
+        self.feature_1_8 = feature * 2
+        self.feature_1_16 = feature * 4
+
+        self.feature_1_16_dec = self.feature_1_16
+        self.feature_1_8_dec = self.feature_1_8
+        self.feature_1_4_dec = self.feature_1_4
+
+        self.process_1_4 = nn.Sequential(
+            Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_4, norm_layer, bn_momentum),
+        )
+        self.process_1_8 = nn.Sequential(
+            Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_8, norm_layer, bn_momentum),
+        )
+        self.up_1_16_1_8 = Upsample(
+            self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
+        )
+        self.up_1_8_1_4 = Upsample(
+            self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
+        )
+        self.ssc_head_1_4 = SegmentationHead(
+            self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
+        )
+
+        self.context_prior = context_prior
+        size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
+
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature_1_16,                
+                size_1_16,
+                n_relations=n_relations,
+                bn_momentum=bn_momentum,
+            )
+
+    #
+    def forward(self, input_dict):
+        res = {}
+
+        x3d_1_4 = input_dict["x3d"]
+        x3d_1_8 = self.process_1_4(x3d_1_4)
+        x3d_1_16 = self.process_1_8(x3d_1_8)
+
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_1_16)
+            x3d_1_16 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+
+        x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
+        x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
+
+        ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
+
+        res["ssc_logit"] = ssc_logit_1_4
+
+        return res
diff --git a/monoscene/CRP3D.py b/monoscene/CRP3D.py
new file mode 100644
index 0000000000000000000000000000000000000000..c88b7b309e6fe66f597cafe2a5eb8c6d29343b7e
--- /dev/null
+++ b/monoscene/CRP3D.py
@@ -0,0 +1,97 @@
+import torch
+import torch.nn as nn
+from monoscene.modules import (
+    Process,
+    ASPP,
+)
+
+
+class CPMegaVoxels(nn.Module):
+    def __init__(self, feature, size, n_relations=4, bn_momentum=0.0003):
+        super().__init__()
+        self.size = size
+        self.n_relations = n_relations
+        print("n_relations", self.n_relations)
+        self.flatten_size = size[0] * size[1] * size[2]
+        self.feature = feature
+        self.context_feature = feature * 2
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+        padding = ((size[0] + 1) % 2, (size[1] + 1) % 2, (size[2] + 1) % 2)
+        
+        self.mega_context = nn.Sequential(
+            nn.Conv3d(
+                feature, self.context_feature, stride=2, padding=padding, kernel_size=3
+            ),
+        )
+        self.flatten_context_size = (size[0] // 2) * (size[1] // 2) * (size[2] // 2)
+
+        self.context_prior_logits = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Conv3d(
+                        self.feature,
+                        self.flatten_context_size,
+                        padding=0,
+                        kernel_size=1,
+                    ),
+                )
+                for i in range(n_relations)
+            ]
+        )
+        self.aspp = ASPP(feature, [1, 2, 3])
+
+        self.resize = nn.Sequential(
+            nn.Conv3d(
+                self.context_feature * self.n_relations + feature,
+                feature,
+                kernel_size=1,
+                padding=0,
+                bias=False,
+            ),
+            Process(feature, nn.BatchNorm3d, bn_momentum, dilations=[1]),
+        )
+
+    def forward(self, input):
+        ret = {}
+        bs = input.shape[0]
+
+        x_agg = self.aspp(input)
+
+        # get the mega context
+        x_mega_context_raw = self.mega_context(x_agg)
+        x_mega_context = x_mega_context_raw.reshape(bs, self.context_feature, -1)
+        x_mega_context = x_mega_context.permute(0, 2, 1)
+
+        # get context prior map
+        x_context_prior_logits = []
+        x_context_rels = []
+        for rel in range(self.n_relations):
+
+            # Compute the relation matrices
+            x_context_prior_logit = self.context_prior_logits[rel](x_agg)
+            x_context_prior_logit = x_context_prior_logit.reshape(
+                bs, self.flatten_context_size, self.flatten_size
+            )
+            x_context_prior_logits.append(x_context_prior_logit.unsqueeze(1))
+
+            x_context_prior_logit = x_context_prior_logit.permute(0, 2, 1)
+            x_context_prior = torch.sigmoid(x_context_prior_logit)
+
+            # Multiply the relation matrices with the mega context to gather context features
+            x_context_rel = torch.bmm(x_context_prior, x_mega_context)  # bs, N, f
+            x_context_rels.append(x_context_rel)
+
+        x_context = torch.cat(x_context_rels, dim=2)
+        x_context = x_context.permute(0, 2, 1)
+        x_context = x_context.reshape(
+            bs, x_context.shape[1], self.size[0], self.size[1], self.size[2]
+        )
+
+        x = torch.cat([input, x_context], dim=1)
+        x = self.resize(x)
+
+        x_context_prior_logits = torch.cat(x_context_prior_logits, dim=1)
+        ret["P_logits"] = x_context_prior_logits
+        ret["x"] = x
+
+        return ret
diff --git a/monoscene/DDR.py b/monoscene/DDR.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0928c0741433dc24523a2c26bfad9ef1ff920e
--- /dev/null
+++ b/monoscene/DDR.py
@@ -0,0 +1,139 @@
+"""
+Most of the code in this file is taken from https://github.com/waterljwant/SSC/blob/master/models/DDR.py
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class SimpleRB(nn.Module):
+    def __init__(self, in_channel, norm_layer, bn_momentum):
+        super(SimpleRB, self).__init__()
+        self.path = nn.Sequential(
+            nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+            norm_layer(in_channel, momentum=bn_momentum),
+            nn.ReLU(),
+            nn.Conv3d(in_channel, in_channel, kernel_size=3, padding=1, bias=False),
+            norm_layer(in_channel, momentum=bn_momentum),
+        )
+        self.relu = nn.ReLU()
+
+    def forward(self, x):
+        residual = x
+        conv_path = self.path(x)
+        out = residual + conv_path
+        out = self.relu(out)
+        return out
+
+
+"""
+3D Residual Block，3x3x3 conv ==> 3 smaller 3D conv, refered from DDRNet
+"""
+
+
+class Bottleneck3D(nn.Module):
+    def __init__(
+        self,
+        inplanes,
+        planes,
+        norm_layer,
+        stride=1,
+        dilation=[1, 1, 1],
+        expansion=4,
+        downsample=None,
+        fist_dilation=1,
+        multi_grid=1,
+        bn_momentum=0.0003,
+    ):
+        super(Bottleneck3D, self).__init__()
+        # often，planes = inplanes // 4
+        self.expansion = expansion
+        self.conv1 = nn.Conv3d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = norm_layer(planes, momentum=bn_momentum)
+        self.conv2 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(1, 1, 3),
+            stride=(1, 1, stride),
+            dilation=(1, 1, dilation[0]),
+            padding=(0, 0, dilation[0]),
+            bias=False,
+        )
+        self.bn2 = norm_layer(planes, momentum=bn_momentum)
+        self.conv3 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(1, 3, 1),
+            stride=(1, stride, 1),
+            dilation=(1, dilation[1], 1),
+            padding=(0, dilation[1], 0),
+            bias=False,
+        )
+        self.bn3 = norm_layer(planes, momentum=bn_momentum)
+        self.conv4 = nn.Conv3d(
+            planes,
+            planes,
+            kernel_size=(3, 1, 1),
+            stride=(stride, 1, 1),
+            dilation=(dilation[2], 1, 1),
+            padding=(dilation[2], 0, 0),
+            bias=False,
+        )
+        self.bn4 = norm_layer(planes, momentum=bn_momentum)
+        self.conv5 = nn.Conv3d(
+            planes, planes * self.expansion, kernel_size=(1, 1, 1), bias=False
+        )
+        self.bn5 = norm_layer(planes * self.expansion, momentum=bn_momentum)
+
+        self.relu = nn.ReLU(inplace=False)
+        self.relu_inplace = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.dilation = dilation
+        self.stride = stride
+
+        self.downsample2 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(1, stride, 1), stride=(1, stride, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+        self.downsample3 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+        self.downsample4 = nn.Sequential(
+            nn.AvgPool3d(kernel_size=(stride, 1, 1), stride=(stride, 1, 1)),
+            nn.Conv3d(planes, planes, kernel_size=1, stride=1, bias=False),
+            norm_layer(planes, momentum=bn_momentum),
+        )
+
+    def forward(self, x):
+        residual = x
+
+        out1 = self.relu(self.bn1(self.conv1(x)))
+        out2 = self.bn2(self.conv2(out1))
+        out2_relu = self.relu(out2)
+
+        out3 = self.bn3(self.conv3(out2_relu))
+        if self.stride != 1:
+            out2 = self.downsample2(out2)
+        out3 = out3 + out2
+        out3_relu = self.relu(out3)
+
+        out4 = self.bn4(self.conv4(out3_relu))
+        if self.stride != 1:
+            out2 = self.downsample3(out2)
+            out3 = self.downsample4(out3)
+        out4 = out4 + out2 + out3
+
+        out4_relu = self.relu(out4)
+        out5 = self.bn5(self.conv5(out4_relu))
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out = out5 + residual
+        out_relu = self.relu(out)
+
+        return out_relu
diff --git a/monoscene/__init__.py b/monoscene/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/monoscene/app.py b/monoscene/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e70631e75313a28bc978ac3d3bd5df28b61a552
--- /dev/null
+++ b/monoscene/app.py
@@ -0,0 +1,138 @@
+from pytorch_lightning import Trainer
+from monoscene.models.monoscene import MonoScene
+from monoscene.data.NYU.nyu_dm import NYUDataModule
+from monoscene.data.semantic_kitti.kitti_dm import KittiDataModule
+from monoscene.data.kitti_360.kitti_360_dm import Kitti360DataModule
+# import hydra
+from omegaconf import DictConfig
+import torch
+import numpy as np
+import os
+from hydra.utils import get_original_cwd
+import gradio as gr
+import numpy as np
+import plotly.express as px
+import pandas as pd
+    
+
+# @hydra.main(config_name="../config/monoscene.yaml")
+def plot(input_img):
+    torch.set_grad_enabled(False)
+
+    # Setup dataloader
+    # if config.dataset == "kitti" or config.dataset == "kitti_360":
+    feature = 64
+    project_scale = 2
+    full_scene_size = (256, 256, 32)
+
+    #     if config.dataset == "kitti":
+    #         data_module = KittiDataModule(
+    #             root=config.kitti_root,
+    #             preprocess_root=config.kitti_preprocess_root,
+    #             frustum_size=config.frustum_size,
+    #             batch_size=int(config.batch_size / config.n_gpus),
+    #             num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+    #         )
+    #         data_module.setup()
+    #         data_loader = data_module.val_dataloader()
+    #         # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+    #     else:
+    #         data_module = Kitti360DataModule(
+    #             root=config.kitti_360_root,
+    #             sequences=[config.kitti_360_sequence],
+    #             n_scans=2000,
+    #             batch_size=1,
+    #             num_workers=3,
+    #         )
+    #         data_module.setup()
+    #         data_loader = data_module.dataloader()
+
+    # elif config.dataset == "NYU":
+    #     project_scale = 1
+    #     feature = 200
+    #     full_scene_size = (60, 36, 60)
+    #     data_module = NYUDataModule(
+    #         root=config.NYU_root,
+    #         preprocess_root=config.NYU_preprocess_root,
+    #         n_relations=config.n_relations,
+    #         frustum_size=config.frustum_size,
+    #         batch_size=int(config.batch_size / config.n_gpus),
+    #         num_workers=int(config.num_workers_per_gpu * config.n_gpus),
+    #     )
+    #     data_module.setup()
+    #     data_loader = data_module.val_dataloader()
+    #     # data_loader = data_module.test_dataloader() # use this if you want to infer on test set
+    # else:
+    #     print("dataset not support")
+
+    # Load pretrained models
+    # if config.dataset == "NYU":
+    #     model_path = os.path.join(
+    #         get_original_cwd(), "trained_models", "monoscene_nyu.ckpt"
+    #     )
+    # else:
+    # model_path = os.path.join(
+    #     get_original_cwd(), "trained_models", "monoscene_kitti.ckpt"
+    # )
+    model_path = "trained_models/monoscene_kitti.ckpt"
+
+    model = MonoScene.load_from_checkpoint(
+        model_path,
+        feature=feature,
+        project_scale=project_scale,
+        fp_loss=False,
+        full_scene_size=full_scene_size,
+    )
+    model.cuda()
+    model.eval()
+
+    print(input_img.shape)
+    
+    x = np.arange(12).reshape(4, 3) / 12
+    data = pd.DataFrame(data=x, columns=['x', 'y', 'z'])
+    fig = px.scatter_3d(data, x="x", y="y", z="z")
+    return fig
+
+demo = gr.Interface(plot, gr.Image(shape=(200, 200)), gr.Plot())
+demo.launch()
+
+    
+
+    # Save prediction and additional data 
+    # to draw the viewing frustum and remove scene outside the room for NYUv2
+    # output_path = os.path.join(config.output_path, config.dataset)
+    # with torch.no_grad():
+    #     for batch in tqdm(data_loader):
+    #         batch["img"] = batch["img"].cuda()
+    #         pred = model(batch)
+    #         y_pred = torch.softmax(pred["ssc_logit"], dim=1).detach().cpu().numpy()
+    #         y_pred = np.argmax(y_pred, axis=1)
+    #         for i in range(config.batch_size):
+    #             out_dict = {"y_pred": y_pred[i].astype(np.uint16)}
+    #             if "target" in batch:
+    #                 out_dict["target"] = (
+    #                     batch["target"][i].detach().cpu().numpy().astype(np.uint16)
+    #                 )
+
+    #             if config.dataset == "NYU":
+    #                 write_path = output_path
+    #                 filepath = os.path.join(write_path, batch["name"][i] + ".pkl")
+    #                 out_dict["cam_pose"] = batch["cam_pose"][i].detach().cpu().numpy()
+    #                 out_dict["vox_origin"] = (
+    #                     batch["vox_origin"][i].detach().cpu().numpy()
+    #                 )
+    #             else:
+    #                 write_path = os.path.join(output_path, batch["sequence"][i])
+    #                 filepath = os.path.join(write_path, batch["frame_id"][i] + ".pkl")
+    #                 out_dict["fov_mask_1"] = (
+    #                     batch["fov_mask_1"][i].detach().cpu().numpy()
+    #                 )
+    #                 out_dict["cam_k"] = batch["cam_k"][i].detach().cpu().numpy()
+    #                 out_dict["T_velo_2_cam"] = (
+    #                     batch["T_velo_2_cam"][i].detach().cpu().numpy()
+    #                 )
+
+    #             os.makedirs(write_path, exist_ok=True)
+    #             with open(filepath, "wb") as handle:
+    #                 pickle.dump(out_dict, handle)
+    #                 print("wrote to", filepath)
\ No newline at end of file
diff --git a/monoscene/config.py b/monoscene/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..e03e806ad5e0c7ea4c439e3e82d955e3c0b3038f
--- /dev/null
+++ b/monoscene/config.py
@@ -0,0 +1,26 @@
+from transformers import PretrainedConfig
+from typing import List
+
+
+class MonoSceneConfig(PretrainedConfig):
+
+    def __init__(
+        self,
+        dataset="kitti",
+        n_classes=20,
+        feature=64,
+        project_scale=2,
+        full_scene_size=(256, 256, 32),
+        **kwargs,
+    ):
+        self.dataset = dataset
+        self.n_classes = n_classes
+        self.feature = feature
+        self.project_scale = project_scale
+        self.full_scene_size = full_scene_size
+        super().__init__(**kwargs)
+
+
+
+
+
diff --git a/monoscene/flosp.py b/monoscene/flosp.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d502197a72ee120773a47f239e86743f5a1e2d4
--- /dev/null
+++ b/monoscene/flosp.py
@@ -0,0 +1,41 @@
+import torch
+import torch.nn as nn
+
+
+class FLoSP(nn.Module):
+    def __init__(self, scene_size, dataset, project_scale):
+        super().__init__()
+        self.scene_size = scene_size
+        self.dataset = dataset
+        self.project_scale = project_scale
+
+    def forward(self, x2d, projected_pix, fov_mask):
+        c, h, w = x2d.shape
+
+        src = x2d.view(c, -1)
+        zeros_vec = torch.zeros(c, 1).type_as(src)
+        src = torch.cat([src, zeros_vec], 1)
+
+        pix_x, pix_y = projected_pix[:, 0], projected_pix[:, 1]
+        img_indices = pix_y * w + pix_x
+        img_indices[~fov_mask] = h * w
+        img_indices = img_indices.expand(c, -1).long()  # c, HWD
+        src_feature = torch.gather(src, 1, img_indices)
+
+        if self.dataset == "NYU":
+            x3d = src_feature.reshape(
+                c,
+                self.scene_size[0] // self.project_scale,
+                self.scene_size[2] // self.project_scale,
+                self.scene_size[1] // self.project_scale,
+            )
+            x3d = x3d.permute(0, 1, 3, 2)
+        elif self.dataset == "kitti":
+            x3d = src_feature.reshape(
+                c,
+                self.scene_size[0] // self.project_scale,
+                self.scene_size[1] // self.project_scale,
+                self.scene_size[2] // self.project_scale,
+            )
+
+        return x3d
diff --git a/monoscene/modules.py b/monoscene/modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e8bf875ccd6dffb51bb5acb25f0302fe0032d6c
--- /dev/null
+++ b/monoscene/modules.py
@@ -0,0 +1,194 @@
+import torch
+import torch.nn as nn
+from monoscene.DDR import Bottleneck3D
+
+
+class ASPP(nn.Module):
+    """
+    ASPP 3D
+    Adapt from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+
+    def __init__(self, planes, dilations_conv_list):
+        super().__init__()
+
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+
+    def forward(self, x_in):
+
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+
+        return x_in
+
+
+class SegmentationHead(nn.Module):
+    """
+    3D Segmentation heads to retrieve semantic segmentation at each scale.
+    Formed by Dim expansion, Conv3D, ASPP block, Conv3D.
+    Taken from https://github.com/cv-rits/LMSCNet/blob/main/LMSCNet/models/LMSCNet.py#L7
+    """
+
+    def __init__(self, inplanes, planes, nbr_classes, dilations_conv_list):
+        super().__init__()
+
+        # First convolution
+        self.conv0 = nn.Conv3d(inplanes, planes, kernel_size=3, padding=1, stride=1)
+
+        # ASPP Block
+        self.conv_list = dilations_conv_list
+        self.conv1 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn1 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.conv2 = nn.ModuleList(
+            [
+                nn.Conv3d(
+                    planes, planes, kernel_size=3, padding=dil, dilation=dil, bias=False
+                )
+                for dil in dilations_conv_list
+            ]
+        )
+        self.bn2 = nn.ModuleList(
+            [nn.BatchNorm3d(planes) for dil in dilations_conv_list]
+        )
+        self.relu = nn.ReLU()
+
+        self.conv_classes = nn.Conv3d(
+            planes, nbr_classes, kernel_size=3, padding=1, stride=1
+        )
+
+    def forward(self, x_in):
+
+        # Convolution to go from inplanes to planes features...
+        x_in = self.relu(self.conv0(x_in))
+
+        y = self.bn2[0](self.conv2[0](self.relu(self.bn1[0](self.conv1[0](x_in)))))
+        for i in range(1, len(self.conv_list)):
+            y += self.bn2[i](self.conv2[i](self.relu(self.bn1[i](self.conv1[i](x_in)))))
+        x_in = self.relu(y + x_in)  # modified
+
+        x_in = self.conv_classes(x_in)
+
+        return x_in
+
+
+class ProcessKitti(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Process(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, dilations=[1, 2, 3]):
+        super(Process, self).__init__()
+        self.main = nn.Sequential(
+            *[
+                Bottleneck3D(
+                    feature,
+                    feature // 4,
+                    bn_momentum=bn_momentum,
+                    norm_layer=norm_layer,
+                    dilation=[i, i, i],
+                )
+                for i in dilations
+            ]
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Upsample(nn.Module):
+    def __init__(self, in_channels, out_channels, norm_layer, bn_momentum):
+        super(Upsample, self).__init__()
+        self.main = nn.Sequential(
+            nn.ConvTranspose3d(
+                in_channels,
+                out_channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                dilation=1,
+                output_padding=1,
+            ),
+            norm_layer(out_channels, momentum=bn_momentum),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        return self.main(x)
+
+
+class Downsample(nn.Module):
+    def __init__(self, feature, norm_layer, bn_momentum, expansion=8):
+        super(Downsample, self).__init__()
+        self.main = Bottleneck3D(
+            feature,
+            feature // 4,
+            bn_momentum=bn_momentum,
+            expansion=expansion,
+            stride=2,
+            downsample=nn.Sequential(
+                nn.AvgPool3d(kernel_size=2, stride=2),
+                nn.Conv3d(
+                    feature,
+                    int(feature * expansion / 4),
+                    kernel_size=1,
+                    stride=1,
+                    bias=False,
+                ),
+                norm_layer(int(feature * expansion / 4), momentum=bn_momentum),
+            ),
+            norm_layer=norm_layer,
+        )
+
+    def forward(self, x):
+        return self.main(x)
diff --git a/monoscene/monoscene.py b/monoscene/monoscene.py
new file mode 100644
index 0000000000000000000000000000000000000000..d8dd444c86ac9b38494e7fc0f685504ae2f25a56
--- /dev/null
+++ b/monoscene/monoscene.py
@@ -0,0 +1,125 @@
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from monoscene.unet3d_nyu import UNet3D as UNet3DNYU
+from monoscene.unet3d_kitti import UNet3D as UNet3DKitti
+from monoscene.flosp import FLoSP
+import numpy as np
+import torch.nn.functional as F
+from monoscene.unet2d import UNet2D
+
+
+class MonoScene(pl.LightningModule):
+    def __init__(
+        self,
+        n_classes,
+        feature,
+        project_scale,
+        full_scene_size,
+        dataset,
+        project_res=["1", "2", "4", "8"],
+        n_relations=4,
+        context_prior=True,
+        fp_loss=True,
+        frustum_size=4,
+        relation_loss=False,
+        CE_ssc_loss=True,
+        geo_scal_loss=True,
+        sem_scal_loss=True,
+        lr=1e-4,
+        weight_decay=1e-4,
+    ):
+        super().__init__()
+
+        self.project_res = project_res
+        self.fp_loss = fp_loss
+        self.dataset = dataset
+        self.context_prior = context_prior
+        self.frustum_size = frustum_size
+        self.relation_loss = relation_loss
+        self.CE_ssc_loss = CE_ssc_loss
+        self.sem_scal_loss = sem_scal_loss
+        self.geo_scal_loss = geo_scal_loss
+        self.project_scale = project_scale
+        self.lr = lr
+        self.weight_decay = weight_decay
+
+        self.projects = {}
+        self.scale_2ds = [1, 2, 4, 8]  # 2D scales
+        for scale_2d in self.scale_2ds:
+            self.projects[str(scale_2d)] = FLoSP(
+                full_scene_size, project_scale=self.project_scale, dataset=self.dataset
+            )
+        self.projects = nn.ModuleDict(self.projects)
+
+        self.n_classes = n_classes
+        if self.dataset == "NYU":
+            self.net_3d_decoder = UNet3DNYU(
+                self.n_classes,
+                nn.BatchNorm3d,
+                n_relations=n_relations,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        elif self.dataset == "kitti":
+            self.net_3d_decoder = UNet3DKitti(
+                self.n_classes,
+                nn.BatchNorm3d,
+                project_scale=project_scale,
+                feature=feature,
+                full_scene_size=full_scene_size,
+                context_prior=context_prior,
+            )
+        self.net_rgb = UNet2D.build(out_feature=feature, use_decoder=True)
+
+    def forward(self, batch):
+
+        img = batch["img"]
+        bs = len(img)
+
+        out = {}
+
+        x_rgb = self.net_rgb(img)
+
+        x3ds = []
+        for i in range(bs):
+            x3d = None
+            for scale_2d in self.project_res:
+
+                # project features at each 2D scale to target 3D scale
+                scale_2d = int(scale_2d)
+                projected_pix = batch["projected_pix_{}".format(self.project_scale)][i]#.cuda()
+                fov_mask = batch["fov_mask_{}".format(self.project_scale)][i]#.cuda()
+
+                # Sum all the 3D features
+                if x3d is None:
+                    x3d = self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+                else:
+                    x3d += self.projects[str(scale_2d)](
+                        x_rgb["1_" + str(scale_2d)][i],
+                        # torch.div(projected_pix, scale_2d, rounding_mode='floor'),
+                        projected_pix // scale_2d,
+                        fov_mask,
+                    )
+            x3ds.append(x3d)
+
+        input_dict = {
+            "x3d": torch.stack(x3ds),
+        }
+
+        out_dict = self.net_3d_decoder(input_dict)
+
+        ssc_pred = out_dict["ssc_logit"]
+    
+        y_pred = ssc_pred.detach().cpu().numpy()
+        y_pred = np.argmax(y_pred, axis=1)
+
+        return y_pred
+
+
diff --git a/monoscene/monoscene_model.py b/monoscene/monoscene_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a5207f3d03de86192c5d41a8bdfe3ce32e672ab
--- /dev/null
+++ b/monoscene/monoscene_model.py
@@ -0,0 +1,21 @@
+from transformers import PreTrainedModel
+from .config import MonoSceneConfig
+from monoscene.monoscene import MonoScene
+
+
+class MonoSceneModel(PreTrainedModel):
+    config_class = MonoSceneConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MonoScene(
+            dataset=config.dataset,
+            n_classes=config.n_classes,
+            feature=config.feature,
+            project_scale=config.project_scale,
+            full_scene_size=config.full_scene_size
+        )
+     
+
+    def forward(self, tensor):
+        return self.model.forward(tensor)
\ No newline at end of file
diff --git a/monoscene/unet2d.py b/monoscene/unet2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1c9e45553b1c7e083436778c6e963545446d008
--- /dev/null
+++ b/monoscene/unet2d.py
@@ -0,0 +1,198 @@
+"""
+Code adapted from https://github.com/shariqfarooq123/AdaBins/blob/main/models/unet_adaptive_bins.py
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import os
+
+
+class UpSampleBN(nn.Module):
+    def __init__(self, skip_input, output_features):
+        super(UpSampleBN, self).__init__()
+        self._net = nn.Sequential(
+            nn.Conv2d(skip_input, output_features, kernel_size=3, stride=1, padding=1),
+            nn.BatchNorm2d(output_features),
+            nn.LeakyReLU(),
+            nn.Conv2d(
+                output_features, output_features, kernel_size=3, stride=1, padding=1
+            ),
+            nn.BatchNorm2d(output_features),
+            nn.LeakyReLU(),
+        )
+
+    def forward(self, x, concat_with):
+        up_x = F.interpolate(
+            x,
+            size=(concat_with.shape[2], concat_with.shape[3]),
+            mode="bilinear",
+            align_corners=True,
+        )
+        f = torch.cat([up_x, concat_with], dim=1)
+        return self._net(f)
+
+
+class DecoderBN(nn.Module):
+    def __init__(
+        self, num_features, bottleneck_features, out_feature, use_decoder=True
+    ):
+        super(DecoderBN, self).__init__()
+        features = int(num_features)
+        self.use_decoder = use_decoder
+
+        self.conv2 = nn.Conv2d(
+            bottleneck_features, features, kernel_size=1, stride=1, padding=1
+        )
+
+        self.out_feature_1_1 = out_feature
+        self.out_feature_1_2 = out_feature
+        self.out_feature_1_4 = out_feature
+        self.out_feature_1_8 = out_feature
+        self.out_feature_1_16 = out_feature
+        self.feature_1_16 = features // 2
+        self.feature_1_8 = features // 4
+        self.feature_1_4 = features // 8
+        self.feature_1_2 = features // 16
+        self.feature_1_1 = features // 32
+
+        if self.use_decoder:
+            self.resize_output_1_1 = nn.Conv2d(
+                self.feature_1_1, self.out_feature_1_1, kernel_size=1
+            )
+            self.resize_output_1_2 = nn.Conv2d(
+                self.feature_1_2, self.out_feature_1_2, kernel_size=1
+            )
+            self.resize_output_1_4 = nn.Conv2d(
+                self.feature_1_4, self.out_feature_1_4, kernel_size=1
+            )
+            self.resize_output_1_8 = nn.Conv2d(
+                self.feature_1_8, self.out_feature_1_8, kernel_size=1
+            )
+            self.resize_output_1_16 = nn.Conv2d(
+                self.feature_1_16, self.out_feature_1_16, kernel_size=1
+            )
+
+            self.up16 = UpSampleBN(
+                skip_input=features + 224, output_features=self.feature_1_16
+            )
+            self.up8 = UpSampleBN(
+                skip_input=self.feature_1_16 + 80, output_features=self.feature_1_8
+            )
+            self.up4 = UpSampleBN(
+                skip_input=self.feature_1_8 + 48, output_features=self.feature_1_4
+            )
+            self.up2 = UpSampleBN(
+                skip_input=self.feature_1_4 + 32, output_features=self.feature_1_2
+            )
+            self.up1 = UpSampleBN(
+                skip_input=self.feature_1_2 + 3, output_features=self.feature_1_1
+            )
+        else:
+            self.resize_output_1_1 = nn.Conv2d(3, out_feature, kernel_size=1)
+            self.resize_output_1_2 = nn.Conv2d(32, out_feature * 2, kernel_size=1)
+            self.resize_output_1_4 = nn.Conv2d(48, out_feature * 4, kernel_size=1)
+
+    def forward(self, features):
+        x_block0, x_block1, x_block2, x_block3, x_block4 = (
+            features[4],
+            features[5],
+            features[6],
+            features[8],
+            features[11],
+        )
+        bs = x_block0.shape[0]
+        x_d0 = self.conv2(x_block4)
+
+        if self.use_decoder:
+            x_1_16 = self.up16(x_d0, x_block3)
+            x_1_8 = self.up8(x_1_16, x_block2)
+            x_1_4 = self.up4(x_1_8, x_block1)
+            x_1_2 = self.up2(x_1_4, x_block0)
+            x_1_1 = self.up1(x_1_2, features[0])
+            return {
+                "1_1": self.resize_output_1_1(x_1_1),
+                "1_2": self.resize_output_1_2(x_1_2),
+                "1_4": self.resize_output_1_4(x_1_4),
+                "1_8": self.resize_output_1_8(x_1_8),
+                "1_16": self.resize_output_1_16(x_1_16),
+            }
+        else:
+            x_1_1 = features[0]
+            x_1_2, x_1_4, x_1_8, x_1_16 = (
+                features[4],
+                features[5],
+                features[6],
+                features[8],
+            )
+            x_global = features[-1].reshape(bs, 2560, -1).mean(2)
+            return {
+                "1_1": self.resize_output_1_1(x_1_1),
+                "1_2": self.resize_output_1_2(x_1_2),
+                "1_4": self.resize_output_1_4(x_1_4),
+                "global": x_global,
+            }
+
+
+class Encoder(nn.Module):
+    def __init__(self, backend):
+        super(Encoder, self).__init__()
+        self.original_model = backend
+
+    def forward(self, x):
+        features = [x]
+        for k, v in self.original_model._modules.items():
+            if k == "blocks":
+                for ki, vi in v._modules.items():
+                    features.append(vi(features[-1]))
+            else:
+                features.append(v(features[-1]))
+        return features
+
+
+class UNet2D(nn.Module):
+    def __init__(self, backend, num_features, out_feature, use_decoder=True):
+        super(UNet2D, self).__init__()
+        self.use_decoder = use_decoder
+        self.encoder = Encoder(backend)
+        self.decoder = DecoderBN(
+            out_feature=out_feature,
+            use_decoder=use_decoder,
+            bottleneck_features=num_features,
+            num_features=num_features,
+        )
+
+    def forward(self, x, **kwargs):
+        encoded_feats = self.encoder(x)
+        unet_out = self.decoder(encoded_feats, **kwargs)
+        return unet_out
+
+    def get_encoder_params(self):  # lr/10 learning rate
+        return self.encoder.parameters()
+
+    def get_decoder_params(self):  # lr learning rate
+        return self.decoder.parameters()
+
+    @classmethod
+    def build(cls, **kwargs):
+        basemodel_name = "tf_efficientnet_b7_ns"
+        num_features = 2560
+
+        print("Loading base model ()...".format(basemodel_name), end="")
+        basemodel = torch.hub.load(
+            "rwightman/gen-efficientnet-pytorch", basemodel_name, pretrained=True
+        )
+        print("Done.")
+
+        # Remove last layer
+        print("Removing last two layers (global_pool & classifier).")
+        basemodel.global_pool = nn.Identity()
+        basemodel.classifier = nn.Identity()
+
+        # Building Encoder-Decoder model
+        print("Building Encoder-Decoder model..", end="")
+        m = cls(basemodel, num_features=num_features, **kwargs)
+        print("Done.")
+        return m
+
+if __name__ == '__main__':
+    model = UNet2D.build(out_feature=256, use_decoder=True)
diff --git a/monoscene/unet3d_kitti.py b/monoscene/unet3d_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d5339fbdf34e28d017d7e4e29ce4923169bef5
--- /dev/null
+++ b/monoscene/unet3d_kitti.py
@@ -0,0 +1,88 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from monoscene.modules import SegmentationHead
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import Process, Upsample, Downsample
+
+
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        full_scene_size,
+        feature,
+        project_scale,
+        context_prior=None,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_scale = project_scale
+        self.full_scene_size = full_scene_size
+        self.feature = feature
+
+        size_l1 = (
+            int(self.full_scene_size[0] / project_scale),
+            int(self.full_scene_size[1] / project_scale),
+            int(self.full_scene_size[2] / project_scale),
+        )
+        size_l2 = (size_l1[0] // 2, size_l1[1] // 2, size_l1[2] // 2)
+        size_l3 = (size_l2[0] // 2, size_l2[1] // 2, size_l2[2] // 2)
+
+        dilations = [1, 2, 3]
+        self.process_l1 = nn.Sequential(
+            Process(self.feature, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature, norm_layer, bn_momentum),
+        )
+        self.process_l2 = nn.Sequential(
+            Process(self.feature * 2, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature * 2, norm_layer, bn_momentum),
+        )
+
+        self.up_13_l2 = Upsample(
+            self.feature * 4, self.feature * 2, norm_layer, bn_momentum
+        )
+        self.up_12_l1 = Upsample(
+            self.feature * 2, self.feature, norm_layer, bn_momentum
+        )
+        self.up_l1_lfull = Upsample(
+            self.feature, self.feature // 2, norm_layer, bn_momentum
+        )
+
+        self.ssc_head = SegmentationHead(
+            self.feature // 2, self.feature // 2, class_num, dilations
+        )
+
+        self.context_prior = context_prior
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature * 4, size_l3, bn_momentum=bn_momentum
+            )
+
+    def forward(self, input_dict):
+        res = {}
+
+        x3d_l1 = input_dict["x3d"]
+
+        x3d_l2 = self.process_l1(x3d_l1)
+
+        x3d_l3 = self.process_l2(x3d_l2)
+
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_l3)
+            x3d_l3 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+
+        x3d_up_l2 = self.up_13_l2(x3d_l3) + x3d_l2
+        x3d_up_l1 = self.up_12_l1(x3d_up_l2) + x3d_l1
+        x3d_up_lfull = self.up_l1_lfull(x3d_up_l1)
+
+        ssc_logit_full = self.ssc_head(x3d_up_lfull)
+
+        res["ssc_logit"] = ssc_logit_full
+
+        return res
diff --git a/monoscene/unet3d_nyu.py b/monoscene/unet3d_nyu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9e3b3718999248efa1b2925658465ba59801b13
--- /dev/null
+++ b/monoscene/unet3d_nyu.py
@@ -0,0 +1,90 @@
+# encoding: utf-8
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from monoscene.CRP3D import CPMegaVoxels
+from monoscene.modules import (
+    Process,
+    Upsample,
+    Downsample,
+    SegmentationHead,
+    ASPP,
+)
+
+
+class UNet3D(nn.Module):
+    def __init__(
+        self,
+        class_num,
+        norm_layer,
+        feature,
+        full_scene_size,
+        n_relations=4,
+        project_res=[],
+        context_prior=True,
+        bn_momentum=0.1,
+    ):
+        super(UNet3D, self).__init__()
+        self.business_layer = []
+        self.project_res = project_res
+
+        self.feature_1_4 = feature
+        self.feature_1_8 = feature * 2
+        self.feature_1_16 = feature * 4
+
+        self.feature_1_16_dec = self.feature_1_16
+        self.feature_1_8_dec = self.feature_1_8
+        self.feature_1_4_dec = self.feature_1_4
+
+        self.process_1_4 = nn.Sequential(
+            Process(self.feature_1_4, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_4, norm_layer, bn_momentum),
+        )
+        self.process_1_8 = nn.Sequential(
+            Process(self.feature_1_8, norm_layer, bn_momentum, dilations=[1, 2, 3]),
+            Downsample(self.feature_1_8, norm_layer, bn_momentum),
+        )
+        self.up_1_16_1_8 = Upsample(
+            self.feature_1_16_dec, self.feature_1_8_dec, norm_layer, bn_momentum
+        )
+        self.up_1_8_1_4 = Upsample(
+            self.feature_1_8_dec, self.feature_1_4_dec, norm_layer, bn_momentum
+        )
+        self.ssc_head_1_4 = SegmentationHead(
+            self.feature_1_4_dec, self.feature_1_4_dec, class_num, [1, 2, 3]
+        )
+
+        self.context_prior = context_prior
+        size_1_16 = tuple(np.ceil(i / 4).astype(int) for i in full_scene_size)
+
+        if context_prior:
+            self.CP_mega_voxels = CPMegaVoxels(
+                self.feature_1_16,                
+                size_1_16,
+                n_relations=n_relations,
+                bn_momentum=bn_momentum,
+            )
+
+    #
+    def forward(self, input_dict):
+        res = {}
+
+        x3d_1_4 = input_dict["x3d"]
+        x3d_1_8 = self.process_1_4(x3d_1_4)
+        x3d_1_16 = self.process_1_8(x3d_1_8)
+
+        if self.context_prior:
+            ret = self.CP_mega_voxels(x3d_1_16)
+            x3d_1_16 = ret["x"]
+            for k in ret.keys():
+                res[k] = ret[k]
+
+        x3d_up_1_8 = self.up_1_16_1_8(x3d_1_16) + x3d_1_8
+        x3d_up_1_4 = self.up_1_8_1_4(x3d_up_1_8) + x3d_1_4
+
+        ssc_logit_1_4 = self.ssc_head_1_4(x3d_up_1_4)
+
+        res["ssc_logit"] = ssc_logit_1_4
+
+        return res
diff --git a/monoscene_kitti.ckpt b/monoscene_kitti.ckpt
new file mode 100644
index 0000000000000000000000000000000000000000..cd302ef17432a3b5c8b58ab5a63bb52e2c166976
--- /dev/null
+++ b/monoscene_kitti.ckpt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6f0d1324885166f17949bf2dcfc0ee1eb2d2aedd0f48e75b56bb2beb87c1ce3a
+size 1796467007
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..81946126e893d56bb066160e9465ef42bc43d9d9
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+scikit-image==0.18.1
+PyYAML==5.3.1
+tqdm==4.49.0
+scikit-learn==0.24.0
+pytorch-lightning==1.4.9
+opencv-python==4.5.1.48
+hydra-core==1.0.5
+numpy==1.20.3
+numba==0.53
+imageio
+protobuf~=3.19.0
+transformers
+plotly
+torch
+torchvision
+torchmetrics==0.6.0
\ No newline at end of file