diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..49510808e7b6d8e7902c9e34bf50700fa8eb243d
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,28 @@
+# folders or files
+detectron2/
+pytorch3d/
+datasets/*
+testing/image_2
+training/image_2
+# .vscode/
+.ipynb_checkpoints/
+.idea/
+output/
+cubercnn/external/
+wandb/
+hpc_logs/
+depth/checkpoints/
+ProposalNetwork/proposals/network_out.pkl
+.vscode/settings.json
+submit.sh
+profiling/
+
+# filetypes
+*.pyc
+*.mexa64
+*/output/*
+*/output*/*
+*~
+*.so
+#*.ipynb
+ProposalNetwork/proposals/figs/*
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
new file mode 100644
index 0000000000000000000000000000000000000000..fce8b9f4c93588ba438dfe215a88ecf43671abb6
--- /dev/null
+++ b/.gitmodules
@@ -0,0 +1,9 @@
+[submodule "GroundingDINO"]
+	path = GroundingDINO
+	url = https://github.com/AndreasLH/GroundingDINO
+[submodule "sam-hq"]
+	path = sam-hq
+	url = https://github.com/SysCV/sam-hq.git
+[submodule "Depth-Anything-V2"]
+	path = Depth-Anything-V2
+	url = https://github.com/DepthAnything/Depth-Anything-V2
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000000000000000000000000000000000000..3183a617972c54483ddceafd2323832dc6edd460
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,100 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+
+        {
+            "name": "Python: Current File",
+            "type": "python"    ,
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": []
+        },
+        {
+            "name": "Cube R-CNN Demo",
+            "type": "python",
+            "request": "launch",
+            "program": "demo/demo.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "cubercnn://omni3d/cubercnn_DLA34_FPN.yaml", "--input-folder", "datasets/title", "--threshold", "0.25", "MODEL.WEIGHTS", "cubercnn://omni3d/cubercnn_DLA34_FPN.pth", "OUTPUT_DIR", "output/demo"]
+        },
+        {
+            "name": "Cube R-CNN 2D only",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D_2D_only.yaml", "MODEL.WEIGHTS", "output/omni3d-2d-only/model_recent.pth", "OUTPUT_DIR", "output/omni3d-2d-only", "log", "False"]
+        },
+        {
+            "name": "Cube R-CNN Time equalised Demo",
+            "type": "python",
+            "request": "launch",
+            "program": "demo/demo.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "--input-folder", "datasets/coco_examples", "--threshold", "0.25", "MODEL.WEIGHTS", "output/omni_equalised/model_final.pth", "OUTPUT_DIR", "output/demo_time_equal"]
+        },
+        {
+            "name": "Cube R-CNN pseudo gt demo",
+            "type": "python",
+            "request": "launch",
+            "program": "demo/demo.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "--input-folder", "datasets/title", "--threshold", "0.25", "MODEL.WEIGHTS", "output/omni_pseudo_gt/model_final.pth", "OUTPUT_DIR", "output/demo_pseudogt"]
+        },
+        {
+            "name": "train",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "OUTPUT_DIR", "output/omni3d_example_run"]
+        },
+        {
+            "name": "resume train",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Base_Omni3D.yaml", "--resume", "OUTPUT_DIR", "output/Baseline_sgd"]
+        },
+        {
+            "name": "eval, train_net pretrained",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--eval-only", "--config-file", "cubercnn://omni3d/cubercnn_DLA34_FPN.yaml", "MODEL.WEIGHTS", "cubercnn://omni3d/cubercnn_DLA34_FPN.pth"]
+        },
+        {
+            "name": "eval, train_net locally trained",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--eval-only", "--config-file", "configs/Base_Omni3D.yaml", "MODEL.WEIGHTS", "output/Baseline_sgd/model_final.pth"]
+        },
+        {
+            "name": "train Cube R-CNN weak loss",
+            "type": "python",
+            "request": "launch",
+            "program": "tools/train_net.py",
+            "console": "integratedTerminal",
+            "justMyCode": true,
+            "args": ["--config-file", "configs/Omni_combined.yaml", "OUTPUT_DIR", "output/omni3d_combined_test", "log", "False", "loss_functions", "['iou', 'z_pseudo_gt_center', 'pose_alignment', 'pose_ground']"]
+        },
+        
+    ]
+} 
\ No newline at end of file
diff --git a/DATA.md b/DATA.md
new file mode 100644
index 0000000000000000000000000000000000000000..1112baee30b1061fb592a8aa8dfb3b3bd4f35227
--- /dev/null
+++ b/DATA.md
@@ -0,0 +1,219 @@
+- [Data Preparation](#data-preparation)  
+    - [Download Omni3D json](#download-omni3d-json)
+    - [Download Individual Datasets](#download-individual-datasets)
+- [Data Usage](#data-usage)  
+    - [Coordinate System](#coordinate-system)
+    - [Annotation Format](#annotation-format)
+    - [Example Loading Data](#example-loading-data)
+
+# Data Preparation
+
+The Omni3D dataset is comprised of 6 datasets which have been pre-processed into the same annotation format and camera coordinate systems. To use a subset or the full dataset you must download:
+
+1. The processed Omni3D json files
+2. RGB images from each dataset separately
+
+## Download Omni3D json
+
+Run
+
+```
+sh datasets/Omni3D/download_omni3d_json.sh
+```
+
+to download and extract the Omni3D train, val and test json annotation files.
+
+## Download Individual Datasets
+
+Below are the instructions for setting up each individual dataset. It is recommended to download only the data you plan to use.  
+
+### KITTI
+Download the left color images from [KITTI's official website](http://www.cvlibs.net/datasets/kitti/eval_object.php?obj_benchmark=3d). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below. Note that we only require the image_2 folder. 
+
+```bash
+datasets/KITTI_object
+└── training
+    ├── image_2
+```
+
+
+### nuScenes
+
+Download the trainval images from the [official nuScenes website](https://www.nuscenes.org/nuscenes#download). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below. Note that we only require the CAM_FRONT folder.
+
+```bash
+datasets/nuScenes/samples
+└── samples
+    ├── CAM_FRONT
+```
+
+### Objectron
+
+Run
+
+```
+sh datasets/objectron/download_objectron_images.sh
+```
+
+to download and extract the Objectron pre-processed images (~24 GB).
+
+### SUN RGB-D
+
+Download the "SUNRGBD V1" images at [SUN RGB-D's official website](https://rgbd.cs.princeton.edu/). Unzip or softlink the images into the root `./Omni3D/` which should have the folder structure as detailed below. 
+
+```bash
+./Omni3D/datasets/SUNRGBD
+├── kv1
+├── kv2
+├── realsense
+```
+
+### ARKitScenes
+
+Run
+
+```
+sh datasets/ARKitScenes/download_arkitscenes_images.sh
+```
+
+to download and extract the ARKitScenes pre-processed images (~28 GB).
+
+### Hypersim
+
+Follow the [download instructions](https://github.com/apple/ml-hypersim/tree/main/contrib/99991) from [Thomas Germer](https://github.com/99991) in order to download all \*tonemap.jpg preview images in order to avoid downloading the full Hypersim dataset. For example:
+
+```bash
+git clone https://github.com/apple/ml-hypersim
+cd ml-hypersim/
+python contrib/99991/download.py -c .tonemap.jpg -d /path/to/Omni3D/datasets/hypersim --silent
+```
+
+Then arrange or unzip the downloaded images into the root `./Omni3D/` so that it has the below folder structure.
+
+```bash
+datasets/hypersim/
+├── ai_001_001
+├── ai_001_002
+├── ai_001_003
+├── ai_001_004
+├── ai_001_005
+├── ai_001_006
+...
+```
+
+# Data Usage
+
+Below we describe the unified 3D annotation coordinate systems, annotation format, and an example script. 
+
+
+## Coordinate System
+
+All 3D annotations are provided in a shared camera coordinate system with 
++x right, +y down, +z toward screen. 
+
+The vertex order of bbox3D_cam:
+```
+                v4_____________________v5
+                /|                    /|
+               / |                   / |
+              /  |                  /  |
+             /___|_________________/   |
+          v0|    |                 |v1 |
+            |    |                 |   |
+            |    |                 |   |
+            |    |                 |   |
+            |    |_________________|___|
+            |   / v7               |   /v6
+            |  /                   |  /
+            | /                    | /
+            |/_____________________|/
+            v3                     v2
+```
+
+## Annotation Format
+Each dataset is formatted as a dict in python in the below format.
+
+```python
+dataset {
+    "info"			: info,
+    "images"			: [image],
+    "categories"		: [category],
+    "annotations"		: [object],
+}
+
+info {
+	"id"			: str,
+	"source"		: int,
+	"name"			: str,
+	"split"			: str,
+	"version"		: str,
+	"url"			: str,
+}
+
+image {
+	"id"			: int,
+	"dataset_id"		: int,
+	"width"			: int,
+	"height"		: int,
+	"file_path"		: str,
+	"K"			: list (3x3),
+	"src_90_rotate"		: int,					# im was rotated X times, 90 deg counterclockwise 
+	"src_flagged"		: bool,					# flagged as potentially inconsistent sky direction
+}
+
+category {
+	"id"			: int,
+	"name"			: str,
+	"supercategory"	: str
+}
+
+object {
+	
+	"id"			: int,					# unique annotation identifier
+	"image_id"		: int,					# identifier for image
+	"category_id"		: int,					# identifier for the category
+	"category_name"		: str,					# plain name for the category
+	
+	# General 2D/3D Box Parameters.
+	# Values are set to -1 when unavailable.
+	"valid3D"		: bool,				        # flag for no reliable 3D box
+	"bbox2D_tight"		: [x1, y1, x2, y2],			# 2D corners of annotated tight box
+	"bbox2D_proj"		: [x1, y1, x2, y2],			# 2D corners projected from bbox3D
+	"bbox2D_trunc"		: [x1, y1, x2, y2],			# 2D corners projected from bbox3D then truncated
+	"bbox3D_cam"		: [[x1, y1, z1]...[x8, y8, z8]]		# 3D corners in meters and camera coordinates
+	"center_cam"		: [x, y, z],				# 3D center in meters and camera coordinates
+	"dimensions"		: [width, height, length],		# 3D attributes for object dimensions in meters
+	"R_cam"			: list (3x3),				# 3D rotation matrix to the camera frame rotation
+	
+	# Optional dataset specific properties,
+	# used mainly for evaluation and ignore.
+	# Values are set to -1 when unavailable.
+	"behind_camera"		: bool,					# a corner is behind camera
+	"visibility"		: float, 				# annotated visibility 0 to 1
+	"truncation"		: float, 				# computed truncation 0 to 1
+	"segmentation_pts"	: int, 					# visible instance segmentation points
+	"lidar_pts" 		: int, 					# visible LiDAR points in the object
+	"depth_error"		: float,				# L1 of depth map and rendered object
+}
+```
+
+
+## Example Loading Data
+Each dataset is named as "Omni3D_{name}_{split}.json" where split can be train, val, or test. 
+
+The annotations are in a COCO-like format such that if you load the json from the Omni3D class which inherits the [COCO class](https://github.com/cocodataset/cocoapi/blob/master/PythonAPI/pycocotools/coco.py#L70), you can use basic COCO dataset functions as demonstrated with the below code. 
+
+```python
+from cubercnn import data
+
+dataset_paths_to_json = ['path/to/Omni3D/{name}_{split}.json', ...]
+
+# Example 1. load all images
+dataset = data.Omni3D(dataset_paths_to_json)
+imgIds = dataset.getImgIds()
+imgs = dataset.loadImgs(imgIds)
+
+# Example 2. load annotations for image index 0
+annIds = dataset.getAnnIds(imgIds=imgs[0]['id'])
+anns = dataset.loadAnns(annIds)
+```
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..2873f343fcf85f1be81fd44462771d1284111004
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,32 @@
+# Base image, have to use the full version to use the git features
+FROM python:3.12
+# https://huggingface.co/docs/hub/spaces-sdks-docker-first-demo
+
+# RUN apt-get install -y git
+
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+COPY ./pre-requirements.txt /code/pre-requirements.txt
+COPY ./GroundingDINO /code/GroundingDINO
+COPY ./sam-hq /code/sam-hq
+
+RUN pip install --no-cache-dir -r /code/pre-requirements.txt
+RUN pip install --no-cache-dir -r /code/requirements.txt
+
+# Set up a new user named "user" with user ID 1000
+RUN useradd -m -u 1000 user
+
+# Switch to the "user" user
+USER user
+
+# Set home to the user's home directory
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+
+# Set the working directory to the user's home directory
+WORKDIR $HOME/app
+
+# Copy the current directory contents into the container at $HOME/app setting the owner to the user
+COPY --chown=user . $HOME/app
+
+CMD ["python", "app.py"]
\ No newline at end of file
diff --git a/LICENSE.md b/LICENSE.md
new file mode 100644
index 0000000000000000000000000000000000000000..37d6e9d3765fe4a4cfdbfb0e5a1ea21fec95a9ad
--- /dev/null
+++ b/LICENSE.md
@@ -0,0 +1,906 @@
+- [Omni3D and Cube R-CNN License](#omni3d-and-cube-r-cnn-license)
+- [ARKitScenes License](#arkitscenes-license)
+- [Objectron License](#objectron-license)
+
+# Omni3D and Cube R-CNN License
+https://github.com/facebookresearch/omni3d
+https://github.com/facebookresearch/omni3d/blob/main/LICENSE.md
+
+Attribution-NonCommercial 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public: 
+	wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
+
+# ARKitScenes License
+https://github.com/apple/ARKitScenes/
+https://github.com/apple/ARKitScenes/blob/main/LICENSE
+
+Attribution-NonCommercial-ShareAlike 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+    wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More considerations
+     for the public:
+    wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International
+Public License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial-ShareAlike 4.0 International Public License
+("Public License"). To the extent this Public License may be
+interpreted as a contract, You are granted the Licensed Rights in
+consideration of Your acceptance of these terms and conditions, and the
+Licensor grants You such rights in consideration of benefits the
+Licensor receives from making the Licensed Material available under
+these terms and conditions.
+
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. BY-NC-SA Compatible License means a license listed at
+     creativecommons.org/compatiblelicenses, approved by Creative
+     Commons as essentially the equivalent of this Public License.
+
+  d. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+
+  e. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  f. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  g. License Elements means the license attributes listed in the name
+     of a Creative Commons Public License. The License Elements of this
+     Public License are Attribution, NonCommercial, and ShareAlike.
+
+  h. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  i. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  j. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  k. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+
+  l. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  m. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  n. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. Additional offer from the Licensor -- Adapted Material.
+               Every recipient of Adapted Material from You
+               automatically receives an offer from the Licensor to
+               exercise the Licensed Rights in the Adapted Material
+               under the conditions of the Adapter's License You apply.
+
+            c. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+  b. ShareAlike.
+
+     In addition to the conditions in Section 3(a), if You Share
+     Adapted Material You produce, the following conditions also apply.
+
+       1. The Adapter's License You apply must be a Creative Commons
+          license with the same License Elements, this version or
+          later, or a BY-NC-SA Compatible License.
+
+       2. You must include the text of, or the URI or hyperlink to, the
+          Adapter's License You apply. You may satisfy this condition
+          in any reasonable manner based on the medium, means, and
+          context in which You Share Adapted Material.
+
+       3. You may not offer or impose any additional or different terms
+          or conditions on, or apply any Effective Technological
+          Measures to, Adapted Material that restrict exercise of the
+          rights granted under the Adapter's License You apply.
+
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material,
+     including for purposes of Section 3(b); and
+
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
+
+
+
+# Objectron License
+https://github.com/google-research-datasets/Objectron
+https://github.com/google-research-datasets/Objectron/blob/main/LICENSE
+
+
+# Computational Use of Data Agreement v1.0
+
+This is the Computational Use of Data Agreement, Version 1.0 (the “C-UDA”). Capitalized terms are defined in Section 5. Data Provider and you agree as follows:
+
+1. **Provision of the Data**
+
+    1.1. You may use, modify, and distribute the Data made available to you by the Data Provider under this C-UDA for Computational Use if you follow the C-UDA's terms.
+
+    1.2. Data Provider will not sue you or any Downstream Recipient for any claim arising out of the use, modification, or distribution of the Data provided you meet the terms of the C-UDA.
+
+    1.3 This C-UDA does not restrict your use, modification, or distribution of any portions of the Data that are in the public domain or that may be used, modified, or distributed under any other legal exception or limitation.
+
+2. **Restrictions**
+
+    2.1  You agree that you will use the Data solely for Computational Use.
+
+	  2.2 The C-UDA does not impose any restriction with respect to the use, modification, or distribution of Results.
+
+3.	**Redistribution of Data**
+
+    3.1. You may redistribute the Data, so long as:
+
+      3.1.1. You include with any Data you redistribute all credit or attribution information that you received with the Data, and your terms require any Downstream Recipient to do the same; and
+
+      3.1.2. You bind each recipient to whom you redistribute the Data to the terms of the C-UDA.
+
+4.	**No Warranty, Limitation of Liability**
+
+    4.1. Data Provider does not represent or warrant that it has any rights whatsoever in the Data.
+
+    4.2. THE DATA IS PROVIDED ON AN “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+    4.3. NEITHER DATA PROVIDER NOR ANY UPSTREAM DATA PROVIDER SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE DATA OR RESULTS, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+
+5.	**Definitions**
+
+    5.1. “Computational Use” means activities necessary to enable the use of Data (alone or along with other material) for analysis by a computer.
+
+    5.2.“Data” means the material you receive under the C-UDA in modified or unmodified form, but not including Results.
+
+    5.3. “Data Provider” means the source from which you receive the Data and with whom you enter into the C-UDA.
+
+    5.4. “Downstream Recipient” means any person or persons who receives the Data directly or indirectly from you in accordance with the C-UDA.
+
+    5.5. “Result” means anything that you develop or improve from your use of Data that does not include more than a de minimis portion of the Data on which the use is based. Results may include de minimis portions of the Data necessary to report on or explain use that has been conducted with the Data, such as figures in scientific papers, but do not include more. Artificial intelligence models trained on Data (and which do not include more than a de minimis portion of Data) are Results.
+
+    5.6. “Upstream Data Providers” means the source or sources from which the Data Provider directly or indirectly received, under the terms of the C-UDA, material that is included in the Data.
+
+
diff --git a/MODEL_ZOO.md b/MODEL_ZOO.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f726bab7c3a0d4ee9574acac91ff99e2d658790
--- /dev/null
+++ b/MODEL_ZOO.md
@@ -0,0 +1,17 @@
+# Cube R-CNN Model Zoo on Omni3D
+
+## Models
+
+We provide a model zoo for models trained on Omni3D data splits (see paper for more details).
+
+|         |        Omni3D             |     Omni3D (Indoor only)     |     Omni3D (Outdoor only)    |
+|---------|:-------------------------:|:----------------------------:|:----------------------------:|
+| `res34` |  [omni3d/cubercnn_Res34_FPN.pth][res34_omni]  |   [indoor/cubercnn_Res34_FPN.pth][res34_in]  |   [outdoor/cubercnn_Res34_FPN.pth][res34_out]  |
+| `dla34` |   [omni3d/cubercnn_DLA34_FPN.pth][dla34_omni]  |   [indoor/cubercnn_DLA34_FPN.pth][dla34_in]  |   [outdoor/cubercnn_DLA34_FPN.pth][dla34_out]  |
+
+[dla34_omni]: https://dl.fbaipublicfiles.com/cubercnn/omni3d/cubercnn_DLA34_FPN.pth
+[dla34_in]: https://dl.fbaipublicfiles.com/cubercnn/indoor/cubercnn_DLA34_FPN.pth
+[dla34_out]: https://dl.fbaipublicfiles.com/cubercnn/outdoor/cubercnn_DLA34_FPN.pth
+[res34_omni]: https://dl.fbaipublicfiles.com/cubercnn/omni3d/cubercnn_Res34_FPN.pth
+[res34_in]: https://dl.fbaipublicfiles.com/cubercnn/indoor/cubercnn_Res34_FPN.pth
+[res34_out]: https://dl.fbaipublicfiles.com/cubercnn/outdoor/cubercnn_Res34_FPN.pth
\ No newline at end of file
diff --git a/ProposalNetwork/utils/__init__.py b/ProposalNetwork/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fd8f57ba4c4eadbded7f3acdb8b53f289dc838b
--- /dev/null
+++ b/ProposalNetwork/utils/__init__.py
@@ -0,0 +1,3 @@
+from .spaces import *
+from .conversions import *
+from .utils import *
\ No newline at end of file
diff --git a/ProposalNetwork/utils/conversions.py b/ProposalNetwork/utils/conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..bceca9a2dfb128f91bcc45f91edbbaf9a5a0872a
--- /dev/null
+++ b/ProposalNetwork/utils/conversions.py
@@ -0,0 +1,50 @@
+import torch
+import numpy as np
+from detectron2.structures import Boxes
+    
+def cube_to_box(cube,K):
+    '''
+    Converts a Cube to a Box.
+
+    Args:
+        cube: A Cube.
+        K: The 3D camera matrix of the box.
+
+    Returns:
+        A Box.
+    '''
+    bube_corners = cube.get_bube_corners(K)
+    
+    min_x = torch.min(bube_corners[:,0])
+    max_x = torch.max(bube_corners[:,0])
+    min_y = torch.min(bube_corners[:,1])
+    max_y = torch.max(bube_corners[:,1])
+    
+    return Boxes(torch.tensor([[min_x, min_y, max_x, max_y]], device=cube.tensor.device))
+
+def cubes_to_box(cubes, K, im_shape):
+    '''
+    Converts a Cubes to a Boxes.
+
+    Args:
+        cubes: A Cubes.
+        K: The 3D camera matrix of the box.
+        im_shape: The shape of the image (width, height).
+
+    Returns:
+        A Box.
+    '''
+    bube_corners = cubes.get_bube_corners(K, im_shape)
+    min_x, _ = torch.min(bube_corners[:, :, :, 0], 2)
+    max_x, _ = torch.max(bube_corners[:, :, :, 0], 2)
+    min_y, _ = torch.min(bube_corners[:, :, :, 1], 2)
+    max_y, _ = torch.max(bube_corners[:, :, :, 1], 2)
+
+    values = torch.stack((min_x, min_y, max_x, max_y),dim=2)
+    box_list = []
+    for i in range(cubes.num_instances):
+        box_list.append(Boxes(values[i]))
+
+    return box_list
+
+    
\ No newline at end of file
diff --git a/ProposalNetwork/utils/plane.py b/ProposalNetwork/utils/plane.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cc8cf8749596f748a6edff1d86e589e02a2f9ad
--- /dev/null
+++ b/ProposalNetwork/utils/plane.py
@@ -0,0 +1,209 @@
+import random
+import torch
+import numpy as np
+
+class Plane_torch:
+    """
+    Implementation of planar RANSAC.
+
+    Class for Plane object, which finds the equation of a infinite plane using RANSAC algorithim.
+
+    Call `fit(.)` to randomly take 3 points of pointcloud to verify inliers based on a threshold.
+
+    ![Plane](https://raw.githubusercontent.com/leomariga/pyRANSAC-3D/master/doc/plano.gif "Plane")
+
+    ---
+    """
+
+    def __init__(self):
+        self.inliers = []
+        self.equation = []
+
+    def fit(self, pts, thresh=0.05, minPoints=100, maxIteration=1000):
+        """
+        Find the best equation for a plane.
+
+        :param pts: 3D point cloud as a `torch.Tensor (N,3)`.
+        :param thresh: Threshold distance from the plane which is considered inlier.
+        :param maxIteration: Number of maximum iteration which RANSAC will loop over.
+        :returns:
+        - `self.equation`:  Parameters of the plane using Ax+By+Cy+D `torch.Tensor(4)`
+        - `self.inliers`: points from the dataset considered inliers
+
+        ---
+        """
+        n_points = pts.shape[0]
+        best_eq = []
+        best_inliers = []
+
+        for it in range(maxIteration):
+
+            # Samples 3 random points
+            id_samples = torch.randperm(n_points)[:3]
+            pt_samples = pts[id_samples]
+
+            # We have to find the plane equation described by those 3 points
+            # We find first 2 vectors that are part of this plane
+            # A = pt2 - pt1
+            # B = pt3 - pt1
+
+            vecA = pt_samples[1, :] - pt_samples[0, :]
+            vecB = pt_samples[2, :] - pt_samples[0, :]
+
+            # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
+            vecC = torch.cross(vecA, vecB)
+
+            # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
+            # We have to use a point to find k
+            vecC = vecC / torch.norm(vecC, p=2)
+            k = -torch.sum(torch.mul(vecC, pt_samples[1, :]))
+            plane_eq = torch.tensor([vecC[0], vecC[1], vecC[2], k])
+
+            # Distance from a point to a plane
+            # https://mathworld.wolfram.com/Point-PlaneDistance.html
+            pt_id_inliers = []  # list of inliers ids
+            dist_pt = (
+                plane_eq[0] * pts[:, 0] + plane_eq[1] * pts[:, 1] + plane_eq[2] * pts[:, 2] + plane_eq[3]
+            ) / torch.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2)
+
+            # Select indexes where distance is smaller than the threshold
+            pt_id_inliers = torch.where(torch.abs(dist_pt) <= thresh)[0]
+            if len(pt_id_inliers) > len(best_inliers):
+                best_eq = plane_eq
+                best_inliers = pt_id_inliers
+            self.inliers = best_inliers
+            self.equation = best_eq
+
+        return -self.equation, self.inliers
+
+    def fit_parallel(self, pts:torch.Tensor, thresh=0.05, minPoints=100, maxIteration=1000):
+        """
+        Find the best equation for a plane.
+
+        :param pts: 3D point cloud as a `torch.Tensor (N,3)`.
+        :param thresh: Threshold distance from the plane which is considered inlier.
+        :param maxIteration: Number of maximum iteration which RANSAC will loop over.
+        :returns:
+        - `self.equation`:  Parameters of the plane using Ax+By+Cy+D `torch.Tensor(4)`
+        - `self.inliers`: points from the dataset considered inliers
+
+        ---
+        """
+        n_points = pts.shape[0]
+
+        # Samples shape (maxIteration, 3) random points
+        id_samples = torch.tensor([random.sample(range(0, n_points), 3) for _ in range(maxIteration)],device=pts.device)
+        pt_samples = pts[id_samples]
+
+        # We have to find the plane equation described by those 3 points
+        # We find first 2 vectors that are part of this plane
+        # A = pt2 - pt1
+        # B = pt3 - pt1
+
+        vecA = pt_samples[:, 1, :] - pt_samples[:, 0, :]
+        vecB = pt_samples[:, 2, :] - pt_samples[:, 0, :]
+
+        # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
+        vecC = torch.cross(vecA, vecB, dim=-1)
+
+        # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
+        # We have to use a point to find k
+        vecC = vecC / torch.norm(vecC, p=2, dim=1, keepdim=True)
+        k = -torch.sum(torch.mul(vecC, pt_samples[:, 1, :]), dim=1)
+        plane_eqs = torch.column_stack([vecC[:, 0], vecC[:, 1], vecC[:, 2], k])
+
+        # Distance from a point to a plane
+        # https://mathworld.wolfram.com/Point-PlaneDistance.html
+        dist_pt = (
+            plane_eqs[:,0].unsqueeze(1) * pts[:, 0] + plane_eqs[:,1].unsqueeze(1) * pts[:, 1] + plane_eqs[:,2].unsqueeze(1) * pts[:, 2] + plane_eqs[:,3].unsqueeze(1)
+        ) / torch.sqrt(plane_eqs[:,0] ** 2 + plane_eqs[:,1] ** 2 + plane_eqs[:,2] ** 2).unsqueeze(1)
+
+        # Select indexes where distance is smaller than the threshold
+        # maxIteration x n_points
+        # row with most inliers
+        
+        pt_id_inliers = torch.abs(dist_pt) <= thresh
+        counts = torch.sum(pt_id_inliers, dim=1)
+
+        best_eq = plane_eqs[torch.argmax(counts)]
+        best_inliers_id = pt_id_inliers[torch.argmax(counts)]
+        # convert boolean tensor to indices
+        best_inliers = torch.where(best_inliers_id)[0]
+        self.inliers = best_inliers
+        self.equation = best_eq
+        return -self.equation, self.inliers
+    
+
+class Plane_np:
+    """
+    Implementation of planar RANSAC.
+
+    Class for Plane object, which finds the equation of a infinite plane using RANSAC algorithim.
+
+    Call `fit(.)` to randomly take 3 points of pointcloud to verify inliers based on a threshold.
+
+    ![Plane](https://raw.githubusercontent.com/leomariga/pyRANSAC-3D/master/doc/plano.gif "Plane")
+
+    ---
+    """
+
+    def __init__(self):
+        self.inliers = []
+        self.equation = []
+
+    def fit(self, pts, thresh=0.05, minPoints=100, maxIteration=1000):
+        """
+        Find the best equation for a plane.
+
+        :param pts: 3D point cloud as a `np.array (N,3)`.
+        :param thresh: Threshold distance from the plane which is considered inlier.
+        :param maxIteration: Number of maximum iteration which RANSAC will loop over.
+        :returns:
+        - `self.equation`:  Parameters of the plane using Ax+By+Cy+D `np.array (1, 4)`
+        - `self.inliers`: points from the dataset considered inliers
+
+        ---
+        """
+        n_points = pts.shape[0]
+        best_eq = []
+        best_inliers = []
+
+        for it in range(maxIteration):
+
+            # Samples 3 random points
+            id_samples = random.sample(range(0, n_points), 3)
+            pt_samples = pts[id_samples]
+
+            # We have to find the plane equation described by those 3 points
+            # We find first 2 vectors that are part of this plane
+            # A = pt2 - pt1
+            # B = pt3 - pt1
+
+            vecA = pt_samples[1, :] - pt_samples[0, :]
+            vecB = pt_samples[2, :] - pt_samples[0, :]
+
+            # Now we compute the cross product of vecA and vecB to get vecC which is normal to the plane
+            vecC = np.cross(vecA, vecB)
+
+            # The plane equation will be vecC[0]*x + vecC[1]*y + vecC[0]*z = -k
+            # We have to use a point to find k
+            vecC = vecC / np.linalg.norm(vecC)
+            k = -np.sum(np.multiply(vecC, pt_samples[1, :]))
+            plane_eq = [vecC[0], vecC[1], vecC[2], k]
+
+            # Distance from a point to a plane
+            # https://mathworld.wolfram.com/Point-PlaneDistance.html
+            pt_id_inliers = []  # list of inliers ids
+            dist_pt = (
+                plane_eq[0] * pts[:, 0] + plane_eq[1] * pts[:, 1] + plane_eq[2] * pts[:, 2] + plane_eq[3]
+            ) / np.sqrt(plane_eq[0] ** 2 + plane_eq[1] ** 2 + plane_eq[2] ** 2)
+
+            # Select indexes where distance is biggers than the threshold
+            pt_id_inliers = np.where(np.abs(dist_pt) <= thresh)[0]
+            if len(pt_id_inliers) > len(best_inliers):
+                best_eq = plane_eq
+                best_inliers = pt_id_inliers
+            self.inliers = best_inliers
+            self.equation = best_eq
+
+        return self.equation, self.inliers
diff --git a/ProposalNetwork/utils/spaces.py b/ProposalNetwork/utils/spaces.py
new file mode 100644
index 0000000000000000000000000000000000000000..264cea8b76fa8c82457d087d6e4c47f7a0c9d7d1
--- /dev/null
+++ b/ProposalNetwork/utils/spaces.py
@@ -0,0 +1,328 @@
+import numpy as np
+import torch
+from cubercnn import util
+
+'''
+coordinate system is assumed to have origin in the upper left
+(0,0) _________________(N,0)
+|  
+|    
+| 
+|
+|
+(0,M)
+'''
+"""
+class Cube:
+    '''
+    3D box in the format [c1, c2, c3, w, h, l, R]
+
+    Args:
+        c1: The x coordinate of the center of the box.
+        c2: The y coordinate of the center of the box.
+        c3: The z coordinate of the center of the box.
+        w: The width of the box in meters.
+        h: The height of the box in meters.
+        l: The length of the box in meters.
+        R: The 3D rotation matrix of the box.
+    ```
+
+                      _____________________ 
+                    /|                    /|
+                   / |                   / |
+                  /  |                  /  |
+                 /___|_________________/   |
+                |    |                 |   | h
+                |    |                 |   |
+                |    |                 |   |
+                |    |   (c1,c2,c3)    |   |
+                |    |_________________|___|
+                |   /                  |   /
+                |  /                   |  /
+                | /                    | / l
+                |/_____________________|/
+                            w             
+    ```
+    '''
+    def __init__(self,tensor: torch.Tensor, R: torch.Tensor, score=None, label=None) -> None:
+        self.tensor = tensor
+        self.center = tensor[:3]
+        self.dimensions = tensor[3:6]
+        self.rotation = R
+
+        # score and label are meant as auxiliary information
+        self.score = score
+        self.label = label
+
+    def get_cube(self):
+        color = [c/255.0 for c in util.get_color()]
+        return util.mesh_cuboid(torch.cat((self.center,self.dimensions)), self.rotation, color=color)
+    
+    def get_all_corners(self):
+        '''wrap ``util.get_cuboid_verts_faces``
+        
+        Returns:
+            verts: the 3D vertices of the cuboid in camera space'''
+        verts, _ = util.get_cuboid_verts_faces(torch.cat((self.center,self.dimensions)), self.rotation)
+        return verts
+    
+    def get_bube_corners(self,K) -> torch.Tensor:
+        cube_corners = self.get_all_corners()
+        cube_corners = torch.mm(K, cube_corners.t()).t()
+        return cube_corners[:,:2]/cube_corners[:,2].unsqueeze(1)
+    
+    def get_volume(self) -> float:
+        return self.dimensions.prod().item()
+
+    
+    def __repr__(self) -> str:
+        return f'Cube({self.center}, {self.dimensions}, {self.rotation})'
+    
+    def to_device(self, device):
+        '''
+        Move all tensors of the instantiated class to the specified device.
+
+        Args:
+            device: The device to move the tensors to (e.g., 'cuda', 'cpu').
+        '''
+        self.tensor = self.tensor.to(device)
+        self.center = self.center.to(device)
+        self.dimensions = self.dimensions.to(device)
+        self.rotation = self.rotation.to(device)
+        return self
+"""
+
+class Cubes:
+    '''
+    3D boxes in the format [[c1, c2, c3, w, h, l, R1...R9]]
+
+    inspired by `detectron2.structures.Boxes`
+
+    Args:
+        tensor: torch.tensor(
+            c1: The x coordinates of the center of the boxes.
+            c2: The y coordinates of the center of the boxes.
+            c3: The z coordinates of the center of the boxes.
+            w: The width of the boxes in meters.
+            h: The height of the boxes in meters.
+            l: The length of the boxes in meters.
+            R: The flattened 3D rotation matrix of the boxes (i.e. the rows are next to each other).
+            )
+            of shape (N, 15).
+    ```
+                      _____________________ 
+                    /|                    /|
+                   / |                   / |
+                  /  |                  /  |
+                 /___|_________________/   |
+                |    |                 |   | h
+                |    |                 |   |
+                |    |                 |   |
+                |    |   (c1,c2,c3)    |   |
+                |    |_________________|___|
+                |   /                  |   /
+                |  /                   |  /
+                | /                    | / l
+                |/_____________________|/
+                            w             
+    ```
+    '''
+    def __init__(self,tensor: torch.Tensor, scores=None, labels=None) -> None:
+
+        # score and label are meant as auxiliary information
+        if scores is not None:
+            assert scores.ndim == 2, f"scores.shape must be (n_instances, n_proposals), but was {scores.shape}" 
+        self.scores = scores
+        self.labels = labels
+
+        if not isinstance(tensor, torch.Tensor):
+            if not isinstance(tensor, np.ndarray):
+                tensor = np.asarray(tensor)
+            tensor = torch.as_tensor(tensor, dtype=torch.float32, device=torch.device("cpu"))
+        else:
+            tensor = tensor.to(torch.float32)
+        if tensor.numel() == 0:
+            tensor = tensor.reshape((-1, 15)).to(dtype=torch.float32)
+        self.tensor = tensor
+        if self.tensor.dim() == 1:
+            self.tensor = self.tensor.unsqueeze(0)
+        if self.tensor.dim() == 2:
+            self.tensor = self.tensor.unsqueeze(0)
+
+    @property
+    def centers(self):
+        return self.tensor[:, :, :3]
+    
+    @property
+    def dimensions(self):
+        return self.tensor[:, :, 3:6]
+    
+    @property
+    def rotations(self):
+        shape = self.tensor.shape
+        return self.tensor[:, :, 6:].reshape(shape[0],shape[1], 3, 3)
+    
+    @property
+    def device(self):
+        return self.tensor.device
+    
+    @property
+    def num_instances(self):
+        return self.tensor.shape[0]
+    
+    @property
+    def shape(self):
+        return self.tensor.shape
+
+    def clone(self) -> "Cubes":
+        """
+        Clone the Cubes.
+
+        Returns:
+            Cubes
+        """
+        return Cubes(self.tensor.clone())
+    
+
+    def get_cubes(self):
+        color = [c/255.0 for c in util.get_color()]
+        return util.mesh_cuboid(torch.cat((self.centers.squeeze(0),self.dimensions.squeeze(0)),dim=1), self.rotations.squeeze(0), color=color)
+        
+    
+    def get_all_corners(self):
+        '''wrap ``util.get_cuboid_verts_faces``
+        
+        Returns:
+            verts: the 3D vertices of the cuboid in camera space'''
+
+        verts_list = []
+        for i in range(self.num_instances):
+            verts_next_instance, _ = util.get_cuboid_verts_faces(self.tensor[i, :, :6], self.rotations[i])
+            verts_list.append(verts_next_instance)
+        verts = torch.stack(verts_list, dim=0)
+
+        return verts
+    
+    def get_cuboids_verts_faces(self):
+        '''wrap ``util.get_cuboid_verts_faces``
+        
+        Returns:
+            verts: the 3D vertices of the cuboid in camera space
+            faces: the faces of the cuboid in camera space'''
+
+        verts_list = []
+        faces_list = []
+        for i in range(self.num_instances):
+            verts_next_instance, faces = util.get_cuboid_verts_faces(self.tensor[i, :, :6], self.rotations[i])
+            verts_list.append(verts_next_instance)
+            faces_list.append(faces)
+        verts = torch.stack(verts_list, dim=0)
+        faces = torch.stack(faces_list, dim=0)
+
+        return verts, faces
+    
+    def get_bube_corners(self, K, clamp:tuple=None) -> torch.Tensor:
+        '''This assumes that all the cubes have the same camera intrinsic matrix K
+
+        clamp is a typically the image shape (width, height) to truncate the boxes to image frame, this avoids huge projected boxes
+        Returns:
+            num_instances x N x 8 x 2'''
+        cube_corners = self.get_all_corners() # num_instances x N x 8 x 3
+        num_prop = cube_corners.shape[1]
+        cube_corners = cube_corners.reshape(self.num_instances * num_prop, 8, 3)
+        K_repeated = K.repeat(self.num_instances * num_prop,1,1)
+        cube_corners = torch.matmul(K_repeated, cube_corners.transpose(2,1))
+        cube_corners = cube_corners[:, :2, :]/cube_corners[:, 2, :].unsqueeze(-2)
+        cube_corners = cube_corners.transpose(2,1)
+        cube_corners = cube_corners.reshape(self.num_instances, num_prop, 8, 2)
+
+        # we must clamp and then stack, otherwise the gradient is fucked
+        if clamp is not None:
+            x = torch.clamp(cube_corners[..., 0], int(-clamp[0]/2+1), int(clamp[0]-1+clamp[0]))
+            y = torch.clamp(cube_corners[..., 1], int(-clamp[1]/2+1), int(clamp[1]-1+clamp[1]))
+        cube_corners = torch.stack((x, y), dim=-1)
+        
+        return cube_corners # num_instances x num_proposals x 8 x 2
+    
+    def get_volumes(self) -> float:
+        return self.get_dimensions().prod(1).item()
+    
+    def __len__(self) -> int:
+        return self.tensor.shape[0]
+
+    def __repr__(self) -> str:
+        return f'Cubes({self.tensor})'
+    
+    def to(self, device: torch.device):
+        # Cubes are assumed float32 and does not support to(dtype)
+        if isinstance(self.scores, torch.Tensor):
+            self.scores = self.scores.to(device=device)
+        if isinstance(self.labels, torch.Tensor):
+            self.labels = self.labels.to(device=device)
+        return Cubes(self.tensor.to(device=device), self.scores, self.labels)
+    
+    def __getitem__(self, item) -> "Cubes":
+        """
+        Args:
+            item: int, slice, or a BoolTensor
+
+        Returns:
+            Cubes: Create a new :class:`Cubes` by indexing.
+
+        The following usage are allowed:
+
+        1. `new_cubes = cubes[3]`: return a `Cubes` which contains only one box.
+        2. `new_cubes = cubes[2:10]`: return a slice of cubes.
+        3. `new_cubes = cubes[vector]`, where vector is a torch.BoolTensor
+           with `length = len(cubes)`. Nonzero elements in the vector will be selected.
+
+        Note that the returned Cubes might share storage with this Cubes,
+        subject to Pytorch's indexing semantics.
+        """
+        if isinstance(item, int):
+            prev_n_prop = self.tensor.shape[1]
+            return Cubes(self.tensor[item].view(1, prev_n_prop, -1))
+        elif isinstance(item, tuple):
+            return Cubes(self.tensor[item[0],item[1]].view(1, 1, -1))
+        b = self.tensor[item]
+        assert b.dim() == 2, "Indexing on Cubes with {} failed to return a matrix!".format(item)
+        return Cubes(b)
+    
+
+    @classmethod
+    def cat(cls, cubes_list: list["Cubes"]) -> "Cubes":
+        """
+        Concatenates a list of Cubes into a single Cubes
+
+        Arguments:
+            cubes_list (list[Cubes])
+
+        Returns:
+            Cubes: the concatenated Cubes
+        """
+        assert isinstance(cubes_list, (list, tuple))
+        if len(cubes_list) == 0:
+            return cls(torch.empty(0))
+        assert all([isinstance(box, Cubes) for box in cubes_list])
+
+        # use torch.cat (v.s. layers.cat) so the returned cubes never share storage with input
+        cat_cubes = cls(torch.cat([b.tensor for b in cubes_list], dim=0))
+        return cat_cubes
+    
+    @torch.jit.unused
+    def __iter__(self):
+        """
+        Yield a cube as a Tensor of shape (15,) at a time.
+        """
+        yield from self.tensor
+
+    def split(self, split_size: int, dim=1) -> tuple["Cubes"]:
+        """same behaviour as torch.split, return a tuple of chunksize Cubes"""
+        return tuple(Cubes(x) for x in self.tensor.split(split_size, dim=dim))
+    
+    def reshape(self, *args) -> "Cubes":
+        """
+        Returns:
+            Cubes: reshaped Cubes
+        """
+        return Cubes(self.tensor.reshape(*args), self.scores, self.labels)
\ No newline at end of file
diff --git a/ProposalNetwork/utils/utils.py b/ProposalNetwork/utils/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..17cf80e02ee0f7c8e1040278ef5764c9017c97c4
--- /dev/null
+++ b/ProposalNetwork/utils/utils.py
@@ -0,0 +1,564 @@
+import torch
+import numpy as np
+import matplotlib.pyplot as plt
+
+from detectron2.structures import pairwise_iou
+from pytorch3d.ops import box3d_overlap
+
+##### Proposal
+def normalize_vector(v):
+    v_mag = torch.sqrt(v.pow(2).sum())
+    v_mag = torch.max(v_mag, torch.tensor([1e-8], device=v.device))
+    v_mag = v_mag.view(1,1).expand(1,v.shape[0])
+    v = v/v_mag
+
+    return v[0]
+    
+def cross_product(u, v):
+    i = u[1]*v[2] - u[2]*v[1]
+    j = u[2]*v[0] - u[0]*v[2]
+    k = u[0]*v[1] - u[1]*v[0]
+    out = torch.cat((i.view(1,1), j.view(1,1), k.view(1,1)),1)
+        
+    return out[0]
+
+def compute_rotation_matrix_from_ortho6d(poses):
+    x_raw = poses[0:3]
+    y_raw = poses[3:6]
+        
+    x = normalize_vector(x_raw)
+    z = cross_product(x,y_raw)
+    z = normalize_vector(z)
+    y = cross_product(z,x)
+        
+    x = x.view(-1,3,1)
+    y = y.view(-1,3,1)
+    z = z.view(-1,3,1)
+    matrix = torch.cat((x,y,z), 2)[0]
+
+    return matrix
+
+def sample_normal_in_range(means, stds, count, threshold_low=None, threshold_high=None):
+    device = means.device
+    # Generate samples from a normal distribution
+    samples = torch.normal(means.unsqueeze(1).expand(-1,count), stds.unsqueeze(1).expand(-1,count))
+
+    # Ensure that all samples are greater than threshold_low and less than threshold_high
+    if threshold_high is not None and threshold_low is not None:
+        tries = 0
+        threshold_high = threshold_high.unsqueeze(1).expand_as(samples)
+        while torch.any((samples < threshold_low) | (samples > threshold_high)):
+            invalid_mask = (samples < threshold_low) | (samples > threshold_high)
+            # Replace invalid samples with new samples drawn from the normal distribution, could be done more optimal by sampling only sum(invalid mask) new samples, but matching of correct means is difficult then
+            samples[invalid_mask] = torch.normal(means.unsqueeze(1).expand(-1,count), stds.unsqueeze(1).expand(-1,count))[invalid_mask]
+            
+            tries += 1
+            if tries == 10000:
+                break
+
+    return samples.to(device)
+
+def randn_orthobasis_torch(num_samples=1,num_instances=1):
+    z = torch.randn(num_instances, num_samples, 3, 3)
+    z = z / torch.norm(z, p=2, dim=-1, keepdim=True)
+    z[:, :, 0] = torch.cross(z[:, :, 1], z[:, :, 2], dim=-1)
+    z[:, :, 0] = z[:, :, 0] / torch.norm(z[:, :, 0], dim=-1, keepdim=True)
+    z[:, :, 1] = torch.cross(z[:, :, 2], z[:, :, 0], dim=-1)
+    z[:, :, 1] = z[:, :, 1] / torch.norm(z[:, :, 1], dim=-1, keepdim=True)
+    return z
+
+def randn_orthobasis(num_samples=1):
+    z = np.random.randn(num_samples, 3, 3)
+    z = z / np.linalg.norm(z, axis=-1, keepdims=True)
+    z[:, 0] = np.cross(z[:, 1], z[:, 2], axis=-1)
+    z[:, 0] = z[:, 0] / np.linalg.norm(z[:, 0], axis=-1, keepdims=True)
+    z[:, 1] = np.cross(z[:, 2], z[:, 0], axis=-1)
+    z[:, 1] = z[:, 1] / np.linalg.norm(z[:, 1], axis=-1, keepdims=True)
+    return z
+
+# ##things for making rotations
+def vec_perp(vec):
+    '''generate a vector perpendicular to vec in 3d'''
+    # https://math.stackexchange.com/a/2450825
+    a, b, c = vec
+    if a == 0:
+        return np.array([0,c,-b])
+    return np.array(normalize_vector(torch.tensor([b,-a,0])))
+
+def orthobasis_from_normal(normal, yaw_angle=0):
+    '''generate an orthonormal/Rotation matrix basis from a normal vector in 3d
+     
+       returns a 3x3 matrix with the basis vectors as columns, 3rd column is the original normal vector
+    '''
+    x = rotate_vector(vec_perp(normal), normal, yaw_angle)
+    x = x / np.linalg.norm(x, ord=2)
+    y = np.cross(normal, x)
+    return np.array([x, normal, y]).T # the vectors should be as columns
+
+def rotate_vector(v, k, theta):
+    '''rotate a vector v around an axis k by an angle theta
+    it is assumed that k is a unit vector (p2 norm = 1)'''
+    # https://medium.com/@sim30217/rodrigues-rotation-formula-47489db49050
+    cos_theta = np.cos(theta)
+    sin_theta = np.sin(theta)
+    
+    term1 = v * cos_theta
+    term2 = np.cross(k, v) * sin_theta
+    term3 = k * np.dot(k, v) * (1 - cos_theta)
+    
+    return term1 + term2 + term3
+
+def vec_perp_t(vec):
+    '''generate a vector perpendicular to vec in 3d'''
+    # https://math.stackexchange.com/a/2450825
+    a, b, c = vec
+    if a == 0:
+        return torch.tensor([0,c,-b], device=vec.device)
+    return normalize_vector(torch.tensor([b,-a,0], device=vec.device))
+
+def orthobasis_from_normal_t(normal:torch.Tensor, yaw_angles:torch.Tensor=0):
+    '''generate an orthonormal/Rotation matrix basis from a normal vector in 3d
+
+        normal is assumed to be normalised 
+     
+       returns a (no. of yaw_angles)x3x3 matrix with the basis vectors as columns, 3rd column is the original normal vector
+    '''
+    n = len(yaw_angles)
+    x = rotate_vector_t(vec_perp_t(normal), normal, yaw_angles)
+    # x = x / torch.norm(x, p=2)
+    y = torch.cross(normal.view(-1,1), x)
+    # y = y / torch.norm(y, p=2, dim=1)
+    return torch.cat([x.t(), normal.unsqueeze(0).repeat(n, 1), y.t()],dim=1).reshape(n,3,3).transpose(2,1) # the vectors should be as columns
+
+def rotate_vector_t(v, k, theta):
+    '''rotate a vector v around an axis k by an angle theta
+    it is assumed that k is a unit vector (p2 norm = 1)'''
+    # https://medium.com/@sim30217/rodrigues-rotation-formula-47489db49050
+    cos_theta = torch.cos(theta)
+    sin_theta = torch.sin(theta)
+    v2 = v.view(-1,1)
+
+    term1 = v2 * cos_theta
+    term2 = torch.cross(k, v).view(-1, 1) * sin_theta
+    term3 = (k * (k @ v)).view(-1, 1) * (1 - cos_theta)
+    
+    return (term1 + term2 + term3)
+
+# ########### End rotations
+def gt_in_norm_range(range,gt):
+    tmp = gt-range[0]
+    res = tmp / abs(range[1] - range[0])
+
+    return res
+
+    if range[0] > 0: # both positive
+        tmp = gt-range[0]
+        res = tmp / abs(range[1] - range[0])
+    elif range[1] > 0: # lower negative upper positive
+        if gt > 0:
+            tmp = gt-range[0]
+        else:
+            tmp = range[1]-gt
+        res = tmp / abs(range[1] - range[0])
+    else: # both negative
+        tmp = range[1]-gt
+        res = tmp / abs(range[1] - range[0])
+
+    return res
+
+def vectorized_linspace(start_tensor, end_tensor, number_of_steps):
+    # Calculate spacing
+    spacing = (end_tensor - start_tensor) / (number_of_steps - 1)
+    # Create linear spaces with arange
+    linear_spaces = torch.arange(start=0, end=number_of_steps, dtype=start_tensor.dtype, device=start_tensor.device)
+    linear_spaces = linear_spaces.repeat(start_tensor.size(0),1)
+    linear_spaces = linear_spaces * spacing[:,None] + start_tensor[:,None]
+    return linear_spaces
+
+
+
+##### Scoring
+def iou_2d(gt_box, proposal_boxes):
+    '''
+    gt_box: Boxes
+    proposal_box: Boxes
+    '''
+    IoU = pairwise_iou(gt_box,proposal_boxes).flatten()
+    return IoU
+
+def iou_3d(gt_cube, proposal_cubes):
+    """
+    Compute the Intersection over Union (IoU) of two 3D cubes.
+
+    Parameters:
+    - gt_cube: GT Cube.
+    - proposal_cube: List of Proposal Cubes.
+
+    Returns:
+    - iou: Intersection over Union (IoU) value.
+    """
+    gt_corners = gt_cube.get_all_corners()[0]
+    proposal_corners = proposal_cubes.get_all_corners()[0]
+    vol, iou = box3d_overlap(gt_corners,proposal_corners)
+    iou = iou[0]
+
+    return iou
+
+def custom_mapping(x,beta=1.7):
+    '''
+    maps the input curve to be S shaped instead of linear
+    
+    Args:
+    beta: number > 1, higher beta is more aggressive
+    x: list of floats betweeen and including 0 and 1
+    beta: number > 1 higher beta is more aggressive
+    '''
+    mapped_list = []
+    for i in range(len(x)):
+        if x[i] <= 0:
+            mapped_list.append(0.0)
+        else:
+            mapped_list.append((1 / (1 + (x[i] / (1 - x[i])) ** (-beta))))
+    
+    return mapped_list
+
+def mask_iou(segmentation_mask, bube_mask):
+    '''
+    Area is of segmentation_mask
+    '''
+    bube_mask = torch.tensor(bube_mask, device=segmentation_mask.device)
+    intersection = (segmentation_mask * bube_mask).sum()
+    if intersection == 0:
+        return torch.tensor(0.0)
+    union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
+    return intersection / union
+
+def mod_mask_iou(segmentation_mask, bube_mask):
+    '''
+    Area is of segmentation_mask
+    '''
+    bube_mask = torch.tensor(bube_mask, device=segmentation_mask.device)
+    intersection = (segmentation_mask * bube_mask).sum()
+    if intersection == 0:
+        return torch.tensor(0.0)
+    union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
+    return intersection**5 / union # NOTE not standard IoU
+
+def mask_iou_loss(segmentation_mask, bube_mask):
+    '''
+    Area is of segmentation_mask
+    '''
+    intersection = (segmentation_mask * bube_mask).sum()
+    if intersection == 0:
+        return torch.tensor(0.0)
+    union = torch.logical_or(segmentation_mask, bube_mask).to(torch.int).sum()
+    return intersection / union
+
+def is_gt_included(gt_cube,x_range,y_range,z_range, w_prior, h_prior, l_prior):
+    # Define how far away dimensions need to be to be counted as unachievable
+    stds_away = 1.5
+    # Center
+    because_of = []
+    if not (x_range[0] < gt_cube.center[0] < x_range[1]):
+        if (gt_cube.center[0] < x_range[0]):
+            val = abs(x_range[0] - gt_cube.center[0])
+        else:
+            val = abs(gt_cube.center[0] - x_range[1])
+        because_of.append(f'x by {val:.1f}')
+    if not (y_range[0] < gt_cube.center[1] < y_range[1]):
+        if (gt_cube.center[1] < y_range[0]):
+            val = abs(y_range[0] - gt_cube.center[1])
+        else:
+            val = abs(gt_cube.center[1] - y_range[1])
+        because_of.append(f'y by {val:.1f}')
+    # Depth
+    if not (z_range[0] < gt_cube.center[2] < z_range[1]):
+        if (gt_cube.center[2] < z_range[0]):
+            val = abs(z_range[0] - gt_cube.center[2])
+        else:
+            val = abs(gt_cube.center[2] - z_range[1])
+        because_of.append(f'z by {val:.1f}')
+    # Dimensions
+    if (gt_cube.dimensions[0] < w_prior[0]-stds_away*w_prior[1]):
+        because_of.append('w-')
+    if (gt_cube.dimensions[0] > w_prior[0]+stds_away*w_prior[1]):
+        because_of.append('w+')
+    if (gt_cube.dimensions[1] < h_prior[0]-stds_away*h_prior[1]):
+        because_of.append('h-')
+    if (gt_cube.dimensions[1] > h_prior[0]+stds_away*h_prior[1]):
+        because_of.append('h+')
+    if (gt_cube.dimensions[2] < l_prior[0]-stds_away*l_prior[1]):
+        because_of.append('l-')
+    if (gt_cube.dimensions[2] > l_prior[0]+stds_away*l_prior[1]):
+        because_of.append('l+')
+    if because_of == []:
+        return True
+    else:
+        print('GT cannot be found due to',because_of)
+        return False
+
+    # rotation nothing yet
+
+def euler_to_unit_vector(eulers):
+    """
+    Convert Euler angles to a unit vector.
+    """
+    yaw, pitch, roll = eulers
+    
+    # Calculate the components of the unit vector
+    x = np.cos(yaw) * np.cos(pitch)
+    y = np.sin(yaw) * np.cos(pitch)
+    z = np.sin(pitch)
+    
+    # Normalize the vector
+    length = np.sqrt(x**2 + y**2 + z**2)
+    unit_vector = np.array([x, y, z]) / length
+    
+    return unit_vector
+
+
+# helper functions for plotting segmentation masks
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30/255, 144/255, 255/255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+
+def show_mask2(masks:np.array, im:np.array, random_color=False):
+    """
+    Display the masks on top of the image.
+
+    Args:
+        masks (np.array): Array of masks with shape (h, w, 4).
+        im (np.array): Image with shape (h, w, 3).
+        random_color (bool, optional): Whether to use random colors for the masks. Defaults to False.
+
+    Returns:
+        np.array: Image with masks displayed on top.
+    """
+    im_expanded = np.concatenate((im, np.ones((im.shape[0],im.shape[1],1))*255), axis=-1)/255
+
+    mask_image = np.zeros((im.shape[0],im.shape[1],4))
+    for i, mask in enumerate(masks):
+        if isinstance(random_color, list):
+            color = random_color[i]
+        else:
+            color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+        h, w = mask.shape[-2:]
+        mask_sub = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+        mask_image = mask_image + mask_sub
+    mask_binary = (mask_image > 0).astype(bool)
+    im_out = im_expanded * ~mask_binary + (0.5* mask_image + 0.5 * (im_expanded * mask_binary))
+    im_out = im_out.clip(0,1)
+    return im_out
+    
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)   
+    
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))    
+
+
+
+
+
+
+# Convex Hull
+import torch
+
+def direction(p1, p2, p3):
+    return (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0])
+
+def distance_sq(p1, p2):
+    return (p2[0] - p1[0])**2 + (p2[1] - p1[1])**2
+
+def findDuplicates(arr): 
+    Len = len(arr)
+    ifPresent = False
+    a1 = []
+    idx = []
+    for i in range(Len - 1): 
+        for j in range(i + 1, Len): 
+            # Checking if element is present in the ArrayList or not if present then break 
+            if torch.all(arr[i] == arr[j]): 
+                # if len(a1) == 0:
+                #     a1 arr[i]
+                #     idx.append(i)
+                #     ifPresent = True
+                # else:
+                #     # if arr[i] in a1: 
+                #     #     break
+                #     # # If element is not present in the ArrayList then add it to ArrayList and make ifPresent true 
+                #     # else: 
+                a1.append(arr[i])
+                idx.append(i)
+                ifPresent = True
+                    
+    if ifPresent: 
+        return set(idx) # lazi inefficient implementation
+    else:
+        return None
+
+def jarvis_march(points):
+    '''https://algorithmtutor.com/Computational-Geometry/Convex-Hull-Algorithms-Jarvis-s-March/
+    https://algorithmtutor.com/Computational-Geometry/Determining-if-two-consecutive-segments-turn-left-or-right/ '''
+    # remove duplicates
+    duplicates = findDuplicates(points)
+    # this is necessary if there are > 2 duplicates of the same element
+    if duplicates is not None:
+        plusone = torch.zeros_like(points)
+        for i, d in enumerate(duplicates):
+            plusone[d] += i + 1
+        points = points + plusone
+
+    # find the lower left point
+    min_x = torch.min(points[:, 0])
+    candidates = (points[:, 0] == min_x).nonzero(as_tuple=True)[0]
+
+    # If there are multiple points, choose the one with the highest y value
+    if len(candidates) > 1:
+        index = candidates[torch.argmax(points[candidates][:, 1])]
+    else:
+        index = candidates[0]
+    
+    a = points[index]
+    
+    # selection sort
+    l = index
+    result = []
+    result.append(a)
+
+    while (True):
+        q = (l + 1) % len(points)
+        for i in range(len(points)):
+            if i == l:
+                continue
+            # find the greatest left turn
+            # in case of collinearity, consider the farthest point
+            d = direction(points[l], points[i], points[q])
+            if d > 0 or (d == 0 and distance_sq(points[i], points[l]) > distance_sq(points[q], points[l])):
+                q = i
+        l = q
+        if l == index:
+            break
+        result.append(points[q])
+
+    return torch.flip(torch.stack(result), [0,])
+
+def fill_polygon(mask, polygon):
+    '''
+    inspired by https://web.archive.org/web/20120323102807/http://local.wasp.uwa.edu.au/~pbourke/geometry/insidepoly/
+    '''
+    h, w = mask.shape
+    Y, X = torch.meshgrid(torch.arange(h), torch.arange(w), indexing='ij') # or xy??? xy is the numpy was
+    grid_coords = torch.stack([X.flatten(), Y.flatten()], dim=1).float().to(mask.device)
+    
+    new_mask = torch.ones(h, w, device=mask.device)
+    zeros = torch.zeros(h, w, device=mask.device)
+    ones = torch.ones(h, w, device=mask.device)
+    
+    # For some reason it is easier for me to comprehend the algorithm if we iterate counter-clockwise
+    for i in range(len(polygon)):
+        v1 = polygon[i]
+        v2 = polygon[(i + 1) % len(polygon)]
+        
+        # Determine the direction of the edge
+        edge_direction = v2 - v1
+        
+        # Given a line segment between P0 (x0,y0) and P1 (x1,y1), another point P (x,y) has the following relationship to the line segment.
+        # Compute
+        # (y - y0) (x1 - x0) - (x - x0) (y1 - y0)
+        # Check if the point is to the left of the edge
+        points = (grid_coords[:, 0] - v1[0]) * edge_direction[1] - (grid_coords[:, 1] - v1[1]) * edge_direction[0]
+        # we can do the threshold in a clever differentiable way
+        # this sets all values to be between 0 and 1
+        is_left = torch.min(torch.max(points.view(h, w), zeros), ones)
+        
+        # do the intersection of the 2 masks, this progressily builds op the polygon
+        new_mask = new_mask * is_left
+
+    return new_mask
+
+def convex_hull(mask, coords):
+    hull = jarvis_march(coords)
+    new_mask = fill_polygon(mask, hull)
+    return new_mask
+
+if __name__ == '__main__':
+    import matplotlib.pyplot as plt
+    mask = torch.zeros(700, 700, dtype=torch.bool)
+    # p = torch.tensor([[5,6],[21.0,7],[21,20],[10,20],[15,20],[5,20],[11,8],[15,15],[17,6],[11,15]])
+
+    p = torch.tensor([[271.0000, 356.0000],
+                    [ 25.3744, 356.0000],
+                    [  0.0000, 356.0000],
+                    [  0.0000,  89.5266],
+                    [271.0000, 159.3112],
+                    [ 95.5653, 201.7484],
+                    [  0.0000,   0.0000],
+                    [271.0000,   0.0000]])
+    
+    p2 = torch.tensor([[150.3456,   0.0000],
+                    [479.0000,   0.0000],
+                    [ 11.8427,   0.0000],
+                    [  0.0000,   0.0000],
+                    [121.4681, 232.5976],
+                    [375.6230, 383.9329],
+                    [ 12.8765, 630.0000],
+                    [  0.0000, 344.7250]])
+    
+    p3 = torch.tensor([[290.9577, 171.1176],
+                    [197.7348, 483.7612],
+                    [383.0000, 504.0000],
+                    [383.0000,  27.6211],
+                    [  2.2419,  52.6505],
+                    [  0.0000, 399.6908],
+                    [  0.0000, 504.0000],
+                    [  0.0000,   0.0000]])
+    
+    p4 = torch.tensor([[271.0000,  19.5241],
+                    [271.0000, 356.0000],
+                    [  0.0000,   0.0000],
+                    [271.0000,   0.0000],
+                    [  0.0000,   0.0000],
+                    [163.0264,  77.9408],
+                    [164.2467, 321.0222],
+                    [  0.0000, 356.0000],
+                    [  0.0000,   0.0000]])
+    
+    p5 = torch.tensor([[272.0000,   1.0000],
+                    [  0.0000, 173.5156],
+                    [ 74.8860, 141.3913],
+                    [253.8221,   0.0000],
+                    [271.0000,   0.0000],
+                    [271.0000, 356.0000],
+                    [262.5294, 327.9978],
+                    [271.0000, 120.8048]])
+
+    mask5 = convex_hull(mask, p5)
+    mask4 = convex_hull(mask, p4)
+    mask1 = convex_hull(mask, p)
+    mask2 = convex_hull(mask, p2)
+    mask3 = convex_hull(mask, p3)
+    fig, ax = plt.subplots(1,5, figsize=(20,5))
+    ax[0].scatter(p[:,0], p[:,1], c='r')
+    ax[1].scatter(p2[:,0], p2[:,1], c='b')
+    ax[2].scatter(p3[:,0], p3[:,1], c='g')
+    ax[3].scatter(p4[:,0], p4[:,1], c='y')
+    ax[4].scatter(p5[:,0], p5[:,1], c='m')
+
+    ax[0].imshow(mask1)
+    ax[1].imshow(mask2)
+    ax[2].imshow(mask3)
+    ax[3].imshow(mask4)
+    ax[4].imshow(mask5)
+    plt.show()
+    a = 2
diff --git a/README.md b/README.md
index 7e64700c68be175eafafec159c3d3b7b023e0e8d..063603b04887dfa5e6caa4167d343eafc81acba8 100644
--- a/README.md
+++ b/README.md
@@ -1,12 +1,11 @@
 ---
 title: Weak Cube RCNN
-emoji: ⚡
+emoji: 🎲
 colorFrom: indigo
-colorTo: purple
+colorTo: yellow
 sdk: docker
 pinned: false
-license: cc-by-nc-sa-4.0
-short_description: Weak Cube RCNN model
+license: apache-2.0
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+https://github.com/AndreasLH/Weak-Cube-R-CNN
\ No newline at end of file
diff --git a/VisualiseGT.py b/VisualiseGT.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb48f731f6dd37d98ef25bd8061de6df8bdd2c4b
--- /dev/null
+++ b/VisualiseGT.py
@@ -0,0 +1,830 @@
+from pycocotools.coco import COCO
+import os
+import random
+from functools import reduce
+from io import StringIO
+
+from detectron2.utils.visualizer import Visualizer
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+from scipy import stats
+
+from cubercnn import data, util, vis
+from cubercnn.config import get_cfg_defaults
+from cubercnn.data.build import (build_detection_test_loader,
+                                 build_detection_train_loader)
+from cubercnn.data.dataset_mapper import DatasetMapper3D
+from cubercnn.data.datasets import load_omni3d_json, simple_register
+from detectron2.config import get_cfg
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.structures.boxes import BoxMode
+from detectron2.utils.logger import setup_logger
+
+color = '#384860'
+second_color = '#97a6c4'
+
+def load_gt(dataset='SUNRGBD', mode='test', single_im=True, filter=False, img_idx=150):
+
+    # we can do this block of code to get the categories reduced number of categories in the sunrgbd dataset as there normally is 83 categories, however we only work with 38.
+    config_file = 'configs/Base_Omni3D.yaml'
+    if filter:
+        cfg, filter_settings = get_config_and_filter_settings(config_file)
+    else:
+        filter_settings = None
+
+    if mode == 'test':
+        dataset_paths_to_json = ['datasets/Omni3D/'+dataset+'_test.json']
+    elif mode == 'train':
+        dataset_paths_to_json = ['datasets/Omni3D/'+dataset+'_train.json']
+
+    # Get Image and annotations
+    try:
+        dataset = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+    except:
+        print('Dataset does not exist or is not in the correct format!')
+        exit()
+    imgIds = dataset.getImgIds()
+    imgs = dataset.loadImgs(imgIds)
+    if single_im:
+        # img = random.choice(imgs)
+        # 730 and 150 are used in the report
+        img = imgs[img_idx]
+        annIds = dataset.getAnnIds(imgIds=img['id'])
+    else:
+        # get all annotations
+        img = imgs
+        annIds = dataset.getAnnIds()
+
+    anns = dataset.loadAnns(annIds)
+
+    # Extract necessary annotations
+    R_cams = []
+    center_cams = []
+    dimensions_all = []
+    cats = []
+    bboxes = []
+    for instance in anns:
+        if 'bbox2D_tight' in instance and instance['bbox2D_tight'][0] != -1:
+            bboxes.append(instance['bbox2D_tight']) # boxes are XYXY_ABS by default
+
+        elif 'bbox2D_trunc' in instance and not np.all([val==-1 for val in instance['bbox2D_trunc']]):
+            bboxes.append(instance['bbox2D_trunc']) # boxes are XYXY_ABS by default
+
+        elif 'bbox2D_proj' in instance:
+            bboxes.append(instance['bbox2D_proj']) # boxes are XYXY_ABS by default
+
+        else:
+            continue
+
+        R_cams.append(instance['R_cam'])
+        center_cams.append(instance['center_cam'])
+        dimensions_all.append(instance['dimensions'])
+        cats.append(instance['category_name'])
+    
+    return img, R_cams, center_cams, dimensions_all, cats, bboxes
+    
+
+
+def plot_scene(image_path, output_dir, center_cams, dimensions_all, Rs, K, cats, bboxes):
+    # TODO: currently this function does not filter out invalid annotations, but it should have the option to do so.
+    # Compute meshes
+    meshes = []
+    meshes_text = []
+    for idx, (center_cam, dimensions, pose, cat) in enumerate(zip(
+            center_cams, dimensions_all, Rs, cats
+        )):
+        bbox3D = center_cam + dimensions
+        meshes_text.append('{}'.format(cat))
+        color = [c/255.0 for c in util.get_color(idx)]
+        box_mesh = util.mesh_cuboid(bbox3D, pose, color=color)
+        meshes.append(box_mesh)
+    
+    image_name = util.file_parts(image_path)[1]
+    print('File: {} with {} dets'.format(image_name, len(meshes)))
+    np.random.seed(0)
+    colors = [np.concatenate([np.random.random(3), np.array([0.6])], axis=0) for _ in range(len(meshes))]
+
+    # Plot
+    image = util.imread('datasets/'+image_path)
+    if len(meshes) > 0:
+        im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(image, np.array(K), meshes, colors=colors, text=meshes_text, scale=image.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
+
+        if False:
+            im_concat = np.concatenate((im_drawn_rgb, im_topdown), axis=1)
+            vis.imshow(im_concat)
+
+        util.imwrite(im_drawn_rgb, os.path.join(output_dir, image_name+'_boxes.jpg'))
+        util.imwrite(im_topdown, os.path.join(output_dir, image_name+'_novel.jpg'))
+        v_pred = Visualizer(image, None)
+        #bboxes = [[320, 150, 560, 340]] # low loss
+        #bboxes = [[350, 220, 440, 290]] # high loss
+        #bboxes = [[340, 163, 540, 297]] # fail loss
+        v_pred = v_pred.overlay_instances(boxes=np.array(bboxes), assigned_colors=colors)#[np.array([0.5,0,0.5])])#colors)
+        util.imwrite(v_pred.get_image(), os.path.join(output_dir, image_name+'_pred_boxes.jpg'))
+        
+        #im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(v_pred.get_image(), np.array(K), meshes, colors=colors, text=meshes_text, scale=image.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
+        #util.imwrite(im_drawn_rgb, os.path.join(output_dir, image_name+'_boxes_with_2d.jpg'))
+    else:
+        print('No meshes')
+        util.imwrite(image, os.path.join(output_dir, image_name+'_boxes.jpg'))
+
+
+
+def show_data(dataset, filter_invalid=False, output_dir='output/playground'):
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, filter=filter_invalid)
+
+    # Create Output Directory
+    util.mkdir_if_missing(output_dir)
+    
+    plot_scene(image['file_path'], output_dir, center_cams, dimensions_all, Rs, image['K'], cats, bboxes)
+
+
+def category_distribution(dataset):
+    '''Plot a histogram of the category distribution in the dataset.'''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+    config_file = 'configs/Base_Omni3D.yaml'
+    cfg, filter_settings = get_config_and_filter_settings(config_file)
+    annotation_file = 'datasets/Omni3D/SUNRGBD_train.json'
+    coco_api = COCO(annotation_file)
+    meta = MetadataCatalog.get('SUNRGBD')
+    cat_ids = sorted(coco_api.getCatIds(filter_settings['category_names']))
+    cats_sun = coco_api.loadCats(cat_ids)
+    thing_classes = [c["name"] for c in sorted(cats_sun, key=lambda x: x["id"])]
+
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+
+    # histogram of categories
+    cats_all = cats + cats_t
+    # cats_unique = list(set(cats_all))
+    cats_unique = thing_classes
+    print('cats unique: ', len(cats_unique))
+    # make dict with count of each category
+    cats_count = {cat: cats_all.count(cat) for cat in cats_unique}
+    cats_sorted = dict(sorted(cats_count.items(), key=lambda x: x[1], reverse=True))
+
+    plt.figure(figsize=(14,5))
+    plt.bar(cats_sorted.keys(), cats_sorted.values())
+    plt.xticks(rotation=60, size=9)
+
+    plt.title('Category Distribution')
+    plt.savefig(os.path.join(output_dir, 'category_distribution.png'),dpi=300, bbox_inches='tight')
+    plt.close()
+
+    return cats_sorted
+
+def spatial_statistics(dataset):
+    '''Compute spatial statistics of the dataset.
+    wanted to reproduce fig. 7 from the omni3D paper
+    however, we must standardise the images for it to work
+    '''
+    # Load Image and Ground 
+    # this function filters out invalid images if there are no valid annotations in the image
+    # annnotations in each image can also be marked as is_ignore => True
+    image_root = 'datasets'
+    cfg, filter_settings = get_config_and_filter_settings()
+    dataset_names = ['SUNRGBD_train','SUNRGBD_test','SUNRGBD_val']
+    output_dir = 'output/figures/' + dataset
+
+    # this is almost the same as the simple_register function, but it also stores the model metadata
+    # which is needed for the load_omni3d_json function 
+    data.register_and_store_model_metadata(None, output_dir, filter_settings=filter_settings)
+
+    data_dicts = []
+    for dataset_name in dataset_names:
+        json_file = 'datasets/Omni3D/'+dataset_name+'.json'
+        data_dict = load_omni3d_json(json_file, image_root, dataset_name, filter_settings, filter_empty=True)
+        data_dicts.extend(data_dict)
+    
+
+    # standardise the images to a fixed size
+    # and map the annotations to the standardised images
+    std_image_size = (480//4, 640//4)
+    tot_outliers = 0
+    img = np.zeros(std_image_size)
+    for img_dict in data_dicts:
+        original_width = img_dict['width']
+        original_height = img_dict['height']
+        
+        # Calculate the scale factor for resizing
+        scale_x = std_image_size[1] / original_width
+        scale_y = std_image_size[0] / original_height
+
+        # Update the image size in the annotation
+        img_dict['width'] = std_image_size[1]
+        img_dict['height'] = std_image_size[0]
+        for anno in img_dict['annotations']:
+            if not anno['ignore']:
+                # Update the 2D box coordinates (boxes are XYWH)
+                anno['bbox2D_tight'][0] *= scale_x
+                anno['bbox2D_tight'][1] *= scale_y
+                anno['bbox2D_tight'][2] *= scale_x
+                anno['bbox2D_tight'][3] *= scale_y
+                # get the centerpoint of the annotation as (x, y)
+                # x0, y0, x1, y1 = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+                x0, y0, x1, y1 = anno['bbox2D_tight']
+                x_m, y_m = int((x0+x1)/2), int((y0+y1)/2)
+                if x_m >= std_image_size[1] or x_m < 0:
+                    # print(f'x out of line {x_m}')
+                    tot_outliers += 1
+                elif y_m >= std_image_size[0] or y_m < 0:
+                    # print(f'y out of line {y_m}')
+                    tot_outliers += 1
+                else:
+                    img[y_m, x_m] += 1
+            else:
+                # Remove the annotation if it is marked as ignore
+                img_dict['annotations'].remove(anno)
+
+
+    print('num center points outside frame: ', tot_outliers)
+    img = img/img.max()
+    # this point is so large that all the points become invisible, so I remove it.
+    img[0,0] = 0.00 
+    img = img/img.max()
+    plt.figure()
+    plt.imshow(img, cmap='gray_r', vmin=0, vmax=1)
+    plt.xticks([]); plt.yticks([])
+    plt.title('Histogram of 2D box centre points')
+    # plt.box(False)
+    plt.savefig(os.path.join(output_dir, '2d_histogram.png'),dpi=300, bbox_inches='tight')
+    plt.close()
+    return
+
+def AP_vs_no_of_classes(dataset, files:list=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
+    '''Search the log file for the precision numbers corresponding to the last iteration
+    then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
+    # search the file from the back until the line 
+    # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
+    # is found
+
+    target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
+    model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
+    df = []
+    for file, model_name in zip(files, model_names):
+        df_i = search_file_backwards(file, target_line).rename(columns={'AP3D':f'{model_name} AP3D', 'AP2D':f'{model_name} AP2D'})
+        assert df_i is not None, 'df not found'
+        df.append(df_i)
+        # merge df's
+    df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
+    # sort df by ap3d of model 1
+    df = df.sort_values(by='Base Cube R-CNN AP3D', ascending=False)
+
+    cats = category_distribution(dataset)
+    df.sort_values(by='category', inplace=True)
+    cats = dict(sorted(cats.items()))
+    merged_df = pd.merge(df.reset_index(), pd.DataFrame(cats.values(), columns=['cats']), left_index=True, right_index=True)
+    merged_df = merged_df.sort_values(by='cats')
+    merged_df = merged_df.drop('index',axis=1)
+    merged_df = merged_df.reset_index(drop=True)
+    
+    
+    fig, ax = plt.subplots(figsize=(12,8))
+    for model_name in model_names:
+        if model_name == 'Base Cube R-CNN':
+            scale = 114
+        else:
+            scale = 10.15
+        # convert the annotation time to hours
+        time = merged_df['cats']*scale / 60 / 60
+        ax.scatter(time, merged_df[f'{model_name} AP3D'].values, s=merged_df[f'{model_name} AP2D'].values*2, alpha=0.5, label=model_name)
+    
+        for i, txt in enumerate(merged_df['category']):
+            ax.text(time[i], merged_df[f'{model_name} AP3D'].values[i], txt, fontsize=merged_df[f'{model_name} AP3D'].values[i]*0.3+3)
+    
+        correlation_coef = np.corrcoef(time, merged_df[f'{model_name} AP3D'].values)[0, 1]
+        line_fit = np.polyfit(time, merged_df[f'{model_name} AP3D'].values, 1)
+
+        # plot the line of best fit
+        ax.plot(time, np.poly1d(line_fit)(time), linestyle='--',alpha=0.5, label=f'Linear fit (R={correlation_coef:.2f})')
+
+    # Set labels and title
+    ax.set_xlabel('Annotation time (h)')
+    ax.set_ylabel('AP3D')
+    ax.set_xscale('log')
+    ax.set_title('AP3D vs class-wise annotation time')
+    ax.legend(title='AP3D scaled by AP2D')
+
+    # Save the plot
+    plt.savefig('output/figures/'+dataset+'/AP_vs_no_of_classes_all.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+    return
+
+def AP3D_vs_AP2D(dataset, mode = 'standard', files=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
+    '''Search the log file for the precision numbers corresponding to the last iteration
+    then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
+
+    # search the file from the back until the line 
+    # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
+    # is found
+
+    target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
+    model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
+    df = []
+    for file, model_name in zip(files, model_names):
+        df_i = search_file_backwards(file, target_line).rename(columns={'AP3D':f'{model_name} AP3D', 'AP2D':f'{model_name} AP2D'})
+        assert df_i is not None, 'df not found'
+        df.append(df_i)
+        # merge df's
+    df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
+    # sort df by ap3d of model 1
+    df = df.sort_values(by='Base Cube R-CNN AP3D', ascending=False)
+
+    cats = category_distribution(dataset)
+    df.sort_values(by='category', inplace=True)
+    cats = dict(sorted(cats.items()))
+    merged_df = pd.merge(df.reset_index(), pd.DataFrame(cats.values(), columns=['cats']), left_index=True, right_index=True)
+    merged_df = merged_df.sort_values(by='cats')
+    merged_df = merged_df.drop('index',axis=1)
+    merged_df = merged_df.reset_index(drop=True)
+    
+    # mode = 'standard' # 'log'
+    
+    fig, ax = plt.subplots(figsize=(12,8))
+    for model_name in model_names:
+        if mode == 'standard': s=merged_df[f'{model_name} AP2D'].values*2
+        else: s = None
+        # we have to add 0.001 to the values to avoid log(0) errors
+        ax.scatter(merged_df[f'{model_name} AP2D'].values+0.001, merged_df[f'{model_name} AP3D'].values+0.001, alpha=0.5, label=model_name, s=s)
+        for i, txt in enumerate(merged_df['category']):
+            if mode == 'standard': fontsize=merged_df[f'{model_name} AP3D'].values[i]*0.3+3
+            else: fontsize=7
+            ax.text(merged_df[f'{model_name} AP2D'].values[i]+0.001, merged_df[f'{model_name} AP3D'].values[i]+0.001, txt,fontsize=fontsize)
+    # plot average line
+    ax.plot((0, 70), (0, 70), linestyle='--', color=color, alpha=0.3, label=f'AP2D=AP3D')
+
+    # Set labels and title
+    if mode == 'log':
+        ax.set_xscale('log')
+        ax.set_yscale('log')
+    ax.set_xlabel('AP2D')
+    ax.set_ylabel('AP3D')
+    # ax.set_xlim(0.1, 75); ax.set_ylim(0.1, 75)
+    ax.set_title('AP in 3D vs AP in 2D')
+    ax.legend()
+    # if mode == 'log':
+    #     # for some obscure reason the log plot fails to save
+    #     plt.show()
+
+    # # Save the plot
+    # else:
+    plt.savefig('output/figures/'+dataset+f'/AP3D_vs_AP2D_all_{mode}.png', dpi=300, bbox_inches='tight')
+    plt.close()
+
+    return
+
+
+def search_file_backwards(file_path:str, target_line:str) -> pd.DataFrame:
+    '''Search a file backwards for a target line and return the table of the performance of the model. The point of this is to parse the part of the log file that looks like this
+    |  category  | AP2D    | AP3D      |  category   | AP2D     | AP3D     |   category   | AP2D      | AP3D       |
+    |:----------:|:--------|:----------|:-----------:|:---------|:---------|:------------:|:----------|:-----------|
+    |   chair    | 45.9374 | 53.4913   |    table    | 34.5982  | 39.7769  |   cabinet    | 16.3693   | 14.0878    |
+    |    lamp    | 24.8081 | 7.67653   |    books    | 0.928978 | 0.599711 |     sofa     | 49.2354   | 57.9649    |
+    
+    ...
+    To a pandas DataFrame that has 3 columns: category, AP2D, AP3D'''
+    import re
+    with open(file_path, 'r') as file:
+        lines = file.readlines()
+        for i, line in enumerate(reversed(lines)):
+            is_found = re.search(f'.*{target_line}$', line)
+            if is_found:
+                table = lines[-i:-i+15]
+                tab_as_str= ' '.join(table)
+                # i know this is really ugly
+                df = pd.read_csv( StringIO(tab_as_str.replace(' ', '')),  # Get rid of whitespaces
+                    sep='|',).dropna(axis=1, how='all').drop(0)
+                # https://stackoverflow.com/a/65884212
+                df.columns = pd.MultiIndex.from_frame(df.columns.str.split('.', expand=True)
+                                        .to_frame().fillna('0'))
+                df = df.stack().reset_index(level=1, drop=True).reset_index().drop('index', axis=1)               
+                df['AP3D'] = df['AP3D'].astype(float)
+                df['AP2D'] = df['AP2D'].astype(float)
+
+                return df
+                
+    return None
+
+
+def get_config_and_filter_settings(config_file='configs/Base_Omni3D.yaml'):
+    # we must load the config file to get the filter settings
+    cfg = get_cfg()
+    get_cfg_defaults(cfg)
+    cfg.merge_from_file(config_file)
+    # must setup logger to get info about filtered out annotations
+    setup_logger(output=cfg.OUTPUT_DIR, name="cubercnn")
+    filter_settings = data.get_filter_settings_from_cfg(cfg)
+    return cfg, filter_settings
+
+
+def init_dataloader():
+    ''' dataloader stuff.
+    currently not used anywhere, because I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
+    cfg, filter_settings = get_config_and_filter_settings()
+
+    dataset_names = ['SUNRGBD_train','SUNRGBD_val']
+    dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
+    for dataset_name in dataset_names:
+        simple_register(dataset_name, filter_settings, filter_empty=True)
+
+    # Get Image and annotations
+    datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+    data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+
+    thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+    dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+
+    infos = datasets.dataset['info']
+
+    dataset_id_to_unknown_cats = {}
+    possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+    
+    dataset_id_to_src = {}
+
+    for info in infos:
+        dataset_id = info['id']
+        known_category_training_ids = set()
+        
+        if not dataset_id in dataset_id_to_src:
+            dataset_id_to_src[dataset_id] = info['source']
+
+        for id in info['known_category_ids']:
+            if id in dataset_id_to_contiguous_id:
+                known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+        
+        # determine and store the unknown categories.
+        unknown_categories = possible_categories - known_category_training_ids
+        dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+
+    from detectron2 import data as d2data
+    NoOPaug = d2data.transforms.NoOpTransform()
+
+    # def NoOPaug(input):
+        # return input
+    # TODO: how to load in images without having them resized?
+    # data_mapper = DatasetMapper3D(cfg, augmentations=[NoOPaug], is_train=True)
+    data_mapper = DatasetMapper3D(cfg, is_train=True)
+    # test loader does resize images, like the train loader does
+    # this is the function that filters out the invalid annotations
+    data_loader = build_detection_train_loader(cfg, mapper=data_mapper, dataset_id_to_src=dataset_id_to_src, num_workers=1)
+    # data_loader = build_detection_test_loader(cfg, dataset_names[1], num_workers=1)
+
+    # this is a detectron 2 thing that we just have to do
+    data_mapper.dataset_id_to_unknown_cats = dataset_id_to_unknown_cats
+
+
+    for item in data_loader:
+        print(item)
+
+def vol_over_cat(dataset):
+    '''
+    Errorbarplot of volume of object category
+    '''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+
+    # histogram of categories
+    cats_all = cats + cats_t
+    cats_unique = list(set(cats_all))
+
+    # Create dictionary with np.prod(dimensions) for each category
+    cats_vol = {cat: [] for cat in cats_unique}
+    for cat, dims in zip(cats, dimensions_all):
+        if np.prod(dims) > 0:
+            cats_vol[cat].append(np.prod(dims))
+    for cat, dims in zip(cats_t, dimensions_all_t):
+        if np.prod(dims) > 0:
+            cats_vol[cat].append(np.prod(dims))
+
+    # make dict with mean and std of each category
+    cats_mean = {cat: np.mean(cats_vol[cat]) for cat in cats_unique}
+    cats_error = {cat: np.std(cats_vol[cat]) for cat in cats_unique}
+
+    keys = np.array(list(cats_mean.keys()))
+    means = np.array(list(cats_mean.values()))
+    errors = np.array(list(cats_error.values()))
+
+    # Calculate Z-scores for 5th and 95th percentiles
+    from scipy.stats import norm
+    z_lower = norm.ppf(0.05)
+    z_upper = norm.ppf(0.95)
+    bounds = []
+    for mean, std in zip(means, errors):
+        # Calculate the lower and upper bounds of the interval
+        lower_bound = mean + z_lower * std
+        upper_bound = mean + z_upper * std
+
+        bounds.append((max(0,lower_bound), upper_bound))
+
+    plt.figure(figsize=(14,5))
+    for i, (mean, (lower_bound, upper_bound)) in enumerate(zip(means, bounds)):
+        plt.vlines(x=i, ymin=lower_bound, ymax=upper_bound, color='gray', linewidth=2)
+        plt.plot([i], [mean], marker='o', color=color)
+
+    plt.xticks(np.arange(len(keys)), keys, rotation=60, size=9)
+    plt.xlabel('Category')
+    plt.ylabel('Volume')
+    plt.title('Category Distribution')
+    plt.savefig(os.path.join(output_dir, 'volume_distribution.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+def gt_stats(dataset):
+    '''
+    Errorbarplot of volume of object category
+    '''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+
+    # histogram of centers
+    center_all = center_cams + center_cams_t
+    center_all = np.transpose(np.array(center_all))
+
+    # Filter -1 annotations
+    valid_columns = center_all[0] != -1
+    center_all = center_all[:,valid_columns]
+    
+    x_label = ['x', 'y', 'z']
+    fig, axes = plt.subplots(1, len(center_all), figsize=(18, 5))
+    for i in range(len(center_all)):
+        axes[i].hist(center_all[i], color=color, bins=20)
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('Count')
+    fig.suptitle('Center Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'center.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+    # histogram of dimensions
+    dimensions_all = dimensions_all + dimensions_all_t
+    dimensions_all = np.transpose(np.array(dimensions_all))
+
+    # Filter -1 annotations
+    valid_columns = dimensions_all[0] != -1
+    dimensions_all = dimensions_all[:,valid_columns]
+    
+    x_label = ['w', 'h', 'l']
+    fig, axes = plt.subplots(1, len(dimensions_all), figsize=(18, 5))
+    for i in range(len(dimensions_all)):
+        axes[i].hist(dimensions_all[i], color=color, bins=20)
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('Count')
+    fig.suptitle('Dimensions Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'dimensions.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+def report_figures(dataset, filter_invalid=False, output_dir='output/report_images'):
+    # Create Output Directory
+    util.mkdir_if_missing(output_dir)
+    util.mkdir_if_missing(output_dir+'/low_green')
+    util.mkdir_if_missing(output_dir+'/high_green')
+    util.mkdir_if_missing(output_dir+'/fail_green')
+    util.mkdir_if_missing(output_dir+'/low_red')
+    util.mkdir_if_missing(output_dir+'/high_red')
+    util.mkdir_if_missing(output_dir+'/fail_red')
+    util.mkdir_if_missing(output_dir+'/low_blue')
+    util.mkdir_if_missing(output_dir+'/high_blue')
+    util.mkdir_if_missing(output_dir+'/fail_blue')
+    
+    # Load Image and Ground Truths 
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, filter=filter_invalid, img_idx=352)
+
+    gt_center = center_cams[1:]
+    gt_dim = dimensions_all[1:]
+    gt_Rs = Rs[1:]
+    cats = cats[1:]
+    gt_bb = bboxes[1:]
+
+    # Make low loss boxes for IoU, ps. z and proj
+    center = gt_center[-1]
+    dim = gt_dim[-1]
+    R = gt_Rs[-1]
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/low_green', [center], [dim], [R], image['K'], [cat], [bb])
+
+    # Make high loss boxes for IoU, ps. z and proj
+    center = [gt_center[-1][0],gt_center[-1][1],gt_center[-1][2]+3]
+    dim = gt_dim[-1]
+    R = gt_Rs[-1]
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/high_green', [center], [dim], [R], image['K'], [cat], [bb])
+
+    # Make fail loss boxes for IoU, ps. z and proj
+    center = [gt_center[-1][0]-0.03,gt_center[-1][1],gt_center[-1][2]]
+    dim = [0.05,0.71,0.05] 
+    R = util.euler2mat(np.array([0,0,45]))
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/fail_green', [center], [dim], [R], image['K'], [cat], [bb])
+
+    # Make low loss boxes for range and seg
+    center = gt_center[0]
+    dim = gt_dim[0]
+    R = gt_Rs[0]
+    cat = cats[0]
+    bb = gt_bb[0]
+    plot_scene(image['file_path'], output_dir+'/low_red', [center], [dim], [R], image['K'], [cat], [bb])
+
+    # Make high loss boxes for range and seg
+    center = [gt_center[0][0],gt_center[0][1]+0.3,gt_center[0][2]]
+    dim = [gt_dim[0][0]+1.5,gt_dim[0][1]-0.6,gt_dim[0][2]]
+    R = gt_Rs[0]
+    cat = cats[0]
+    bb = gt_bb[0]
+    plot_scene(image['file_path'], output_dir+'/high_red', [center], [dim], [R], image['K'], [cat], [bb])
+
+    # Make fail loss boxes for range and seg
+    center = [gt_center[0][0]+0.25,gt_center[0][1],gt_center[0][2]]
+    dim = [gt_dim[0][0]+0.7,gt_dim[0][1],gt_dim[0][2]]
+    R = gt_Rs[-1]
+    cat = cats[-1]
+    bb = gt_bb[-1]
+    plot_scene(image['file_path'], output_dir+'/fail_red', [center], [dim], [R], image['K'], [cat], [bb])
+
+    # Make low loss boxes for dim, pose and align
+    center = gt_center[1:]
+    dim = [[gt_dim[1][0]*1.5,gt_dim[1][1],gt_dim[1][2]*1.5], gt_dim[2]]
+    R = gt_Rs[1:]
+    cat = cats[1:]
+    bb = gt_bb[1:]
+    plot_scene(image['file_path'], output_dir+'/low_blue', center, dim, R, image['K'], cat, bb)
+
+    # Make high loss boxes for dim, pose and align
+    center = gt_center[1:]
+    dim = gt_dim[1:]
+    R = [util.euler2mat(util.mat2euler(np.array(gt_Rs[1]))+[20,0,0]), util.euler2mat(util.mat2euler(np.array(gt_Rs[2]))+[-20,0,0])]
+    cat = cats[1:]
+    bb = gt_bb[1:]
+    plot_scene(image['file_path'], output_dir+'/high_blue', center, dim, R, image['K'], cat, bb)
+
+    # Make fail loss boxes for dim, pose and align
+    center = gt_center[1:]
+    dim = [[gt_dim[1][0],gt_dim[1][1],gt_dim[1][2]],[gt_dim[2][1],gt_dim[2][0],gt_dim[2][2]]]
+    R = [util.euler2mat(util.mat2euler(np.array(gt_Rs[1]))+[1,0,0]), util.euler2mat(util.mat2euler(np.array(gt_Rs[2]))+[1,0,0])]
+    cat = cats[1:]
+    bb = gt_bb[1:]
+    plot_scene(image['file_path'], output_dir+'/fail_blue', center, dim, R, image['K'], cat, bb)
+
+    return True
+
+def gt_stats_in_terms_of_sigma(dataset):
+    '''
+    Errorbarplot of volume of object category
+    '''
+    # Load Image and Ground Truths
+    image, Rs, center_cams, dimensions_all, cats, bboxes = load_gt(dataset, mode='train', single_im=False)
+    image_t, Rs_t, center_cams_t, dimensions_all_t, cats_t, bboxes = load_gt(dataset, mode='test', single_im=False)
+
+    output_dir = 'output/figures/' + dataset
+    util.mkdir_if_missing(output_dir)
+
+    # histogram of centers
+    center_all = center_cams + center_cams_t
+    center_all = np.transpose(np.array(center_all))
+
+    # Filter -1 annotations
+    valid_columns = center_all[0] != -1
+    center_all = center_all[:,valid_columns]
+    
+    x_label = ['x', 'y', 'z']
+    fig, axes = plt.subplots(1, len(center_all), figsize=(18, 5))
+    for i in range(len(center_all)):
+        axes[i].hist(center_all[i], color=color, bins=20)
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('Count')
+    fig.suptitle('Center Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'center.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+    # histogram of dimensions
+    dimensions_all = dimensions_all + dimensions_all_t
+    dimensions_all = np.transpose(np.array(dimensions_all))
+
+    # Filter -1 annotations
+    valid_columns = dimensions_all[0] != -1
+    dimensions_all = dimensions_all[:,valid_columns]
+    
+    x_label = ['w', 'h', 'l']
+    fig, axes = plt.subplots(1, len(dimensions_all), figsize=(18, 5))
+    for i in range(len(dimensions_all)):
+        axes[i].hist(dimensions_all[i], color=color, bins=20, density=True)
+
+        # Plot normal distribution
+        mu, sigma = np.mean(dimensions_all[i]), np.std(dimensions_all[i])
+        x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 100)
+        axes[i].plot(x, stats.norm.pdf(x, mu, sigma))
+        y_lim = axes[i].get_ylim()[1]
+        axes[i].vlines(mu+sigma, 0, y_lim, linestyle='--', label='$\sigma$', color='gray')
+        axes[i].vlines(mu-sigma, 0, y_lim, linestyle='--', label='$\sigma$', color='gray')
+        axes[i].vlines(1.4, 0, y_lim, linestyle='--', color='red', label='pred')
+        if i != 0:
+            axes[i].plot((mu+sigma,1.4), (y_lim/2,y_lim/2), color='c', label='loss')
+        axes[i].set_xlabel(x_label[i])
+        axes[i].set_ylabel('density')
+        # Set xticks in terms of sigma
+        xticks = [mu - 3 * sigma, mu - 2 * sigma, mu - sigma, mu, mu + sigma, mu + 2 * sigma, mu + 3 * sigma, mu + 4 * sigma, mu + 5 * sigma, mu + 6 * sigma]
+        xticklabels = ['-3$\sigma$', '-2$\sigma$', '-$\sigma$', '0', '$\sigma$', '$2\sigma$', '$3\sigma$', '$4\sigma$', '$5\sigma$', '$6\sigma$']
+        axes[i].set_xticks(xticks)
+        axes[i].set_xticklabels(xticklabels)
+    axes[-1].legend()
+    fig.suptitle('Dimensions Distribution in Meters')
+    plt.savefig(os.path.join(output_dir, 'dimensions_sigma.png'), dpi=300, bbox_inches='tight')
+    plt.close()
+
+    return True
+
+def parallel_coordinate_plot(dataset='SUNRGBD', files:list=['output/Baseline_sgd/log.txt','output/omni_equalised/log.txt','output/omni_pseudo_gt/log.txt','output/proposal_AP/log.txt','output/exp_10_iou_zpseudogt_dims_depthrange_rotalign_ground/log.txt']):
+    '''Search the log file for the precision numbers corresponding to the last iteration
+    then parse it in as a pd.DataFrame and plot the AP vs number of classes'''
+    import plotly.graph_objects as go
+
+    # df with each model as a column and performance for each class as rows
+    # search the file from the back until the line 
+    # cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:
+    # is found
+    target_line = "cubercnn.vis.logperf INFO: Performance for each of 38 categories on SUNRGBD_test:"
+    model_names = ['Base Cube R-CNN', 'Time-eq.', 'Pseudo GT', 'Proposal', 'Weak loss']
+    df = []
+    for file, model_name in zip(files, model_names):
+        df_i = search_file_backwards(file, target_line).drop(['AP2D'], axis=1).rename(columns={'AP3D':model_name})
+        assert df_i is not None, 'df not found'
+        df.append(df_i)
+        # merge df's
+    df = reduce(lambda x, y: pd.merge(x, y, on = 'category'), df)
+    # sort df by ap3d of model 1
+    df = df.sort_values(by='Base Cube R-CNN', ascending=False)
+    # encode each category as a number
+    df['category_num'] = list(reversed([i for i in range(len(df))]))
+
+    # https://plotly.com/python/parallel-coordinates-plot/
+    fig = go.Figure(data=
+    go.Parcoords(
+        line = dict(color = df.iloc[:, 1],
+                #    colorscale = [[0,'purple'],[0.5,'lightseagreen'],[1,'gold']]),
+                    colorscale = 'Viridis'),
+                    visible = True,
+        dimensions = list([
+            dict(tickvals = df['category_num'],
+                ticktext = df['category'],
+                label = 'Categories', values = df['category_num']),
+            dict(range = [0,70],
+                constraintrange = [5,70],
+                label = model_names[0], values = df[model_names[0]]),
+            dict(range = [0,40],
+                label = model_names[2], values = df[model_names[2]]),
+            dict(range = [0,40],
+                label = model_names[4], values = df[model_names[4]]),
+            dict(range = [0,40],
+                label = model_names[1], values = df[model_names[1]]),
+            dict(range = [0,40],
+                label = model_names[3], values = df[model_names[3]]),
+            ]),
+        )
+    )
+
+    fig.update_layout(
+        plot_bgcolor = 'white',
+        paper_bgcolor = 'white',
+        title={
+            'text': "AP3D per category for each model",
+            'y':0.96,
+            'x':0.5,
+            'xanchor': 'center',
+            'yanchor': 'top'},
+        margin=dict(l=65, r=25, t=80, b=5)
+    )
+    # pip install --upgrade "kaleido==0.1.*"
+    fig.write_image('output/figures/SUNRGBD/parallel_coordinate_plot.png', scale=3, format='png')
+    # fig.show()
+
+
+if __name__ == '__main__':
+    # show_data('SUNRGBD', filter_invalid=False, output_dir='output/playground/no_filter')  #{SUNRGBD,ARKitScenes,KITTI,nuScenes,Objectron,Hypersim}
+    # show_data('SUNRGBD', filter_invalid=True, output_dir='output/playground/with_filter')  #{SUNRGBD,ARKitScenes,KITTI,nuScenes,Objectron,Hypersim}
+    # _ = category_distribution('SUNRGBD')
+    AP_vs_no_of_classes('SUNRGBD')
+    #spatial_statistics('SUNRGBD')
+    # AP3D_vs_AP2D('SUNRGBD')
+    # AP3D_vs_AP2D('SUNRGBD', mode='log')
+    # init_dataloader()
+    # vol_over_cat('SUNRGBD')
+    # gt_stats('SUNRGBD')
+    # gt_stats_in_terms_of_sigma('SUNRGBD')
+    #gt_stats('SUNRGBD')
+
+    # report_figures('SUNRGBD')
+
+    parallel_coordinate_plot()
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da0f02b24a63c8b558fad935b5ecd0cc7a0f43e
--- /dev/null
+++ b/app.py
@@ -0,0 +1,155 @@
+import numpy as np
+import gradio as gr
+import os
+import sys
+import numpy as np
+import torch
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import transforms as T
+
+sys.path.append(os.getcwd())
+np.set_printoptions(suppress=True)
+
+from cubercnn.config import get_cfg_defaults
+from cubercnn.modeling.meta_arch import build_model
+from cubercnn.modeling.backbone import build_dla_from_vision_fpn_backbone # this must be here even though it is not used
+
+from cubercnn import util, vis
+
+
+def do_test(im, threshold, model_str):
+    if im is None:
+        return None, None
+    model = load_model_config()
+    
+    model.eval()
+    
+    thres = threshold
+
+    min_size = 512
+    max_size = 4096
+    augmentations = T.AugmentationList([T.ResizeShortestEdge(min_size, max_size, "choice")])
+
+    category_path = 'configs/category_meta.json'
+        
+    # store locally if needed
+    if category_path.startswith(util.CubeRCNNHandler.PREFIX):
+        category_path = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, category_path)
+
+    metadata = util.load_json(category_path)
+    cats = metadata['thing_classes']
+    
+    image_shape = im.shape[:2]  # h, w
+
+    h, w = image_shape
+    
+    focal_length_ndc = 4.0
+    focal_length = focal_length_ndc * h / 2
+
+    px, py = w/2, h/2
+
+    K = np.array([
+        [focal_length, 0.0, px], 
+        [0.0, focal_length, py], 
+        [0.0, 0.0, 1.0]
+    ])
+
+    # dummy
+    aug_input = T.AugInput(im)
+    tfms = augmentations(aug_input)
+    image = aug_input.image
+    # model.to(device)
+    batched = [{
+            'image': torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))), 
+            'height': image_shape[0], 'width': image_shape[1], 'K': K
+        }]
+    with torch.no_grad():
+        dets = model(batched)[0]['instances']
+
+    n_det = len(dets)
+
+    meshes = []
+    meshes_text = []
+
+    if n_det > 0:
+        for idx, (corners3D, center_cam, center_2D, dimensions, pose, score, cat_idx) in enumerate(zip(
+                dets.pred_bbox3D, dets.pred_center_cam, dets.pred_center_2D, dets.pred_dimensions, 
+                dets.pred_pose, dets.scores, dets.pred_classes
+            )):
+
+            # skip
+            if score < thres:
+                continue
+            
+            cat = cats[cat_idx]
+
+            bbox3D = center_cam.tolist() + dimensions.tolist()
+            meshes_text.append('{} {:.2f}'.format(cat, score))
+            color = [c/255.0 for c in util.get_color(idx)]
+            box_mesh = util.mesh_cuboid(bbox3D, pose.tolist(), color=color)
+            meshes.append(box_mesh)
+    
+    # print('File with {} dets'.format(len(meshes)))
+
+    if len(meshes) > 0:
+        im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(im, K, meshes, text=meshes_text, scale=im.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
+        im_drawn_rgb, im_topdown = im_drawn_rgb.astype(np.uint8), im_topdown.astype(np.uint8)
+    else:
+        im_drawn_rgb, im_topdown = im.astype(np.uint8), None
+    return im_drawn_rgb, im_topdown
+
+def setup(config_file):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    get_cfg_defaults(cfg)
+
+    # store locally if needed
+    if config_file.startswith(util.CubeRCNNHandler.PREFIX):    
+        config_file = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, config_file)
+
+    cfg.merge_from_file(config_file)
+    cfg.freeze()
+    return cfg
+
+def main(config_file, weigths=None):
+    cfg = setup(config_file)
+    model = build_model(cfg)
+    
+    DetectionCheckpointer(model).resume_or_load(
+        weigths, resume=True
+    )
+    return cfg, model
+
+
+if __name__ == "__main__":
+    def load_model_config():
+        config_file =  "configs/Omni_combined.yaml"
+        MODEL_WEIGHTS = "output/weak_cube_r-cnn/model_final.pth"
+        cfg, model = main(config_file, MODEL_WEIGHTS) 
+        return model
+
+    title = 'Weak Cube R-CNN'
+    description = "This showcases the different our model [[`Weak Cube RCNN`](https://arxiv.org/abs/2504.13297). To create Weak Cube RCNN, we modify the framework by replacing its 3D loss functions with ones based solely on 2D annotations. Our methods rely heavily on external, strong generalised deep learning models to infer spatial information in scenes. Experimental results show that all models perform comparably to an annotation time-equalised Cube R-CNN, whereof the pseudo ground truth method achieves the highest accuracy. The results show the methods' ability to understand scenes in 3D, providing satisfactory visual results. Although not precise enough for centimetre accurate measurements, the method provide a solid foundation for further research. \n Check out the code on [GitHub](https://github.com/AndreasLH/Weak-Cube-R-CNN)"
+
+    
+    demo = gr.Interface(
+        title=title,
+        fn=do_test, 
+        inputs=[
+            gr.Image(label="Input Image"), 
+            gr.Slider(0, 1, value=0.25, label="Threshold", info="Only show predictions with a confidence above this threshold"),
+            gr.Textbox(value="Weak Cube R-CNN", visible=False, render=False)
+            ],
+        outputs=[gr.Image(label="Predictions"), gr.Image(label="Top view")],
+            description=description, 
+            allow_flagging='never',
+            examples=[["datasets/examples/ex2.jpg"],[],[],["datasets/examples/ex1.jpg"]],
+        )
+    
+    
+    # demo.launch(server_name="0.0.0.0", server_port=7860)
+    demo.launch()
\ No newline at end of file
diff --git a/configs/Base.yaml b/configs/Base.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d45a91b15c2a592873483d642e99bdf4481fc5a9
--- /dev/null
+++ b/configs/Base.yaml
@@ -0,0 +1,89 @@
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.02
+  STEPS: (19200, 25600)
+  MAX_ITER: 32000
+  WEIGHT_DECAY: 0.0001
+  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
+INPUT:
+  MIN_SIZE_TRAIN: (256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512, 528, 544, 560, 576, 592, 608, 624, 640,)
+  MIN_SIZE_TEST: 512
+  MAX_SIZE_TRAIN: 4096
+  MAX_SIZE_TEST: 4096
+TEST:
+  VISIBILITY_THRES: 0.33333333
+  TRUNCATION_THRES: 0.33333333
+  EVAL_PERIOD: 16000
+DATASETS:
+  TRAIN: ('KITTI_train', 'KITTI_val')
+  TEST: ('KITTI_test',) 
+  CATEGORY_NAMES: ('pedestrian', 'car', 'cyclist', 'van', 'truck', 'tram', 'person')
+  IGNORE_NAMES: "['dontcare', 'ignore', 'void']"
+  MIN_HEIGHT_THRES: 0.05
+  TRUNCATION_THRES: 0.75
+  VISIBILITY_THRES: 0.25
+  TRUNC_2D_BOXES: True
+VIS_PERIOD: 640
+DATALOADER:
+  SAMPLER_TRAIN: "RepeatFactorTrainingSampler"
+  REPEAT_THRESHOLD: 0.1
+MODEL:
+  PIXEL_MEAN: [103.530, 116.280, 123.675]
+  PIXEL_STD: [57.375, 57.120, 58.395]
+  META_ARCHITECTURE: "RCNN3D"
+  MASK_ON: False
+  STABILIZE: 0.02
+  USE_BN: True
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: 'build_dla_from_vision_fpn_backbone'
+  DLA:
+    TYPE: 'dla34'
+  FPN:
+    IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+    ASPECT_RATIOS: [[0.5, 1.0, 2.0]]  # Three aspect ratios (same for all in feature maps)
+  RPN:
+    HEAD_NAME: "StandardRPNHead"
+    IN_FEATURES: ['p2', 'p3', 'p4', 'p5', 'p6']
+    PRE_NMS_TOPK_TRAIN: 2000  # Per FPN level
+    PRE_NMS_TOPK_TEST: 1000  # Per FPN level
+    POST_NMS_TOPK_TRAIN: 1000
+    POST_NMS_TOPK_TEST: 1000
+    BOUNDARY_THRESH: -1
+    OBJECTNESS_UNCERTAINTY: "IoUness"
+    IOU_THRESHOLDS: [0.05, 0.05]
+    POSITIVE_FRACTION: 1.0
+  PROPOSAL_GENERATOR:
+    NAME: "RPNWithIgnore"
+  ROI_HEADS:
+    NAME: "ROIHeads3D"
+    IN_FEATURES: ["p2", "p3", "p4", "p5", 'p6']
+    BATCH_SIZE_PER_IMAGE: 512
+    SCORE_THRESH_TEST: 0.01
+    NUM_CLASSES: 43
+  ROI_BOX_HEAD:
+    NAME: "FastRCNNConvFCHead"
+    NUM_FC: 2
+    POOLER_RESOLUTION: 7
+  ROI_CUBE_HEAD:
+    NAME: 'CubeHead'
+    Z_TYPE: 'direct'
+    POSE_TYPE: '6d'
+    NUM_FC: 2
+    SHARED_FC: True
+    USE_CONFIDENCE: 1.0
+    LOSS_W_3D: 1.0
+    POOLER_TYPE: 'ROIAlignV2'
+    POOLER_RESOLUTION: 7
+    DISENTANGLED_LOSS: True
+    ALLOCENTRIC_POSE: True
+    VIRTUAL_FOCAL: 512.0
+    VIRTUAL_DEPTH: True
+    CHAMFER_POSE: True
+    TEST: 'blasss'
+    DIMS_PRIORS_ENABLED: True
+    DIMS_PRIORS_PRECOMPUTED: False
+VERSION: 2
\ No newline at end of file
diff --git a/configs/Base_Omni3D.yaml b/configs/Base_Omni3D.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5268a95caeba31c2034403d9e9e75d8ea165073d
--- /dev/null
+++ b/configs/Base_Omni3D.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 2 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
+  BASE_LR: 0.0214 #0.12
+  STEPS: (17280, 23040)
+  MAX_ITER: 100000 #116000
+  WARMUP_ITERS: 0 #3625
+TEST:
+  EVAL_PERIOD: 7200 #29000
+VIS_PERIOD: 1 #2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train_mini', 'SUNRGBD_val_mini')
+  TEST: ('SUNRGBD_test_mini',) 
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50
\ No newline at end of file
diff --git a/configs/Base_Omni3D_2D_only.yaml b/configs/Base_Omni3D_2D_only.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1f31d438e388dc150f1e07b797131db8e3402c2e
--- /dev/null
+++ b/configs/Base_Omni3D_2D_only.yaml
@@ -0,0 +1,20 @@
+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 6 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
+  BASE_LR: 0.0214 #0.12
+  STEPS: (30000, 40000)
+  MAX_ITER: 50000 #116000
+  WARMUP_ITERS: 0 #3625
+TEST:
+  EVAL_PERIOD: 25000 #29000
+VIS_PERIOD: 50000 #2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val')
+  TEST: ('SUNRGBD_test',) 
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50
+  ROI_CUBE_HEAD:
+    LOSS_W_3D: 0.0
\ No newline at end of file
diff --git a/configs/Base_Omni3D_in.yaml b/configs/Base_Omni3D_in.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..bb4c33e61509455a9fe90e3f022e3583e6cea86a
--- /dev/null
+++ b/configs/Base_Omni3D_in.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 128
+  BASE_LR: 0.08
+  STEPS: (69600, 92800)
+  MAX_ITER: 116000
+  WARMUP_ITERS: 3625
+TEST:
+  EVAL_PERIOD: 29000
+VIS_PERIOD: 2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val')
+  TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test') 
+  CATEGORY_NAMES: ('stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 38
\ No newline at end of file
diff --git a/configs/Base_Omni3D_og.yaml b/configs/Base_Omni3D_og.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..261613c63c61eae62967f6b9e4ad01160a28212e
--- /dev/null
+++ b/configs/Base_Omni3D_og.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 192
+  BASE_LR: 0.12
+  STEPS: (69600, 92800)
+  MAX_ITER: 116000
+  WARMUP_ITERS: 3625
+TEST:
+  EVAL_PERIOD: 29000
+VIS_PERIOD: 2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val', 'Hypersim_train', 'Hypersim_val', 'ARKitScenes_train', 'ARKitScenes_val', 'Objectron_train', 'Objectron_val', 'nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
+  TEST: ('SUNRGBD_test', 'Hypersim_test', 'ARKitScenes_test', 'Objectron_test', 'KITTI_test', 'nuScenes_test') 
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50
\ No newline at end of file
diff --git a/configs/Base_Omni3D_out.yaml b/configs/Base_Omni3D_out.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c957e04ff0672bbaf16a7f5eec8ddbeef64dd672
--- /dev/null
+++ b/configs/Base_Omni3D_out.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.02
+  STEPS: (69600, 92800)
+  MAX_ITER: 116000
+  WARMUP_ITERS: 3625
+TEST:
+  EVAL_PERIOD: 29000
+VIS_PERIOD: 2320
+DATASETS:
+  TRAIN: ('nuScenes_train', 'nuScenes_val', 'KITTI_train', 'KITTI_val')
+  TEST: ('nuScenes_test', 'KITTI_test') 
+  CATEGORY_NAMES: ('cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 11
\ No newline at end of file
diff --git a/configs/Base_Omni3D_prof.yaml b/configs/Base_Omni3D_prof.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..98f5de6a7489fb886467ed7f937830637f494c2f
--- /dev/null
+++ b/configs/Base_Omni3D_prof.yaml
@@ -0,0 +1,18 @@
+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 2 #196 -> r=5,6 -> because of dataset size r=5,6 * 10,335/233 = 0,248
+  BASE_LR: 0.001224489796 #0.12
+  STEPS: (172, 230)
+  MAX_ITER: 288 #116000
+  WARMUP_ITERS: 9 #3625
+TEST:
+  EVAL_PERIOD: 72 #29000
+VIS_PERIOD: 6 #2320
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val')
+  TEST: ('SUNRGBD_test',) 
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  ROI_HEADS:
+    NUM_CLASSES: 50
\ No newline at end of file
diff --git a/configs/Omni_combined.yaml b/configs/Omni_combined.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a6aff4cc0972942826ca21d6baff927b7a853d2a
--- /dev/null
+++ b/configs/Omni_combined.yaml
@@ -0,0 +1,37 @@
+_BASE_: "Base.yaml"
+SOLVER:
+  TYPE: "sgd"
+  IMS_PER_BATCH: 25
+  BASE_LR: 0.015
+  STEPS: (35000, 40000)
+  MAX_ITER: 42001
+  WARMUP_ITERS: 0
+  CHECKPOINT_PERIOD: 1000
+TEST:
+  EVAL_PERIOD: 100000
+VIS_PERIOD: 1000
+DATASETS:
+  TRAIN: ('SUNRGBD_train', 'SUNRGBD_val') #, 'KITTI_train_mini', 'KITTI_val_mini')
+  TEST: ('SUNRGBD_test',) # 'KITTI_test_mini') 
+  CATEGORY_NAMES: ('chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin')
+MODEL:
+  DEVICE: 'cpu'
+  DEPTH_ON: False #whether to use the depth anything concated features # if do not use this, then we can use ["p2", "p3", "p4", "p5", "p6"], [[32], [64], [128], [256], [512]], otherwise only ["p2", "p3", "p4", "p5"], [[32], [64], [128], [256]]
+  FPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+  RPN:
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+  ANCHOR_GENERATOR:
+    SIZES: [[32], [64], [128], [256], [512]]  # One size for each in feature map
+  ROI_HEADS:
+    NAME: 'ROIHeads3DScore' # name of the class that is the 3d predictor
+    IN_FEATURES: ["p2", "p3", "p4", "p5", "p6"]
+    NUM_CLASSES: 50
+    POSITIVE_FRACTION: 0.25 # we can use this to control the ratio of positive to negative sampled cubes in
+  ROI_CUBE_HEAD:
+    NAME: 'CubeHead' # name of the 3d head
+    DIMS_PRIORS_ENABLED: True
+    POOLER_TYPE: 'ROIAlignV2'
+    POOLER_RESOLUTION: 7
+    LOSS_W_3D: 1.0
+  META_ARCHITECTURE: 'RCNN3D_combined_features' # name of the overall arch that calls the ROI_HEADS.NAME and ROI_CUBE_HEAD.NAME
\ No newline at end of file
diff --git a/configs/category_meta.json b/configs/category_meta.json
new file mode 100644
index 0000000000000000000000000000000000000000..57092151cd2b98ae75a993fe40b8e940664882dc
--- /dev/null
+++ b/configs/category_meta.json
@@ -0,0 +1 @@
+{"thing_classes": ["pedestrian", "car", "cyclist", "van", "truck", "traffic cone", "barrier", "motorcycle", "bicycle", "bus", "trailer", "books", "bottle", "camera", "cereal box", "chair", "cup", "laptop", "shoes", "towel", "blinds", "window", "lamp", "shelves", "mirror", "sink", "cabinet", "bathtub", "door", "toilet", "desk", "box", "bookcase", "picture", "table", "counter", "bed", "night stand", "pillow", "sofa", "television", "floor mat", "curtain", "clothes", "stationery", "refrigerator", "bin", "stove", "oven", "machine"], "thing_dataset_id_to_contiguous_id": {"0": 0, "1": 1, "3": 2, "4": 3, "5": 4, "8": 5, "9": 6, "10": 7, "11": 8, "12": 9, "13": 10, "14": 11, "15": 12, "16": 13, "17": 14, "18": 15, "19": 16, "20": 17, "21": 18, "22": 19, "23": 20, "24": 21, "25": 22, "26": 23, "27": 24, "28": 25, "29": 26, "30": 27, "31": 28, "32": 29, "33": 30, "34": 31, "35": 32, "36": 33, "37": 34, "38": 35, "39": 36, "40": 37, "42": 38, "43": 39, "44": 40, "45": 41, "46": 42, "47": 43, "48": 44, "49": 45, "52": 46, "53": 47, "57": 48, "61": 49}}
\ No newline at end of file
diff --git a/configs/cubercnn_DLA34_FPN.yaml b/configs/cubercnn_DLA34_FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d851cb7cb609fe35a45e1c007ac21f621671237
--- /dev/null
+++ b/configs/cubercnn_DLA34_FPN.yaml
@@ -0,0 +1,6 @@
+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_dla_from_vision_fpn_backbone'
+  DLA:
+    TYPE: 'dla34'
\ No newline at end of file
diff --git a/configs/cubercnn_ResNet34_FPN.yaml b/configs/cubercnn_ResNet34_FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..139c7a06eb41d0f24602a51aae2af35ac6bba043
--- /dev/null
+++ b/configs/cubercnn_ResNet34_FPN.yaml
@@ -0,0 +1,7 @@
+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_resnet_from_vision_fpn_backbone'
+  RESNETS:
+    DEPTH: 34
+    TORCHVISION: True
\ No newline at end of file
diff --git a/configs/cubercnn_densenet_FPN.yaml b/configs/cubercnn_densenet_FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5c1f4c8ba36d6efd5f505ddefd1f0505e15bfc0d
--- /dev/null
+++ b/configs/cubercnn_densenet_FPN.yaml
@@ -0,0 +1,4 @@
+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_densenet_fpn_backbone'
\ No newline at end of file
diff --git a/configs/cubercnn_mnasnet_FPN.yaml b/configs/cubercnn_mnasnet_FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..27808a8a7801b777cb5c5144ec619396b3c19010
--- /dev/null
+++ b/configs/cubercnn_mnasnet_FPN.yaml
@@ -0,0 +1,4 @@
+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_mnasnet_fpn_backbone'
\ No newline at end of file
diff --git a/configs/cubercnn_shufflenet_FPN.yaml b/configs/cubercnn_shufflenet_FPN.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..02ecc723b8fe2f15d67dc76370faa5801caffe38
--- /dev/null
+++ b/configs/cubercnn_shufflenet_FPN.yaml
@@ -0,0 +1,4 @@
+_BASE_: "Base_Omni3D.yaml"
+MODEL:
+  BACKBONE:
+    NAME: 'build_shufflenet_fpn_backbone'
\ No newline at end of file
diff --git a/cubercnn/config/__init__.py b/cubercnn/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d085c3a958591550fc45d3f1347b7295b4425b70
--- /dev/null
+++ b/cubercnn/config/__init__.py
@@ -0,0 +1 @@
+from .config import *
\ No newline at end of file
diff --git a/cubercnn/config/config.py b/cubercnn/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af7b9ad8ad5b327203c012d9f50a2c87f247b9a
--- /dev/null
+++ b/cubercnn/config/config.py
@@ -0,0 +1,187 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from detectron2.config import CfgNode as CN
+
+def get_cfg_defaults(cfg):
+
+    # A list of category names which will be used
+    cfg.DATASETS.CATEGORY_NAMES = []
+
+    # The category names which will be treated as ignore
+    # e.g., not counting as background during training
+    # or as false positives during evaluation.
+    cfg.DATASETS.IGNORE_NAMES = []
+
+    # Should the datasets appear with the same probabilty
+    # in batches (e.g., the imbalance from small and large
+    # datasets will be accounted for during sampling)
+    cfg.DATALOADER.BALANCE_DATASETS = False
+
+    # The thresholds for when to treat a known box
+    # as ignore based on too heavy of truncation or 
+    # too low of visibility in the image. This affects
+    # both training and evaluation ignores.
+    cfg.DATASETS.TRUNCATION_THRES = 0.99
+    cfg.DATASETS.VISIBILITY_THRES = 0.01
+    cfg.DATASETS.MIN_HEIGHT_THRES = 0.00
+    cfg.DATASETS.MAX_DEPTH = 1e8
+
+    # Whether modal 2D boxes should be loaded, 
+    # or if the full 3D projected boxes should be used.
+    cfg.DATASETS.MODAL_2D_BOXES = False
+
+    # Whether truncated 2D boxes should be loaded, 
+    # or if the 3D full projected boxes should be used.
+    cfg.DATASETS.TRUNC_2D_BOXES = True
+
+    # Threshold used for matching and filtering boxes
+    # inside of ignore regions, within the RPN and ROIHeads
+    cfg.MODEL.RPN.IGNORE_THRESHOLD = 0.5
+
+    # Configuration for cube head
+    cfg.MODEL.ROI_CUBE_HEAD = CN()
+    cfg.MODEL.ROI_CUBE_HEAD.NAME = "CubeHead"
+    cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION = 7
+    cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO = 0
+    cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE = "ROIAlignV2"
+
+    # Settings for the cube head features
+    cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV = 0
+    cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM = 256
+    cfg.MODEL.ROI_CUBE_HEAD.NUM_FC = 2
+    cfg.MODEL.ROI_CUBE_HEAD.FC_DIM = 1024
+    # proposal method 
+    cfg.MODEL.ROI_CUBE_HEAD.NUMBER_OF_PROPOSALS = 1000
+    
+    # the style to predict Z with currently supported
+    # options --> ['direct', 'sigmoid', 'log', 'clusters']
+    cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE = "direct"
+
+    # the style to predict pose with currently supported
+    # options --> ['6d', 'euler', 'quaternion']
+    cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE = "6d"
+
+    # Whether to scale all 3D losses by inverse depth
+    cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT = False
+
+    # Virtual depth puts all predictions of depth into
+    # a shared virtual space with a shared focal length. 
+    cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH = True
+    cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL = 512.0
+
+    # If true, then all losses are computed using the 8 corners
+    # such that they are all in a shared scale space. 
+    # E.g., their scale correlates with their impact on 3D IoU.
+    # This way no manual weights need to be set.
+    cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS = True
+
+    # When > 1, the outputs of the 3D head will be based on
+    # a 2D scale clustering, based on 2D proposal height/width.
+    # This parameter describes the number of bins to cluster.
+    cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS = 1
+
+    # Whether batch norm is enabled during training. 
+    # If false, all BN weights will be frozen. 
+    cfg.MODEL.USE_BN = True
+
+    # Whether to predict the pose in allocentric space. 
+    # The allocentric space may correlate better with 2D 
+    # images compared to egocentric poses. 
+    cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE = True
+
+    # Whether to use chamfer distance for disentangled losses
+    # of pose. This avoids periodic issues of rotation but 
+    # may prevent the pose "direction" from being interpretable.
+    cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE = True
+
+    # Should the prediction heads share FC features or not. 
+    # These include groups of uv, z, whl, pose.
+    cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC = True
+
+    # Check for stable gradients. When inf is detected, skip the update. 
+    # This prevents an occasional bad sample from exploding the model. 
+    # The threshold below is the allows percent of bad samples. 
+    # 0.0 is off, and 0.01 is recommended for minor robustness to exploding.
+    cfg.MODEL.STABILIZE = 0.01
+    
+    # Whether or not to use the dimension priors
+    cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED = True
+
+    # How prior dimensions should be computed? 
+    # The supported modes are ["exp", "sigmoid"]
+    # where exp is unbounded and sigmoid is bounded
+    # between +- 3 standard deviations from the mean.
+    cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC = 'exp'
+
+    # weight for confidence loss. 0 is off.
+    cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE = 1.0
+
+    # Loss weights for XY, Z, Dims, Pose
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE = 7.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_NORMAL_VEC = 20.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_IOU = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_SEG = 2.5
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z = 1.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS = 20.0
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DEPTH = 1.0
+
+    cfg.MODEL.DLA = CN()
+
+    # Supported types for DLA backbones are...
+    # dla34, dla46_c, dla46x_c, dla60x_c, dla60, dla60x, dla102x, dla102x2, dla169
+    cfg.MODEL.DLA.TYPE = 'dla34'
+
+    # Only available for dla34, dla60, dla102
+    cfg.MODEL.DLA.TRICKS = False
+
+    # A joint loss for the disentangled loss.
+    # All predictions are computed using a corner
+    # or chamfers loss depending on chamfer_pose!
+    # Recommened to keep this weight small: [0.05, 0.5]
+    cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT = 1.0
+
+    # sgd, adam, adam+amsgrad, adamw, adamw+amsgrad
+    cfg.SOLVER.TYPE = 'sgd'
+
+    cfg.MODEL.RESNETS.TORCHVISION = True
+    cfg.TEST.DETECTIONS_PER_IMAGE = 100
+
+    cfg.TEST.VISIBILITY_THRES = 1/2.0
+    cfg.TEST.TRUNCATION_THRES = 1/2.0
+
+    cfg.INPUT.RANDOM_FLIP = "horizontal"
+
+    # When True, we will use localization uncertainty
+    # as the new IoUness score in the RPN.
+    cfg.MODEL.RPN.OBJECTNESS_UNCERTAINTY = 'IoUness'
+
+    # If > 0.0 this is the scaling factor that will be applied to
+    # an RoI 2D box before doing any pooling to give more context. 
+    # Ex. 1.5 makes width and height 50% larger. 
+    cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES = 0.0
+
+    # weight path specifically for pretraining (no checkpointables will be loaded)
+    cfg.MODEL.WEIGHTS_PRETRAIN = ''
+
+    # ## start of our things
+    cfg.MODEL.ROI_CUBE_HEAD.TEST = 'bas'
+    cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_PRECOMPUTED = False
+
+    cfg.PLOT = CN(new_allowed=True)
+    cfg.PLOT.OUTPUT_DIR = ''
+    cfg.PLOT.EVAL = ''
+    cfg.PLOT.MODE2D = '' #either GT or PRED
+
+    cfg.PLOT.SCORING_FUNC = None
+    cfg.PLOT.PROPOSAL_FUNC = None
+    cfg.PLOT.number_of_proposals = 1000
+
+    cfg.TRAIN = CN(new_allowed=True)
+    cfg.TRAIN.pseudo_gt = 'learn'
+
+    # these are meant to be overwritten as an argument
+    cfg.log = True
+    # (these 2 are mutually exclusive) z_pseudo_gt_patch or z_pseudo_gt_center 
+    cfg.loss_functions = ['iou']
+    cfg.MODEL.DEPTH_ON = False #whether to use the depth anything concated features
\ No newline at end of file
diff --git a/cubercnn/data/Omni_to_kitti.py b/cubercnn/data/Omni_to_kitti.py
new file mode 100644
index 0000000000000000000000000000000000000000..8523dd1ace8a95f6016566e6ab5d1df98dfb02a6
--- /dev/null
+++ b/cubercnn/data/Omni_to_kitti.py
@@ -0,0 +1,197 @@
+import torch
+from detectron2.data.catalog import MetadataCatalog
+from cubercnn import data
+from detectron2.structures import Boxes, BoxMode
+from cubercnn.util.math_util import estimate_truncation, mat2euler, R_to_allocentric
+import os
+import numpy as np
+from tqdm import tqdm
+
+def perp_vector(a, b):
+    return np.array([b, -a])  
+
+def rotate_vector(x, y, theta):
+    # Calculate the rotated coordinates
+    x_rotated = x * np.cos(theta) - y * np.sin(theta)
+    y_rotated = x * np.sin(theta) + y * np.cos(theta)
+    
+    return np.array([x_rotated, y_rotated])
+
+def calculate_alpha(location, ry):
+    '''
+    location: x, y, z coordinates
+    ry: rotation around y-axis, negative counter-clockwise,
+    
+    positive x-axis is to the right
+    calculate the angle from a line perpendicular to the camera to the center of the bounding box'''
+
+    # get vector from camera to object
+    ry = -ry
+    x, y, z = location
+    # vector from [0,0,0] to the center of the bounding box
+    # we can do the whole thing in 2D, top down view
+    # vector perpendicular to center
+    perpendicular = perp_vector(x,z)
+    # vector corresponding to ry
+    ry_vector = np.array([np.cos(ry), np.sin(ry)])
+    # angle between perpendicular and ry_vector
+    dot = perpendicular[0]*ry_vector[0] + perpendicular[1]*ry_vector[1]      # Dot product between [x1, y1] and [x2, y2]
+    det = perpendicular[0]*ry_vector[1] - perpendicular[1]*ry_vector[0]      # Determinant
+    alpha = -np.arctan2(det, dot)
+
+    # wrap to -pi to pi
+    if alpha > np.pi:
+        alpha -= 2*np.pi
+    if alpha < -np.pi:
+        alpha += 2*np.pi
+    return alpha
+
+def test_calculate_alpha():
+    location = [-3.67, 1.67, 6.05]
+    ry = -1.24
+    expected = -0.72
+    result1 = calculate_alpha(location, ry)
+
+    location = [-9.48, 2.08, 26.41]
+    ry = 1.77
+    expected = 2.11
+    result2 = calculate_alpha(location, ry)
+
+    location = [4.19, 1.46, 44.41]
+    ry = -1.35
+    expected = -1.45
+    result3 = calculate_alpha(location, ry)
+
+    location = [-6.41, 2.04, 46.74]
+    ry = 1.68
+    expected = 1.82
+    result4 = calculate_alpha(location, ry)
+
+    location = [0.28, 2.08, 17.74]
+    ry = -1.58
+    expected = -1.59
+    result5 = calculate_alpha(location, ry)
+
+    location = [-3.21, 1.97, 11.22]
+    ry = -0.13
+    expected = 0.15
+    result6 = calculate_alpha(location, ry)
+
+    # assert np.isclose(result, expected, atol=0.01)
+    return result1
+
+
+def main():
+    alpha = test_calculate_alpha()
+
+
+    name = 'KITTI'
+    split = 'test'
+    dataset_paths_to_json = [f'datasets/Omni3D/{name}_{split}.json',]
+    os.makedirs('output/KITTI_formatted_predictions', exist_ok=True)
+
+    # Example 1. load all images
+    dataset = data.Omni3D(dataset_paths_to_json)
+    imgIds = dataset.getImgIds()
+    imgs = dataset.loadImgs(imgIds)
+
+    # Example 2. load annotations for image index 0
+    annIds = dataset.getAnnIds(imgIds=imgs[0]['id'])
+    anns = dataset.loadAnns(annIds)
+
+    data.register_and_store_model_metadata(dataset, 'output')
+
+    thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+    dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+    cats = {'pedestrian', 'car', 'cyclist', 'van', 'truck'}
+
+    input_folder = 'kitti_omni_eq'
+
+    out_path = 'output/'+input_folder+'/KITTI_formatted_predictions/'
+    in_path = 'output/'+input_folder+'/KITTI_pred/instances_predictions.pth'
+    print('saving to', out_path)
+    data_json = torch.load(in_path)
+    # 
+    # reference
+    # https://github.com/ZrrSkywalker/MonoDETR/blob/c724572bddbc067832a0e0d860a411003f36c2fa/lib/helpers/tester_helper.py#L114
+    files = {}
+    for image in tqdm(data_json):
+        K = image['K']
+        K_inv = np.linalg.inv(K)
+        width, height = image['width'], image['height']
+        image_id = image['image_id']
+        l = []
+        for pred in image['instances']:
+
+            category = thing_classes[pred['category_id']]
+            if category not in cats:
+                continue
+            occluded = 0
+            # truncation = estimate_truncation(K, torch.tensor([x3d, y3d, z3d, w3d, h3d, l3d]), pred['pose'], width, height)
+            truncation = 0.0 # it does not matter
+            rotation_y = mat2euler(np.array(pred['pose']))[1]
+            bbox = BoxMode.convert(pred['bbox'], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) # x1, y1, x2, y2 -> convert to left, top, right, bottom
+            h3d, w3d, l3d = pred['dimensions']
+            # unproject, this should yield the same 
+            # cen_2d = np.array(pred['center_2D'] + [1])
+            # z3d = pred['center_cam'][2]
+            # x3d, y3d, z3d = (K_inv @ (z3d*cen_2d))
+
+            x3d, y3d, z3d = pred['center_cam']
+
+            location = pred['center_cam']
+            score = pred['score']
+            alpha = calculate_alpha(location, rotation_y)
+
+            # convert to KITTI format
+            li = [category, truncation, occluded, alpha, bbox[0], bbox[1], bbox[2], bbox[3], h3d, w3d, l3d, x3d, y3d, z3d, rotation_y, score]
+            l.append(li)
+        # sort l by z3d
+        l = sorted(l, key=lambda x: x[13])
+        files[image_id] = l
+
+    # 7518 test images
+    os.makedirs(out_path, exist_ok=True)
+    for img_id, content in files.items():
+
+        img_id_str = str(img_id).zfill(6)
+        with open(out_path+f'{img_id_str}.txt', 'w') as f:
+            str_i = ''
+            for i in content:
+                # t = f'{category} {truncation:.2f} {occluded} {alpha:.2f} {bbox[0]:.2f} {bbox[1]:.2f} {bbox[2]:.2f} {bbox[3]:.2f} {w3d:.2f} {h3d:.2f} {l3d:.2f} {x3d:.2f} {y3d:.2f} {z3d:.2f} {rotation_y:.2f} {score:.2f}\n'
+                t = f'{i[0][0].upper() + i[0][1:]} {i[1]:.2f} {i[2]} {i[3]:.2f} {i[4]:.2f} {i[5]:.2f} {i[6]:.2f} {i[7]:.2f} {i[8]:.2f} {i[9]:.2f} {i[10]:.2f} {i[11]:.2f} {i[12]:.2f} {i[13]:.2f} {i[14]:.2f} {i[15]:.2f}\n'
+                str_i += t
+            f.write(str_i)
+
+if __name__ == '__main__':
+    main()
+
+# write to file 
+# #Values    Name      Description
+# ----------------------------------------------------------------------------
+#    1    type         Describes the type of object: 'Car', 'Van', 'Truck',
+#                      'Pedestrian', 'Person_sitting', 'Cyclist', 'Tram',
+#                      'Misc' or 'DontCare'
+#    1    truncated    Float from 0 (non-truncated) to 1 (truncated), where
+#                      truncated refers to the object leaving image boundaries
+#    1    occluded     Integer (0,1,2,3) indicating occlusion state:
+#                      0 = fully visible, 1 = partly occluded
+#                      2 = largely occluded, 3 = unknown
+#    1    alpha        Observation angle of object, ranging [-pi..pi]
+#    4    bbox         2D bounding box of object in the image (0-based index):
+#                      contains left, top, right, bottom pixel coordinates
+#    3    dimensions   3D object dimensions: height, width, length (in meters)
+#    3    location     3D object location x,y,z in camera coordinates (in meters)
+#    1    rotation_y   Rotation ry around Y-axis in camera coordinates [-pi..pi]
+#    1    score        Only for results: Float, indicating confidence in
+#                      detection, needuhued for p/r curves, higher is better.
+
+# output to files 000000.txt 000001.txt ... 
+
+# example file
+# Car 0.00 0 -1.56 564.62 174.59 616.43 224.74 1.61 1.66 3.20 -0.69 1.69 25.01 -1.59
+# Car 0.00 0 1.71 481.59 180.09 512.55 202.42 1.40 1.51 3.70 -7.43 1.88 47.55 1.55
+# Car 0.00 0 1.64 542.05 175.55 565.27 193.79 1.46 1.66 4.05 -4.71 1.71 60.52 1.56
+# Cyclist 0.00 0 1.89 330.60 176.09 355.61 213.60 1.72 0.50 1.95 -12.63 1.88 34.09 1.54
+# DontCare -1 -1 -10 753.33 164.32 798.00 186.74 -1 -1 -1 -1000 -1000 -1000 -10
+# DontCare -1 -1 -10 738.50 171.32 753.27 184.42 -1 -1 -1 -1000 -1000 -1000 -10
\ No newline at end of file
diff --git a/cubercnn/data/__init__.py b/cubercnn/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ed5cd780f4e689c55aa3a85e05241b77eb37098
--- /dev/null
+++ b/cubercnn/data/__init__.py
@@ -0,0 +1,5 @@
+from .datasets import *
+from .dataset_mapper import *
+from .build import *
+from .builtin import *
+from .Omni_to_kitti import *
\ No newline at end of file
diff --git a/cubercnn/data/build.py b/cubercnn/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5aa7a3352ebfed2694ae48290f655a78ed8e42b
--- /dev/null
+++ b/cubercnn/data/build.py
@@ -0,0 +1,260 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import itertools
+import logging
+import numpy as np
+import math
+from collections import defaultdict
+import torch.utils.data
+
+from detectron2.config import configurable
+from detectron2.utils.logger import _log_api_usage
+
+from detectron2.data.catalog import DatasetCatalog
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.samplers import (
+    InferenceSampler, 
+    RepeatFactorTrainingSampler, 
+    TrainingSampler
+)
+from detectron2.data.build import ( 
+    build_batch_data_loader,
+    trivial_batch_collator
+)
+
+def filter_images_with_only_crowd_annotations(dataset_dicts):
+    """
+    Filter out images with none annotations or only crowd annotations
+    (i.e., images without non-crowd annotations).
+    A common training-time preprocessing on COCO dataset.
+
+    Args:
+        dataset_dicts (list[dict]): annotations in Detectron2 Dataset format.
+
+    Returns:
+        list[dict]: the same format, but filtered.
+    """
+    num_before = len(dataset_dicts)
+
+    def valid(anns):
+        for ann in anns:
+            if ann.get("iscrowd", 0) == 0:
+                return True
+        return False
+
+    dataset_dicts = [x for x in dataset_dicts if valid(x["annotations"])]
+    num_after = len(dataset_dicts)
+    logger = logging.getLogger(__name__)
+    logger.info(
+        "Removed {} images marked with crowd. {} images left.".format(
+            num_before - num_after, num_after
+        )
+    )
+    return dataset_dicts
+
+def get_detection_dataset_dicts(names, filter_empty=True, **kwargs):
+    
+    if isinstance(names, str):
+        names = [names]
+
+    assert len(names), names
+    dataset_dicts = [DatasetCatalog.get(dataset_name) for dataset_name in names]
+    for dataset_name, dicts in zip(names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+
+    has_instances = "annotations" in dataset_dicts[0]
+    
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
+    return dataset_dicts
+
+
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None, dataset_id_to_src=None):
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON
+            else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN if cfg.MODEL.LOAD_PROPOSALS else None,
+        )
+        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+
+    if sampler is None:
+        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+        balance_datasets = cfg.DATALOADER.BALANCE_DATASETS
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+
+        if balance_datasets:
+            assert dataset_id_to_src is not None, 'Need dataset sources.'
+
+            dataset_source_to_int = {val:i for i, val in enumerate(set(dataset_id_to_src.values()))}
+            dataset_ids_per_img = [dataset_source_to_int[dataset_id_to_src[img['dataset_id']]] for img in dataset]
+            dataset_ids = np.unique(dataset_ids_per_img)
+
+            # only one source? don't re-weight then.
+            if len(dataset_ids) == 1:
+                weights_per_img = torch.ones(len(dataset_ids_per_img)).float()
+            
+            # compute per-dataset weights.
+            else:
+                counts = np.bincount(dataset_ids_per_img)
+                counts = [counts[id] for id in dataset_ids]
+                weights = [1 - count/np.sum(counts) for count in counts]
+                weights = [weight/np.min(weights) for weight in weights]
+                
+                weights_per_img = torch.zeros(len(dataset_ids_per_img)).float()
+                dataset_ids_per_img = torch.FloatTensor(dataset_ids_per_img).long()
+
+                # copy weights
+                for dataset_id, weight in zip(dataset_ids, weights):
+                    weights_per_img[dataset_ids_per_img == dataset_id] = weight
+
+        # no special sampling whatsoever
+        if sampler_name == "TrainingSampler" and not balance_datasets:
+            sampler = TrainingSampler(len(dataset))
+
+        # balance the weight sampling by datasets
+        elif sampler_name == "TrainingSampler" and balance_datasets:
+            sampler = RepeatFactorTrainingSampler(weights_per_img)
+        
+        # balance the weight sampling by categories
+        elif sampler_name == "RepeatFactorTrainingSampler" and not balance_datasets:
+            repeat_factors = repeat_factors_from_category_frequency(
+                dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+            )
+            sampler = RepeatFactorTrainingSampler(repeat_factors)
+
+        # balance the weight sampling by categories AND by dataset frequency
+        elif sampler_name == "RepeatFactorTrainingSampler" and balance_datasets:
+            repeat_factors = repeat_factors_from_category_frequency(
+                dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+            )
+            repeat_factors *= weights_per_img
+            repeat_factors /= repeat_factors.min().item()
+            sampler = RepeatFactorTrainingSampler(repeat_factors)
+        else:
+            raise ValueError("Unknown training sampler: {}".format(sampler_name))
+
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+    }
+
+
+def repeat_factors_from_category_frequency(dataset_dicts, repeat_thresh):
+        """
+        Compute (fractional) per-image repeat factors based on category frequency.
+        The repeat factor for an image is a function of the frequency of the rarest
+        category labeled in that image. The "frequency of category c" in [0, 1] is defined
+        as the fraction of images in the training set (without repeats) in which category c
+        appears.
+        See :paper:`lvis` (>= v2) Appendix B.2.
+
+        Args:
+            dataset_dicts (list[dict]): annotations in Detectron2 dataset format.
+            repeat_thresh (float): frequency threshold below which data is repeated.
+                If the frequency is half of `repeat_thresh`, the image will be
+                repeated twice.
+
+        Returns:
+            torch.Tensor:
+                the i-th element is the repeat factor for the dataset image at index i.
+        """
+        # 1. For each category c, compute the fraction of images that contain it: f(c)
+        category_freq = defaultdict(int)
+        for dataset_dict in dataset_dicts:  # For each image (without repeats)
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            for cat_id in cat_ids:
+                if cat_id < 0: continue
+                category_freq[cat_id] += 1
+        num_images = len(dataset_dicts)
+        for k, v in category_freq.items():
+            category_freq[k] = v / num_images
+
+        # 2. For each category c, compute the category-level repeat factor:
+        #    r(c) = max(1, sqrt(t / f(c)))
+        category_rep = {
+            cat_id: max(1.0, math.sqrt(repeat_thresh / cat_freq))
+            for cat_id, cat_freq in category_freq.items()
+        }
+
+        # 3. For each image I, compute the image-level repeat factor:
+        #    r(I) = max_{c in I} r(c)
+        rep_factors = []
+        for dataset_dict in dataset_dicts:
+            cat_ids = {ann["category_id"] for ann in dataset_dict["annotations"]}
+            rep_factor = max({category_rep[cat_id] for cat_id in cat_ids if cat_id >= 0}, default=1.0)
+            rep_factors.append(rep_factor)
+
+        return torch.tensor(rep_factors, dtype=torch.float32)
+
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(dataset, *, mapper, sampler=None, total_batch_size, aspect_ratio_grouping=True, num_workers=0):
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.Sampler)
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers
+    )
+
+def _test_loader_from_config(cfg, dataset_name, batch_size=1, mapper=None, filter_empty=False):
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=filter_empty,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+
+    return {"dataset": dataset, "mapper": mapper, 'batch_size':batch_size, "num_workers": cfg.DATALOADER.NUM_WORKERS}
+
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(dataset, *, mapper, batch_size=1, sampler=None, num_workers=0):
+    
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = InferenceSampler(len(dataset))
+
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.BatchSampler(sampler, batch_size=batch_size, drop_last=False)
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=num_workers,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader
+
diff --git a/cubercnn/data/builtin.py b/cubercnn/data/builtin.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a1bdefc76d6b5164162d6e1076c089d9654965a
--- /dev/null
+++ b/cubercnn/data/builtin.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+
+def get_omni3d_categories(dataset="omni3d"):
+    """
+    Returns the Omni3D categories for dataset
+    Args:
+        dataset: str
+    Returns:
+        cats: set of strings with category names
+    """
+
+    if dataset == "omni3d":
+        cats = set({'chair', 'table', 'cabinet', 'car', 'lamp', 'books', 'sofa', 'pedestrian', 'picture', 'window', 'pillow', 'truck', 'door', 'blinds', 'sink', 'shelves', 'television', 'shoes', 'cup', 'bottle', 'bookcase', 'laptop', 'desk', 'cereal box', 'floor mat', 'traffic cone', 'mirror', 'barrier', 'counter', 'camera', 'bicycle', 'toilet', 'bus', 'bed', 'refrigerator', 'trailer', 'box', 'oven', 'clothes', 'van', 'towel', 'motorcycle', 'night stand', 'stove', 'machine', 'stationery', 'bathtub', 'cyclist', 'curtain', 'bin'})
+        assert len(cats) == 50
+    elif dataset == "omni3d_in":
+        cats = set({'stationery', 'sink', 'table', 'floor mat', 'bottle', 'bookcase', 'bin', 'blinds', 'pillow', 'bicycle', 'refrigerator', 'night stand', 'chair', 'sofa', 'books', 'oven', 'towel', 'cabinet', 'window', 'curtain', 'bathtub', 'laptop', 'desk', 'television', 'clothes', 'stove', 'cup', 'shelves', 'box', 'shoes', 'mirror', 'door', 'picture', 'lamp', 'machine', 'counter', 'bed', 'toilet'})
+        assert len(cats) == 38
+    elif dataset == "omni3d_out":
+        cats = set({'cyclist', 'pedestrian', 'trailer', 'bus', 'motorcycle', 'car', 'barrier', 'truck', 'van', 'traffic cone', 'bicycle'})
+        assert len(cats) == 11
+    elif dataset in ["SUNRGBD_train", "SUNRGBD_val", "SUNRGBD_test", "SUNRGBD_train_mini", "SUNRGBD_val_mini", "SUNRGBD_test_mini", "SUNRGBD_test_mini2", "SUNRGBD_test_mini500"]:
+        cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'})
+        assert len(cats) == 38
+    elif dataset in ["Hypersim_train", "Hypersim_val"]:
+        cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
+        assert len(cats) == 29
+    elif dataset == "Hypersim_test":
+        # Hypersim test annotation does not contain toilet
+        cats = set({'books', 'chair', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator'})
+        assert len(cats) == 28
+    elif dataset in ["ARKitScenes_train", "ARKitScenes_val", "ARKitScenes_test"]:
+        cats = set({'table', 'bed', 'sofa', 'television', 'refrigerator', 'chair', 'oven', 'machine', 'stove', 'shelves', 'sink', 'cabinet', 'bathtub', 'toilet'})
+        assert len(cats) == 14
+    elif dataset in ["Objectron_train", "Objectron_val", "Objectron_test"]:
+        cats = set({'bicycle', 'books', 'bottle', 'camera', 'cereal box', 'chair', 'cup', 'laptop', 'shoes'})
+        assert len(cats) == 9
+    elif dataset in ["KITTI_train", "KITTI_val", "KITTI_test"]:
+        cats = set({'pedestrian', 'car', 'cyclist', 'van', 'truck'})
+        assert len(cats) == 5
+    elif dataset in ["nuScenes_train", "nuScenes_val", "nuScenes_test"]:
+        cats = set({'pedestrian', 'car', 'truck', 'traffic cone', 'barrier', 'motorcycle', 'bicycle', 'bus', 'trailer'})
+        assert len(cats) == 9
+    else:
+        raise ValueError("%s dataset is not registered." % (dataset))
+
+    return cats
\ No newline at end of file
diff --git a/cubercnn/data/dataset_mapper.py b/cubercnn/data/dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8ca3027d7b61f931274f0caa745a20f9b61096e
--- /dev/null
+++ b/cubercnn/data/dataset_mapper.py
@@ -0,0 +1,272 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import copy
+import logging
+from detectron2.config.config import configurable
+from detectron2.data.transforms.augmentation import AugmentationList
+import torch
+import numpy as np
+from detectron2.structures import BoxMode, Keypoints
+from detectron2.data import detection_utils
+from detectron2.data import transforms as T
+from detectron2.data import (
+    DatasetMapper
+)
+from detectron2.structures import (
+    Boxes,
+    BoxMode,
+    Instances,
+)
+
+from typing import List, Optional, Union
+
+from PIL import Image
+
+class DatasetMapper3D(DatasetMapper):
+
+    @configurable
+    def __init__(
+        self,
+        is_train: bool,
+        *,
+        augmentations: List[Union[T.Augmentation, T.Transform]],
+        image_format: str,
+        mode:str=None,
+        use_instance_mask: bool = False,
+        use_keypoint: bool = False,
+        instance_mask_format: str = "polygon",
+        keypoint_hflip_indices: Optional[np.ndarray] = None,
+        precomputed_proposal_topk: Optional[int] = None,
+        recompute_boxes: bool = False,
+        only_2d: bool = False,
+    ):
+        """
+        NOTE: this interface is experimental.
+
+        Args:
+            is_train: whether it's used in training or inference
+            mode: 'get_depth_maps' (default), 'cube_rcnn'
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            use_instance_mask: whether to process instance segmentation annotations, if available
+            use_keypoint: whether to process keypoint annotations if available
+            instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
+                masks into this format.
+            keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
+            precomputed_proposal_topk: if given, will load pre-computed
+                proposals from dataset_dict and keep the top k proposals for each image.
+            recompute_boxes: whether to overwrite bounding box annotations
+                by computing tight bounding boxes from instance mask annotations.
+        """
+        if recompute_boxes:
+            assert use_instance_mask, "recompute_boxes requires instance masks"
+        # fmt: off
+        self.is_train               = is_train
+        self.augmentations          = T.AugmentationList(augmentations)
+        self.image_format           = image_format
+        self.use_instance_mask      = use_instance_mask
+        self.instance_mask_format   = instance_mask_format
+        self.use_keypoint           = use_keypoint
+        self.keypoint_hflip_indices = keypoint_hflip_indices
+        self.proposal_topk          = precomputed_proposal_topk
+        self.recompute_boxes        = recompute_boxes
+        self.only_2d                = only_2d
+        self.mode                   = mode
+        # fmt: on
+        logger = logging.getLogger(__name__)
+        mode_out = "training" if is_train else "inference"
+        logger.info(f"[DatasetMapper] Augmentations used in {mode_out}: {augmentations}")
+
+    @classmethod
+    def from_config(cls, cfg, is_train: bool = True, mode='get_depth_maps'):
+        augs = detection_utils.build_augmentation(cfg, is_train)
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
+            recompute_boxes = cfg.MODEL.MASK_ON
+        else:
+            recompute_boxes = False
+
+        ret = {
+            "is_train": is_train,
+            "mode": mode,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "use_instance_mask": cfg.MODEL.MASK_ON,
+            "instance_mask_format": cfg.INPUT.MASK_FORMAT,
+            "use_keypoint": cfg.MODEL.KEYPOINT_ON,
+            "recompute_boxes": recompute_boxes,
+            "only_2d": cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D == 0.0,
+        }
+
+        if cfg.MODEL.KEYPOINT_ON:
+            ret["keypoint_hflip_indices"] = detection_utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+
+        if cfg.MODEL.LOAD_PROPOSALS:
+            ret["precomputed_proposal_topk"] = (
+                cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
+                if is_train
+                else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
+            )
+        return ret
+
+    def __call__(self, dataset_dict):
+        
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        
+        image = detection_utils.read_image(dataset_dict["file_name"], format=self.image_format)
+        detection_utils.check_image_size(dataset_dict, image)
+
+        aug_input = T.AugInput(image)
+        # state = torch.get_rng_state()
+        transforms = self.augmentations(aug_input)
+        image = aug_input.image
+        image_shape = image.shape[:2]  # h, w
+
+        # dont load ground map and depth map when 
+        if not self.only_2d:
+            if 'depth_image_path' in dataset_dict:
+                dp_img = Image.fromarray(np.load(dataset_dict["depth_image_path"])['depth'])
+                dp_img = np.array(dp_img.resize(image.shape[:2][::-1], Image.NEAREST))
+                aug_input_dp = T.AugInput(dp_img)
+                aug_only_flip = AugmentationList(transforms[-1:])
+                # torch.set_rng_state(state)
+                #transforms_dp = aug_only_flip(aug_input_dp)
+                dp_image = aug_input_dp.image
+                dataset_dict["depth_map"] = torch.as_tensor(np.ascontiguousarray(dp_image))
+            else:
+                dataset_dict["depth_map"] = None
+
+            #  ground image
+            if 'ground_image_path' in dataset_dict:
+                ground_img = Image.fromarray(np.load(dataset_dict["ground_image_path"])['mask'])
+                ground_img = np.array(ground_img.resize(image.shape[:2][::-1], Image.NEAREST))
+                aug_input_gr = T.AugInput(ground_img)
+                #transforms_gr = aug_only_flip(aug_input_gr)
+                gr_image = aug_input_gr.image
+                dataset_dict["ground_map"] = torch.as_tensor(np.ascontiguousarray(gr_image))
+            else:
+                dataset_dict["ground_map"] = None
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+
+        # no need for additional processing at inference
+        if not self.is_train:
+            return dataset_dict
+
+        if "annotations" in dataset_dict:
+
+            dataset_id = dataset_dict['dataset_id']
+            K = np.array(dataset_dict['K'])
+
+            unknown_categories = self.dataset_id_to_unknown_cats[dataset_id]
+
+            # transform and pop off annotations
+            annos = [
+                transform_instance_annotations(obj, transforms, K=K)
+                for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0
+            ]
+
+            # convert to instance format
+            instances = annotations_to_instances(annos, image_shape, unknown_categories)
+            dataset_dict["instances"] = detection_utils.filter_empty_instances(instances)
+
+        return dataset_dict
+
+'''
+Cached for mirroring annotations
+'''
+_M1 = np.array([
+    [1, 0, 0], 
+    [0, -1, 0],
+    [0, 0, -1]
+])
+_M2 = np.array([
+    [-1.,  0.,  0.],
+    [ 0., -1.,  0.],
+    [ 0.,  0.,  1.]
+])
+
+
+def transform_instance_annotations(annotation, transforms, *, K):
+    
+    if isinstance(transforms, (tuple, list)):
+        transforms = T.TransformList(transforms)
+    
+    # bbox is 1d (per-instance bounding box)
+    bbox = BoxMode.convert(annotation["bbox"], annotation["bbox_mode"], BoxMode.XYXY_ABS)
+    bbox = transforms.apply_box(np.array([bbox]))[0]
+    
+    annotation["bbox"] = bbox
+    annotation["bbox_mode"] = BoxMode.XYXY_ABS
+
+    if annotation['center_cam'][2] != 0:
+
+        # project the 3D box annotation XYZ_3D to screen 
+        point3D = annotation['center_cam']
+        point2D = K @ np.array(point3D)
+        point2D[:2] = point2D[:2] / point2D[-1]
+        annotation["center_cam_proj"] = point2D.tolist()
+
+        # apply coords transforms to 2D box
+        annotation["center_cam_proj"][0:2] = transforms.apply_coords(
+            point2D[np.newaxis][:, :2]
+        )[0].tolist()
+
+        keypoints = (K @ np.array(annotation["bbox3D_cam"]).T).T
+        keypoints[:, 0] /= keypoints[:, -1]
+        keypoints[:, 1] /= keypoints[:, -1]
+        
+        if annotation['ignore']:
+            # all keypoints marked as not visible 
+            # 0 - unknown, 1 - not visible, 2 visible
+            keypoints[:, 2] = 1
+        else:
+            
+            valid_keypoints = keypoints[:, 2] > 0
+
+            # 0 - unknown, 1 - not visible, 2 visible
+            keypoints[:, 2] = 2
+            keypoints[valid_keypoints, 2] = 2
+
+        # in place
+        transforms.apply_coords(keypoints[:, :2])
+        annotation["keypoints"] = keypoints.tolist()
+
+        # manually apply mirror for pose
+        for transform in transforms:
+
+            # horrizontal flip?
+            if isinstance(transform, T.HFlipTransform):
+
+                pose = _M1 @ np.array(annotation["pose"]) @ _M2
+                annotation["pose"] = pose.tolist()
+                annotation["R_cam"] = pose.tolist()
+
+    return annotation
+
+
+def annotations_to_instances(annos, image_size, unknown_categories):
+
+    # init
+    target = Instances(image_size)
+    
+    # add classes, 2D boxes, 3D boxes and poses
+    target.gt_classes = torch.tensor([int(obj["category_id"]) for obj in annos], dtype=torch.int64)
+    target.gt_boxes = Boxes([BoxMode.convert(obj["bbox"], obj["bbox_mode"], BoxMode.XYXY_ABS) for obj in annos])
+    target.gt_boxes3D = torch.FloatTensor([anno['center_cam_proj'] + anno['dimensions'] + anno['center_cam'] for anno in annos])
+    target.gt_poses = torch.FloatTensor([anno['pose'] for anno in annos])
+    
+    n = len(target.gt_classes)
+
+    # do keypoints?
+    target.gt_keypoints = Keypoints(torch.FloatTensor([anno['keypoints'] for anno in annos]))
+
+    gt_unknown_category_mask = torch.zeros(max(unknown_categories)+1, dtype=bool)
+    gt_unknown_category_mask[torch.tensor(list(unknown_categories))] = True
+
+    # include available category indices as tensor with GTs
+    target.gt_unknown_category_mask = gt_unknown_category_mask.unsqueeze(0).repeat([n, 1])
+
+    return target
diff --git a/cubercnn/data/datasets.py b/cubercnn/data/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb13c9832c42f39ddd52b73d1859b75ca371a645
--- /dev/null
+++ b/cubercnn/data/datasets.py
@@ -0,0 +1,480 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import json
+import time
+import os
+import contextlib
+import io
+import logging
+import numpy as np
+import pandas as pd
+from pycocotools.coco import COCO
+from collections import defaultdict
+from fvcore.common.timer import Timer
+from detectron2.utils.file_io import PathManager
+from detectron2.structures import BoxMode
+from detectron2.data import MetadataCatalog, DatasetCatalog
+
+from cubercnn import util
+
+VERSION = '0.1'
+
+logger = logging.getLogger(__name__)
+
+def get_version():
+    return VERSION
+
+def get_global_dataset_stats(path_to_stats=None, reset=False):
+
+    if path_to_stats is None:
+        path_to_stats = os.path.join('datasets', 'Omni3D', 'stats.json')
+
+    if os.path.exists(path_to_stats) and not reset:
+        stats = util.load_json(path_to_stats)
+    
+    else:
+        stats = {
+            'n_datasets': 0,
+            'n_ims': 0,
+            'n_anns': 0,
+            'categories': []
+        }
+
+    return stats
+
+
+def save_global_dataset_stats(stats, path_to_stats=None):
+
+    if path_to_stats is None:
+        path_to_stats = os.path.join('datasets', 'Omni3D', 'stats.json')
+
+    util.save_json(path_to_stats, stats)
+
+
+def get_filter_settings_from_cfg(cfg=None):
+
+    if cfg is None:
+        return {
+            'category_names': [], 
+            'ignore_names': [], 
+            'truncation_thres': 0.99, 
+            'visibility_thres': 0.01,
+            'min_height_thres': 0.00,
+            'max_height_thres': 1.50,
+            'modal_2D_boxes': False,
+            'trunc_2D_boxes': False,
+            'max_depth': 1e8,
+        }
+    else:
+        return {
+            'category_names': cfg.DATASETS.CATEGORY_NAMES, 
+            'ignore_names': cfg.DATASETS.IGNORE_NAMES, 
+            'truncation_thres': cfg.DATASETS.TRUNCATION_THRES, 
+            'visibility_thres': cfg.DATASETS.VISIBILITY_THRES,
+            'min_height_thres': cfg.DATASETS.MIN_HEIGHT_THRES,
+            'modal_2D_boxes': cfg.DATASETS.MODAL_2D_BOXES,
+            'trunc_2D_boxes': cfg.DATASETS.TRUNC_2D_BOXES,
+            'max_depth': cfg.DATASETS.MAX_DEPTH,
+            
+            # TODO expose as a config
+            'max_height_thres': 1.50,
+        }
+
+
+def is_ignore(anno, filter_settings, image_height):
+
+    ignore = anno['behind_camera'] 
+    ignore |= (not bool(anno['valid3D']))
+
+    if ignore:
+        return ignore
+
+    ignore |= anno['dimensions'][0] <= 0.01
+    ignore |= anno['dimensions'][1] <= 0.01
+    ignore |= anno['dimensions'][2] <= 0.01
+    ignore |= anno['center_cam'][2] > filter_settings['max_depth']
+    ignore |= (anno['lidar_pts'] == 0)
+    ignore |= (anno['segmentation_pts'] == 0)
+    ignore |= (anno['depth_error'] > 0.5)
+    
+    # tightly annotated 2D boxes are not always available.
+    if filter_settings['modal_2D_boxes'] and 'bbox2D_tight' in anno and anno['bbox2D_tight'][0] != -1:
+        bbox2D =  BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+    # truncated projected 2D boxes are also not always available.
+    elif filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
+        bbox2D =  BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+    # use the projected 3D --> 2D box, which requires a visible 3D cuboid.
+    elif 'bbox2D_proj' in anno:
+        bbox2D =  BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+    else:
+        bbox2D = anno['bbox']
+
+    ignore |= bbox2D[3] <= filter_settings['min_height_thres']*image_height
+    ignore |= bbox2D[3] >= filter_settings['max_height_thres']*image_height
+        
+    ignore |= (anno['truncation'] >=0 and anno['truncation'] >= filter_settings['truncation_thres'])
+    ignore |= (anno['visibility'] >= 0 and anno['visibility'] <= filter_settings['visibility_thres'])
+    
+    if 'ignore_names' in filter_settings:
+        ignore |= anno['category_name'] in filter_settings['ignore_names']
+
+    return ignore
+
+
+def simple_register(dataset_name, filter_settings, filter_empty=True, datasets_root_path=None):
+
+    if datasets_root_path is None:
+        datasets_root_path = path_to_json = os.path.join('datasets', 'Omni3D',)
+    
+    path_to_json = os.path.join(datasets_root_path, dataset_name + '.json')
+    path_to_image_root = 'datasets'
+
+    DatasetCatalog.register(dataset_name, lambda: load_omni3d_json(
+        path_to_json, path_to_image_root, 
+        dataset_name, filter_settings, filter_empty=filter_empty
+    ))
+
+    MetadataCatalog.get(dataset_name).set(json_file=path_to_json, image_root=path_to_image_root, evaluator_type="coco")
+
+class Omni3D(COCO):
+    '''
+    Class for COCO-like dataset object. Not inherently related to 
+    use with Detectron2 or training per se. 
+    '''
+
+    def __init__(self, annotation_files, filter_settings=None):
+             
+        # load dataset
+        self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict()
+        self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list)
+
+        self.idx_without_ground = set(pd.read_csv('datasets/no_ground_idx.csv')['img_id'].values)
+       
+        if isinstance(annotation_files, str):
+            annotation_files = [annotation_files,]
+        
+        cats_ids_master = []
+        cats_master = []
+        
+        for annotation_file in annotation_files:
+
+            _, name, _ = util.file_parts(annotation_file)
+
+            logger.info('loading {} annotations into memory...'.format(name))
+            dataset = json.load(open(annotation_file, 'r'))
+            assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
+
+            if type(dataset['info']) == list:
+                dataset['info'] = dataset['info'][0]
+                
+            dataset['info']['known_category_ids'] = [cat['id'] for cat in dataset['categories']]
+
+            # first dataset
+            if len(self.dataset) == 0:
+                self.dataset = dataset
+            
+            # concatenate datasets
+            else:
+
+                if type(self.dataset['info']) == dict:
+                    self.dataset['info'] = [self.dataset['info']]
+                    
+                self.dataset['info'] += [dataset['info']]
+                self.dataset['annotations'] += dataset['annotations']
+                self.dataset['images'] += dataset['images']
+            
+            # sort through categories
+            for cat in dataset['categories']:
+
+                if not cat['id'] in cats_ids_master:
+                    cats_ids_master.append(cat['id'])
+                    cats_master.append(cat)
+
+        if filter_settings is None:
+
+            # include every category in the master list
+            self.dataset['categories'] = [
+                cats_master[i] 
+                for i in np.argsort(cats_ids_master) 
+            ]
+            
+        else:
+        
+            # determine which categories we may actually use for filtering.
+            trainable_cats = set(filter_settings['ignore_names']) | set(filter_settings['category_names'])
+
+            # category names are provided to us
+            if len(filter_settings['category_names']) > 0:
+
+                self.dataset['categories'] = [
+                    cats_master[i] 
+                    for i in np.argsort(cats_ids_master) 
+                    if cats_master[i]['name'] in filter_settings['category_names']
+                ]
+            
+            # no categories are provided, so assume use ALL available.
+            else:
+
+                self.dataset['categories'] = [
+                    cats_master[i] 
+                    for i in np.argsort(cats_ids_master) 
+                ]
+
+                filter_settings['category_names'] = [cat['name'] for cat in self.dataset['categories']]
+
+                trainable_cats = trainable_cats | set(filter_settings['category_names'])
+            
+            valid_anns = []
+            im_height_map = {}
+
+            for im_obj in self.dataset['images']:
+                im_height_map[im_obj['id']] = im_obj['height']
+
+            # Filter out annotations
+            for anno_idx, anno in enumerate(self.dataset['annotations']):
+                
+                im_height = im_height_map[anno['image_id']]
+
+                ignore = is_ignore(anno, filter_settings, im_height)
+                
+                if filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
+                    bbox2D =  BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+                elif anno['bbox2D_proj'][0] != -1:
+                    bbox2D = BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+                elif anno['bbox2D_tight'][0] != -1:
+                    bbox2D = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+                else: 
+                    continue
+
+                width = bbox2D[2]
+                height = bbox2D[3]
+
+                self.dataset['annotations'][anno_idx]['area'] = width*height
+                self.dataset['annotations'][anno_idx]['iscrowd'] = False
+                self.dataset['annotations'][anno_idx]['ignore'] = ignore
+                self.dataset['annotations'][anno_idx]['ignore2D'] = ignore
+                self.dataset['annotations'][anno_idx]['ignore3D'] = ignore
+                
+                if filter_settings['modal_2D_boxes'] and anno['bbox2D_tight'][0] != -1:
+                    self.dataset['annotations'][anno_idx]['bbox'] = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+                
+                else:
+                    self.dataset['annotations'][anno_idx]['bbox'] = bbox2D
+                
+                self.dataset['annotations'][anno_idx]['bbox3D'] = anno['bbox3D_cam']
+                self.dataset['annotations'][anno_idx]['depth'] = anno['center_cam'][2]
+
+                category_name = anno["category_name"]
+
+                # category is part of trainable categories?
+                if category_name in trainable_cats:
+                    if not ignore:
+                        valid_anns.append(self.dataset['annotations'][anno_idx])
+
+            self.dataset['annotations'] = valid_anns
+
+            # append depth image path to each image corresponding to the id
+            # for img in self.dataset['images']:
+            #     img_id = img['id']
+            #     img['depth_image_path'] = f'datasets/depth_maps/{img_id}.npz'
+            #     if not img_id in self.idx_without_ground:
+            #         img['ground_image_path'] = f'datasets/ground_maps/{img_id}.npz'
+
+        self.createIndex()
+
+    def info(self):
+        
+        infos = self.dataset['info']
+        if type(infos) == dict:
+            infos = [infos]
+
+        for i, info in enumerate(infos):
+            print('Dataset {}/{}'.format(i+1, infos))
+
+            for key, value in info.items():
+                print('{}: {}'.format(key, value))
+
+
+def register_and_store_model_metadata(datasets, output_dir, filter_settings=None):
+
+    output_file = os.path.join(output_dir, 'category_meta.json')
+
+    if os.path.exists(output_file):
+        metadata = util.load_json(output_file)
+        thing_classes = metadata['thing_classes']
+        id_map = metadata['thing_dataset_id_to_contiguous_id']
+
+        # json saves id map as strings rather than ints
+        id_map = {int(idA):idB for idA, idB in id_map.items()}
+
+    else:
+        omni3d_stats = util.load_json(os.path.join('datasets', 'Omni3D', 'stats.json'))
+        thing_classes = filter_settings['category_names']
+
+        cat_ids = []
+        for cat in thing_classes:
+            cat_idx = omni3d_stats['category_names'].index(cat)
+            cat_id = omni3d_stats['categories'][cat_idx]['id']
+            cat_ids.append(cat_id)
+
+        cat_order = np.argsort(cat_ids)
+        cat_ids = [cat_ids[i] for i in cat_order]
+        thing_classes = [thing_classes[i] for i in cat_order]
+        id_map = {id: i for i, id in enumerate(cat_ids)}
+        
+        util.save_json(output_file, {
+            'thing_classes': thing_classes,
+            'thing_dataset_id_to_contiguous_id': id_map,
+        })
+
+    MetadataCatalog.get('omni3d_model').thing_classes = thing_classes
+    MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id  = id_map
+
+
+def load_omni3d_json(json_file, image_root, dataset_name, filter_settings, filter_empty=True):
+    
+    # read in the dataset
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    ground_map_files = os.listdir('datasets/ground_maps')
+    ground_idx = []
+    for file in ground_map_files:
+        try:
+            idx = int(file.split('.')[0])
+            ground_idx.append(idx)
+        except:
+            pass
+    depth_map_files = os.listdir('datasets/depth_maps')
+    depth_idx = []
+    for file in depth_map_files:
+        try:
+            idx = int(file.split('.')[0])
+            depth_idx.append(idx)
+        except:
+            pass
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+
+    # the global meta information for the full dataset
+    meta_model = MetadataCatalog.get('omni3d_model')
+
+    # load the meta information
+    meta = MetadataCatalog.get(dataset_name)
+    cat_ids = sorted(coco_api.getCatIds(filter_settings['category_names']))
+    cats = coco_api.loadCats(cat_ids)
+    thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+    meta.thing_classes = thing_classes
+    
+    # the id mapping must be based on the model!
+    id_map = meta_model.thing_dataset_id_to_contiguous_id
+    meta.thing_dataset_id_to_contiguous_id = id_map
+
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    imgs = coco_api.loadImgs(img_ids)
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+    total_num_valid_anns = sum([len(x) for x in anns])
+    total_num_anns = len(coco_api.anns)
+    if total_num_valid_anns < total_num_anns:
+        logger.info(
+            f"{json_file} contains {total_num_anns} annotations, but only "
+            f"{total_num_valid_anns} of them match to images in the file."
+        )
+
+    imgs_anns = list(zip(imgs, anns))
+    logger.info("Loaded {} images in Omni3D format from {}".format(len(imgs_anns), json_file))
+
+    dataset_dicts = []
+
+    # annotation keys to pass along
+    ann_keys = [
+        "bbox", "bbox3D_cam", "bbox2D_proj", "bbox2D_trunc", "bbox2D_tight", 
+        "center_cam", "dimensions", "pose", "R_cam", "category_id",
+    ]
+    
+    # optional per image keys to pass if exists
+    # this property is unique to KITTI. 
+    img_keys_optional = ['p2']
+
+    invalid_count = 0
+    
+    for img_dict, anno_dict_list in imgs_anns:
+        
+        has_valid_annotation = False
+
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_path"])
+        record["dataset_id"] = img_dict["dataset_id"]
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        record["K"] = img_dict["K"]
+
+        # store optional keys when available
+        for img_key in img_keys_optional:
+            if img_key in img_dict:
+                record[img_key] = img_dict[img_key]
+
+        image_id = record["image_id"] = img_dict["id"]
+
+        if image_id in depth_idx:
+            record["depth_image_path"] = f'datasets/depth_maps/{image_id}.npz'
+        if image_id in ground_idx:
+            record["ground_image_path"] = f'datasets/ground_maps/{image_id}.npz'
+        objs = []
+        # where invalid annotations are removed
+        for anno in anno_dict_list:
+            assert anno["image_id"] == image_id
+
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+
+            obj["bbox_mode"] = BoxMode.XYWH_ABS
+            annotation_category_id = obj["category_id"]
+
+            # category is not part of ids and is not in the ignore category?
+            if not (annotation_category_id in id_map) and not (anno['category_name'] in filter_settings['ignore_names']):
+                continue
+
+            ignore = is_ignore(anno, filter_settings, img_dict["height"])
+            
+            obj['iscrowd'] = False
+            obj['ignore'] = ignore
+            
+            if filter_settings['modal_2D_boxes'] and 'bbox2D_tight' in anno and anno['bbox2D_tight'][0] != -1:
+                obj['bbox'] = BoxMode.convert(anno['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+            elif filter_settings['trunc_2D_boxes'] and 'bbox2D_trunc' in anno and not np.all([val==-1 for val in anno['bbox2D_trunc']]):
+                obj['bbox'] =  BoxMode.convert(anno['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+            elif 'bbox2D_proj' in anno:
+                obj['bbox'] = BoxMode.convert(anno['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+            else:
+                continue
+
+            obj['pose'] = anno['R_cam']
+
+            # store category as -1 for ignores!
+            # OLD Logic
+            obj["category_id"] = -1 if ignore else id_map[annotation_category_id]
+
+            objs.append(obj)
+
+            has_valid_annotation |= (not ignore)
+
+        if has_valid_annotation or (not filter_empty):
+            record["annotations"] = objs
+            dataset_dicts.append(record)
+            
+        else:
+            invalid_count += 1 
+    
+    logger.info("Filtered out {}/{} images without valid annotations".format(invalid_count, len(imgs_anns)))
+
+    return dataset_dicts
\ No newline at end of file
diff --git a/cubercnn/data/filter_ground.py b/cubercnn/data/filter_ground.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b3309abc8b8c9010b7576843b26a03d76ea5d27
--- /dev/null
+++ b/cubercnn/data/filter_ground.py
@@ -0,0 +1,26 @@
+# Basically a hotfix script to avoid having to run the ground segemntation script again
+# this will filter out empty ground maps and add the indices to the no_ground_idx.csv file
+# It removes ground maps with very little ground, because we assume that it has found something wrong
+import os
+import torch
+import numpy as np
+from tqdm import tqdm
+import pandas as pd
+
+files = os.listdir('datasets/ground_maps')
+no_ground = []
+for file in tqdm(files):
+    mask = np.load(f'datasets/ground_maps/{file}')['mask']
+    ground_map = torch.as_tensor(mask)[::5,::5]
+    nnz = torch.count_nonzero(ground_map).item()
+    # 100 is determined from looking at the pictures
+    if nnz < 100:
+        print(nnz)
+        print('indices', file[:-4])
+        no_ground.append(int(file[:-4]))
+        os.remove(f'datasets/ground_maps/{file}')
+
+df = pd.DataFrame(no_ground, columns=['img_id'])
+df2 = pd.read_csv('datasets/no_ground_idx.csv')
+df = pd.concat([df, df2])
+df.to_csv('datasets/no_ground_idx.csv', index=False)
\ No newline at end of file
diff --git a/cubercnn/data/generate_depth_maps.py b/cubercnn/data/generate_depth_maps.py
new file mode 100644
index 0000000000000000000000000000000000000000..d41fd0e1ab5ee0543cfb98a6baccc6f6b07927a3
--- /dev/null
+++ b/cubercnn/data/generate_depth_maps.py
@@ -0,0 +1,86 @@
+import torch
+import cv2
+# might need to export PYTHONPATH=/work3/$username/3dod/
+from depth.metric_depth.depth_anything_v2.dpt import DepthAnythingV2
+def depth_of_images(encoder='vitl', dataset='hypersim', max_depth=20, device='cpu'):
+    """
+    This function takes in a list of images and returns the depth of the images
+    
+    encoder = 'vitl' # or 'vits', 'vitb'
+    dataset = 'hypersim' # 'hypersim' for indoor model, 'vkitti' for outdoor model
+    max_depth = 20 # 20 for indoor model, 80 for outdoor model
+    """
+    model_configs = {
+        'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]},
+        'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
+        'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}
+    }
+
+    model = DepthAnythingV2(**{**model_configs[encoder], 'max_depth': max_depth})
+    model.load_state_dict(torch.load(f'depth/checkpoints/depth_anything_v2_metric_{dataset}_{encoder}.pth', map_location=device, weights_only=False))
+    model.eval()
+    model.to(device)
+    return model
+
+def init_dataset():
+    ''' dataloader stuff.
+     I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
+    cfg, filter_settings = get_config_and_filter_settings()
+
+    dataset_names = ['SUNRGBD_train','SUNRGBD_val','SUNRGBD_test', 'KITTI_train', 'KITTI_val', 'KITTI_test',]
+    dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
+    # for dataset_name in dataset_names:
+    #     simple_register(dataset_name, filter_settings, filter_empty=True)
+
+    # Get Image and annotations
+    datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+    data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+
+    thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+    dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+
+    infos = datasets.dataset['info']
+
+    dataset_id_to_unknown_cats = {}
+    possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+    
+    dataset_id_to_src = {}
+
+    for info in infos:
+        dataset_id = info['id']
+        known_category_training_ids = set()
+        
+        if not dataset_id in dataset_id_to_src:
+            dataset_id_to_src[dataset_id] = info['source']
+
+        for id in info['known_category_ids']:
+            if id in dataset_id_to_contiguous_id:
+                known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+        
+        # determine and store the unknown categories.
+        unknown_categories = possible_categories - known_category_training_ids
+        dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+
+    return datasets
+
+if __name__ == '__main__':
+    import os
+    from detectron2.data.catalog import MetadataCatalog
+    import numpy as np
+
+    from cubercnn import data
+    from priors import get_config_and_filter_settings
+
+    from tqdm import tqdm
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    datasets = init_dataset()
+
+    os.makedirs('datasets/depth_maps', exist_ok=True)
+
+    model = depth_of_images(device=device)
+
+    for img_id, img_info in tqdm(datasets.imgs.items()):
+        file_path = img_info['file_path']
+        img = cv2.imread('datasets/'+file_path)
+        depth = model.infer_image(img) # HxW depth map in meters in numpy
+        np.savez_compressed(f'datasets/depth_maps/{img_id}.npz', depth=depth)
\ No newline at end of file
diff --git a/cubercnn/data/generate_ground_segmentations.py b/cubercnn/data/generate_ground_segmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..241a59daf1e6d946f7b87fe9f93263cf424b4acc
--- /dev/null
+++ b/cubercnn/data/generate_ground_segmentations.py
@@ -0,0 +1,206 @@
+from segment_anything import sam_model_registry
+from segment_anything.modeling import Sam
+import os
+
+def init_segmentation(device='cpu') -> Sam:
+    # 1) first cd into the segment_anything and pip install -e .
+    # to get the model stary in the root foler folder and run the download_model.sh 
+    # 2) chmod +x download_model.sh && ./download_model.sh
+    # the largest model: https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
+    # this is the smallest model
+    if os.path.exists('sam-hq/sam_hq_vit_b.pth'):
+        sam_checkpoint = "sam-hq/sam_hq_vit_b.pth"
+        model_type = "vit_b"
+    else:
+        sam_checkpoint = "sam-hq/sam_hq_vit_tiny.pth"
+        model_type = "vit_tiny"
+    print(f'SAM device: {device}, model_type: {model_type}')
+    sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+    sam.to(device=device)
+    return sam
+
+
+if __name__ == '__main__':
+    from segment_anything.utils.transforms import ResizeLongestSide
+    import numpy as np
+    import pandas as pd
+    import torch
+    import torchvision.transforms as T2
+    from matplotlib import pyplot as plt
+    from PIL import Image
+    from tqdm import tqdm
+    from torchvision.ops import box_convert
+
+    import groundingdino.datasets.transforms as T
+    from cubercnn import data
+    from detectron2.data.catalog import MetadataCatalog
+    from groundingdino.util.inference import load_image, load_model, predict
+    from priors import get_config_and_filter_settings
+    import supervision as sv
+    
+    def init_dataset():
+        ''' dataloader stuff.
+        currently not used anywhere, because I'm not sure what the difference between the omni3d dataset and load omni3D json functions are. this is a 3rd alternative to this. The train script calls something similar to this.'''
+        cfg, filter_settings = get_config_and_filter_settings()
+
+        dataset_names = ['SUNRGBD_train','SUNRGBD_val','SUNRGBD_test', 'KITTI_train', 'KITTI_val', 'KITTI_test',]
+        dataset_paths_to_json = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
+        # for dataset_name in dataset_names:
+        #     simple_register(dataset_name, filter_settings, filter_empty=True)
+
+        # Get Image and annotations
+        datasets = data.Omni3D(dataset_paths_to_json, filter_settings=filter_settings)
+        data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+
+
+        thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+        dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+
+        infos = datasets.dataset['info']
+
+        dataset_id_to_unknown_cats = {}
+        possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+        
+        dataset_id_to_src = {}
+
+        for info in infos:
+            dataset_id = info['id']
+            known_category_training_ids = set()
+            
+            if not dataset_id in dataset_id_to_src:
+                dataset_id_to_src[dataset_id] = info['source']
+
+            for id in info['known_category_ids']:
+                if id in dataset_id_to_contiguous_id:
+                    known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+            
+            # determine and store the unknown categories.
+            unknown_categories = possible_categories - known_category_training_ids
+            dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+
+        return datasets
+
+    def load_image(image_path: str, device) -> tuple[torch.Tensor, torch.Tensor]:
+        transform = T.Compose(
+            [
+                # T.RandomResize([800], max_size=1333),
+                T.ToTensor(),
+                T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]),
+            ]
+        )
+        transform2 = T2.ToTensor()
+        image_source = Image.open(image_path).convert("RGB")
+        image = transform2(image_source).to(device)
+        image_transformed, _ = transform(image_source, None)
+        return image, image_transformed.to(device)
+
+
+    def annotate(image_source: np.ndarray, boxes: torch.Tensor, logits: torch.Tensor, phrases: list[str]) -> np.ndarray:
+        """    
+        This function annotates an image with bounding boxes and labels.
+
+        Parameters:
+        image_source (np.ndarray): The source image to be annotated.
+        boxes (torch.Tensor): A tensor containing bounding box coordinates.
+        logits (torch.Tensor): A tensor containing confidence scores for each bounding box.
+        phrases (List[str]): A list of labels for each bounding box.
+
+        Returns:
+        np.ndarray: The annotated image.
+        """
+        h, w, _ = image_source.shape
+        boxes = boxes * torch.Tensor([w, h, w, h])
+        xyxy = box_convert(boxes=boxes, in_fmt="cxcywh", out_fmt="xyxy").numpy()
+        detections = sv.Detections(xyxy=xyxy)
+
+        labels = [
+            f"{phrase} {logit:.2f}"
+            for phrase, logit
+            in zip(phrases, logits)
+        ]
+
+        box_annotator = sv.BoxAnnotator()
+        # annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
+        annotated_frame = image_source.copy()
+        annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
+        return annotated_frame
+
+
+    datasets = init_dataset()
+
+    device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    # model.to(device)
+
+    segmentor = init_segmentation(device=device)
+
+    os.makedirs('datasets/ground_maps', exist_ok=True)
+    model = load_model("GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", "GroundingDINO/weights/groundingdino_swint_ogc.pth", device=device)
+    TEXT_PROMPT = "ground"
+    BOX_TRESHOLD = 0.35
+    TEXT_TRESHOLD = 0.25
+
+    noground = 0
+    no_ground_idx = []
+
+    #  **** to annotate full dataset ****
+    for img_id, img_info in tqdm(datasets.imgs.items()):
+        file_path = img_info['file_path']
+        w = img_info['width']
+        h = img_info['height']
+    #  **** to annotate full dataset ****
+    #  **** to annotate demo images ****
+    # for img_id in tqdm(os.listdir('datasets/coco_examples')):
+    #     file_path = 'coco_examples/'+img_id
+        image_source, image = load_image('datasets/'+file_path, device=device)
+    #  **** to annotate demo images ****
+
+        boxes, logits, phrases = predict(
+            model=model,
+            image=image,
+            caption=TEXT_PROMPT,
+            box_threshold=BOX_TRESHOLD,
+            text_threshold=TEXT_TRESHOLD,
+            device=device
+        )
+        if len(boxes) == 0:
+            print(f"No ground found for {img_id}")
+            noground += 1
+            # save a ground map that is all zeros
+            no_ground_idx.append(img_id)
+            continue
+        # only want box corresponding to max logit
+        max_logit_idx = torch.argmax(logits)
+        logit = logits[max_logit_idx].unsqueeze(0)
+        box = boxes[max_logit_idx].unsqueeze(0)
+        phrase = [phrases[max_logit_idx]]
+
+        _, h, w = image_source.shape
+        box = box * torch.tensor([w, h, w, h], device=device)
+        xyxy = box_convert(boxes=box, in_fmt="cxcywh", out_fmt="xyxy")
+
+        image = image.unsqueeze(0)
+        org_shape = image.shape[-2:]
+        resize_transform = ResizeLongestSide(segmentor.image_encoder.img_size)
+        batched_input = []
+        images = resize_transform.apply_image_torch(image*1.0)# .permute(2, 0, 1).contiguous()
+        for image, boxes in zip(images, xyxy):
+            transformed_boxes = resize_transform.apply_boxes_torch(boxes, org_shape) # Bx4
+            batched_input.append({'image': image, 'boxes': transformed_boxes, 'original_size':org_shape})
+
+        seg_out = segmentor(batched_input, multimask_output=False)
+        mask_per_image = seg_out[0]['masks']
+
+        nnz = torch.count_nonzero(mask_per_image, dim=(-2, -1))
+        indices = torch.nonzero(nnz <= 1000).flatten()
+        if len(indices) > 0:
+            noground += 1
+            # save a ground map that is all zeros
+            no_ground_idx.append(img_id)
+
+        np.savez_compressed(f'datasets/ground_maps/{img_id}.npz', mask=mask_per_image.cpu()[0,0,:,:].numpy())
+
+    print(f"Could not find ground for {noground} images")
+
+
+    df = pd.DataFrame(no_ground_idx, columns=['img_id'])
+    df.to_csv('datasets/no_ground_idx.csv', index=False)
\ No newline at end of file
diff --git a/cubercnn/evaluation/__init__.py b/cubercnn/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe258d36e153c92a8bb4eab214ae9955dbcc1135
--- /dev/null
+++ b/cubercnn/evaluation/__init__.py
@@ -0,0 +1 @@
+from .omni3d_evaluation import *
\ No newline at end of file
diff --git a/cubercnn/evaluation/omni3d_evaluation.py b/cubercnn/evaluation/omni3d_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9df152ad0f13078768d031f4b3ef677b466e208
--- /dev/null
+++ b/cubercnn/evaluation/omni3d_evaluation.py
@@ -0,0 +1,1706 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import contextlib
+import copy
+import datetime
+import io
+import itertools
+import json
+import logging
+import os
+import time
+from collections import defaultdict
+from typing import List, Union
+from typing import Tuple
+
+import numpy as np
+import pycocotools.mask as maskUtils
+import torch
+from detectron2.utils.memory import retry_if_cuda_oom
+from detectron2.data import MetadataCatalog, DatasetCatalog
+from detectron2.evaluation.coco_evaluation import COCOEvaluator
+from detectron2.structures import BoxMode
+from detectron2.utils.file_io import PathManager
+from detectron2.utils.logger import create_small_table, log_every_n_seconds
+from pycocotools.cocoeval import COCOeval
+from tabulate import tabulate
+from detectron2.utils.comm import get_world_size, is_main_process
+import detectron2.utils.comm as comm
+from detectron2.evaluation import (
+    DatasetEvaluators, inference_context, DatasetEvaluator
+)
+from collections import OrderedDict, abc
+from contextlib import ExitStack, contextmanager
+from torch import nn
+
+import logging
+from cubercnn.data import Omni3D
+from pytorch3d import _C
+import torch.nn.functional as F
+
+from pytorch3d.ops.iou_box3d import _box_planes, _box_triangles
+
+import cubercnn.vis.logperf as utils_logperf
+from cubercnn.data import (
+    get_omni3d_categories,
+    simple_register
+)
+
+"""
+This file contains
+* Omni3DEvaluationHelper: a helper object to accumulate and summarize evaluation results
+* Omni3DEval: a wrapper around COCOeval to perform 3D bounding evaluation in the detection setting
+* Omni3DEvaluator: a wrapper around COCOEvaluator to collect results on each dataset
+* Omni3DParams: parameters for the evaluation API
+"""
+
+logger = logging.getLogger(__name__)
+
+# Defines the max cross of len(dts) * len(gts)
+# which we will attempt to compute on a GPU. 
+# Fallback is safer computation on a CPU. 
+# 0 is disabled on GPU. 
+MAX_DTS_CROSS_GTS_FOR_IOU3D = 0
+
+
+def _check_coplanar(boxes: torch.Tensor, eps: float = 1e-4) -> torch.BoolTensor:
+    """
+    Checks that plane vertices are coplanar.
+    Returns a bool tensor of size B, where True indicates a box is coplanar.
+    """
+    faces = torch.tensor(_box_planes, dtype=torch.int64, device=boxes.device)
+    verts = boxes.index_select(index=faces.view(-1), dim=1)
+    B = boxes.shape[0]
+    P, V = faces.shape
+    # (B, P, 4, 3) -> (B, P, 3)
+    v0, v1, v2, v3 = verts.reshape(B, P, V, 3).unbind(2)
+
+    # Compute the normal
+    e0 = F.normalize(v1 - v0, dim=-1)
+    e1 = F.normalize(v2 - v0, dim=-1)
+    normal = F.normalize(torch.cross(e0, e1, dim=-1), dim=-1)
+
+    # Check the fourth vertex is also on the same plane
+    mat1 = (v3 - v0).view(B, 1, -1)  # (B, 1, P*3)
+    mat2 = normal.view(B, -1, 1)  # (B, P*3, 1)
+    
+    return (mat1.bmm(mat2).abs() < eps).view(B)
+
+
+def _check_nonzero(boxes: torch.Tensor, eps: float = 1e-8) -> torch.BoolTensor:
+    """
+    Checks that the sides of the box have a non zero area.
+    Returns a bool tensor of size B, where True indicates a box is nonzero.
+    """
+    faces = torch.tensor(_box_triangles, dtype=torch.int64, device=boxes.device)
+    verts = boxes.index_select(index=faces.view(-1), dim=1)
+    B = boxes.shape[0]
+    T, V = faces.shape
+    # (B, T, 3, 3) -> (B, T, 3)
+    v0, v1, v2 = verts.reshape(B, T, V, 3).unbind(2)
+
+    normals = torch.cross(v1 - v0, v2 - v0, dim=-1)  # (B, T, 3)
+    face_areas = normals.norm(dim=-1) / 2
+
+    return (face_areas > eps).all(1).view(B)
+
+def box3d_overlap(
+    boxes_dt: torch.Tensor, boxes_gt: torch.Tensor, 
+    eps_coplanar: float = 1e-4, eps_nonzero: float = 1e-8
+) -> torch.Tensor:
+    """
+    Computes the intersection of 3D boxes_dt and boxes_gt.
+
+    Inputs boxes_dt, boxes_gt are tensors of shape (B, 8, 3)
+    (where B doesn't have to be the same for boxes_dt and boxes_gt),
+    containing the 8 corners of the boxes, as follows:
+
+        (4) +---------+. (5)
+            | ` .     |  ` .
+            | (0) +---+-----+ (1)
+            |     |   |     |
+        (7) +-----+---+. (6)|
+            ` .   |     ` . |
+            (3) ` +---------+ (2)
+
+
+    NOTE: Throughout this implementation, we assume that boxes
+    are defined by their 8 corners exactly in the order specified in the
+    diagram above for the function to give correct results. In addition
+    the vertices on each plane must be coplanar.
+    As an alternative to the diagram, this is a unit bounding
+    box which has the correct vertex ordering:
+
+    box_corner_vertices = [
+        [0, 0, 0],
+        [1, 0, 0],
+        [1, 1, 0],
+        [0, 1, 0],
+        [0, 0, 1],
+        [1, 0, 1],
+        [1, 1, 1],
+        [0, 1, 1],
+    ]
+
+    Args:
+        boxes_dt: tensor of shape (N, 8, 3) of the coordinates of the 1st boxes
+        boxes_gt: tensor of shape (M, 8, 3) of the coordinates of the 2nd boxes
+    Returns:
+        iou: (N, M) tensor of the intersection over union which is
+            defined as: `iou = vol / (vol1 + vol2 - vol)`
+    """
+    # Make sure predictions are coplanar and nonzero 
+    invalid_coplanar = ~_check_coplanar(boxes_dt, eps=eps_coplanar)
+    invalid_nonzero  = ~_check_nonzero(boxes_dt, eps=eps_nonzero)
+
+    ious = _C.iou_box3d(boxes_dt, boxes_gt)[1]
+
+    # Offending boxes are set to zero IoU
+    if invalid_coplanar.any():
+        ious[invalid_coplanar] = 0
+        print('Warning: skipping {:d} non-coplanar boxes at eval.'.format(int(invalid_coplanar.float().sum())))
+    
+    if invalid_nonzero.any():
+        ious[invalid_nonzero] = 0
+        print('Warning: skipping {:d} zero volume boxes at eval.'.format(int(invalid_nonzero.float().sum())))
+
+    return ious
+
+class Omni3DEvaluationHelper:
+    def __init__(self, 
+            dataset_names, 
+            filter_settings, 
+            output_folder,
+            iter_label='-',
+            only_2d=False,
+        ):
+        """
+        A helper class to initialize, evaluate and summarize Omni3D metrics. 
+
+        The evaluator relies on the detectron2 MetadataCatalog for keeping track 
+        of category names and contiguous IDs. Hence, it is important to set 
+        these variables appropriately. 
+        
+        # (list[str]) the category names in their contiguous order
+        MetadataCatalog.get('omni3d_model').thing_classes = ... 
+
+        # (dict[int: int]) the mapping from Omni3D category IDs to the contiguous order
+        MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+
+        Args:
+            dataset_names (list[str]): the individual dataset splits for evaluation
+            filter_settings (dict): the filter settings used for evaluation, see
+                cubercnn/data/datasets.py get_filter_settings_from_cfg
+            output_folder (str): the output folder where results can be stored to disk.
+            iter_label (str): an optional iteration/label used within the summary
+            only_2d (bool): whether the evaluation mode should be 2D or 2D and 3D.
+        """
+        
+        self.dataset_names = dataset_names
+        self.filter_settings = filter_settings
+        self.output_folder = output_folder
+        self.iter_label = iter_label
+        self.only_2d = only_2d
+
+        # Each dataset evaluator is stored here
+        self.evaluators = OrderedDict()
+
+        # These are the main evaluation results
+        self.results = OrderedDict()
+
+        # These store store per-dataset results to be printed
+        self.results_analysis = OrderedDict()
+        self.results_omni3d = OrderedDict()
+
+        self.overall_imgIds = set()
+        self.overall_catIds = set()
+        
+        # These store the evaluations for each category and area,
+        # concatenated from ALL evaluated datasets. Doing so avoids
+        # the need to re-compute them when accumulating results.
+        self.evals_per_cat_area2D = {}
+        self.evals_per_cat_area3D = {}
+        
+        self.output_folders = {
+            dataset_name: os.path.join(self.output_folder, dataset_name)
+            for dataset_name in dataset_names
+        }
+
+        for dataset_name in self.dataset_names:
+            
+            # register any datasets that need it
+            if MetadataCatalog.get(dataset_name).get('json_file') is None:
+                simple_register(dataset_name, filter_settings, filter_empty=False)
+            
+            # create an individual dataset evaluator
+            self.evaluators[dataset_name] = Omni3DEvaluator(
+                dataset_name, output_dir=self.output_folders[dataset_name], 
+                filter_settings=self.filter_settings, only_2d=self.only_2d, 
+                eval_prox=('Objectron' in dataset_name or 'SUNRGBD' in dataset_name),
+                distributed=False, # actual evaluation should be single process
+            )
+
+            self.evaluators[dataset_name].reset()
+            self.overall_imgIds.update(set(self.evaluators[dataset_name]._omni_api.getImgIds()))
+            self.overall_catIds.update(set(self.evaluators[dataset_name]._omni_api.getCatIds()))
+        
+    def add_predictions(self, dataset_name, predictions):
+        """
+        Adds predictions to the evaluator for dataset_name. This can be any number of
+        predictions, including all predictions passed in at once or in batches. 
+
+        Args:
+            dataset_name (str): the dataset split name which the predictions belong to
+            predictions (list[dict]): each item in the list is a dict as follows:
+
+                {
+                    "image_id": <int> the unique image identifier from Omni3D,
+                    "K": <np.array> 3x3 intrinsics matrix for the image,
+                    "width": <int> image width,
+                    "height": <int> image height,
+                    "instances": [
+                        {
+                            "image_id":  <int> the unique image identifier from Omni3D,
+                            "category_id": <int> the contiguous category prediction IDs, 
+                                which can be mapped from Omni3D's category ID's using
+                                MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+                            "bbox": [float] 2D box as [x1, y1, x2, y2] used for IoU2D,
+                            "score": <float> the confidence score for the object,
+                            "depth": <float> the depth of the center of the object,
+                            "bbox3D": list[list[float]] 8x3 corner vertices used for IoU3D,
+                        }
+                        ...
+                    ]
+                }
+        """
+        # concatenate incoming predictions
+        self.evaluators[dataset_name]._predictions += predictions
+
+    def save_predictions(self, dataset_name):
+        """
+        Saves the predictions from dataset_name to disk, in a self.output_folder.
+
+        Args:
+            dataset_name (str): the dataset split name which should be saved.
+        """
+        # save predictions to disk
+        output_folder_dataset = self.output_folders[dataset_name]
+        PathManager.mkdirs(output_folder_dataset)
+        file_path = os.path.join(output_folder_dataset, "instances_predictions.pth")
+        with PathManager.open(file_path, "wb") as f:
+            torch.save(self.evaluators[dataset_name]._predictions, f)
+
+    def evaluate(self, dataset_name):
+        """
+        Runs the evaluation for an individual dataset split, assuming all 
+        predictions have been passed in. 
+
+        Args:
+            dataset_name (str): the dataset split name which should be evalated.
+        """
+        
+        if not dataset_name in self.results:
+            
+            # run evaluation and cache
+            self.results[dataset_name] = self.evaluators[dataset_name].evaluate()
+
+        results = self.results[dataset_name]
+
+        logger.info('\n'+results['log_str_2D'].replace('mode=2D', '{} iter={} mode=2D'.format(dataset_name, self.iter_label)))
+            
+        # store the partially accumulated evaluations per category per area
+        for key, item in results['bbox_2D_evals_per_cat_area'].items():
+            if not key in self.evals_per_cat_area2D:
+                self.evals_per_cat_area2D[key] = []
+            self.evals_per_cat_area2D[key] += item
+
+        if not self.only_2d:
+            # store the partially accumulated evaluations per category per area
+            for key, item in results['bbox_3D_evals_per_cat_area'].items():
+                if not key in self.evals_per_cat_area3D:
+                    self.evals_per_cat_area3D[key] = []
+                self.evals_per_cat_area3D[key] += item
+
+            logger.info('\n'+results['log_str_3D'].replace('mode=3D', '{} iter={} mode=3D'.format(dataset_name, self.iter_label)))
+
+        # full model category names
+        category_names = self.filter_settings['category_names']
+
+        # The set of categories present in the dataset; there should be no duplicates 
+        categories = {cat for cat in category_names if 'AP-{}'.format(cat) in results['bbox_2D']}
+        assert len(categories) == len(set(categories)) 
+
+        # default are all NaN
+        general_2D, general_3D, omni_2D, omni_3D = (np.nan,) * 4
+
+        # 2D and 3D performance for categories in dataset; and log
+        general_2D = np.mean([results['bbox_2D']['AP-{}'.format(cat)] for cat in categories])
+        if not self.only_2d:
+            general_3D = np.mean([results['bbox_3D']['AP-{}'.format(cat)] for cat in categories])
+
+        # 2D and 3D performance on Omni3D categories
+        omni3d_dataset_categories = get_omni3d_categories(dataset_name)  # dataset-specific categories
+        if len(omni3d_dataset_categories - categories) == 0:  # omni3d_dataset_categories is a subset of categories
+            omni_2D = np.mean([results['bbox_2D']['AP-{}'.format(cat)] for cat in omni3d_dataset_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results['bbox_3D']['AP-{}'.format(cat)] for cat in omni3d_dataset_categories])
+        
+        self.results_omni3d[dataset_name] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+
+        # Performance analysis
+        extras_AP15, extras_AP25, extras_AP50, extras_APn, extras_APm, extras_APf = (np.nan,)*6
+        if not self.only_2d:
+            extras_AP15 = results['bbox_3D']['AP15']
+            extras_AP25 = results['bbox_3D']['AP25']
+            extras_AP50 = results['bbox_3D']['AP50']
+            extras_APn = results['bbox_3D']['APn']
+            extras_APm = results['bbox_3D']['APm']
+            extras_APf = results['bbox_3D']['APf']
+
+        self.results_analysis[dataset_name] = {
+            "iters": self.iter_label, 
+            "AP2D": general_2D, "AP3D": general_3D, 
+            "AP3D@15": extras_AP15, "AP3D@25": extras_AP25, "AP3D@50": extras_AP50, 
+            "AP3D-N": extras_APn, "AP3D-M": extras_APm, "AP3D-F": extras_APf
+        }
+
+        # Performance per category
+        results_cat = OrderedDict()
+        for cat in category_names:
+            cat_2D, cat_3D = (np.nan,) * 2
+            if 'AP-{}'.format(cat) in results['bbox_2D']:
+                cat_2D = results['bbox_2D']['AP-{}'.format(cat)]
+                if not self.only_2d:
+                    cat_3D = results['bbox_3D']['AP-{}'.format(cat)]
+            if not np.isnan(cat_2D) or not np.isnan(cat_3D):
+                results_cat[cat] = {"AP2D": cat_2D, "AP3D": cat_3D}
+        utils_logperf.print_ap_category_histogram(dataset_name, results_cat)
+
+    def summarize_all(self,):
+        '''
+        Report collective metrics when possible for the the Omni3D dataset.
+        This uses pre-computed evaluation results from each dataset, 
+        which were aggregated and cached while evaluating individually. 
+        This process simply re-accumulate and summarizes them. 
+        '''
+
+        # First, double check that we have all the evaluations
+        for dataset_name in self.dataset_names:
+            if not dataset_name in self.results:
+                self.evaluate(dataset_name)
+
+        thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+        catId2contiguous = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+        ordered_things = [thing_classes[catId2contiguous[cid]] for cid in self.overall_catIds]
+        categories = set(ordered_things)
+
+        evaluator2D = Omni3Deval(mode='2D')
+        evaluator2D.params.catIds = list(self.overall_catIds)
+        evaluator2D.params.imgIds = list(self.overall_imgIds)
+        evaluator2D.evalImgs = True
+        evaluator2D.evals_per_cat_area = self.evals_per_cat_area2D
+        evaluator2D._paramsEval = copy.deepcopy(evaluator2D.params)
+        evaluator2D.accumulate()
+        summarize_str2D = evaluator2D.summarize()
+        
+        precisions = evaluator2D.eval['precision']
+
+        metrics = ["AP", "AP50", "AP75", "AP95", "APs", "APm", "APl"]
+
+        results2D = {
+            metric: float(
+                evaluator2D.stats[idx] * 100 if evaluator2D.stats[idx] >= 0 else "nan"
+            )
+            for idx, metric in enumerate(metrics)
+        }
+
+        for idx, name in enumerate(ordered_things):
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results2D.update({"AP-" + "{}".format(name): float(ap * 100)})
+
+        if not self.only_2d:
+            evaluator3D = Omni3Deval(mode='3D')
+            evaluator3D.params.catIds = list(self.overall_catIds)
+            evaluator3D.params.imgIds = list(self.overall_imgIds)
+            evaluator3D.evalImgs = True
+            evaluator3D.evals_per_cat_area = self.evals_per_cat_area3D
+            evaluator3D._paramsEval = copy.deepcopy(evaluator3D.params)
+            evaluator3D.accumulate()
+            summarize_str3D = evaluator3D.summarize()
+            
+            precisions = evaluator3D.eval['precision']
+
+            metrics = ["AP", "AP15", "AP25", "AP50", "APn", "APm", "APf"]
+
+            results3D = {
+                metric: float(
+                    evaluator3D.stats[idx] * 100 if evaluator3D.stats[idx] >= 0 else "nan"
+                )
+                for idx, metric in enumerate(metrics)
+            }
+
+            for idx, name in enumerate(ordered_things):
+                precision = precisions[:, :, idx, 0, -1]
+                precision = precision[precision > -1]
+                ap = np.mean(precision) if precision.size else float("nan")
+                results3D.update({"AP-" + "{}".format(name): float(ap * 100)})
+
+
+        # All concat categories
+        general_2D, general_3D = (np.nan,) * 2
+
+        general_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in categories])
+        if not self.only_2d:
+            general_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in categories])
+
+        # Analysis performance
+        extras_AP15, extras_AP25, extras_AP50, extras_APn, extras_APm, extras_APf = (np.nan,) * 6
+        if not self.only_2d:
+            extras_AP15 = results3D['AP15']
+            extras_AP25 = results3D['AP25']
+            extras_AP50 = results3D['AP50']
+            extras_APn = results3D['APn']
+            extras_APm = results3D['APm']
+            extras_APf = results3D['APf']
+
+        self.results_analysis["<Concat>"] = {
+            "iters": self.iter_label, 
+            "AP2D": general_2D, "AP3D": general_3D, 
+            "AP3D@15": extras_AP15, "AP3D@25": extras_AP25, "AP3D@50": extras_AP50, 
+            "AP3D-N": extras_APn, "AP3D-M": extras_APm, "AP3D-F": extras_APf
+        }
+
+        # Omni3D Outdoor performance
+        omni_2D, omni_3D = (np.nan,) * 2
+
+        omni3d_outdoor_categories = get_omni3d_categories("omni3d_out")
+        if len(omni3d_outdoor_categories - categories) == 0:
+            omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_outdoor_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_outdoor_categories])
+
+        self.results_omni3d["Omni3D_Out"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+
+        # Omni3D Indoor performance
+        omni_2D, omni_3D = (np.nan,) * 2
+
+        omni3d_indoor_categories = get_omni3d_categories("omni3d_in")
+        if len(omni3d_indoor_categories - categories) == 0:
+            omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_indoor_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_indoor_categories])
+
+        self.results_omni3d["Omni3D_In"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+
+        # Omni3D performance
+        omni_2D, omni_3D = (np.nan,) * 2
+
+        omni3d_categories = get_omni3d_categories("omni3d")
+        if len(omni3d_categories - categories) == 0:
+            omni_2D = np.mean([results2D['AP-{}'.format(cat)] for cat in omni3d_categories])
+            if not self.only_2d:
+                omni_3D = np.mean([results3D['AP-{}'.format(cat)] for cat in omni3d_categories])
+
+        self.results_omni3d["Omni3D"] = {"iters": self.iter_label, "AP2D": omni_2D, "AP3D": omni_3D}
+
+        # Per-category performance for the cumulative datasets
+        results_cat = OrderedDict()
+        for cat in self.filter_settings['category_names']:
+            cat_2D, cat_3D = (np.nan,) * 2
+            if 'AP-{}'.format(cat) in results2D:
+                cat_2D = results2D['AP-{}'.format(cat)]
+                if not self.only_2d:
+                    cat_3D = results3D['AP-{}'.format(cat)]
+            if not np.isnan(cat_2D) or not np.isnan(cat_3D):
+                results_cat[cat] = {"AP2D": cat_2D, "AP3D": cat_3D}
+        
+        utils_logperf.print_ap_category_histogram("<Concat>", results_cat)
+        utils_logperf.print_ap_analysis_histogram(self.results_analysis)
+        utils_logperf.print_ap_omni_histogram(self.results_omni3d)
+
+
+def inference_on_dataset(model, data_loader):
+    """
+    Run model on the data_loader. 
+    Also benchmark the inference speed of `model.__call__` accurately.
+    The model will be used in eval mode.
+
+    Args:
+        model (callable): a callable which takes an object from
+            `data_loader` and returns some outputs.
+
+            If it's an nn.Module, it will be temporarily set to `eval` mode.
+            If you wish to evaluate a model in `training` mode instead, you can
+            wrap the given model and override its behavior of `.eval()` and `.train()`.
+        data_loader: an iterable object with a length.
+            The elements it generates will be the inputs to the model.
+
+    Returns:
+        The return value of `evaluator.evaluate()`
+    """
+    
+    num_devices = get_world_size()
+    distributed = num_devices > 1
+    logger.info("Start inference on {} batches".format(len(data_loader)))
+
+    total = len(data_loader)  # inference data loader must have a fixed length
+
+    num_warmup = min(5, total - 1)
+    start_time = time.perf_counter()
+    total_data_time = 0
+    total_compute_time = 0
+    total_eval_time = 0
+
+    inference_json = []
+
+    with ExitStack() as stack:
+        if isinstance(model, nn.Module):
+            stack.enter_context(inference_context(model))
+        stack.enter_context(torch.no_grad())
+
+        start_data_time = time.perf_counter()
+        for idx, inputs in enumerate(data_loader):
+            total_data_time += time.perf_counter() - start_data_time
+            if idx == num_warmup:
+                start_time = time.perf_counter()
+                total_data_time = 0
+                total_compute_time = 0
+                total_eval_time = 0
+
+            start_compute_time = time.perf_counter()
+            outputs = model(inputs)
+            if torch.cuda.is_available():
+                torch.cuda.synchronize()
+            total_compute_time += time.perf_counter() - start_compute_time
+
+            start_eval_time = time.perf_counter()
+
+            for input, output in zip(inputs, outputs):
+
+                prediction = {
+                    "image_id": input["image_id"],
+                    "K": input["K"],
+                    "width": input["width"],
+                    "height": input["height"],
+                }
+
+                # convert to json format
+                instances = output["instances"].to('cpu')
+                prediction["instances"] = instances_to_coco_json(instances, input["image_id"])
+
+                # store in overall predictions
+                inference_json.append(prediction)
+
+            total_eval_time += time.perf_counter() - start_eval_time
+
+            iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup)
+            data_seconds_per_iter = total_data_time / iters_after_start
+            compute_seconds_per_iter = total_compute_time / iters_after_start
+            eval_seconds_per_iter = total_eval_time / iters_after_start
+            total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start
+            if idx >= num_warmup * 2 or compute_seconds_per_iter > 5:
+                eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1)))
+                log_every_n_seconds(
+                    logging.INFO,
+                    (
+                        f"Inference done {idx + 1}/{total}. "
+                        f"Dataloading: {data_seconds_per_iter:.4f} s/iter. "
+                        f"Inference: {compute_seconds_per_iter:.4f} s/iter. "
+                        f"Eval: {eval_seconds_per_iter:.4f} s/iter. "
+                        f"Total: {total_seconds_per_iter:.4f} s/iter. "
+                        f"ETA={eta}"
+                    ),
+                    n=5,
+                )
+            start_data_time = time.perf_counter()
+
+    # Measure the time only for this worker (before the synchronization barrier)
+    total_time = time.perf_counter() - start_time
+    total_time_str = str(datetime.timedelta(seconds=total_time))
+    # NOTE this format is parsed by grep
+    logger.info(
+        "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_time_str, total_time / (total - num_warmup), num_devices
+        )
+    )
+    total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time)))
+    logger.info(
+        "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format(
+            total_compute_time_str, total_compute_time / (total - num_warmup), num_devices
+        )
+    )
+
+    if distributed:
+        comm.synchronize()
+        inference_json = comm.gather(inference_json, dst=0)
+        inference_json = list(itertools.chain(*inference_json))
+
+        if not comm.is_main_process():
+            return []
+
+    return inference_json
+
+class Omni3DEvaluator(COCOEvaluator):
+    def __init__(
+        self,
+        dataset_name,
+        tasks=None,
+        distributed=True,
+        output_dir=None,
+        *,
+        max_dets_per_image=None,
+        use_fast_impl=False,
+        eval_prox=False,
+        only_2d=False,
+        filter_settings={},
+    ):
+        """
+        Args:
+            dataset_name (str): name of the dataset to be evaluated.
+                It must have either the following corresponding metadata:
+                    "json_file": the path to the COCO format annotation
+                Or it must be in detectron2's standard dataset format
+                so it can be converted to COCO format automatically.
+            tasks (tuple[str]): tasks that can be evaluated under the given
+                configuration. For now, support only for "bbox".
+            distributed (True): if True, will collect results from all ranks and run evaluation
+                in the main process.
+                Otherwise, will only evaluate the results in the current process.
+            output_dir (str): optional, an output directory to dump all
+                results predicted on the dataset. The dump contains two files:
+                1. "instances_predictions.pth" a file that can be loaded with `torch.load` and
+                    contains all the results in the format they are produced by the model.
+                2. "coco_instances_results.json" a json file in COCO's result format.
+            max_dets_per_image (int): limit on the maximum number of detections per image.
+                By default in COCO, this limit is to 100, but this can be customized
+                to be greater, as is needed in evaluation metrics AP fixed and AP pool
+                (see https://arxiv.org/pdf/2102.01066.pdf)
+                This doesn't affect keypoint evaluation.
+            use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP.
+                Although the results should be very close to the official implementation in COCO
+                API, it is still recommended to compute results with the official API for use in
+                papers. The faster implementation also uses more RAM.
+            eval_prox (bool): whether to perform proximity evaluation. For datasets that are not
+                exhaustively annotated.
+            only_2d (bool): evaluates only 2D performance if set to True
+            filter_settions: settings for the dataset loader. TBD
+        """
+
+        self._logger = logging.getLogger(__name__)
+        self._distributed = distributed
+        self._output_dir = output_dir
+        self._use_fast_impl = use_fast_impl
+        self._eval_prox = eval_prox
+        self._only_2d = only_2d
+        self._filter_settings = filter_settings
+
+        # COCOeval requires the limit on the number of detections per image (maxDets) to be a list
+        # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the
+        # 3rd element (100) is used as the limit on the number of detections per image when
+        # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval,
+        # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults.
+        if max_dets_per_image is None:
+            max_dets_per_image = [1, 10, 100]
+
+        else:
+            max_dets_per_image = [1, 10, max_dets_per_image]
+
+        self._max_dets_per_image = max_dets_per_image
+
+        self._tasks = tasks
+        self._cpu_device = torch.device("cpu")
+
+        self._metadata = MetadataCatalog.get(dataset_name)
+
+        json_file = PathManager.get_local_path(self._metadata.json_file)
+        with contextlib.redirect_stdout(io.StringIO()):
+            self._omni_api = Omni3D([json_file], filter_settings)
+
+        # Test set json files do not contain annotations (evaluation must be
+        # performed using the COCO evaluation server).
+        self._do_evaluation = "annotations" in self._omni_api.dataset
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model (e.g., GeneralizedRCNN).
+                It is a list of dict. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name", "image_id".
+            outputs: the outputs of a model. It is a list of dicts with key
+                "instances" that contains :class:`Instances`.
+        """
+
+        # Optional image keys to keep when available
+        img_keys_optional = ["p2"]
+
+        for input, output in zip(inputs, outputs):
+
+            prediction = {
+                "image_id": input["image_id"],
+                "K": input["K"],
+                "width": input["width"],
+                "height": input["height"],
+            }
+
+            # store optional keys when available
+            for img_key in img_keys_optional:
+                if img_key in input:
+                    prediction.update({img_key: input[img_key]})
+
+            # already in COCO format
+            if type(output["instances"]) == list:
+                prediction["instances"] = output["instances"]
+
+            # tensor instances format
+            else: 
+                instances = output["instances"].to(self._cpu_device)
+                prediction["instances"] = instances_to_coco_json(
+                    instances, input["image_id"]
+                )
+
+            if len(prediction) > 1:
+                self._predictions.append(prediction)
+
+    def _derive_omni_results(self, omni_eval, iou_type, mode, class_names=None):
+        """
+        Derive the desired score numbers from summarized COCOeval.
+        Args:
+            omni_eval (None or Omni3Deval): None represents no predictions from model.
+            iou_type (str):
+            mode (str): either "2D" or "3D"
+            class_names (None or list[str]): if provided, will use it to predict
+                per-category AP.
+        Returns:
+            a dict of {metric name: score}
+        """
+        assert mode in ["2D", "3D"]
+
+        metrics = {
+            "2D": ["AP", "AP50", "AP75", "AP95", "APs", "APm", "APl"],
+            "3D": ["AP", "AP15", "AP25", "AP50", "APn", "APm", "APf"],
+        }[mode]
+
+        if iou_type != "bbox":
+            raise ValueError("Support only for bbox evaluation.")
+
+        if omni_eval is None:
+            self._logger.warn("No predictions from the model!")
+            return {metric: float("nan") for metric in metrics}
+
+        # the standard metrics
+        results = {
+            metric: float(
+                omni_eval.stats[idx] * 100 if omni_eval.stats[idx] >= 0 else "nan"
+            )
+            for idx, metric in enumerate(metrics)
+        }
+        self._logger.info(
+            "Evaluation results for {} in {} mode: \n".format(iou_type, mode)
+            + create_small_table(results)
+        )
+        if not np.isfinite(sum(results.values())):
+            self._logger.info("Some metrics cannot be computed and is shown as NaN.")
+
+        if class_names is None or len(class_names) <= 1:
+            return results
+        
+        # Compute per-category AP
+        # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa
+        precisions = omni_eval.eval["precision"]
+
+        # precision has dims (iou, recall, cls, area range, max dets)
+        assert len(class_names) == precisions.shape[2]
+
+        results_per_category = []
+        for idx, name in enumerate(class_names):
+            # area range index 0: all area ranges
+            # max dets index -1: typically 100 per image
+            precision = precisions[:, :, idx, 0, -1]
+            precision = precision[precision > -1]
+            ap = np.mean(precision) if precision.size else float("nan")
+            results_per_category.append(("{}".format(name), float(ap * 100)))
+
+        # tabulate it
+        N_COLS = min(6, len(results_per_category) * 2)
+        results_flatten = list(itertools.chain(*results_per_category))
+        results_table = itertools.zip_longest(
+            *[results_flatten[i::N_COLS] for i in range(N_COLS)]
+        )
+        table = tabulate(
+            results_table,
+            tablefmt="pipe",
+            floatfmt=".3f",
+            headers=["category", "AP"] * (N_COLS // 2),
+            numalign="left",
+        )
+        self._logger.info(
+            "Per-category {} AP in {} mode: \n".format(iou_type, mode) + table
+        )
+        results.update({"AP-" + name: ap for name, ap in results_per_category})
+        return results
+
+    def _eval_predictions(self, predictions, img_ids=None):
+        """
+        Evaluate predictions. Fill self._results with the metrics of the tasks.
+        """
+        self._logger.info("Preparing results for COCO format ...")
+        omni_results = list(itertools.chain(*[x["instances"] for x in predictions]))
+        tasks = self._tasks or self._tasks_from_predictions(omni_results)
+
+        omni3d_global_categories = MetadataCatalog.get('omni3d_model').thing_classes
+
+        # the dataset results will store only the categories that are present
+        # in the corresponding dataset, all others will be dropped. 
+        dataset_results = []
+        
+        # unmap the category ids for COCO
+        if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"):
+            dataset_id_to_contiguous_id = (
+                self._metadata.thing_dataset_id_to_contiguous_id
+            )
+            all_contiguous_ids = list(dataset_id_to_contiguous_id.values())
+            num_classes = len(all_contiguous_ids)
+            assert (
+                min(all_contiguous_ids) == 0
+                and max(all_contiguous_ids) == num_classes - 1
+            )
+
+            reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()}
+            for result in omni_results:
+                category_id = result["category_id"]
+                assert category_id < num_classes, (
+                    f"A prediction has class={category_id}, "
+                    f"but the dataset only has {num_classes} classes and "
+                    f"predicted class id should be in [0, {num_classes - 1}]."
+                )
+                result["category_id"] = reverse_id_mapping[category_id]
+
+                cat_name = omni3d_global_categories[category_id]
+
+                if cat_name in self._metadata.thing_classes:
+                    dataset_results.append(result)
+
+        # replace the results with the filtered
+        # instances that are in vocabulary. 
+        omni_results = dataset_results
+
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "omni_instances_results.json")
+            self._logger.info("Saving results to {}".format(file_path))
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(omni_results))
+                f.flush()
+
+        if not self._do_evaluation:
+            self._logger.info("Annotations are not available for evaluation.")
+            return
+
+        self._logger.info(
+            "Evaluating predictions with {} COCO API...".format(
+                "unofficial" if self._use_fast_impl else "official"
+            )
+        )
+        for task in sorted(tasks):
+            assert task in {"bbox"}, f"Got unknown task: {task}!"
+            evals, log_strs = (
+                _evaluate_predictions_on_omni(
+                    self._omni_api,
+                    omni_results,
+                    task,
+                    img_ids=img_ids,
+                    only_2d=self._only_2d,
+                    eval_prox=self._eval_prox,
+                )
+                if len(omni_results) > 0
+                else None  # cocoapi does not handle empty results very well
+            )
+
+            modes = evals.keys()
+            for mode in modes:
+                res = self._derive_omni_results(
+                    evals[mode],
+                    task,
+                    mode,
+                    class_names=self._metadata.get("thing_classes"),
+                )
+                self._results[task + "_" + format(mode)] = res
+                self._results[task + "_" + format(mode) + '_evalImgs'] = evals[mode].evalImgs
+                self._results[task + "_" + format(mode) + '_evals_per_cat_area'] = evals[mode].evals_per_cat_area
+
+            self._results["log_str_2D"] = log_strs["2D"]
+            
+            if "3D" in log_strs:
+                self._results["log_str_3D"] = log_strs["3D"]
+
+
+def _evaluate_predictions_on_omni(
+    omni_gt,
+    omni_results,
+    iou_type,
+    img_ids=None,
+    only_2d=False,
+    eval_prox=False,
+):
+    """
+    Evaluate the coco results using COCOEval API.
+    """
+    assert len(omni_results) > 0
+    log_strs, evals = {}, {}
+
+    omni_dt = omni_gt.loadRes(omni_results)
+
+    modes = ["2D"] if only_2d else ["2D", "3D"]
+
+    for mode in modes:
+        omni_eval = Omni3Deval(
+            omni_gt, omni_dt, iouType=iou_type, mode=mode, eval_prox=eval_prox
+        )
+        if img_ids is not None:
+            omni_eval.params.imgIds = img_ids
+
+        omni_eval.evaluate()
+        omni_eval.accumulate()
+        log_str = omni_eval.summarize()
+        log_strs[mode] = log_str
+        evals[mode] = omni_eval
+
+    return evals, log_strs
+
+
+def instances_to_coco_json(instances, img_id):
+
+    num_instances = len(instances)
+
+    if num_instances == 0:
+        return []
+
+    boxes = BoxMode.convert(
+        instances.pred_boxes.tensor.numpy(), BoxMode.XYXY_ABS, BoxMode.XYWH_ABS
+    ).tolist()
+    scores = instances.scores.tolist()
+    classes = instances.pred_classes.tolist()
+
+    if hasattr(instances, "pred_bbox3D"):
+        bbox3D = instances.pred_bbox3D.tolist()
+        center_cam = instances.pred_center_cam.tolist()
+        center_2D = instances.pred_center_2D.tolist()
+        dimensions = instances.pred_dimensions.tolist()
+        pose = instances.pred_pose.tolist()
+    else:
+        # dummy
+        bbox3D = np.ones([num_instances, 8, 3]).tolist()
+        center_cam = np.ones([num_instances, 3]).tolist()
+        center_2D = np.ones([num_instances, 2]).tolist()
+        dimensions = np.ones([num_instances, 3]).tolist()
+        pose = np.ones([num_instances, 3, 3]).tolist()
+
+    results = []
+    for k in range(num_instances):
+        result = {
+            "image_id": img_id,
+            "category_id": classes[k],
+            "bbox": boxes[k],
+            "score": scores[k],
+            "depth": np.array(bbox3D[k])[:, 2].mean(),
+            "bbox3D": bbox3D[k],
+            "center_cam": center_cam[k],
+            "center_2D": center_2D[k],
+            "dimensions": dimensions[k],
+            "pose": pose[k],
+        }
+
+        results.append(result)
+    return results
+
+
+# ---------------------------------------------------------------------
+#                               Omni3DParams
+# ---------------------------------------------------------------------
+class Omni3DParams:
+    """
+    Params for the Omni evaluation API
+    """
+
+    def setDet2DParams(self):
+        self.imgIds = []
+        self.catIds = []
+
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            0.5, 0.95, int(np.round((0.95 - 0.5) / 0.05)) + 1, endpoint=True
+        )
+
+        self.recThrs = np.linspace(
+            0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True
+        )
+
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [
+            [0 ** 2, 1e5 ** 2],
+            [0 ** 2, 32 ** 2],
+            [32 ** 2, 96 ** 2],
+            [96 ** 2, 1e5 ** 2],
+        ]
+
+        self.areaRngLbl = ["all", "small", "medium", "large"]
+        self.useCats = 1
+
+    def setDet3DParams(self):
+        self.imgIds = []
+        self.catIds = []
+
+        # np.arange causes trouble.  the data point on arange is slightly larger than the true value
+        self.iouThrs = np.linspace(
+            0.05, 0.5, int(np.round((0.5 - 0.05) / 0.05)) + 1, endpoint=True
+        )
+
+        self.recThrs = np.linspace(
+            0.0, 1.00, int(np.round((1.00 - 0.0) / 0.01)) + 1, endpoint=True
+        )
+
+        self.maxDets = [1, 10, 100]
+        self.areaRng = [[0, 1e5], [0, 10], [10, 35], [35, 1e5]]
+        self.areaRngLbl = ["all", "near", "medium", "far"]
+        self.useCats = 1
+
+    def __init__(self, mode="2D"):
+        """
+        Args:
+            iouType (str): defines 2D or 3D evaluation parameters.
+                One of {"2D", "3D"}
+        """
+
+        if mode == "2D":
+            self.setDet2DParams()
+
+        elif mode == "3D":
+            self.setDet3DParams()
+
+        else:
+            raise Exception("mode %s not supported" % (mode))
+
+        self.iouType = "bbox"
+        self.mode = mode
+        # the proximity threshold defines the neighborhood
+        # when evaluating on non-exhaustively annotated datasets
+        self.proximity_thresh = 0.3
+
+
+# ---------------------------------------------------------------------
+#                               Omni3Deval
+# ---------------------------------------------------------------------
+class Omni3Deval(COCOeval):
+    """
+    Wraps COCOeval for 2D or 3D box evaluation depending on mode
+    """
+
+    def __init__(
+        self, cocoGt=None, cocoDt=None, iouType="bbox", mode="2D", eval_prox=False
+    ):
+        """
+        Initialize COCOeval using coco APIs for Gt and Dt
+        Args:
+            cocoGt: COCO object with ground truth annotations
+            cocoDt: COCO object with detection results
+            iouType: (str) defines the evaluation type. Supports only "bbox" now.
+            mode: (str) defines whether to evaluate 2D or 3D performance.
+                One of {"2D", "3D"}
+            eval_prox: (bool) if True, performs "Proximity Evaluation", i.e.
+                evaluates detections in the proximity of the ground truth2D boxes.
+                This is used for datasets which are not exhaustively annotated.
+        """
+        if not iouType:
+            print("iouType not specified. use default iouType bbox")
+        elif iouType != "bbox":
+            print("no support for %s iouType" % (iouType))
+        self.mode = mode
+        if mode not in ["2D", "3D"]:
+            raise Exception("mode %s not supported" % (mode))
+        self.eval_prox = eval_prox
+        self.cocoGt = cocoGt  # ground truth COCO API
+        self.cocoDt = cocoDt  # detections COCO API
+        
+        # per-image per-category evaluation results [KxAxI] elements
+        self.evalImgs = defaultdict(list) 
+
+        self.eval = {}  # accumulated evaluation results
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+        self.params = Omni3DParams(mode)  # parameters
+        self._paramsEval = {}  # parameters for evaluation
+        self.stats = []  # result summarization
+        self.ious = {}  # ious between all gts and dts
+
+        if cocoGt is not None:
+            self.params.imgIds = sorted(cocoGt.getImgIds())
+            self.params.catIds = sorted(cocoGt.getCatIds())
+
+        self.evals_per_cat_area = None
+
+    def _prepare(self):
+        """
+        Prepare ._gts and ._dts for evaluation based on params
+        """
+        
+        p = self.params
+
+        if p.useCats:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds))
+        
+        else:
+            gts = self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds))
+            dts = self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds))
+
+        # set ignore flag
+        ignore_flag = "ignore2D" if self.mode == "2D" else "ignore3D"
+        for gt in gts:
+            gt[ignore_flag] = gt[ignore_flag] if ignore_flag in gt else 0
+
+        self._gts = defaultdict(list)  # gt for evaluation
+        self._dts = defaultdict(list)  # dt for evaluation
+
+        for gt in gts:
+            self._gts[gt["image_id"], gt["category_id"]].append(gt)
+
+        for dt in dts:
+            self._dts[dt["image_id"], dt["category_id"]].append(dt)
+
+        self.evalImgs = defaultdict(list)  # per-image per-category evaluation results
+        self.eval = {}  # accumulated evaluation results
+
+    def accumulate(self, p = None):
+        '''
+        Accumulate per image evaluation results and store the result in self.eval
+        :param p: input params for evaluation
+        :return: None
+        '''
+
+        print('Accumulating evaluation results...')
+        assert self.evalImgs, 'Please run evaluate() first'
+
+        tic = time.time()
+
+        # allows input customized parameters
+        if p is None:
+            p = self.params
+
+        p.catIds = p.catIds if p.useCats == 1 else [-1]
+
+        T           = len(p.iouThrs)
+        R           = len(p.recThrs)
+        K           = len(p.catIds) if p.useCats else 1
+        A           = len(p.areaRng)
+        M           = len(p.maxDets)
+
+        precision   = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories
+        recall      = -np.ones((T,K,A,M))
+        scores      = -np.ones((T,R,K,A,M))
+
+        # create dictionary for future indexing
+        _pe = self._paramsEval
+
+        catIds = _pe.catIds if _pe.useCats else [-1]
+        setK = set(catIds)
+        setA = set(map(tuple, _pe.areaRng))
+        setM = set(_pe.maxDets)
+        setI = set(_pe.imgIds)
+
+        # get inds to evaluate
+        catid_list = [k for n, k in enumerate(p.catIds)  if k in setK]
+        k_list = [n for n, k in enumerate(p.catIds)  if k in setK]
+        m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
+        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        i_list = [n for n, i in enumerate(p.imgIds)  if i in setI]
+
+        I0 = len(_pe.imgIds)
+        A0 = len(_pe.areaRng)
+
+        has_precomputed_evals = not (self.evals_per_cat_area is None)
+        
+        if has_precomputed_evals:
+            evals_per_cat_area = self.evals_per_cat_area
+        else:
+            evals_per_cat_area = {}
+
+        # retrieve E at each category, area range, and max number of detections
+        for k, (k0, catId) in enumerate(zip(k_list, catid_list)):
+            Nk = k0*A0*I0
+            for a, a0 in enumerate(a_list):
+                Na = a0*I0
+
+                if has_precomputed_evals:
+                    E = evals_per_cat_area[(catId, a)]
+
+                else:
+                    E = [self.evalImgs[Nk + Na + i] for i in i_list]
+                    E = [e for e in E if not e is None]
+                    evals_per_cat_area[(catId, a)] = E
+
+                if len(E) == 0:
+                    continue
+
+                for m, maxDet in enumerate(m_list):
+
+                    dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E])
+
+                    # different sorting method generates slightly different results.
+                    # mergesort is used to be consistent as Matlab implementation.
+                    inds = np.argsort(-dtScores, kind='mergesort')
+                    dtScoresSorted = dtScores[inds]
+
+                    dtm  = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds]
+                    dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet]  for e in E], axis=1)[:,inds]
+                    gtIg = np.concatenate([e['gtIgnore'] for e in E])
+                    npig = np.count_nonzero(gtIg==0)
+
+                    if npig == 0:
+                        continue
+
+                    tps = np.logical_and(               dtm,  np.logical_not(dtIg) )
+                    fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) )
+
+                    tp_sum = np.cumsum(tps, axis=1).astype(dtype=float)
+                    fp_sum = np.cumsum(fps, axis=1).astype(dtype=float)
+
+                    for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)):
+                        tp = np.array(tp)
+                        fp = np.array(fp)
+                        nd = len(tp)
+                        rc = tp / npig
+                        pr = tp / (fp+tp+np.spacing(1))
+                        q  = np.zeros((R,))
+                        ss = np.zeros((R,))
+
+                        if nd:
+                            recall[t,k,a,m] = rc[-1]
+
+                        else:
+                            recall[t,k,a,m] = 0
+
+                        # numpy is slow without cython optimization for accessing elements
+                        # use python array gets significant speed improvement
+                        pr = pr.tolist(); q = q.tolist()
+
+                        for i in range(nd-1, 0, -1):
+                            if pr[i] > pr[i-1]:
+                                pr[i-1] = pr[i]
+
+                        inds = np.searchsorted(rc, p.recThrs, side='left')
+                        
+                        try:
+                            for ri, pi in enumerate(inds):
+                                q[ri] = pr[pi]
+                                ss[ri] = dtScoresSorted[pi]
+                        except:
+                            pass
+
+                        precision[t,:,k,a,m] = np.array(q)
+                        scores[t,:,k,a,m] = np.array(ss)
+
+        self.evals_per_cat_area = evals_per_cat_area
+
+        self.eval = {
+            'params': p,
+            'counts': [T, R, K, A, M],
+            'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
+            'precision': precision,
+            'recall':   recall,
+            'scores': scores,
+        }
+        
+        toc = time.time()
+        print('DONE (t={:0.2f}s).'.format( toc-tic))
+
+    def evaluate(self):
+        """
+        Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+        """
+
+        print("Running per image evaluation...")
+        
+        p = self.params
+        print("Evaluate annotation type *{}*".format(p.iouType))
+
+        tic = time.time()
+
+        p.imgIds = list(np.unique(p.imgIds))
+        if p.useCats:
+            p.catIds = list(np.unique(p.catIds))
+
+        p.maxDets = sorted(p.maxDets)
+        self.params = p
+
+        self._prepare()
+        
+        catIds = p.catIds if p.useCats else [-1]
+
+        # loop through images, area range, max detection number
+        self.ious = {
+            (imgId, catId): self.computeIoU(imgId, catId)
+            for imgId in p.imgIds
+            for catId in catIds
+        }
+
+        maxDet = p.maxDets[-1]
+
+        self.evalImgs = [
+            self.evaluateImg(imgId, catId, areaRng, maxDet)
+            for catId in catIds
+            for areaRng in p.areaRng
+            for imgId in p.imgIds
+        ]
+
+        self._paramsEval = copy.deepcopy(self.params)
+
+        toc = time.time()
+        print("DONE (t={:0.2f}s).".format(toc - tic))
+
+    def computeIoU(self, imgId, catId):
+        """
+        ComputeIoU computes the IoUs by sorting based on "score"
+        for either 2D boxes (in 2D mode) or 3D boxes (in 3D mode)
+        """
+        
+        device = (torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu"))
+
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+
+        if len(gt) == 0 and len(dt) == 0:
+            return []
+
+        inds = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in inds]
+        if len(dt) > p.maxDets[-1]:
+            dt = dt[0 : p.maxDets[-1]]
+
+        if p.iouType == "bbox":
+            if self.mode == "2D":
+                g = [g["bbox"] for g in gt]
+                d = [d["bbox"] for d in dt]
+            elif self.mode == "3D":
+                g = [g["bbox3D"] for g in gt]
+                d = [d["bbox3D"] for d in dt]
+        else:
+            raise Exception("unknown iouType for iou computation")
+
+        # compute iou between each dt and gt region
+        # iscrowd is required in builtin maskUtils so we
+        # use a dummy buffer for it
+        iscrowd = [0 for o in gt]
+        if self.mode == "2D":
+            ious = maskUtils.iou(d, g, iscrowd)
+
+        elif len(d) > 0 and len(g) > 0:
+            
+            # For 3D eval, we want to run IoU in CUDA if available
+            if torch.cuda.is_available() and len(d) * len(g) < MAX_DTS_CROSS_GTS_FOR_IOU3D:
+                device = torch.device("cuda:0") 
+            else:
+                device = torch.device("cpu")
+            
+            dd = torch.tensor(d, device=device, dtype=torch.float32)
+            gg = torch.tensor(g, device=device, dtype=torch.float32)
+
+            ious = box3d_overlap(dd, gg).cpu().numpy()
+
+        else:
+            ious = []
+
+        in_prox = None
+
+        if self.eval_prox:
+            g = [g["bbox"] for g in gt]
+            d = [d["bbox"] for d in dt]
+            iscrowd = [0 for o in gt]
+            ious2d = maskUtils.iou(d, g, iscrowd)
+
+            if type(ious2d) == list:
+                in_prox = []
+
+            else:
+                in_prox = ious2d > p.proximity_thresh
+        
+        return ious, in_prox
+
+    def evaluateImg(self, imgId, catId, aRng, maxDet):
+        """
+        Perform evaluation for single category and image
+        Returns:
+            dict (single image results)
+        """
+
+        p = self.params
+        if p.useCats:
+            gt = self._gts[imgId, catId]
+            dt = self._dts[imgId, catId]
+
+        else:
+            gt = [_ for cId in p.catIds for _ in self._gts[imgId, cId]]
+            dt = [_ for cId in p.catIds for _ in self._dts[imgId, cId]]
+
+        if len(gt) == 0 and len(dt) == 0:
+            return None
+
+        flag_range = "area" if self.mode == "2D" else "depth"
+        flag_ignore = "ignore2D" if self.mode == "2D" else "ignore3D"
+
+        for g in gt:
+            if g[flag_ignore] or (g[flag_range] < aRng[0] or g[flag_range] > aRng[1]):
+                g["_ignore"] = 1
+            else:
+                g["_ignore"] = 0
+
+        # sort dt highest score first, sort gt ignore last
+        gtind = np.argsort([g["_ignore"] for g in gt], kind="mergesort")
+        gt = [gt[i] for i in gtind]
+        dtind = np.argsort([-d["score"] for d in dt], kind="mergesort")
+        dt = [dt[i] for i in dtind[0:maxDet]]
+
+        # load computed ious
+        ious = (
+            self.ious[imgId, catId][0][:, gtind]
+            if len(self.ious[imgId, catId][0]) > 0
+            else self.ious[imgId, catId][0]
+        )
+
+        if self.eval_prox:
+            in_prox = (
+                self.ious[imgId, catId][1][:, gtind]
+                if len(self.ious[imgId, catId][1]) > 0
+                else self.ious[imgId, catId][1]
+            )
+
+        T = len(p.iouThrs)
+        G = len(gt)
+        D = len(dt)
+        gtm = np.zeros((T, G))
+        dtm = np.zeros((T, D))
+        gtIg = np.array([g["_ignore"] for g in gt])
+        dtIg = np.zeros((T, D))
+
+        if not len(ious) == 0:
+            for tind, t in enumerate(p.iouThrs):
+                for dind, d in enumerate(dt):
+
+                    # information about best match so far (m=-1 -> unmatched)
+                    iou = min([t, 1 - 1e-10])
+                    m = -1
+
+                    for gind, g in enumerate(gt):
+                        # in case of proximity evaluation, if not in proximity continue
+                        if self.eval_prox and not in_prox[dind, gind]:
+                            continue
+
+                        # if this gt already matched, continue
+                        if gtm[tind, gind] > 0:
+                            continue
+
+                        # if dt matched to reg gt, and on ignore gt, stop
+                        if m > -1 and gtIg[m] == 0 and gtIg[gind] == 1:
+                            break
+
+                        # continue to next gt unless better match made
+                        if ious[dind, gind] < iou:
+                            continue
+
+                        # if match successful and best so far, store appropriately
+                        iou = ious[dind, gind]
+                        m = gind
+
+                    # if match made store id of match for both dt and gt
+                    if m == -1:
+                        continue
+
+                    dtIg[tind, dind] = gtIg[m]
+                    dtm[tind, dind] = gt[m]["id"]
+                    gtm[tind, m] = d["id"]
+
+        # set unmatched detections outside of area range to ignore
+        a = np.array(
+            [d[flag_range] < aRng[0] or d[flag_range] > aRng[1] for d in dt]
+        ).reshape((1, len(dt)))
+
+        dtIg = np.logical_or(dtIg, np.logical_and(dtm == 0, np.repeat(a, T, 0)))
+
+        # in case of proximity evaluation, ignore detections which are far from gt regions
+        if self.eval_prox and len(in_prox) > 0:
+            dt_far = in_prox.any(1) == 0
+            dtIg = np.logical_or(dtIg, np.repeat(dt_far.reshape((1, len(dt))), T, 0))
+
+        # store results for given image and category
+        return {
+            "image_id": imgId,
+            "category_id": catId,
+            "aRng": aRng,
+            "maxDet": maxDet,
+            "dtIds": [d["id"] for d in dt],
+            "gtIds": [g["id"] for g in gt],
+            "dtMatches": dtm,
+            "gtMatches": gtm,
+            "dtScores": [d["score"] for d in dt],
+            "gtIgnore": gtIg,
+            "dtIgnore": dtIg,
+        }
+
+    def summarize(self):
+        """
+        Compute and display summary metrics for evaluation results.
+        Note this functin can *only* be applied on the default parameter setting
+        """
+
+        def _summarize(mode, ap=1, iouThr=None, areaRng="all", maxDets=100, log_str=""):
+            p = self.params
+            eval = self.eval
+
+            if mode == "2D":
+                iStr = (" {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}")
+
+            elif mode == "3D":
+                iStr = " {:<18} {} @[ IoU={:<9} | depth={:>6s} | maxDets={:>3d} ] = {:0.3f}"
+
+            titleStr = "Average Precision" if ap == 1 else "Average Recall"
+            typeStr = "(AP)" if ap == 1 else "(AR)"
+
+            iouStr = (
+                "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1])
+                if iouThr is None
+                else "{:0.2f}".format(iouThr)
+            )
+
+            aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng]
+            mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets]
+
+            if ap == 1:
+
+                # dimension of precision: [TxRxKxAxM]
+                s = eval["precision"]
+
+                # IoU
+                if iouThr is not None:
+                    t = np.where(np.isclose(iouThr, p.iouThrs.astype(float)))[0]
+                    s = s[t]
+
+                s = s[:, :, :, aind, mind]
+
+            else:
+                # dimension of recall: [TxKxAxM]
+                s = eval["recall"]
+                if iouThr is not None:
+                    t = np.where(iouThr == p.iouThrs)[0]
+                    s = s[t]
+                s = s[:, :, aind, mind]
+
+            if len(s[s > -1]) == 0:
+                mean_s = -1
+                
+            else:
+                mean_s = np.mean(s[s > -1])
+
+            if log_str != "":
+                log_str += "\n"
+
+            log_str += "mode={} ".format(mode) + \
+                iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)
+            
+            return mean_s, log_str
+
+        def _summarizeDets(mode):
+
+            params = self.params
+
+            # the thresholds here, define the thresholds printed in `derive_omni_results`
+            thres = [0.5, 0.75, 0.95] if mode == "2D" else [0.15, 0.25, 0.50]
+
+            stats = np.zeros((13,))
+            stats[0], log_str = _summarize(mode, 1)
+
+            stats[1], log_str = _summarize(
+                mode, 1, iouThr=thres[0], maxDets=params.maxDets[2], log_str=log_str
+            )
+
+            stats[2], log_str = _summarize(
+                mode, 1, iouThr=thres[1], maxDets=params.maxDets[2], log_str=log_str
+            )
+
+            stats[3], log_str = _summarize(
+                mode, 1, iouThr=thres[2], maxDets=params.maxDets[2], log_str=log_str
+            )
+
+            stats[4], log_str = _summarize(
+                mode,
+                1,
+                areaRng=params.areaRngLbl[1],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+
+            stats[5], log_str = _summarize(
+                mode,
+                1,
+                areaRng=params.areaRngLbl[2],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+
+            stats[6], log_str = _summarize(
+                mode,
+                1,
+                areaRng=params.areaRngLbl[3],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+
+            stats[7], log_str = _summarize(
+                mode, 0, maxDets=params.maxDets[0], log_str=log_str
+            )
+
+            stats[8], log_str = _summarize(
+                mode, 0, maxDets=params.maxDets[1], log_str=log_str
+            )
+
+            stats[9], log_str = _summarize(
+                mode, 0, maxDets=params.maxDets[2], log_str=log_str
+            )
+
+            stats[10], log_str = _summarize(
+                mode,
+                0,
+                areaRng=params.areaRngLbl[1],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+
+            stats[11], log_str = _summarize(
+                mode,
+                0,
+                areaRng=params.areaRngLbl[2],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+
+            stats[12], log_str = _summarize(
+                mode,
+                0,
+                areaRng=params.areaRngLbl[3],
+                maxDets=params.maxDets[2],
+                log_str=log_str,
+            )
+            
+            return stats, log_str
+
+        if not self.eval:
+            raise Exception("Please run accumulate() first")
+
+        stats, log_str = _summarizeDets(self.mode)
+        self.stats = stats
+
+        return log_str
diff --git a/cubercnn/modeling/backbone/__init__.py b/cubercnn/modeling/backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..552b7045775937e4c951690559deb177c1b983e3
--- /dev/null
+++ b/cubercnn/modeling/backbone/__init__.py
@@ -0,0 +1,5 @@
+from .densenet import * 
+from .mnasnet import * 
+from .resnet import * 
+from .shufflenet import * 
+from .dla import * 
\ No newline at end of file
diff --git a/cubercnn/modeling/backbone/densenet.py b/cubercnn/modeling/backbone/densenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..b049851568fb5b26a7d9d68548105d4a0fb856b8
--- /dev/null
+++ b/cubercnn/modeling/backbone/densenet.py
@@ -0,0 +1,64 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+
+from detectron2.modeling.backbone.fpn import FPN
+
+class DenseNetBackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+
+        base  = models.densenet121(pretrained)
+        base  = base.features
+
+        self.base = base
+        
+        self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 1024, 'p6': 1024}
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    
+    def forward(self, x):
+
+        outputs = {}
+        
+        db1 = self.base[0:5](x)
+        db2 = self.base[5:7](db1)
+        db3 = self.base[7:9](db2)
+        p5 = self.base[9:](db3)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+        outputs['p2'] = db1
+        outputs['p3'] = db2
+        outputs['p4'] = db3
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+
+        return outputs
+
+
+@BACKBONE_REGISTRY.register()
+def build_densenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+
+    bottom_up = DenseNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE
+    )
+    return backbone
\ No newline at end of file
diff --git a/cubercnn/modeling/backbone/dla.py b/cubercnn/modeling/backbone/dla.py
new file mode 100644
index 0000000000000000000000000000000000000000..42de7ac8e357ac7a26f5aa7c4f939688351dde41
--- /dev/null
+++ b/cubercnn/modeling/backbone/dla.py
@@ -0,0 +1,507 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import os
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+import torch.nn.functional as F
+import detectron2.utils.comm as comm
+
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+from detectron2.modeling.backbone.fpn import FPN
+
+BatchNorm = nn.BatchNorm2d
+
+"""
+Adapted models from repositories
+    Deep Layer Aggregation CVPR 2018
+    https://github.com/ucbdrive/dla
+    BSD-3 Licence https://github.com/ucbdrive/dla/blob/master/LICENSE
+    
+    Geometry Uncertainty Projection Network for Monocular 3D Object Detection, ICCV 2021
+    https://github.com/SuperMHP/GUPNet/blob/main/code/lib/backbones/dla.py
+    MIT Licence https://github.com/SuperMHP/GUPNet/blob/main/LICENSE
+"""
+
+def get_model_url(data='imagenet', name='dla34', hash='ba72cf86'):
+    return os.path.join('http://dl.yf.io/dla/models', data, '{}-{}.pth'.format(name, hash))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
+                     padding=1, bias=False)
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
+                               stride=1, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation,
+                               bias=False, dilation=dilation)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes,
+                               kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(bottle_planes, bottle_planes, kernel_size=3,
+                               stride=stride, padding=dilation, bias=False,
+                               dilation=dilation, groups=cardinality)
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes,
+                               kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels, out_channels, 1,
+            stride=1, bias=False, padding=(kernel_size - 1) // 2)
+        self.bn = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(self, levels, block, in_channels, out_channels, stride=1,
+                 level_root=False, root_dim=0, root_kernel_size=1,
+                 dilation=1, root_residual=False):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride,
+                               dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1,
+                               dilation=dilation)
+        else:
+            self.tree1 = Tree(levels - 1, block, in_channels, out_channels,
+                              stride, root_dim=0,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+            self.tree2 = Tree(levels - 1, block, out_channels, out_channels,
+                              root_dim=root_dim + out_channels,
+                              root_kernel_size=root_kernel_size,
+                              dilation=dilation, root_residual=root_residual)
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size,
+                             root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(out_channels)
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(self, levels, channels, num_classes=1000,
+                 block=BasicBlock, residual_root=False, return_levels=False,
+                 pool_size=7, linear_root=False):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.return_levels = return_levels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1,
+                      padding=3, bias=False),
+            BatchNorm(channels[0]),
+            nn.ReLU(inplace=True))
+        self.level0 = self._make_conv_level(
+            channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2)
+        self.level2 = Tree(levels[2], block, channels[1], channels[2], 2,
+                           level_root=False,
+                           root_residual=residual_root)
+        self.level3 = Tree(levels[3], block, channels[2], channels[3], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level4 = Tree(levels[4], block, channels[3], channels[4], 2,
+                           level_root=True, root_residual=residual_root)
+        self.level5 = Tree(levels[5], block, channels[4], channels[5], 2,
+                           level_root=True, root_residual=residual_root)
+
+        self.avgpool = nn.AvgPool2d(pool_size)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes,
+                          kernel_size=1, stride=1, bias=False),
+                BatchNorm(planes),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend([
+                nn.Conv2d(inplanes, planes, kernel_size=3,
+                          stride=stride if i == 0 else 1,
+                          padding=dilation, bias=False, dilation=dilation),
+                BatchNorm(planes),
+                nn.ReLU(inplace=True)])
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+
+    def load_pretrained_model(self, data='imagenet', name='dla34', hash='ba72cf86'):
+        
+        # load model only on main process
+        # to prevent redundent model caching
+        if comm.is_main_process():
+            model_url = get_model_url(data, name, hash)
+            model_weights = model_zoo.load_url(model_url)
+            del model_weights['fc.weight']
+            del model_weights['fc.bias']
+            self.load_state_dict(model_weights)
+
+
+def dla34(pretrained=False, tricks=False, **kwargs):  # DLA-34
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 128, 256, 512],
+                block=BasicBlock, **kwargs)
+    if pretrained:
+        if tricks:
+            model.load_pretrained_model(data='imagenet', name='dla34+tricks', hash='24a49e58')
+        else:
+            model.load_pretrained_model(data='imagenet', name='dla34', hash='ba72cf86')
+    return model
+
+
+def dla46_c(pretrained=False, **kwargs):  # DLA-46-C
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=Bottleneck, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla46_c', hash='2bfd52c3')
+    return model
+
+
+def dla46x_c(pretrained=False, **kwargs):  # DLA-X-46-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 2, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla46x_c', hash='d761bae7')
+    return model
+
+
+def dla60x_c(pretrained=False, **kwargs):  # DLA-X-60-C
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 64, 64, 128, 256],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla60x_c', hash='b870c45c')
+    return model
+
+
+def dla60(pretrained=False, tricks=False, **kwargs):  # DLA-60
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, **kwargs)
+    if pretrained:
+        if tricks:
+            model.load_pretrained_model(data='imagenet', name='dla60+tricks', hash='14488826')
+        else:
+            model.load_pretrained_model(data='imagenet', name='dla60', hash='24839fc4')
+
+    return model
+
+
+def dla60x(pretrained=False, **kwargs):  # DLA-X-60
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 2, 3, 1],
+                [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla60x', hash='d15cacda')
+    return model
+
+
+def dla102(pretrained=False, tricks=False, **kwargs):  # DLA-102
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained:
+
+        if tricks:
+            model.load_pretrained_model(data='imagenet', name='dla102+tricks', hash='27a30eac')
+        else:
+            model.load_pretrained_model(data='imagenet', name='dla102', hash='d94d9790')
+    return model
+
+
+def dla102x(pretrained=False, **kwargs):  # DLA-X-102
+    BottleneckX.expansion = 2
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla102x', hash='ad62be81')
+    return model
+
+
+def dla102x2(pretrained=False, **kwargs):  # DLA-X-102 64
+    BottleneckX.cardinality = 64
+    model = DLA([1, 1, 1, 3, 4, 1], [16, 32, 128, 256, 512, 1024],
+                block=BottleneckX, residual_root=True, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla102x2', hash='262837b6')
+    return model
+
+
+def dla169(pretrained=False, **kwargs):  # DLA-169
+    Bottleneck.expansion = 2
+    model = DLA([1, 1, 2, 3, 5, 1], [16, 32, 128, 256, 512, 1024],
+                block=Bottleneck, residual_root=True, **kwargs)
+    if pretrained:
+        model.load_pretrained_model(data='imagenet', name='dla169', hash='0914e092')
+    return model
+
+class DLABackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+
+        if cfg.MODEL.DLA.TYPE == 'dla34':
+            base  = dla34(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
+            self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
+        elif cfg.MODEL.DLA.TYPE == 'dla46_c':
+            base  = dla46_c(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
+        elif cfg.MODEL.DLA.TYPE == 'dla46x_c':
+            base  = dla46x_c(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
+        elif cfg.MODEL.DLA.TYPE == 'dla60x_c':
+            base  = dla60x_c(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 64, 'p4': 128, 'p5': 256, 'p6': 256}
+        elif cfg.MODEL.DLA.TYPE == 'dla60':
+            base  = dla60(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla60x':
+            base  = dla60x(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla102':
+            base  = dla102(pretrained=pretrained, tricks=cfg.MODEL.DLA.TRICKS)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla102x':
+            base  = dla102x(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla102x2':
+            base  = dla102x2(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+        elif cfg.MODEL.DLA.TYPE == 'dla169':
+            base  = dla169(pretrained=pretrained)
+            self._out_feature_channels = {'p2': 128, 'p3': 256, 'p4': 512, 'p5': 1024, 'p6': 1024}
+
+        self.base_layer = base.base_layer
+        self.level0 = base.level0
+        self.level1 = base.level1
+        self.level2 = base.level2
+        self.level3 = base.level3
+        self.level4 = base.level4
+        self.level5 = base.level5
+        
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    
+    def forward(self, x):
+
+        outputs = {}
+        
+        base_layer = self.base_layer(x)
+        level0 = self.level0(base_layer)
+        level1 = self.level1(level0)
+        level2 = self.level2(level1)
+        level3 = self.level3(level2)
+        level4 = self.level4(level3)
+        level5 = self.level5(level4)
+        level6 = F.max_pool2d(level5, kernel_size=1, stride=2, padding=0)
+
+        outputs['p2'] = level2
+        outputs['p3'] = level3
+        outputs['p4'] = level4
+        outputs['p5'] = level5
+        outputs['p6'] = level6
+
+        return outputs
+
+@BACKBONE_REGISTRY.register()
+def build_dla_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+
+    bottom_up = DLABackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
\ No newline at end of file
diff --git a/cubercnn/modeling/backbone/mnasnet.py b/cubercnn/modeling/backbone/mnasnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f7b95ea300215b0d262cdee8f7efed72f33bb03
--- /dev/null
+++ b/cubercnn/modeling/backbone/mnasnet.py
@@ -0,0 +1,63 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+
+from detectron2.modeling.backbone.fpn import FPN
+
+class MNASNetBackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+
+        base  = models.mnasnet1_0(pretrained)
+        base  = base.layers 
+        
+        self.base = base
+
+        self._out_feature_channels = {'p2': 24, 'p3': 40, 'p4': 96, 'p5': 320, 'p6': 320}
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    
+    def forward(self, x):
+
+        outputs = {}
+        
+        p2 = self.base[0:9](x)
+        p3 = self.base[9](p2)
+        p4 = self.base[10:12](p3)
+        p5 = self.base[12:14](p4)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+        outputs['p2'] = p2
+        outputs['p3'] = p3
+        outputs['p4'] = p4
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+
+        return outputs
+
+@BACKBONE_REGISTRY.register()
+def build_mnasnet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+
+    bottom_up = MNASNetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/cubercnn/modeling/backbone/resnet.py b/cubercnn/modeling/backbone/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7078605beb0361f38fac935c750d22e1dda4b716
--- /dev/null
+++ b/cubercnn/modeling/backbone/resnet.py
@@ -0,0 +1,96 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.fpn import LastLevelMaxPool
+from detectron2.modeling.backbone.resnet import build_resnet_backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+
+from detectron2.modeling.backbone.fpn import FPN
+
+class ResNet(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+
+        if cfg.MODEL.RESNETS.DEPTH == 18:
+            base  = models.resnet18(pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
+        elif cfg.MODEL.RESNETS.DEPTH == 34:
+            base  = models.resnet34(pretrained)
+            self._out_feature_channels = {'p2': 64, 'p3': 128, 'p4': 256, 'p5': 512, 'p6': 512}
+        elif cfg.MODEL.RESNETS.DEPTH == 50:
+            base  = models.resnet50(pretrained)
+            self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
+        elif cfg.MODEL.RESNETS.DEPTH == 101:
+            base  = models.resnet101(pretrained)
+            self._out_feature_channels = {'p2': 256, 'p3': 512, 'p4': 1024, 'p5': 2048, 'p6': 2048}
+        else:
+            raise ValueError('No configuration currently supporting depth of {}'.format(cfg.MODEL.RESNETS.DEPTH))
+        
+        self.conv1 = base.conv1
+        self.bn1 = base.bn1
+        self.relu = base.relu
+        self.maxpool = base.maxpool
+        self.layer1 = base.layer1
+        self.layer2 = base.layer2
+        self.layer3 = base.layer3
+        self.layer4 = base.layer4
+        
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    
+    def forward(self, x):
+
+        outputs = {}
+        
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        p2 = self.layer1(x)
+        p3 = self.layer2(p2)
+        p4 = self.layer3(p3)
+        p5 = self.layer4(p4)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+
+        outputs['p2'] = p2
+        outputs['p3'] = p3
+        outputs['p4'] = p4
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+
+        return outputs
+
+
+@BACKBONE_REGISTRY.register()
+def build_resnet_from_vision_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+
+    if cfg.MODEL.RESNETS.TORCHVISION:
+        bottom_up = ResNet(cfg, input_shape, pretrained=imagenet_pretrain)
+
+    else:
+        # use the MSRA modeling logic to build the backbone.
+        bottom_up = build_resnet_backbone(cfg, input_shape)
+
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        top_block=LastLevelMaxPool(),
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/cubercnn/modeling/backbone/shufflenet.py b/cubercnn/modeling/backbone/shufflenet.py
new file mode 100644
index 0000000000000000000000000000000000000000..8eb5f7c53dbf4a0b690edfd57cfd6944831cce8d
--- /dev/null
+++ b/cubercnn/modeling/backbone/shufflenet.py
@@ -0,0 +1,69 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from torchvision import models
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.backbone.build import BACKBONE_REGISTRY
+import torch.nn.functional as F
+
+from detectron2.modeling.backbone.fpn import FPN
+
+class ShufflenetBackbone(Backbone):
+    def __init__(self, cfg, input_shape, pretrained=True):
+        super().__init__()
+
+        base  = models.shufflenet_v2_x1_0(pretrained)
+        self.conv1 = base.conv1
+        self.maxpool = base.maxpool
+        self.stage2 = base.stage2
+        self.stage3 = base.stage3
+        self.stage4 = base.stage4
+        self.conv5 = base.conv5
+
+        self._out_feature_channels = {'p2': 24, 'p3': 116, 'p4': 232, 'p5': 464, 'p6': 464}
+        self._out_feature_strides ={'p2': 4, 'p3': 8, 'p4': 16, 'p5': 32, 'p6': 64}
+        self._out_features = ['p2', 'p3', 'p4', 'p5', 'p6']
+    
+    def forward(self, x):
+
+        outputs = {}
+        
+        x = self.conv1(x)
+        p2 = self.maxpool(x)
+        p3 = self.stage2(p2)
+        p4 = self.stage3(p3)
+        p5 = self.stage4(p4)
+        p6 = F.max_pool2d(p5, kernel_size=1, stride=2, padding=0)
+
+        outputs['p2'] = p2
+        outputs['p3'] = p3
+        outputs['p4'] = p4
+        outputs['p5'] = p5
+        outputs['p6'] = p6
+
+        return outputs
+
+
+@BACKBONE_REGISTRY.register()
+def build_shufflenet_fpn_backbone(cfg, input_shape: ShapeSpec, priors=None):
+    """
+    Args:
+        cfg: a detectron2 CfgNode
+
+    Returns:
+        backbone (Backbone): backbone module, must be a subclass of :class:`Backbone`.
+    """
+
+    imagenet_pretrain = cfg.MODEL.WEIGHTS_PRETRAIN + cfg.MODEL.WEIGHTS == ''
+    
+    bottom_up = ShufflenetBackbone(cfg, input_shape, pretrained=imagenet_pretrain)
+    in_features = cfg.MODEL.FPN.IN_FEATURES
+    out_channels = cfg.MODEL.FPN.OUT_CHANNELS
+
+    backbone = FPN(
+        bottom_up=bottom_up,
+        in_features=in_features,
+        out_channels=out_channels,
+        norm=cfg.MODEL.FPN.NORM,
+        fuse_type=cfg.MODEL.FPN.FUSE_TYPE,
+    )
+    return backbone
diff --git a/cubercnn/modeling/meta_arch/__init__.py b/cubercnn/modeling/meta_arch/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcbc539d0a3f1444e399ce91720ce563c8c47c6b
--- /dev/null
+++ b/cubercnn/modeling/meta_arch/__init__.py
@@ -0,0 +1 @@
+from .rcnn3d import *
\ No newline at end of file
diff --git a/cubercnn/modeling/meta_arch/rcnn3d.py b/cubercnn/modeling/meta_arch/rcnn3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc715ab3550f2feae577ee469ebd51448d8d5d28
--- /dev/null
+++ b/cubercnn/modeling/meta_arch/rcnn3d.py
@@ -0,0 +1,618 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+from typing import Dict, List, Optional
+from detectron2.layers import move_device_like
+from detectron2.structures.image_list import ImageList
+import torch
+import numpy as np
+from detectron2.layers import ShapeSpec, batched_nms
+from detectron2.utils.visualizer import Visualizer
+from detectron2.data.detection_utils import convert_image_to_rgb
+from detectron2.structures import Instances
+from detectron2.utils.events import get_event_storage
+from detectron2.data import MetadataCatalog
+
+from detectron2.modeling.backbone import Backbone, BACKBONE_REGISTRY
+from detectron2.modeling.proposal_generator import build_proposal_generator
+from detectron2.utils.logger import _log_api_usage
+from detectron2.modeling.meta_arch import (
+    META_ARCH_REGISTRY, GeneralizedRCNN
+)
+# from cubercnn.data.generate_depth_maps import setup_depth_model
+from cubercnn.modeling.roi_heads import build_roi_heads
+
+from detectron2.data import MetadataCatalog
+from cubercnn.modeling.roi_heads import build_roi_heads
+from cubercnn import util, vis
+import torch.nn.functional as F
+from detectron2.config import configurable
+import torch.nn as nn
+
+logger = logging.getLogger(__name__)
+
+
+@META_ARCH_REGISTRY.register()
+class RCNN3D(GeneralizedRCNN):
+    
+    @classmethod
+    def from_config(cls, cfg, priors=None):
+        backbone = build_backbone(cfg, priors=priors)
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "roi_heads": build_roi_heads(cfg, backbone.output_shape(), priors=priors),
+            "input_format": cfg.INPUT.FORMAT,
+            "vis_period": cfg.VIS_PERIOD,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        
+        if not self.training:
+            return self.inference(batched_inputs)
+
+        images = self.preprocess_image(batched_inputs)
+
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+        else:
+            gt_instances = None
+
+        # the backbone is actually a FPN, where the DLA model is the bottom-up structure.
+        # FPN: https://arxiv.org/abs/1612.03144v2
+        # backbone and proposal generator only work on 2D images and annotations.
+        features = self.backbone(images.tensor)
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+
+        instances, detector_losses = self.roi_heads(
+            images, features, proposals, 
+            Ks, im_scales_ratio, 
+            gt_instances
+        )
+
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0 and storage.iter > 0:
+                self.visualize_training(batched_inputs, proposals, instances)
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+
+    def inference(
+        self,
+        batched_inputs: List[Dict[str, torch.Tensor]],
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        assert not self.training
+
+        images = self.preprocess_image(batched_inputs)
+
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+        
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+
+        features = self.backbone(images.tensor)
+
+        # Pass oracle 2D boxes into the RoI heads
+        if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
+            oracles = [b['oracle2D'] for b in batched_inputs]
+            results, _ = self.roi_heads(images, features, oracles, Ks, im_scales_ratio, None)
+        
+        # normal inference
+        else:
+            proposals, _ = self.proposal_generator(images, features, None)
+            results, _ = self.roi_heads(images, features, proposals, Ks, im_scales_ratio, None)
+            
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        else:
+            return results
+
+    def visualize_training(self, batched_inputs, proposals, instances):
+        """
+        A function used to visualize images and proposals. It shows ground truth
+        bounding boxes on the original image and up to 20 top-scoring predicted
+        object proposals on the original image. Users can implement different
+        visualization functions for different models.
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            proposals (list): a list that contains predicted proposals. Both
+                batched_inputs and proposals should have the same length.
+            instances (list): a list that contains predicted RoIhead instances. Both
+                batched_inputs and proposals should have the same length.
+        """
+        
+        storage = get_event_storage()
+
+        # minimum number of boxes to try to visualize per image
+        max_vis_prop = 20
+
+        if not hasattr(self, 'thing_classes'):
+            self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+            self.num_classes = len(self.thing_classes)
+
+        # make a dummy for 2d scenario
+        only2d = instances is None
+        if only2d:
+            instances = [None]*len(batched_inputs)
+
+        for input, prop, instances_i in zip(batched_inputs, proposals, instances):
+
+            img = input["image"]            
+            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+            img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+            img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+
+            '''
+            Visualize the 2D GT and proposal predictions
+            '''
+            v_gt = Visualizer(img, None)
+            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
+            anno_img = v_gt.get_image()
+            box_size = min(len(prop.proposal_boxes), max_vis_prop)
+            v_pred = Visualizer(img, None)
+            v_pred = v_pred.overlay_instances(
+                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
+            )
+            prop_img = v_pred.get_image()
+            vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
+            vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
+            storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
+            if only2d:
+                break
+            '''
+            Visualize the 3D GT and predictions
+            '''
+            K = torch.tensor(input['K'], device=self.device)
+            scale = input['height']/img.shape[0]
+            fx, sx = (val.item()/scale for val in K[0, [0, 2]])
+            fy, sy = (val.item()/scale for val in K[1, [1, 2]])
+            
+            K_scaled = torch.tensor(
+                [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]], 
+                dtype=torch.float32, device=self.device
+            ) @ K
+
+            gts_per_image = input["instances"]
+
+            gt_classes = gts_per_image.gt_classes
+            
+            # Filter out irrelevant groundtruth
+            fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
+
+            gt_classes = gt_classes[fg_selection_mask]
+            gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
+            gt_boxes   = gts_per_image.gt_boxes.tensor[fg_selection_mask]  # 2D boxes
+            gt_poses   = gts_per_image.gt_poses[fg_selection_mask]         # GT poses
+
+            # projected 2D center, depth, w, h, l, 3D center
+            gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
+
+            # this box may have been mirrored and scaled so
+            # we need to recompute XYZ in 3D by backprojecting.
+            gt_z = gt_boxes3D[:, 2]
+
+            gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
+            gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
+            
+            # put together the GT boxes
+            gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
+            gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
+
+            gt_colors = torch.tensor(
+                [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))], 
+                device=self.device
+            )/255.0
+
+            gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
+
+            # perform a simple NMS, which is not cls dependent. 
+            keep = batched_nms(
+                instances_i.pred_boxes.tensor, 
+                instances_i.scores, 
+                torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device), 
+                self.roi_heads.box_predictor.test_nms_thresh
+            )
+            
+            keep = keep[:max_vis_prop]
+            num_to_visualize = len(keep)
+
+            pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
+            pred_pose = instances_i.pred_pose[keep]
+
+            pred_colors = torch.tensor(
+                [util.get_color(i) for i in range(num_to_visualize)], 
+                device=self.device
+            )/255.0
+
+            pred_boxes = instances_i.pred_boxes[keep]
+            pred_scores = instances_i.scores[keep]
+            pred_classes = instances_i.pred_classes[keep]
+            pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
+            pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
+
+            # convert to lists
+            pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
+            gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
+
+            img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+            img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+
+            # horizontal stack 3D GT and pred left/right
+            vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
+            vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
+            vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
+
+            storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
+
+            break  # only visualize one image in a batch
+
+@META_ARCH_REGISTRY.register()
+class RCNN3D_combined_features(nn.Module):
+
+    @configurable
+    def __init__(self, *, backbone, proposal_generator, roi_heads, input_format, vis_period, pixel_mean, pixel_std, depth_model, only_2d):
+        super().__init__()
+        self.backbone = backbone
+        self.proposal_generator = proposal_generator
+        self.roi_heads = roi_heads
+        self.input_format = input_format
+        self.vis_period = vis_period
+        self.depth_model = depth_model
+        self.only_2d = only_2d
+
+        self.register_buffer("pixel_mean", torch.tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer("pixel_std", torch.tensor(pixel_std).view(-1, 1, 1), False)
+        assert (
+            self.pixel_mean.shape == self.pixel_std.shape
+        ), f"{self.pixel_mean} and {self.pixel_std} have different shapes!"
+
+    @classmethod
+    def from_config(cls, cfg, priors=None):
+        backbone = build_backbone(cfg, priors=priors)
+        if False: # some leftover from experimenting with incorporating depth features
+            depth_model = 'zoedepth'
+            pretrained_resource = 'local::depth/checkpoints/depth_anything_metric_depth_indoor.pt'
+            d_model = setup_depth_model(depth_model, pretrained_resource) #NOTE maybe make the depth model be learnable as well
+        
+            shape_modified = {key:ShapeSpec(i.channels*2,stride=i.stride) for key, i in backbone.output_shape().items()}
+        else:
+            d_model = None
+            shape_modified = backbone.output_shape()
+
+        return {
+            "backbone": backbone,
+            "proposal_generator": build_proposal_generator(cfg, backbone.output_shape()),
+            "roi_heads": build_roi_heads(cfg, shape_modified, priors=priors),
+            "input_format": cfg.INPUT.FORMAT,
+            "vis_period": cfg.VIS_PERIOD,
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+            "depth_model": d_model,
+            "only_2d": cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D == 0.0,
+        }
+    
+                
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def _move_to_current_device(self, x):
+        return move_device_like(x, self.pixel_mean)
+
+
+    def preprocess_image(self, batched_inputs: List[Dict[str, torch.Tensor]], normalise=True, img_type="image", convert=False, NoOp=False, to_float=False):
+        """
+        Normalize, pad and batch the input images.
+        """
+        images = [self._move_to_current_device(x[img_type]) for x in batched_inputs]
+        if normalise:
+            images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        if convert:
+            # convert from BGR to RGB
+            images = [x[[2,1,0],:,:] for x in images]
+        if to_float:
+            images = [x.float()/255.0 for x in images]
+        if NoOp:
+            images = ImageList.from_tensors(images)
+            return images
+        images = ImageList.from_tensors(
+            images,
+            self.backbone.size_divisibility,
+            padding_constraints=self.backbone.padding_constraints,
+        )
+        return images
+
+    def _standardize(self, x:torch.Tensor, y:torch.Tensor):
+        '''standardise x to match the mean and std of y'''
+        ym = y.mean()
+        ys = y.std()
+        xm = x.mean()
+        xs = x.std()
+        return (x - xm) * (ys / xs) + ym
+    
+    def cat_depth_features(self, features, images_raw):
+        pred_o = self.depth_model(images_raw.tensor.float()/255.0)
+        # depth features corresponding to p2, p3, p4, p5
+
+        d_features = pred_o['depth_features']
+        # img_features = features['p5']
+        # we must scale the depth map to the same size as the conv feature, otherwise the scale will not correspond correctly in the roi pooling
+        for (layer, img_feature), d_feature in zip(features.items(), reversed(d_features)):
+            d_feature = F.interpolate(d_feature, size=img_feature.shape[-2:], mode='bilinear', align_corners=True)
+            d_feature = self._standardize(d_feature, img_feature)
+            features[layer] = torch.cat((img_feature, d_feature), dim=1)
+        return features
+
+    def forward(self, batched_inputs: List[Dict[str, torch.Tensor]]):
+        
+        if not self.training:
+            return self.inference(batched_inputs) # segmentor is just none in inference because we dont need the loss
+
+        images = self.preprocess_image(batched_inputs)
+        # NOTE: images_raw are scaled to be padded to the same size as the largest. 
+        # This is necessary because the images are of different sizes, so to batch them they must each be the same size.
+        images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
+        # if we want depth maps they are there
+        if not self.only_2d:
+            depth_maps = self.preprocess_image(batched_inputs, img_type="depth_map", normalise=False, NoOp=True)
+
+            ground_maps_fail = [i['ground_map'] is None for i in batched_inputs]
+            ground_maps_fail_idx = [i for i, x in enumerate(ground_maps_fail) if x]
+            for idx in ground_maps_fail_idx:
+                batched_inputs[idx]['ground_map'] = torch.tensor([[1]]) # make a dummy to indicate a fail
+            ground_maps = self.preprocess_image(batched_inputs, img_type="ground_map", normalise=False, NoOp=True)
+        else:
+            ground_maps = None
+            depth_maps = None
+
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+
+        if "instances" in batched_inputs[0]:
+            gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+
+        features = self.backbone(images.tensor)
+        proposals, proposal_losses = self.proposal_generator(images, features, gt_instances)
+
+        if self.depth_model is not None:
+            features = self.cat_depth_features(features, images_raw)
+        
+        instances, detector_losses = self.roi_heads(
+            images, images_raw, ground_maps, depth_maps, features, proposals, 
+            Ks, im_scales_ratio,
+            gt_instances
+        )
+
+        if self.vis_period > 0:
+            storage = get_event_storage()
+            if storage.iter % self.vis_period == 0 and storage.iter > 0:
+                self.visualize_training(batched_inputs, proposals, instances)
+
+        losses = {}
+        losses.update(detector_losses)
+        losses.update(proposal_losses)
+        return losses
+    
+    def inference(
+        self,
+        batched_inputs: List[Dict[str, torch.Tensor]], 
+        detected_instances: Optional[List[Instances]] = None,
+        do_postprocess: bool = True,
+    ):
+        assert not self.training
+
+        images = self.preprocess_image(batched_inputs)
+        images_raw = self.preprocess_image(batched_inputs, img_type='image', convert=True, normalise=False, NoOp=True)
+        # do we assume no access to ground maps in inference?
+        ground_maps = None
+        depth_maps = None
+
+        # scaling factor for the sample relative to its original scale
+        # e.g., how much has the image been upsampled by? or downsampled?
+        im_scales_ratio = [info['height'] / im.shape[1] for (info, im) in zip(batched_inputs, images)]
+        
+        # The unmodified intrinsics for the image
+        Ks = [torch.FloatTensor(info['K']) for info in batched_inputs]
+
+        features = self.backbone(images.tensor)
+
+        # Pass oracle 2D boxes into the RoI heads
+        if type(batched_inputs == list) and np.any(['oracle2D' in b for b in batched_inputs]):
+            oracles = [b['oracle2D'] for b in batched_inputs]
+            results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, oracles, Ks, im_scales_ratio, None)
+        
+        # normal inference
+        else:
+            proposals, _ = self.proposal_generator(images, features, None)
+            if self.depth_model is not None:
+                features = self.cat_depth_features(features, images_raw)
+            # pred boxes are proposals
+            results, _ = self.roi_heads(images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, None)
+            
+        if do_postprocess:
+            assert not torch.jit.is_scripting(), "Scripting is not supported for postprocess."
+            return GeneralizedRCNN._postprocess(results, batched_inputs, images.image_sizes)
+        else:
+            return results
+
+    def visualize_training(self, batched_inputs, proposals, instances):
+        """
+        A function used to visualize images and proposals. It shows ground truth
+        bounding boxes on the original image and up to 20 top-scoring predicted
+        object proposals on the original image. Users can implement different
+        visualization functions for different models.
+        Args:
+            batched_inputs (list): a list that contains input to the model.
+            proposals (list): a list that contains predicted proposals. Both
+                batched_inputs and proposals should have the same length.
+            instances (list): a list that contains predicted RoIhead instances. Both
+                batched_inputs and proposals should have the same length.
+        """
+        
+        storage = get_event_storage()
+
+        # minimum number of boxes to try to visualize per image
+        max_vis_prop = 20
+
+        if not hasattr(self, 'thing_classes'):
+            self.thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+            self.num_classes = len(self.thing_classes)
+        only2d = instances is None
+        if only2d:
+            instances = [None]*len(batched_inputs)
+        for input, prop, instances_i in zip(batched_inputs, proposals, instances):
+
+            img = input["image"]            
+            img = convert_image_to_rgb(img.permute(1, 2, 0), self.input_format)
+            img_3DGT = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+            img_3DPR = np.ascontiguousarray(img.copy()[:, :, [2, 1, 1]]) # BGR
+
+            '''
+            Visualize the 2D GT and proposal predictions
+            '''
+            v_gt = Visualizer(img, None)
+            v_gt = v_gt.overlay_instances(boxes=input["instances"].gt_boxes)
+            anno_img = v_gt.get_image()
+            box_size = min(len(prop.proposal_boxes), max_vis_prop)
+            v_pred = Visualizer(img, None)
+            v_pred = v_pred.overlay_instances(
+                boxes=prop.proposal_boxes[0:box_size].tensor.cpu().numpy()
+            )
+            prop_img = v_pred.get_image()
+            vis_img_rpn = np.concatenate((anno_img, prop_img), axis=1)
+            vis_img_rpn = vis_img_rpn.transpose(2, 0, 1)
+            storage.put_image("Left: GT 2D bounding boxes; Right: Predicted 2D proposals", vis_img_rpn)
+            if only2d:
+                break
+            '''
+            Visualize the 3D GT and predictions
+            '''
+            K = torch.tensor(input['K'], device=self.device)
+            scale = input['height']/img.shape[0]
+            fx, sx = (val.item()/scale for val in K[0, [0, 2]])
+            fy, sy = (val.item()/scale for val in K[1, [1, 2]])
+            
+            K_scaled = torch.tensor(
+                [[1/scale, 0 , 0], [0, 1/scale, 0], [0, 0, 1.0]], 
+                dtype=torch.float32, device=self.device
+            ) @ K
+
+            gts_per_image = input["instances"]
+
+            gt_classes = gts_per_image.gt_classes
+            
+            # Filter out irrelevant groundtruth
+            fg_selection_mask = (gt_classes != -1) & (gt_classes < self.num_classes)
+
+            gt_classes = gt_classes[fg_selection_mask]
+            gt_class_names = [self.thing_classes[cls_idx] for cls_idx in gt_classes]
+            gt_boxes   = gts_per_image.gt_boxes.tensor[fg_selection_mask]  # 2D boxes
+            gt_poses   = gts_per_image.gt_poses[fg_selection_mask]         # GT poses
+
+            # projected 2D center, depth, w, h, l, 3D center
+            gt_boxes3D = gts_per_image.gt_boxes3D[fg_selection_mask]
+
+            # this box may have been mirrored and scaled so
+            # we need to recompute XYZ in 3D by backprojecting.
+            gt_z = gt_boxes3D[:, 2]
+
+            gt_x3D = gt_z * (gt_boxes3D[:, 0] - sx)/fx
+            gt_y3D = gt_z * (gt_boxes3D[:, 1] - sy)/fy
+            
+            # put together the GT boxes
+            gt_center_3D = torch.stack((gt_x3D, gt_y3D, gt_z)).T
+            gt_boxes3D_XYZ_WHL = torch.cat((gt_center_3D, gt_boxes3D[:, 3:6]), dim=1)
+
+            gt_colors = torch.tensor(
+                [util.get_color(i) for i in range(len(gt_boxes3D_XYZ_WHL))], 
+                device=self.device
+            )/255.0
+
+            gt_meshes = util.mesh_cuboid(gt_boxes3D_XYZ_WHL, gt_poses, gt_colors)
+
+            # perform a simple NMS, which is not cls dependent. 
+            keep = batched_nms(
+                instances_i.pred_boxes.tensor, 
+                instances_i.scores, 
+                torch.zeros(len(instances_i.scores), dtype=torch.long, device=instances_i.scores.device), 
+                self.roi_heads.box_predictor.test_nms_thresh
+            )
+            
+            keep = keep[:max_vis_prop]
+            num_to_visualize = len(keep)
+
+            pred_xyzwhl = torch.cat((instances_i.pred_center_cam[keep], instances_i.pred_dimensions[keep]), dim=1)
+            pred_pose = instances_i.pred_pose[keep]
+
+            pred_colors = torch.tensor(
+                [util.get_color(i) for i in range(num_to_visualize)], 
+                device=self.device
+            )/255.0
+
+            pred_boxes = instances_i.pred_boxes[keep]
+            pred_scores = instances_i.scores[keep]
+            pred_classes = instances_i.pred_classes[keep]
+            pred_class_names = ['{} {:.2f}'.format(self.thing_classes[cls_idx], score) for cls_idx, score in zip(pred_classes, pred_scores)]
+            pred_meshes = util.mesh_cuboid(pred_xyzwhl, pred_pose, pred_colors)
+
+            # convert to lists
+            pred_meshes = [pred_meshes.__getitem__(i).detach() for i in range(len(pred_meshes))]
+            gt_meshes = [gt_meshes.__getitem__(i) for i in range(len(gt_meshes))]
+
+            img_3DPR = vis.draw_scene_view(img_3DPR, K_scaled.cpu().numpy(), pred_meshes, text=pred_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+            img_3DGT = vis.draw_scene_view(img_3DGT, K_scaled.cpu().numpy(), gt_meshes, text=gt_class_names, mode='front', blend_weight=0.0, blend_weight_overlay=0.85)
+
+            # horizontal stack 3D GT and pred left/right
+            vis_img_3d = np.concatenate((img_3DGT, img_3DPR), axis=1)
+            vis_img_3d = vis_img_3d[:, :, [2, 1, 0]] # RGB
+            vis_img_3d = vis_img_3d.astype(np.uint8).transpose(2, 0, 1)
+
+            storage.put_image("Left: GT 3D cuboids; Right: Predicted 3D cuboids", vis_img_3d)
+
+            break  # only visualize one image in a batch
+
+def build_model(cfg, priors=None):
+    """
+    Build the whole model architecture, defined by ``cfg.MODEL.META_ARCHITECTURE``.
+    Note that it does not load any weights from ``cfg``.
+    """
+    meta_arch = cfg.MODEL.META_ARCHITECTURE
+    model = META_ARCH_REGISTRY.get(meta_arch)(cfg, priors=priors)
+    model.to(torch.device(cfg.MODEL.DEVICE))
+    _log_api_usage("modeling.meta_arch." + meta_arch)
+    return model
+
+def build_backbone(cfg, input_shape=None, priors=None):
+    """
+    Build a backbone from `cfg.MODEL.BACKBONE.NAME`.
+
+    Returns:
+        an instance of :class:`Backbone`
+    """
+    if input_shape is None:
+        input_shape = ShapeSpec(channels=len(cfg.MODEL.PIXEL_MEAN))
+
+    backbone_name = cfg.MODEL.BACKBONE.NAME
+    backbone = BACKBONE_REGISTRY.get(backbone_name)(cfg, input_shape, priors)
+    assert isinstance(backbone, Backbone)
+    return backbone
\ No newline at end of file
diff --git a/cubercnn/modeling/proposal_generator/__init__.py b/cubercnn/modeling/proposal_generator/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad2b8e8960ac3fd1051eb0a59027a9fcd6dc69c
--- /dev/null
+++ b/cubercnn/modeling/proposal_generator/__init__.py
@@ -0,0 +1 @@
+from .rpn import *
diff --git a/cubercnn/modeling/proposal_generator/rpn.py b/cubercnn/modeling/proposal_generator/rpn.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d0ab29376c216691281b080111b7773f8532862
--- /dev/null
+++ b/cubercnn/modeling/proposal_generator/rpn.py
@@ -0,0 +1,354 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from typing import Dict, List, Tuple
+import torch
+from typing import List, Tuple, Union
+import torch.nn.functional as F
+from detectron2.config import configurable
+from detectron2.utils.events import get_event_storage
+from detectron2.layers import ShapeSpec, cat
+from detectron2.structures import Boxes, Instances, pairwise_iou, pairwise_ioa
+from detectron2.utils.memory import retry_if_cuda_oom
+from fvcore.nn import smooth_l1_loss
+from detectron2.layers import cat
+from detectron2.layers import nonzero_tuple
+
+from detectron2.modeling.box_regression import Box2BoxTransform, _dense_box_regression_loss
+from detectron2.modeling.proposal_generator import RPN
+from detectron2.modeling import PROPOSAL_GENERATOR_REGISTRY
+
+@PROPOSAL_GENERATOR_REGISTRY.register()
+class RPNWithIgnore(RPN):
+    
+    @configurable
+    def __init__(
+        self,
+        *,
+        ignore_thresh: float = 0.5,
+        objectness_uncertainty: str = 'none',
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.ignore_thresh = ignore_thresh
+        self.objectness_uncertainty = objectness_uncertainty
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["ignore_thresh"] = cfg.MODEL.RPN.IGNORE_THRESHOLD
+        ret["objectness_uncertainty"] = cfg.MODEL.RPN.OBJECTNESS_UNCERTAINTY 
+        return ret
+    
+    @torch.jit.unused
+    @torch.no_grad()
+    def label_and_sample_anchors(self, anchors: List[Boxes], gt_instances: List[Instances]) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+        
+        anchors = Boxes.cat(anchors)
+
+        # separate valid and ignore gts
+        gt_boxes_ign = [x.gt_boxes[x.gt_classes < 0] for x in gt_instances]
+        gt_boxes = [x.gt_boxes[x.gt_classes >= 0] for x in gt_instances]
+
+        del gt_instances
+
+        gt_labels = []
+        matched_gt_boxes = []
+
+        for gt_boxes_i, gt_boxes_ign_i in zip(gt_boxes, gt_boxes_ign):
+            """
+            gt_boxes_i: ground-truth boxes for i-th image
+            gt_boxes_ign_i: ground-truth ignore boxes for i-th image
+            """
+
+            match_quality_matrix = retry_if_cuda_oom(pairwise_iou)(gt_boxes_i, anchors)
+            matched_idxs, gt_labels_i = retry_if_cuda_oom(self.anchor_matcher)(match_quality_matrix)
+            
+            # Matching is memory-expensive and may result in CPU tensors. But the result is small
+            gt_labels_i = gt_labels_i.to(device=gt_boxes_i.device)
+
+            gt_arange = torch.arange(match_quality_matrix.shape[1]).to(matched_idxs.device)
+            matched_ious = match_quality_matrix[matched_idxs, gt_arange]
+            
+            best_ious_gt_vals, best_ious_gt_ind = match_quality_matrix.max(dim=1)
+
+            del match_quality_matrix
+
+            best_inds = torch.tensor(list(set(best_ious_gt_ind.tolist()) & set((gt_labels_i == 1).nonzero().squeeze(1).tolist())))
+
+            # A vector of labels (-1, 0, 1) for each anchor
+            # which denote (ignore, background, foreground)
+            gt_labels_i = self._subsample_labels(gt_labels_i, matched_ious=matched_ious)
+
+            # overrride the best possible GT options, always selected for sampling.
+            # otherwise aggressive thresholds may produce HUGE amounts of low quality FG.
+            if best_inds.numel() > 0:
+                gt_labels_i[best_inds] = 1.0
+
+            if len(gt_boxes_i) == 0:
+                # These values won't be used anyway since the anchor is labeled as background
+                matched_gt_boxes_i = torch.zeros_like(anchors.tensor)
+            else:
+                # TODO wasted indexing computation for ignored boxes
+                matched_gt_boxes_i = gt_boxes_i[matched_idxs].tensor
+
+            if len(gt_boxes_ign_i) > 0: 
+
+                # compute the quality matrix, only on subset of background
+                background_inds = (gt_labels_i == 0).nonzero().squeeze()
+
+                if background_inds.numel() > 1:
+                    
+                    match_quality_matrix_ign = retry_if_cuda_oom(pairwise_ioa)(gt_boxes_ign_i, anchors[background_inds])
+
+                    # determine the boxes inside ignore regions with sufficient threshold
+                    gt_labels_i[background_inds[match_quality_matrix_ign.max(0)[0] >= self.ignore_thresh]] = -1
+                
+                    del match_quality_matrix_ign
+
+            gt_labels.append(gt_labels_i)  # N,AHW
+            matched_gt_boxes.append(matched_gt_boxes_i)
+
+        return gt_labels, matched_gt_boxes
+    
+    def _subsample_labels(self, label, matched_ious=None):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite
+        the label vector to the ignore value (-1) for all elements that are not
+        included in the sample.
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(
+            label, self.batch_size_per_image, self.positive_fraction, 0, matched_ious=matched_ious
+        )
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+
+    @torch.jit.unused
+    def losses(
+        self,
+        anchors: List[Boxes],
+        pred_objectness_logits: List[torch.Tensor],
+        gt_labels: List[torch.Tensor],
+        pred_anchor_deltas: List[torch.Tensor],
+        gt_boxes: List[torch.Tensor],
+    ) -> Dict[str, torch.Tensor]:
+        """
+        Return the losses from a set of RPN predictions and their associated ground-truth.
+
+        Args:
+            anchors (list[Boxes or RotatedBoxes]): anchors for each feature map, each
+                has shape (Hi*Wi*A, B), where B is box dimension (4 or 5).
+            pred_objectness_logits (list[Tensor]): A list of L elements.
+                Element i is a tensor of shape (N, Hi*Wi*A) representing
+                the predicted objectness logits for all anchors.
+            gt_labels (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
+            pred_anchor_deltas (list[Tensor]): A list of L elements. Element i is a tensor of shape
+                (N, Hi*Wi*A, 4 or 5) representing the predicted "deltas" used to transform anchors
+                to proposals.
+            gt_boxes (list[Tensor]): Output of :meth:`label_and_sample_anchors`.
+
+        Returns:
+            dict[loss name -> loss value]: A dict mapping from loss name to loss value.
+                Loss names are: `loss_rpn_cls` for objectness classification and
+                `loss_rpn_loc` for proposal localization.
+        """
+        num_images = len(gt_labels)
+        gt_labels = torch.stack(gt_labels)  # (N, sum(Hi*Wi*Ai))
+
+        # Log the number of positive/negative anchors per-image that's used in training
+        pos_mask = gt_labels == 1
+        num_pos_anchors = pos_mask.sum().item()
+        num_neg_anchors = (gt_labels == 0).sum().item()
+        storage = get_event_storage()
+        storage.put_scalar("rpn/num_pos_anchors", num_pos_anchors / num_images)
+        storage.put_scalar("rpn/num_neg_anchors", num_neg_anchors / num_images)
+
+        if not self.objectness_uncertainty.lower() in ['none']:
+            localization_loss, objectness_loss = _dense_box_regression_loss_with_uncertainty(
+                anchors,
+                self.box2box_transform,
+                pred_anchor_deltas,
+                pred_objectness_logits,
+                gt_boxes,
+                pos_mask,
+                box_reg_loss_type=self.box_reg_loss_type,
+                smooth_l1_beta=self.smooth_l1_beta,
+                uncertainty_type=self.objectness_uncertainty,
+            )
+        else:
+            localization_loss = _dense_box_regression_loss(
+                anchors,
+                self.box2box_transform,
+                pred_anchor_deltas,
+                gt_boxes,
+                pos_mask,
+                box_reg_loss_type=self.box_reg_loss_type,
+                smooth_l1_beta=self.smooth_l1_beta,
+            )
+
+            valid_mask = gt_labels >= 0
+            objectness_loss = F.binary_cross_entropy_with_logits(
+                cat(pred_objectness_logits, dim=1)[valid_mask],
+                gt_labels[valid_mask].to(torch.float32),
+                reduction="sum",
+            )
+        normalizer = self.batch_size_per_image * num_images
+        losses = {
+            "rpn/cls": objectness_loss / normalizer,
+            "rpn/loc": localization_loss / normalizer,
+        }
+        losses = {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+        return losses
+
+def _dense_box_regression_loss_with_uncertainty(
+    anchors: List[Union[Boxes, torch.Tensor]],
+    box2box_transform: Box2BoxTransform,
+    pred_anchor_deltas: List[torch.Tensor],
+    pred_objectness_logits: List[torch.Tensor],
+    gt_boxes: List[torch.Tensor],
+    fg_mask: torch.Tensor,
+    box_reg_loss_type="smooth_l1",
+    smooth_l1_beta=0.0,
+    uncertainty_type='centerness',
+):
+    """
+    Compute loss for dense multi-level box regression.
+    Loss is accumulated over ``fg_mask``.
+    Args:
+        anchors: #lvl anchor boxes, each is (HixWixA, 4)
+        pred_anchor_deltas: #lvl predictions, each is (N, HixWixA, 4)
+        gt_boxes: N ground truth boxes, each has shape (R, 4) (R = sum(Hi * Wi * A))
+        fg_mask: the foreground boolean mask of shape (N, R) to compute loss on
+        box_reg_loss_type (str): Loss type to use. Supported losses: "smooth_l1", "giou",
+            "diou", "ciou".
+        smooth_l1_beta (float): beta parameter for the smooth L1 regression loss. Default to
+            use L1 loss. Only used when `box_reg_loss_type` is "smooth_l1"
+    """
+    if isinstance(anchors[0], Boxes):
+        anchors = type(anchors[0]).cat(anchors).tensor  # (R, 4)
+    else:
+        anchors = cat(anchors)
+
+    n = len(gt_boxes)
+    
+    boxes_fg = Boxes(anchors.unsqueeze(0).repeat([n, 1, 1])[fg_mask])
+    gt_boxes_fg = Boxes(torch.stack(gt_boxes)[fg_mask].detach())
+    objectness_targets_anchors = matched_pairwise_iou(boxes_fg, gt_boxes_fg).detach()
+
+    objectness_logits = torch.cat(pred_objectness_logits, dim=1)
+
+    # Numerically the same as (-(y*torch.log(p) + (1 - y)*torch.log(1 - p))).sum()
+    loss_box_conf = F.binary_cross_entropy_with_logits(
+        objectness_logits[fg_mask], 
+        objectness_targets_anchors,
+        reduction='none'
+    )
+
+    loss_box_conf = (loss_box_conf * objectness_targets_anchors).sum()
+    
+    # keep track of how scores look for FG / BG.
+    # ideally, FG slowly >>> BG scores as regression improves. 
+    storage = get_event_storage()
+    storage.put_scalar("rpn/conf_pos_anchors", torch.sigmoid(objectness_logits[fg_mask]).mean().item())
+    storage.put_scalar("rpn/conf_neg_anchors", torch.sigmoid(objectness_logits[~fg_mask]).mean().item())
+
+    if box_reg_loss_type == "smooth_l1":
+        gt_anchor_deltas = [box2box_transform.get_deltas(anchors, k) for k in gt_boxes]
+        gt_anchor_deltas = torch.stack(gt_anchor_deltas)  # (N, R, 4)
+        loss_box_reg = smooth_l1_loss(
+            cat(pred_anchor_deltas, dim=1)[fg_mask],
+            gt_anchor_deltas[fg_mask],
+            beta=smooth_l1_beta,
+            reduction="none",
+        )
+        
+        loss_box_reg = (loss_box_reg.sum(dim=1) * objectness_targets_anchors).sum()
+
+    else:
+        raise ValueError(f"Invalid dense box regression loss type '{box_reg_loss_type}'")
+
+    return loss_box_reg, loss_box_conf
+
+def subsample_labels(
+    labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int, matched_ious=None, eps=1e-4
+):
+    """
+    Return `num_samples` (or fewer, if not enough found)
+    random samples from `labels` which is a mixture of positives & negatives.
+    It will try to return as many positives as possible without
+    exceeding `positive_fraction * num_samples`, and then try to
+    fill the remaining slots with negatives.
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number
+            of negatives sampled is `min(num_negatives, num_samples - num_positives_sampled)`.
+            In order words, if there are not enough positives, the sample is filled with
+            negatives. If there are also not enough negatives, then as many elements are
+            sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D vector of indices. The total length of both is `num_samples` or fewer.
+    """
+    positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0]
+    negative = nonzero_tuple(labels == bg_label)[0]
+
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+
+    #if positive_fraction == 1.0 and num_neg > 10:
+    # allow some negatives for statistics only.
+    #num_neg = 10
+    
+    # randomly select positive and negative examples
+    if num_pos > 0 and matched_ious is not None:
+        perm1 = torch.multinomial(matched_ious[positive] + eps, num_pos)
+    else:
+        perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    if num_neg > 0 and matched_ious is not None:
+        perm2 = torch.multinomial(matched_ious[negative] + eps, num_neg)
+    else:
+        perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
+
+def matched_pairwise_iou(boxes1: Boxes, boxes2: Boxes) -> torch.Tensor:
+    """
+    Compute pairwise intersection over union (IOU) of two sets of matched
+    boxes that have the same number of boxes.
+    Similar to :func:`pairwise_iou`, but computes only diagonal elements of the matrix.
+    Args:
+        boxes1 (Boxes): bounding boxes, sized [N,4].
+        boxes2 (Boxes): same length as boxes1
+    Returns:
+        Tensor: iou, sized [N].
+    """
+    assert len(boxes1) == len(
+        boxes2
+    ), "boxlists should have the same" "number of entries, got {}, {}".format(
+        len(boxes1), len(boxes2)
+    )
+    area1 = boxes1.area()  # [N]
+    area2 = boxes2.area()  # [N]
+    box1, box2 = boxes1.tensor, boxes2.tensor
+    lt = torch.max(box1[:, :2], box2[:, :2])  # [N,2]
+    rb = torch.min(box1[:, 2:], box2[:, 2:])  # [N,2]
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+    iou = inter / (area1 + area2 - inter)  # [N]
+    return iou
\ No newline at end of file
diff --git a/cubercnn/modeling/roi_heads/__init__.py b/cubercnn/modeling/roi_heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f81cad6ef21b85de342795007038cc25367fe800
--- /dev/null
+++ b/cubercnn/modeling/roi_heads/__init__.py
@@ -0,0 +1 @@
+from .roi_heads import *
\ No newline at end of file
diff --git a/cubercnn/modeling/roi_heads/cube_head.py b/cubercnn/modeling/roi_heads/cube_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..96f33b7026f12ba47beb1ce69fad7d1c4f1343c7
--- /dev/null
+++ b/cubercnn/modeling/roi_heads/cube_head.py
@@ -0,0 +1,201 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from detectron2.utils.registry import Registry
+from typing import Dict
+from detectron2.layers import ShapeSpec
+from torch import nn
+import torch
+import numpy as np
+import fvcore.nn.weight_init as weight_init
+
+from pytorch3d.transforms.rotation_conversions import _copysign
+from pytorch3d.transforms import (
+    rotation_6d_to_matrix, 
+    euler_angles_to_matrix, 
+    quaternion_to_matrix
+)
+
+ROI_CUBE_HEAD_REGISTRY = Registry("ROI_CUBE_HEAD")
+
+@ROI_CUBE_HEAD_REGISTRY.register()
+class CubeHead(nn.Module):
+
+    def __init__(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        super().__init__()
+
+        #-------------------------------------------
+        # Settings
+        #-------------------------------------------
+        self.num_classes        = cfg.MODEL.ROI_HEADS.NUM_CLASSES
+        self.use_conf           = cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE
+        self.z_type             = cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE
+        self.pose_type          = cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE
+        self.cluster_bins       = cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS
+        self.shared_fc          = cfg.MODEL.ROI_CUBE_HEAD.SHARED_FC
+
+        #-------------------------------------------
+        # Feature generator
+        #-------------------------------------------
+
+        num_conv = cfg.MODEL.ROI_CUBE_HEAD.NUM_CONV
+        conv_dim = cfg.MODEL.ROI_CUBE_HEAD.CONV_DIM
+        num_fc = cfg.MODEL.ROI_CUBE_HEAD.NUM_FC
+        fc_dim = cfg.MODEL.ROI_CUBE_HEAD.FC_DIM
+
+        conv_dims = [conv_dim] * num_conv
+        fc_dims = [fc_dim] * num_fc
+
+        assert len(conv_dims) + len(fc_dims) > 0
+
+        self._output_size = (input_shape.channels, input_shape.height, input_shape.width)
+
+        if self.shared_fc:
+            self.feature_generator = nn.Sequential()
+        else:
+            self.feature_generator_XY = nn.Sequential()
+            self.feature_generator_dims = nn.Sequential()
+            self.feature_generator_pose = nn.Sequential()
+            self.feature_generator_Z = nn.Sequential()
+
+            if self.use_conf:
+                self.feature_generator_conf = nn.Sequential()
+
+        # create fully connected layers for Cube Head
+        for k, fc_dim in enumerate(fc_dims):
+            
+            fc_dim_in = int(np.prod(self._output_size))
+            
+            self._output_size = fc_dim
+
+            if self.shared_fc:
+                fc = nn.Linear(fc_dim_in, fc_dim)
+                weight_init.c2_xavier_fill(fc)
+                self.feature_generator.add_module("fc{}".format(k + 1), fc)
+                self.feature_generator.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+            
+            else:
+                
+                fc = nn.Linear(fc_dim_in, fc_dim)
+                weight_init.c2_xavier_fill(fc)
+                self.feature_generator_dims.add_module("fc{}".format(k + 1), fc)
+                self.feature_generator_dims.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+
+                fc = nn.Linear(fc_dim_in, fc_dim)
+                weight_init.c2_xavier_fill(fc)
+                self.feature_generator_XY.add_module("fc{}".format(k + 1), fc)
+                self.feature_generator_XY.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+
+                fc = nn.Linear(fc_dim_in, fc_dim)
+                weight_init.c2_xavier_fill(fc)
+                self.feature_generator_pose.add_module("fc{}".format(k + 1), fc)
+                self.feature_generator_pose.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+
+                fc = nn.Linear(fc_dim_in, fc_dim)
+                weight_init.c2_xavier_fill(fc)
+                self.feature_generator_Z.add_module("fc{}".format(k + 1), fc)
+                self.feature_generator_Z.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+
+                if self.use_conf:
+                    fc = nn.Linear(fc_dim_in, fc_dim)
+                    weight_init.c2_xavier_fill(fc)
+                    self.feature_generator_conf.add_module("fc{}".format(k + 1), fc)
+                    self.feature_generator_conf.add_module("fc_relu{}".format(k + 1), nn.ReLU())
+
+        #-------------------------------------------
+        # 3D outputs
+        #-------------------------------------------
+        
+        # Dimensions in meters (width, height, length)
+        self.bbox_3D_dims = nn.Linear(self._output_size, self.num_classes*3)
+        nn.init.normal_(self.bbox_3D_dims.weight, std=0.001)
+        nn.init.constant_(self.bbox_3D_dims.bias, 0)
+
+        cluster_bins = self.cluster_bins if self.cluster_bins > 1 else 1
+
+        # XY
+        self.bbox_3D_center_deltas = nn.Linear(self._output_size, self.num_classes*2)
+        nn.init.normal_(self.bbox_3D_center_deltas.weight, std=0.001)
+        nn.init.constant_(self.bbox_3D_center_deltas.bias, 0)
+
+        # Pose
+        if self.pose_type == '6d':
+            self.bbox_3D_pose = nn.Linear(self._output_size, self.num_classes*6)
+
+        elif self.pose_type == 'quaternion':
+            self.bbox_3D_pose = nn.Linear(self._output_size, self.num_classes*4)
+
+        elif self.pose_type == 'euler':
+            self.bbox_3D_pose = nn.Linear(self._output_size, self.num_classes*3)
+
+        else:
+            raise ValueError('Cuboid pose type {} is not recognized'.format(self.pose_type))
+        
+        nn.init.normal_(self.bbox_3D_pose.weight, std=0.001)
+        nn.init.constant_(self.bbox_3D_pose.bias, 0)
+
+        # Z 
+        self.bbox_3D_center_depth = nn.Linear(self._output_size, self.num_classes*cluster_bins)
+        nn.init.normal_(self.bbox_3D_center_depth.weight, std=0.001)
+        nn.init.constant_(self.bbox_3D_center_depth.bias, 1) # NOTE Changed second input from 0 to 1
+
+        # Optionally, box confidence
+        if self.use_conf:
+            self.bbox_3D_uncertainty = nn.Linear(self._output_size, self.num_classes*1)
+            nn.init.normal_(self.bbox_3D_uncertainty.weight, std=0.001)
+            nn.init.constant_(self.bbox_3D_uncertainty.bias, 5)
+
+
+    def forward(self, x):
+    
+        n = x.shape[0]
+        
+        box_z = None
+        box_uncert = None
+        box_2d_deltas = None
+
+        if self.shared_fc:
+            features = self.feature_generator(x)
+            box_2d_deltas = self.bbox_3D_center_deltas(features)
+            box_dims = self.bbox_3D_dims(features)
+            box_pose = self.bbox_3D_pose(features)
+            box_z = self.bbox_3D_center_depth(features)
+
+            if self.use_conf:
+                box_uncert = self.bbox_3D_uncertainty(features).clip(0.01)
+        else:
+
+            box_2d_deltas = self.bbox_3D_center_deltas(self.feature_generator_XY(x))
+            box_dims = self.bbox_3D_dims(self.feature_generator_dims(x))
+            box_pose = self.bbox_3D_pose(self.feature_generator_pose(x))
+            box_z = self.bbox_3D_center_depth(self.feature_generator_Z(x))
+
+            if self.use_conf:
+                box_uncert = self.bbox_3D_uncertainty(self.feature_generator_conf(x)).clip(0.01)
+
+        # Pose
+        if self.pose_type == '6d':
+            box_pose = rotation_6d_to_matrix(box_pose.view(-1, 6))
+
+        elif self.pose_type == 'quaternion':
+            quats = box_pose.view(-1, 4)
+            quats_scales = (quats * quats).sum(1)
+            quats = quats / _copysign(torch.sqrt(quats_scales), quats[:, 0])[:, None]
+            box_pose = quaternion_to_matrix(quats)
+
+        elif self.pose_type == 'euler':
+            box_pose = euler_angles_to_matrix(box_pose.view(-1, 3), 'XYZ')
+
+        box_2d_deltas = box_2d_deltas.view(n, self.num_classes, 2)
+        box_dims = box_dims.view(n, self.num_classes, 3)
+        box_pose = box_pose.view(n, self.num_classes, 3, 3)
+
+        if self.cluster_bins > 1:
+            box_z = box_z.view(n, self.cluster_bins, self.num_classes, -1)
+
+        else:
+            box_z = box_z.view(n, self.num_classes, -1)
+            
+        return box_2d_deltas, box_z, box_dims, box_pose, box_uncert
+    
+def build_cube_head(cfg, input_shape: Dict[str, ShapeSpec]):
+    name = cfg.MODEL.ROI_CUBE_HEAD.NAME
+    return ROI_CUBE_HEAD_REGISTRY.get(name)(cfg, input_shape)
\ No newline at end of file
diff --git a/cubercnn/modeling/roi_heads/fast_rcnn.py b/cubercnn/modeling/roi_heads/fast_rcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..72019db748bb89f0e2d553106616e5b74c8c3696
--- /dev/null
+++ b/cubercnn/modeling/roi_heads/fast_rcnn.py
@@ -0,0 +1,262 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from re import L
+import torch
+from torch.nn import functional as F
+from typing import List, Tuple
+
+from fvcore.nn import giou_loss, smooth_l1_loss
+from detectron2.utils.events import get_event_storage
+from detectron2.layers import cat, cross_entropy, nonzero_tuple, batched_nms
+from detectron2.structures import Instances, Boxes
+from detectron2.modeling.roi_heads.fast_rcnn import (
+    FastRCNNOutputLayers, _log_classification_stats
+)
+from cubercnn.modeling.proposal_generator.rpn import matched_pairwise_iou
+
+def fast_rcnn_inference(
+    boxes: List[torch.Tensor],
+    scores: List[torch.Tensor],
+    image_shapes: List[Tuple[int, int]],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+):
+    """
+    Call `fast_rcnn_inference_single_image` for all images.
+
+    Args:
+        boxes (list[Tensor]): A list of Tensors of predicted class-specific or class-agnostic
+            boxes for each image. Element i has shape (Ri, K * 4) if doing
+            class-specific regression, or (Ri, 4) if doing class-agnostic
+            regression, where Ri is the number of predicted objects for image i.
+            This is compatible with the output of :meth:`FastRCNNOutputLayers.predict_boxes`.
+        scores (list[Tensor]): A list of Tensors of predicted class scores for each image.
+            Element i has shape (Ri, K + 1), where Ri is the number of predicted objects
+            for image i. Compatible with the output of :meth:`FastRCNNOutputLayers.predict_probs`.
+        image_shapes (list[tuple]): A list of (width, height) tuples for each image in the batch.
+        score_thresh (float): Only return detections with a confidence score exceeding this
+            threshold.
+        nms_thresh (float):  The threshold to use for box non-maximum suppression. Value in [0, 1].
+        topk_per_image (int): The number of top scoring detections to return. Set < 0 to return
+            all detections.
+
+    Returns:
+        instances: (list[Instances]): A list of N instances, one for each image in the batch,
+            that stores the topk most confidence detections.
+        kept_indices: (list[Tensor]): A list of 1D tensor of length of N, each element indicates
+            the corresponding boxes/scores index in [0, Ri) from the input, for image i.
+    """
+    result_per_image = [
+        fast_rcnn_inference_single_image(
+            boxes_per_image, scores_per_image, image_shape, score_thresh, nms_thresh, topk_per_image
+        )
+        for scores_per_image, boxes_per_image, image_shape in zip(scores, boxes, image_shapes)
+    ]
+    return [x[0] for x in result_per_image], [x[1] for x in result_per_image]
+
+def fast_rcnn_inference_single_image(
+    boxes,
+    scores,
+    image_shape: Tuple[int, int],
+    score_thresh: float,
+    nms_thresh: float,
+    topk_per_image: int,
+):
+    """
+    Single-image inference. Return bounding-box detection results by thresholding
+    on scores and applying non-maximum suppression (NMS).
+
+    Args:
+        Same as `fast_rcnn_inference`, but with boxes, scores, and image shapes
+        per image.
+
+    Returns:
+        Same as `fast_rcnn_inference`, but for only one image.
+    """
+    valid_mask = torch.isfinite(boxes).all(dim=1) & torch.isfinite(scores).all(dim=1)
+    if not valid_mask.all():
+        boxes = boxes[valid_mask]
+        scores = scores[valid_mask]
+
+    scores = scores[:, :-1]
+    num_bbox_reg_classes = boxes.shape[1] // 4
+
+    # Convert to Boxes to use the `clip` function ...
+    boxes = Boxes(boxes.reshape(-1, 4))
+    boxes.clip(image_shape)
+    boxes = boxes.tensor.view(-1, num_bbox_reg_classes, 4)  # R x C x 4
+
+    # 1. Filter results based on detection scores. It can make NMS more efficient
+    #    by filtering out low-confidence detections.
+    filter_mask = scores > score_thresh  # R x K
+
+    # R' x 2. First column contains indices of the R predictions;
+    # Second column contains indices of classes.
+    filter_inds = filter_mask.nonzero()
+    if num_bbox_reg_classes == 1:
+        boxes = boxes[filter_inds[:, 0], 0]
+    else:
+        boxes = boxes[filter_mask]
+
+    scores_full = scores[filter_inds[:, 0]]
+    scores = scores[filter_mask]
+
+    # 2. Apply NMS for each class independently.
+    keep = batched_nms(boxes, scores, filter_inds[:, 1], nms_thresh)
+    if topk_per_image >= 0:
+        keep = keep[:topk_per_image]
+
+    boxes, scores, filter_inds, scores_full = boxes[keep], scores[keep], filter_inds[keep], scores_full[keep]
+
+    result = Instances(image_shape)
+    result.pred_boxes = Boxes(boxes)
+    result.scores = scores
+    result.scores_full = scores_full
+    result.pred_classes = filter_inds[:, 1]
+    return result, filter_inds[:, 0]
+
+
+class FastRCNNOutputs(FastRCNNOutputLayers):
+
+    def inference(self, predictions: Tuple[torch.Tensor, torch.Tensor], proposals: List[Instances]):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were
+                used to compute predictions. The ``proposal_boxes`` field is expected.
+
+        Returns:
+            list[Instances]: same as `fast_rcnn_inference`.
+            list[Tensor]: same as `fast_rcnn_inference`.
+        """
+        boxes = self.predict_boxes(predictions, proposals)
+        scores = self.predict_probs(predictions, proposals)
+            
+        image_shapes = [x.image_size for x in proposals]
+        return fast_rcnn_inference(
+            boxes,
+            scores,
+            image_shapes,
+            self.test_score_thresh,
+            self.test_nms_thresh,
+            self.test_topk_per_image,
+        )
+
+    def losses(self, predictions, proposals):
+        """
+        Args:
+            predictions: return values of :meth:`forward()`.
+            proposals (list[Instances]): proposals that match the features that were used
+                to compute predictions. The fields ``proposal_boxes``, ``gt_boxes``,
+                ``gt_classes`` are expected.
+
+        Returns:
+            Dict[str, Tensor]: dict of losses
+        """
+        scores, proposal_deltas = predictions
+
+        # parse classification outputs
+        gt_classes = (
+            cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)
+        )
+
+        # parse box regression outputs
+        if len(proposals):
+            proposal_boxes = cat([p.proposal_boxes.tensor for p in proposals], dim=0)  # Nx4
+            assert not proposal_boxes.requires_grad, "Proposals should not require gradients!"
+            # If "gt_boxes" does not exist, the proposals must be all negative and
+            # should not be included in regression loss computation.
+            # Here we just use proposal_boxes as an arbitrary placeholder because its
+            # value won't be used in self.box_reg_loss().
+            gt_boxes = cat(
+                [(p.gt_boxes if p.has("gt_boxes") else p.proposal_boxes).tensor for p in proposals],
+                dim=0,
+            )
+        else:
+            proposal_boxes = gt_boxes = torch.empty((0, 4), device=proposal_deltas.device)
+
+
+        normalize_factor = max(gt_classes.numel(), 1.0)
+
+        '''
+        Standard Faster R-CNN losses
+        '''
+        _log_classification_stats(scores, gt_classes)
+        loss_cls = cross_entropy(scores, gt_classes, reduction="mean")
+        loss_box_reg = self.box_reg_loss(proposal_boxes, gt_boxes, proposal_deltas, gt_classes, reduction="none")
+        loss_box_reg = (loss_box_reg).sum() / normalize_factor
+
+        losses = {
+            "BoxHead/loss_cls": loss_cls,
+            "BoxHead/loss_box_reg": loss_box_reg,
+        }
+        
+        return {k: v * self.loss_weight.get(k, 1.0) for k, v in losses.items()}
+    
+    def box_reg_loss(self, proposal_boxes, gt_boxes, pred_deltas, gt_classes, reduction='mean'):
+        """
+        Args:
+            All boxes are tensors with the same shape Rx(4 or 5).
+            gt_classes is a long tensor of shape R, the gt class label of each proposal.
+            R shall be the number of proposals.
+        """
+        box_dim = proposal_boxes.shape[1]  # 4 or 5
+        
+        # Regression loss is only computed for foreground proposals (those matched to a GT)
+        fg_inds = nonzero_tuple((gt_classes >= 0) & (gt_classes < self.num_classes))[0]
+        if pred_deltas.shape[1] == box_dim:  # cls-agnostic regression
+            fg_pred_deltas = pred_deltas[fg_inds]
+        else:
+            fg_pred_deltas = pred_deltas.view(-1, self.num_classes, box_dim)[
+                fg_inds, gt_classes[fg_inds]
+            ]
+
+        if reduction == 'mean':
+            if self.box_reg_loss_type == "smooth_l1":
+                gt_pred_deltas = self.box2box_transform.get_deltas(
+                    proposal_boxes[fg_inds],
+                    gt_boxes[fg_inds],
+                )
+                loss_box_reg = smooth_l1_loss(
+                    fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="sum"
+                )
+            elif self.box_reg_loss_type == "giou":
+                fg_pred_boxes = self.box2box_transform.apply_deltas(
+                    fg_pred_deltas, proposal_boxes[fg_inds]
+                )
+                loss_box_reg = giou_loss(fg_pred_boxes, gt_boxes[fg_inds], reduction="sum")
+            else:
+                raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
+        
+            # The reg loss is normalized using the total number of regions (R), not the number
+            # of foreground regions even though the box regression loss is only defined on
+            # foreground regions. Why? Because doing so gives equal training influence to
+            # each foreground example. To see how, consider two different minibatches:
+            #  (1) Contains a single foreground region
+            #  (2) Contains 100 foreground regions
+            # If we normalize by the number of foreground regions, the single example in
+            # minibatch (1) will be given 100 times as much influence as each foreground
+            # example in minibatch (2). Normalizing by the total number of regions, R,
+            # means that the single example in minibatch (1) and each of the 100 examples
+            # in minibatch (2) are given equal influence.
+            return loss_box_reg / max(gt_classes.numel(), 1.0)  # return 0 if empty
+        
+        elif reduction == 'none':
+            if self.box_reg_loss_type == "smooth_l1":
+                gt_pred_deltas = self.box2box_transform.get_deltas(
+                    proposal_boxes[fg_inds],
+                    gt_boxes[fg_inds],
+                )
+                loss_box_reg = smooth_l1_loss(
+                    fg_pred_deltas, gt_pred_deltas, self.smooth_l1_beta, reduction="none"
+                )
+            else:
+                raise ValueError(f"Invalid bbox reg loss type '{self.box_reg_loss_type}'")
+            
+            # return non-reduced type
+            return loss_box_reg
+        
+        else:
+            raise ValueError(f"Invalid bbox reg reduction type '{reduction}'")
+
+    
\ No newline at end of file
diff --git a/cubercnn/modeling/roi_heads/roi_heads.py b/cubercnn/modeling/roi_heads/roi_heads.py
new file mode 100644
index 0000000000000000000000000000000000000000..277f867f016194c6750fe8a33907e4beb9dc1ab0
--- /dev/null
+++ b/cubercnn/modeling/roi_heads/roi_heads.py
@@ -0,0 +1,2231 @@
+from detectron2.layers.nms import batched_nms
+from pytorch3d.ops.iou_box3d import box3d_overlap
+from ProposalNetwork.utils.plane import Plane_torch as Plane_torch
+from segment_anything.utils.transforms import ResizeLongestSide
+from cubercnn.data.generate_ground_segmentations import init_segmentation
+
+import logging
+
+import numpy as np
+from torchvision.ops import sigmoid_focal_loss
+
+from typing import Dict, List, Tuple
+import torch
+from torch import nn
+import torch.nn.functional as F
+from pytorch3d.transforms.so3 import (
+    so3_relative_angle
+)
+from detectron2.config import configurable
+from detectron2.structures import Instances, Boxes, pairwise_iou, pairwise_ioa
+from detectron2.layers import ShapeSpec
+from detectron2.modeling.proposal_generator.proposal_utils import add_ground_truth_to_proposals
+from detectron2.utils.events import get_event_storage
+from detectron2.modeling.roi_heads import (
+    StandardROIHeads, ROI_HEADS_REGISTRY, select_foreground_proposals,
+)
+from detectron2.modeling.poolers import ROIPooler
+from ProposalNetwork.utils.conversions import cubes_to_box
+from ProposalNetwork.utils.spaces import Cubes
+from ProposalNetwork.utils.utils import iou_2d, convex_hull
+from cubercnn.modeling.roi_heads.cube_head import build_cube_head
+from cubercnn.modeling.proposal_generator.rpn import subsample_labels
+from cubercnn.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
+from cubercnn import util
+
+from torchvision.ops import generalized_box_iou_loss
+
+from cubercnn.util.math_util import so3_relative_angle_batched
+
+logger = logging.getLogger(__name__)
+
+E_CONSTANT = 2.71828183
+SQRT_2_CONSTANT = 1.41421356
+
+def build_roi_heads(cfg, input_shape=None, priors=None):
+    """
+    Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
+    """
+    name = cfg.MODEL.ROI_HEADS.NAME
+    return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape, priors=priors)
+
+@ROI_HEADS_REGISTRY.register()
+class ROIHeads3DScore(StandardROIHeads):
+    '''3D head for the weak cube rcnn model'''
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        ignore_thresh: float,
+        cube_head: nn.Module,
+        cube_pooler: nn.Module,
+        loss_w_3d: float,
+        loss_w_iou: float,
+        loss_w_seg: float,
+        loss_w_pose: float,
+        loss_w_normal_vec: float,
+        loss_w_z: float,
+        loss_w_dims: float,
+        loss_w_depth: float,
+        use_confidence: float,
+        inverse_z_weight: bool,
+        z_type: str,
+        pose_type: str,
+        cluster_bins: int,
+        priors = None,
+        dims_priors_enabled = None,
+        dims_priors_func = None,
+        disentangled_loss=None,
+        virtual_depth=None,
+        virtual_focal=None,
+        test_scale=None,
+        allocentric_pose=None,
+        chamfer_pose=None,
+        scale_roi_boxes=None,
+        loss_functions=['dims', 'pose_alignment', 'pose_ground', 'iou', 'segmentation', 'z', 'z_pseudo_gt_patch'],
+        segmentor,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.scale_roi_boxes = scale_roi_boxes
+        self.segmentor = segmentor
+
+        # rotation settings
+        self.allocentric_pose = allocentric_pose
+        self.chamfer_pose = chamfer_pose
+
+        # virtual settings
+        self.virtual_depth = virtual_depth
+        self.virtual_focal = virtual_focal
+
+        # loss weights, <=0 is off
+        self.loss_w_3d = loss_w_3d
+        self.loss_w_iou = loss_w_iou
+        self.loss_w_seg = loss_w_seg
+        self.loss_w_pose = loss_w_pose
+        self.loss_w_normal_vec = loss_w_normal_vec
+        self.loss_w_z = loss_w_z
+        self.loss_w_dims = loss_w_dims
+        self.loss_w_depth = loss_w_depth
+
+        # loss functions
+        self.loss_functions = loss_functions
+
+        # loss modes
+        self.disentangled_loss = disentangled_loss
+        self.inverse_z_weight = inverse_z_weight
+
+        # misc
+        self.test_scale = test_scale
+        self.ignore_thresh = ignore_thresh
+        
+        # related to network outputs
+        self.z_type = z_type
+        self.pose_type = pose_type
+        self.use_confidence = use_confidence
+
+        # related to priors
+        self.cluster_bins = cluster_bins
+        self.dims_priors_enabled = dims_priors_enabled
+        self.dims_priors_func = dims_priors_func
+
+        # if there is no 3D loss, then we don't need any heads. 
+        # if loss_w_3d > 0:
+        
+        self.cube_head = cube_head
+        self.cube_pooler = cube_pooler
+        
+        # the dimensions could rely on pre-computed priors
+        if self.dims_priors_enabled and priors is not None:
+            self.priors_dims_per_cat = nn.Parameter(torch.FloatTensor(priors['priors_dims_per_cat']).unsqueeze(0))
+        else:
+            self.priors_dims_per_cat = nn.Parameter(torch.ones(1, self.num_classes, 2, 3))
+
+        # Optionally, refactor priors and store them in the network params
+        if self.cluster_bins > 1 and priors is not None:
+
+            # the depth could have been clustered based on 2D scales                
+            priors_z_scales = torch.stack([torch.FloatTensor(prior[1]) for prior in priors['priors_bins']])
+            self.priors_z_scales = nn.Parameter(priors_z_scales)
+
+        else:
+            self.priors_z_scales = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins))
+
+        # the depth can be based on priors
+        if self.z_type == 'clusters':
+            
+            assert self.cluster_bins > 1, 'To use z_type of priors, there must be more than 1 cluster bin'
+            
+            if priors is None:
+                self.priors_z_stats = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins, 2).float())
+            else:
+
+                # stats
+                priors_z_stats = torch.cat([torch.FloatTensor(prior[2]).unsqueeze(0) for prior in priors['priors_bins']])
+                self.priors_z_stats = nn.Parameter(priors_z_stats)
+
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], priors=None):
+        
+        ret = super().from_config(cfg, input_shape)
+        
+        # pass along priors
+        ret["box_predictor"] = FastRCNNOutputs(cfg, ret['box_head'].output_shape)
+        ret.update(cls._init_cube_head(cfg, input_shape))
+        ret["priors"] = priors
+
+        return ret
+
+    @classmethod
+    def _init_cube_head(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        
+        in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
+        pooler_resolution = cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION 
+        pooler_sampling_ratio = cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type = cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE
+
+        cube_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=pooler_sampling_ratio,
+            pooler_type=pooler_type,
+        )
+
+        in_channels = [input_shape[f].channels for f in in_features][0]
+        shape = ShapeSpec(
+            channels=in_channels, width=pooler_resolution, height=pooler_resolution
+        )
+
+        cube_head = build_cube_head(cfg, shape)
+        logger.info('Loss functions: %s', cfg.loss_functions)
+        possible_losses = ['dims', 'pose_alignment', 'pose_ground', 'pose_ground2', 'iou', 'segmentation', 'z', 'z_pseudo_gt_patch', 'z_pseudo_gt_center','depth']
+        assert all([x in possible_losses for x in cfg.loss_functions]), f'loss functions must be in {possible_losses}, but was {cfg.loss_functions}'
+
+        if 'segmentation' in cfg.loss_functions or 'depth' in cfg.loss_functions:
+            segmentor = init_segmentation(device=cfg.MODEL.DEVICE)
+        else:
+            segmentor = None
+
+        return {
+            'cube_head': cube_head,
+            'cube_pooler': cube_pooler,
+            'use_confidence': cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE,
+            'inverse_z_weight': cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT,
+            'loss_w_3d': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D,
+            'loss_w_iou': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_IOU,
+            'loss_w_seg': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_SEG,
+            'loss_w_pose': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE,
+            'loss_w_dims': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS,
+            'loss_w_normal_vec': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_NORMAL_VEC,
+            'loss_w_z': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z,
+            'loss_w_depth': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DEPTH,
+            'z_type': cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE,
+            'pose_type': cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE,
+            'dims_priors_enabled': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED,
+            'dims_priors_func': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC,
+            'disentangled_loss': cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS,
+            'virtual_depth': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH,
+            'virtual_focal': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL,
+            'test_scale': cfg.INPUT.MIN_SIZE_TEST,
+            'chamfer_pose': cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE,
+            'allocentric_pose': cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE,
+            'cluster_bins': cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS,
+            'ignore_thresh': cfg.MODEL.RPN.IGNORE_THRESHOLD,
+            'scale_roi_boxes': cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES,
+            'loss_functions': cfg.loss_functions,
+            'segmentor': segmentor,
+        }
+
+
+    def forward(self, images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, targets):
+
+        im_dims = [image.shape[1:] for image in images]
+
+        del images
+
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+
+            losses = self._forward_box(features, proposals)
+            if self.loss_w_3d > 0:
+                tmp_list = [x.gt_boxes3D.tolist() for x in targets]
+                idx_list = []
+                for i in range(len(tmp_list)):
+                    for j in range(len(tmp_list[i])):
+                        idx_list.append(tmp_list[i][j][0])
+                
+
+                first_occurrence_indices = {}
+                unique_counter = 0
+                result_indices = []
+
+                for entry in idx_list:
+                    if entry not in first_occurrence_indices:
+                        first_occurrence_indices[entry] = unique_counter
+                        unique_counter += 1
+                    result_indices.append(first_occurrence_indices[entry])
+                if 'segmentation' in self.loss_functions or 'depth' in self.loss_functions:
+                    mask_per_image = self.object_masks(images_raw.tensor, targets) # over all images in batch
+                    masks_all_images = [sublist for outer_list in mask_per_image for sublist in outer_list]
+                else:
+                    mask_per_image, masks_all_images = None, None
+
+                instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps)
+                losses.update(losses_cube)
+
+            else:
+                instances_3d = None
+
+            return instances_3d, losses
+        
+        else:
+
+            # when oracle is available, by pass the box forward.
+            # simulate the predicted instances by creating a new 
+            # instance for each passed in image.
+            if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]):
+                pred_instances = []
+                for proposal, im_dim in zip(proposals, im_dims):
+                    
+                    pred_instances_i = Instances(im_dim)
+                    pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D'])
+                    pred_instances_i.pred_classes =  proposal['gt_classes']
+                    pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float()
+                    pred_instances.append(pred_instances_i)
+            else:
+                pred_instances = self._forward_box(features, proposals)
+            
+            mask_per_image, masks_all_images, first_occurrence_indices = None, None, None
+            pred_instances = self._forward_cube(features, pred_instances, Ks, im_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps)
+            return pred_instances, {}
+
+    def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]):
+        """
+        Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`,
+            the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+
+        Returns:
+            In training, a dict of losses.
+            In inference, a list of `Instances`, the predicted instances.
+        """
+        features = [features[f] for f in self.box_in_features]
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        box_features = self.box_head(box_features)
+        predictions = self.box_predictor(box_features)
+        del box_features
+
+        if self.training:
+            losses = self.box_predictor.losses(
+                predictions, proposals, 
+            )
+            pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+                predictions, proposals
+            )
+            for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+                proposals_per_image.pred_boxes = Boxes(pred_boxes_per_image)
+
+            # proposals is modified in-place below, so losses must be computed first.
+            if self.train_on_pred_boxes:
+                with torch.no_grad():
+                    pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+                        predictions, proposals
+                    )
+                    for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+                        proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image)
+            return losses
+        else:
+            pred_instances, _ = self.box_predictor.inference(predictions, proposals, )
+            return pred_instances
+
+    def l1_loss(self, vals, target):
+        return F.smooth_l1_loss(vals, target, reduction='none', beta=0.0)
+
+    def chamfer_loss(self, vals, target):
+        B = vals.shape[0]
+        xx = vals.view(B, 8, 1, 3)
+        yy = target.view(B, 1, 8, 3)
+        l1_dist = (xx - yy).abs().sum(-1)
+        l1 = (l1_dist.min(1).values.mean(-1) + l1_dist.min(2).values.mean(-1))
+        return l1
+
+    # optionally, scale proposals to zoom RoI in (<1.0) our out (>1.0)
+    def scale_proposals(self, proposal_boxes):
+        if self.scale_roi_boxes > 0:
+
+            proposal_boxes_scaled = []
+            for boxes in proposal_boxes:
+                centers = boxes.get_centers()
+                widths = boxes.tensor[:, 2] - boxes.tensor[:, 0]
+                heights = boxes.tensor[:, 2] - boxes.tensor[:, 0]
+                x1 = centers[:, 0] - 0.5*widths*self.scale_roi_boxes
+                x2 = centers[:, 0] + 0.5*widths*self.scale_roi_boxes
+                y1 = centers[:, 1] - 0.5*heights*self.scale_roi_boxes
+                y2 = centers[:, 1] + 0.5*heights*self.scale_roi_boxes
+                boxes_scaled = Boxes(torch.stack([x1, y1, x2, y2], dim=1))
+                proposal_boxes_scaled.append(boxes_scaled)
+        else:
+            proposal_boxes_scaled = proposal_boxes
+
+        return proposal_boxes_scaled
+    
+    def object_masks(self, images, instances):
+        '''list of masks for each object in the image.
+        Returns
+        ------
+        mask_per_image: List of torch.Tensor of shape (N_instance, 1, H, W)
+        '''
+        org_shape = images.shape[-2:]
+        resize_transform = ResizeLongestSide(self.segmentor.image_encoder.img_size)
+        batched_input = []
+        images = resize_transform.apply_image_torch(images*1.0)# .permute(2, 0, 1).contiguous()
+        for image, instance in zip(images, instances):
+            boxes = instance.gt_boxes.tensor
+            transformed_boxes = resize_transform.apply_boxes_torch(boxes, org_shape) # Bx4
+            batched_input.append({'image': image, 'boxes': transformed_boxes, 'original_size':org_shape})
+
+        seg_out = self.segmentor(batched_input, multimask_output=False)
+
+        mask_per_image = [i['masks'] for i in seg_out]
+        return mask_per_image
+    
+    def dice_loss(self, y, y_hat):
+        '''Andreas: i am extremely unconfident in the correctness of this implementation
+        
+        taken from my implementation in the DLCV course
+
+        see also:  https://gist.github.com/weiliu620/52d140b22685cf9552da4899e2160183'''
+
+        smooth = 1
+        y_hat = F.sigmoid(y_hat)
+
+        y_hat = y_hat.view(-1)
+        y = y.view(-1)
+
+        intersection = (y_hat * y).sum()
+        dice = (2.*intersection + smooth)/(y_hat.sum() + y.sum() + smooth)
+        return 1 - dice
+    
+    def segment_loss(self, gt_mask, bube_corners, at_which_mask_idx, loss='focal'):
+        n = len(bube_corners)
+        y_hat = []
+        y = []
+        for i in range(n):
+            gt_mask_i = gt_mask[at_which_mask_idx[i]][0]
+            bube_corners_i = bube_corners[i]
+            # just need the shape of the gt_mask
+            bube_mask = convex_hull(gt_mask[0].squeeze(), bube_corners_i)
+
+            gt_mask_i = (gt_mask_i * 1.0).float()
+            y.append(gt_mask_i)
+            y_hat.append(bube_mask)
+
+        y = torch.stack(y)
+        y_hat = torch.stack(y_hat)
+        
+        if loss == 'bce':
+            score = F.binary_cross_entropy_with_logits(y, y_hat, reduction='none').mean((1,2)) # mean over h,w
+        elif loss == 'dice':
+            score = self.dice_loss(y, y_hat)
+        elif loss == 'focal':
+            score = sigmoid_focal_loss(y, y_hat, reduction='none').mean((1,2))
+        return score
+
+    def pose_loss(self, cube_pose:torch.Tensor, num_boxes_per_image:list[int]):
+        '''
+        Loss based on pose consistency within a single image
+        generate all combinations of poses as one row of the combination matrix at the time
+        this will give the equivalent to the lower triangle of the matrix
+        '''
+        loss_pose = torch.zeros(1, device=cube_pose.device)
+        fail_count = 0
+        for cube_pose_ in cube_pose.split(num_boxes_per_image):
+            # normalise with the number of elements in the lower triangle to make the loss more fair between images with different number of boxes
+            # we don't really care about the eps
+            # we cannot use this when there is only one cube in an image, so skip it
+            if len(cube_pose_) == 1:
+                fail_count += 1
+                continue
+            loss_pose_t = 1-so3_relative_angle_batched(cube_pose_, eps=10000, cos_angle=True).abs()
+            loss_pose += torch.mean(loss_pose_t)
+        if fail_count == len(num_boxes_per_image): # ensure that loss is None if all images in batch only had 1 box
+            return None
+        return loss_pose * 1/(fail_count+1)
+    
+    def normal_vector_from_maps(self, ground_maps, depth_maps, Ks, use_nth=5):
+        '''compute a normal vector corresponding to the ground from a point ground generated from a depth map'''
+        # ### point cloud
+        dvc = depth_maps.device
+        normal_vecs = []
+        # i cannot really see any other options than to loop over the them because the images have different sizes
+        for ground_map, depth_map, org_image_size, K in zip(ground_maps, depth_maps, depth_maps.image_sizes, Ks):
+            if ground_map.shape == (1,1): ground_map = None
+            z = depth_map[::use_nth,::use_nth]
+            # i don't know if it makes sense to use the image shape as the 
+            # this way it looks much more correct
+            # https://github.com/DepthAnything/Depth-Anything-V2/blob/31dc97708961675ce6b3a8d8ffa729170a4aa273/metric_depth/depth_to_pointcloud.py#L100
+            width, height = z.shape[1], z.shape[0]
+            focal_length_x, focal_length_y = K[0,0] // use_nth, K[1,1] // use_nth
+
+            u, v = torch.meshgrid(torch.arange(width, device=dvc), torch.arange(height,device=dvc), indexing='xy')
+            cx, cy = width / 2, height / 2 # principal point of camera
+            # https://www.open3d.org/docs/0.7.0/python_api/open3d.geometry.create_point_cloud_from_depth_image.html
+            x = (u - cx) * z / focal_length_x
+            y = (v - cy) * z / focal_length_y
+            if ground_map is not None:
+                # select only the points in x,y,z that are part of the ground map
+                ground = ground_map[::use_nth,::use_nth]
+                zg = z[ground > 0]
+                xg = x[ground > 0]
+                yg = y[ground > 0]
+            else:
+                # the ground map also works to remove the padded 0's to the depth maps
+                # so in the case the ground map is not available we must ensure to only select the valid part of the image
+                mask = torch.ones(org_image_size, device=dvc)
+                image_without_pad = mask[::use_nth,::use_nth]
+                zg = z[image_without_pad > 0]
+                xg = x[image_without_pad > 0]
+                yg = y[image_without_pad > 0]
+
+            # normalise the points
+            points = torch.stack((xg, yg, zg), axis=-1)
+
+            plane = Plane_torch()
+            # best_eq is the ground plane as a,b,c,d in the equation ax + by + cz + d = 0
+            # if this errors out, run the filter ground script first
+            best_eq, best_inliers = plane.fit_parallel(points, thresh=0.05, maxIteration=1000)
+            normal_vec = best_eq[:-1]
+
+            x_up = torch.tensor([1.0, 0.0, 0.0], device=dvc)
+            y_up = torch.tensor([0.0, 1.0, 0.0], device=dvc)
+            z_up = torch.tensor([0.0, 0.0, 1.0], device=dvc)
+            # make sure normal vector is consistent with y-up
+            if (normal_vec @ z_up).abs() > (normal_vec @ y_up).abs():
+                # this means the plane has been found as the back wall
+                # to rectify this we can turn the vector 90 degrees around the local x-axis
+                # note that this assumes that the walls are perpendicular to the floor
+                normal_vec = normal_vec[torch.tensor([0,2,1], device=dvc)] * torch.tensor([1, 1, -1], device=dvc)
+            if (normal_vec @ x_up).abs() > (normal_vec @ y_up).abs():
+                # this means the plane has been found as the side wall
+                # to rectify this we can turn the vector 90 degrees around the local y-axis
+                # note that this assumes that the walls are perpendicular to the floor
+                normal_vec = normal_vec[torch.tensor([2,0,1], device=dvc)] * torch.tensor([-1, 1, 1], device=dvc)
+            if normal_vec @ y_up < 0:
+                normal_vec *= -1
+            normal_vecs.append(normal_vec)
+
+        return torch.stack(normal_vecs)
+    
+    def z_loss(self, gt_boxes:Boxes, cubes:Cubes, Ks, im_sizes, proj_boxes:Boxes):
+        max_count = 50 # 50 steps of 0.1 meters
+        num_preds = cubes.num_instances
+
+        # Find losses
+        scores = torch.zeros((num_preds), device=cubes.device)
+
+        gt_area = gt_boxes.area()
+
+        pred_center = proj_boxes.get_centers()
+        pred_area = proj_boxes.area()
+        gt_boxes_t = gt_boxes.tensor
+
+        is_within_gt_box = ((gt_boxes_t[:, 0] - max_count <= pred_center[:,0]) <= gt_boxes_t[:, 2] + max_count) & \
+                           ((gt_boxes_t[:, 1] - max_count <= pred_center[:,1]) <= gt_boxes_t[:, 3] + max_count)
+        values_tensor = torch.linspace(0.0, (max_count-1)/10, max_count, device=cubes.device)
+        is_gt_smaller = gt_area < pred_area
+
+        for i in range(num_preds):
+            # Check if pred center is within gt box
+            if is_within_gt_box[i]:
+                cube_tensor = cubes[i].tensor
+                mod_cube_tensor = cube_tensor[0,0].clone().unsqueeze(0).repeat((max_count,1))
+                
+                # Check if too small or too big.
+                if is_gt_smaller[i]: # NOTE has disadvantage when box has different shape, CAN FAIL TODO Change to checking each corner instead
+                    mod_cube_tensor[:, 2] += values_tensor
+                else:
+                    mod_cube_tensor[:, 2] -= values_tensor
+                mod_cube = Cubes(mod_cube_tensor)
+                mod_box = Boxes(cubes_to_box(mod_cube, Ks[i], im_sizes[i])[0].tensor)
+
+                pred_areas = mod_box.area()
+                mask_zero_area = (pred_areas == 0) * 10000000
+                pred_areas = pred_areas + mask_zero_area
+                idx = torch.argmin(self.l1_loss(gt_area[i].repeat(max_count), pred_areas))
+                
+                scores[i] = self.l1_loss(cubes[i].tensor[0,0,2], mod_cube_tensor[idx,2])
+                
+            else:
+                #If center is outside return something high?
+                scores[i] = torch.tensor(0.1 * max_count, requires_grad=True)
+        
+        return scores/2
+    
+    def pseudo_gt_z_box_loss(self, depth_maps, proposal_boxes:tuple[torch.Tensor], pred_z):
+        '''Compute the pseudo ground truth z loss based on the depth map
+            for now, use the median value depth constrained of the proposal box as the ground truth depth
+        Args:
+            depth_maps: detectron2 Imagelist
+            proposal_boxes: predicted 2d box. list[detectron2 Boxes of shape (N, 4)]
+            pred_z: predicted z. torch.Tensor of shape (N, 1)
+        Returns:
+            z_loss: torch.Tensor of shape (N, 1)'''
+        gt_z = []
+        for depth_map, boxes in zip(depth_maps, proposal_boxes):
+            boxes = Boxes(boxes)
+            h, w = depth_map.shape
+            # x1, y1, x2, y2 = box
+            # clamp boxes extending the image
+            boxes.clip((h, w))
+            # remove boxes fully outside the image
+            mask = boxes.area() > 0
+            boxes_in = boxes[mask]
+            # median of each of the depth maps corresponding each box
+            for box in boxes_in:
+                # TODO: this could be way more efficiently, but I don't know how to slice many boxes at once
+                gt_z.append(torch.median((depth_map[box[1].long():box[3].long(), box[0].long():box[2].long()])).unsqueeze(0))
+            
+            # for boxes outside image, fall back to same method as in pseudo_gt_z_loss_point
+            boxes_out = boxes[~mask]
+            if len(boxes_out) == 0:
+                continue
+            xy = boxes_out.get_centers()
+            x = torch.clamp(xy[:,0],10,w-11)
+            y = torch.clamp(xy[:,1],10,h-11)
+            gt_z.append(depth_map[y.long(), x.long()])
+
+        gt_z_o = torch.cat(gt_z)
+        l1loss = self.l1_loss(pred_z, gt_z_o)
+        return l1loss
+    
+    def dim_loss(self, priors:tuple[torch.Tensor], dimensions):
+        '''
+        priors   : List
+        dimensions : List of Lists
+        P(dim|priors)
+        '''        
+        [prior_mean, prior_std] = priors
+        
+        # Drop rows of prior_mean and prior_std for rows in prior_std containing nan
+        mask = ~torch.isnan(prior_std).any(dim=1)
+        if not mask.all():
+            return None, None, None
+        prior_mean = prior_mean[mask]
+        prior_std = prior_std[mask]
+        dimensions = dimensions[mask]
+
+        # z-score ie how many std's we are from the mean
+        dimensions_scores = (dimensions - prior_mean).abs()/prior_std
+
+        dimensions_scores = torch.max(dimensions_scores - 1.0, torch.zeros_like(dimensions_scores, device=dimensions_scores.device))
+       
+        return dimensions_scores[:,0], dimensions_scores[:,1], dimensions_scores[:,2]
+    
+    def pseudo_gt_z_point_loss(self, depth_maps, pred_xy, pred_z, num_boxes_per_image):
+        '''Compute the pseudo ground truth z loss based on the depth map
+            for now, use the point in depth map corresponding to the center point of the pred box as the pseudo ground truth
+        Args:
+            depth_maps: detectron2 Imagelist
+            pred_xy: predicted centre. torch.Tensor of shape (N, 2)
+            pred_z: predicted z. torch.Tensor of shape (N, 1)
+        Returns:
+            z_loss: torch.Tensor of shape (N, 1)'''
+        gt_z = []
+        for depth_map, xy in zip(depth_maps, pred_xy.split(num_boxes_per_image)):
+            h, w = depth_map.shape
+            y, x = xy[:,1], xy[:,0]
+
+            # clamp points outside the image
+            x = torch.clamp(x,10,w-11)
+            y = torch.clamp(y,10,h-11)
+            gt_z.append(depth_map[y.long(), x.long()])
+
+        gt_z_o = torch.cat(gt_z)
+        l1loss = self.l1_loss(pred_z, gt_z_o)
+        return l1loss
+
+    def depth_range_loss(self, gt_mask, at_which_mask_idx, depth_maps, cubes, gt_boxes, num_instances):
+        """
+        Apply seg_mask on depth image, take difference in min and max values as GT value. Take length as prediction value. Then l1-loss.
+        """
+        gt_boxes_t = gt_boxes.tensor
+        counter = 0
+        gt_depths = []
+        corner_depths = cubes.get_all_corners()[:,0,:,2]
+        # max function gives both vals and idx, so we take only the vals
+        pred_depth = torch.max(corner_depths,dim=1)[0] - torch.min(corner_depths,dim=1)[0]
+        
+        for depth_map, cube in zip(depth_maps, cubes.split(num_instances, dim=0)):
+            for j in range(cube.num_instances):
+                segmentation_mask = gt_mask[at_which_mask_idx[counter]][0]
+                depth_map = F.interpolate(depth_map.unsqueeze(0).unsqueeze(0),size=segmentation_mask.shape, mode='bilinear', align_corners=True).squeeze()
+                depth_range = depth_map[segmentation_mask]
+                # if segmentation fails, fall back to the bbox
+                if depth_range.numel() == 0:
+                    depth_range = depth_map[gt_boxes_t[counter,1].long():gt_boxes_t[counter,3].long(), gt_boxes_t[counter,0].long():gt_boxes_t[counter,2].long()]
+                gt_depth = torch.quantile(depth_range,0.9) - torch.quantile(depth_range,0.1) #torch.max(depth_range) - torch.min(depth_range)
+                gt_depths.append(gt_depth)
+                counter += 1
+
+        gt_depths = torch.stack(gt_depths)
+        scores = self.l1_loss(gt_depths, pred_depth)
+
+        return scores
+
+    def normal_to_rotation(self, normal):
+        '''https://gamedev.stackexchange.com/questions/22204/from-normal-to-rotation-matrix'''
+        x1 = torch.tensor([1.0, 0, 0], device=normal.device).repeat(normal.shape[0],1)
+        t0 = torch.cross(normal, x1, dim=1)
+        if torch.bmm(t0.view(normal.shape[0],1,3), t0.view(normal.shape[0], 3, 1)).flatten().any() < 0.001:
+            y1 = torch.tensor([0, 1.0, 0], device=normal.device).repeat(normal.shape[0],1)
+            t0 = torch.cross(normal, y1, dim=1)
+        t0 = t0 / torch.norm(t0)
+        t1t = torch.cross(normal, t0, dim=1)
+        t1 = t1t / torch.norm(t1t)
+        return torch.cat([t0, t1, normal],dim=1).reshape((normal.shape[0],3,3))#.permute((0,2,1))
+
+    def _forward_cube(self, features, instances, Ks, im_current_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps):
+        
+        features = [features[f] for f in self.in_features]
+
+        # training on foreground
+        if self.training:
+
+            losses = {}
+
+            # add up the amount we should normalize the losses by. 
+            # this follows the same logic as the BoxHead, where each FG proposal 
+            # is able to contribute the same amount of supervision. Technically, 
+            # this value doesn't change during training unless the batch size is dynamic.
+            self.normalize_factor = max(sum([i.gt_classes.numel() for i in instances]), 1.0)
+
+            # The loss is only defined on positive proposals
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposal_boxes = [x.proposal_boxes for x in proposals]
+            pred_boxes = [x.pred_boxes for x in proposals]
+            box_classes = (torch.cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0))
+            gt_boxes3D = torch.cat([p.gt_boxes3D for p in proposals], dim=0,)
+            gt_poses = torch.cat([p.gt_poses for p in proposals], dim=0,)
+
+            assert len(gt_poses) == len(gt_boxes3D) == len(box_classes)
+
+            at_which_mask_idx = []
+            for entry in gt_boxes3D:
+                entry = entry[0].item()
+                at_which_mask_idx.append(first_occurrence_indices[entry])
+        
+        # eval on all instances
+        else:
+            proposals = instances
+            pred_boxes = [x.pred_boxes for x in instances]
+            proposal_boxes = pred_boxes
+            box_classes = torch.cat([x.pred_classes for x in instances])
+
+        proposal_boxes_scaled = self.scale_proposals(proposal_boxes) 
+
+        # forward features
+        cube_features = self.cube_pooler(features, proposal_boxes_scaled).flatten(1)
+
+        n = cube_features.shape[0]
+        
+        # nothing to do..
+        if n == 0:
+            return instances if not self.training else (instances, {})
+
+        num_boxes_per_image = [len(i) for i in proposals]
+
+        # scale the intrinsics according to the ratio the image has been scaled. 
+        # this means the projections at the current scale are in sync.
+        Ks_scaled_per_box = torch.cat([
+            (Ks[i]/im_scales_ratio[i]).unsqueeze(0).repeat([num, 1, 1]) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+        Ks_scaled_per_box[:, -1, -1] = 1
+
+        focal_lengths_per_box = torch.cat([
+            (Ks[i][1, 1]).unsqueeze(0).repeat([num]) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+
+        im_ratios_per_box = torch.cat([
+            torch.FloatTensor([im_scales_ratio[i]]).repeat(num) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+
+        # scaling factor for Network resolution -> Original
+        im_scales_per_box = torch.cat([
+            torch.FloatTensor([im_current_dims[i][0]]).repeat(num) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+
+        im_scales_original_per_box = im_scales_per_box * im_ratios_per_box
+
+        if self.virtual_depth:
+                
+            virtual_to_real = util.compute_virtual_scale_from_focal_spaces(
+                focal_lengths_per_box, im_scales_original_per_box, 
+                self.virtual_focal, im_scales_per_box
+            )
+            real_to_virtual = 1 / virtual_to_real
+
+        else:
+            real_to_virtual = virtual_to_real = 1.0
+
+        # 2D boxes are needed to apply deltas
+        src_boxes = torch.cat([box_per_im.tensor for box_per_im in proposal_boxes], dim=0)
+        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
+        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
+        src_scales = (src_heights**2 + src_widths**2).sqrt()
+        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
+        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
+
+        # For some methods, we need the predicted 2D box,
+        # e.g., the differentiable tensors from the 2D box head. 
+        pred_src_boxes = torch.cat([box_per_im.tensor for box_per_im in pred_boxes], dim=0)
+        pred_widths = pred_src_boxes[:, 2] - pred_src_boxes[:, 0]
+        pred_heights = pred_src_boxes[:, 3] - pred_src_boxes[:, 1]
+        pred_src_x = (pred_src_boxes[:, 2] + pred_src_boxes[:, 0]) * 0.5
+        pred_src_y = (pred_src_boxes[:, 3] + pred_src_boxes[:, 1]) * 0.5
+
+        im_sizes = []
+        im_idx = []
+        for i,j in enumerate(num_boxes_per_image):
+            for _ in range(j):
+                im_sizes.append(list(im_current_dims[i]))
+                im_idx.append(i)
+        
+        # forward predictions
+        cube_2d_deltas, cube_z, cube_dims, cube_pose, cube_uncert = self.cube_head(cube_features)
+        
+        # simple indexing re-used commonly for selection purposes
+        fg_inds = torch.arange(n)
+
+        # Z when clusters are used
+        if cube_z is not None and self.cluster_bins > 1:
+        
+            # compute closest bin assignments per batch per category (batch x n_category)
+            scales_diff = (self.priors_z_scales.detach().T.unsqueeze(0) - src_scales.unsqueeze(1).unsqueeze(2)).abs()
+            
+            # assign the correct scale prediction.
+            # (the others are not used / thrown away)
+            assignments = scales_diff.argmin(1)
+
+            # select FG, category, and correct cluster
+            cube_z = cube_z[fg_inds, :, box_classes, :][fg_inds, assignments[fg_inds, box_classes]]
+
+        elif cube_z is not None:
+
+            # if z is available, collect the per-category predictions.
+            cube_z = cube_z[fg_inds, box_classes, :]
+            
+        cube_dims = cube_dims[fg_inds, box_classes, :]
+        cube_pose = cube_pose[fg_inds, box_classes, :, :]
+
+        if self.use_confidence:
+            
+            # if uncertainty is available, collect the per-category predictions.
+            cube_uncert = cube_uncert[fg_inds, box_classes]
+        
+        cube_2d_deltas = cube_2d_deltas[fg_inds, box_classes, :]
+        
+        # apply our predicted deltas based on src boxes.
+        cube_x = src_ctr_x + src_widths * cube_2d_deltas[:, 0]
+        cube_y = src_ctr_y + src_heights * cube_2d_deltas[:, 1]
+        
+        cube_xy = torch.cat((cube_x.unsqueeze(1), cube_y.unsqueeze(1)), dim=1)
+
+        cube_dims_norm = cube_dims
+        
+        if self.dims_priors_enabled:
+            # gather prior dimensions
+            prior_dims = self.priors_dims_per_cat.detach().repeat([n, 1, 1, 1])[fg_inds, box_classes]
+            prior_dims_mean = prior_dims[:, 0, :]
+            prior_dims_std = prior_dims[:, 1, :]
+
+            if self.dims_priors_func == 'sigmoid':
+                prior_dims_min = (prior_dims_mean - 3*prior_dims_std).clip(0.0)
+                prior_dims_max = (prior_dims_mean + 3*prior_dims_std)
+                cube_dims = util.scaled_sigmoid(cube_dims_norm, min=prior_dims_min, max=prior_dims_max)
+            elif self.dims_priors_func == 'exp':
+                cube_dims = torch.exp(cube_dims_norm.clip(max=5)) * prior_dims_mean
+
+        else:
+            # no priors are used
+            cube_dims = torch.exp(cube_dims_norm.clip(max=5))
+        
+        if self.allocentric_pose:
+            # To compare with GTs, we need the pose to be egocentric, not allocentric
+            cube_pose_allocentric = cube_pose
+            cube_pose = util.R_from_allocentric(Ks_scaled_per_box, cube_pose, u=cube_x.detach(), v=cube_y.detach())
+            
+        cube_z = cube_z.squeeze()
+        
+        if self.z_type =='sigmoid':    
+            cube_z_norm = torch.sigmoid(cube_z)
+            cube_z = cube_z_norm * 100
+
+        elif self.z_type == 'log':
+            cube_z_norm = cube_z
+            cube_z = torch.exp(cube_z)
+
+        elif self.z_type == 'clusters':
+            # gather the mean depth, same operation as above, for a n x c result
+            z_means = self.priors_z_stats[:, :, 0].T.unsqueeze(0).repeat([n, 1, 1])
+            z_means = torch.gather(z_means, 1, assignments.unsqueeze(1)).squeeze(1)
+
+            # gather the std depth, same operation as above, for a n x c result
+            z_stds = self.priors_z_stats[:, :, 1].T.unsqueeze(0).repeat([n, 1, 1])
+            z_stds = torch.gather(z_stds, 1, assignments.unsqueeze(1)).squeeze(1)
+
+            # do not learn these, they are static
+            z_means = z_means.detach()
+            z_stds = z_stds.detach()
+
+            z_means = z_means[fg_inds, box_classes]
+            z_stds = z_stds[fg_inds, box_classes]
+
+            z_mins = (z_means - 3*z_stds).clip(0)
+            z_maxs = (z_means + 3*z_stds)
+
+            cube_z_norm = cube_z
+            cube_z = util.scaled_sigmoid(cube_z, min=z_mins, max=z_maxs)
+
+        if self.virtual_depth:
+            cube_z = (cube_z * virtual_to_real)
+
+        if self.training:
+            prefix = 'Cube/'
+            storage = get_event_storage()
+
+            # Pull off necessary GT information
+            gt_2d = gt_boxes3D[:, :2]
+            gt_z = gt_boxes3D[:, 2]
+            gt_dims = gt_boxes3D[:, 3:6]
+
+            # this box may have been mirrored and scaled so
+            # we need to recompute XYZ in 3D by backprojecting.
+            gt_x3d = gt_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+            gt_y3d = gt_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+            gt_3d = torch.stack((gt_x3d, gt_y3d, gt_z)).T
+
+            # put together the GT boxes
+            gt_cubes = Cubes(torch.cat((gt_3d, gt_dims, gt_poses.view(*gt_poses.shape[:-2], -1)), dim=1).unsqueeze(1))
+
+            # Get center in meters and create cubes
+            #cube_z = gt_boxes3D[:,2]
+            cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+            cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+
+            cubes_tensor = torch.cat((cube_x3d.unsqueeze(1),cube_y3d.unsqueeze(1),cube_z.unsqueeze(1),cube_dims,cube_pose.reshape(n,9)),axis=1).unsqueeze(1)
+            cubes = Cubes(cubes_tensor)
+            
+
+            # 3d iou
+            IoU3Ds = None
+            storage = get_event_storage()
+            # log 3d iou less frequently because it is slow
+            if storage.iter % 200 == 0:       
+                gt_corners = gt_cubes.get_all_corners().squeeze(1)
+                proposal_corners = cubes.get_all_corners().squeeze(1)
+                try:
+                    vol, iou = box3d_overlap(gt_corners.cpu(),proposal_corners.cpu())
+                    IoU3Ds = torch.diag(iou)
+                except ValueError:
+                    IoU3Ds = torch.zeros(n, device=cubes.device)
+
+            # Get bube corners
+            bube_corners = torch.zeros((n,8,2))
+            for i in range(n):
+                bube_corner = cubes[i].get_bube_corners(Ks_scaled_per_box[i], im_sizes[i]) 
+                x = torch.clamp(bube_corner[..., 0], 0, int(im_sizes[i][0]-1)) # clamp for segment loss, else CUDA error bc of accesing elements otside mask range
+                y = torch.clamp(bube_corner[..., 1], 0, int(im_sizes[i][1]-1))
+                bube_corner = torch.stack((x, y), dim=-1)
+                bube_corners[i] = bube_corner
+
+            # Project to 2D
+            proj_boxes = []
+            for i in range(cubes.num_instances):
+                proj_boxes.append(cubes_to_box(cubes[i], Ks_scaled_per_box[i], im_sizes[i])[0].tensor[0])
+            proj_boxes = Boxes(torch.stack(proj_boxes))
+            
+            ### Loss
+            loss_iou = None
+            loss_pose = None
+            loss_seg = None
+            loss_z = None
+            loss_dims_w = None
+            loss_pseudo_gt_z = None
+            loss_ground_rot = None
+            loss_depth = None
+            
+            # 2D IoU
+            gt_boxes = [x.gt_boxes for x in proposals]
+            gt_boxes = Boxes(torch.cat([gt_boxes[i].tensor for i in range(len(gt_boxes))]))
+            
+            # 2D IoU
+            if 'iou' in self.loss_functions:
+                loss_iou = generalized_box_iou_loss(gt_boxes.tensor, proj_boxes.tensor, reduction='none').view(n, -1).mean(dim=1)
+
+            # Pose
+            if 'pose_alignment' in self.loss_functions:
+                loss_pose = self.pose_loss(cube_pose, num_boxes_per_image)
+            if loss_pose is not None:
+                loss_pose = loss_pose.repeat(n)
+
+            # normal vector to ground loss
+            if 'pose_ground' in self.loss_functions:
+                valid_ground_maps_conf = torch.tensor([0.1 if shape == (1,1) else 1.0 for shape in ground_maps.image_sizes],device=cube_pose.device)
+                num_boxes_per_image_tensor = torch.tensor(num_boxes_per_image,device=Ks_scaled_per_box.device)
+                normal_vectors = self.normal_vector_from_maps(ground_maps, depth_maps, Ks_scaled_per_box)
+                normal_vectors = normal_vectors.repeat_interleave(num_boxes_per_image_tensor, 0)
+                valid_ground_maps_conf = valid_ground_maps_conf.repeat_interleave(num_boxes_per_image_tensor, 0)
+                pred_normal = cube_pose[:, 1, :]
+                loss_ground_rot = 1-F.cosine_similarity(normal_vectors, pred_normal, dim=1).abs()
+                loss_ground_rot = loss_ground_rot * valid_ground_maps_conf
+
+            if 'pose_ground2' in self.loss_functions:
+                valid_ground_maps_conf = torch.tensor([0.1 if shape == (1,1) else 1.0 for shape in ground_maps.image_sizes],device=cube_pose.device)
+                num_boxes_per_image_tensor = torch.tensor(num_boxes_per_image,device=Ks_scaled_per_box.device)
+                normal_vectors = self.normal_vector_from_maps(ground_maps, depth_maps, Ks_scaled_per_box)
+                normal_vectors = normal_vectors.repeat_interleave(num_boxes_per_image_tensor, 0)
+                valid_ground_maps_conf = valid_ground_maps_conf.repeat_interleave(num_boxes_per_image_tensor, 0)
+                ps_gt_rotation_matrix = self.normal_to_rotation(normal_vectors) 
+                # might need to transpose the rotation matrices
+                pred_rotation_matrix = cube_pose
+                loss_ground_rot = 1 - so3_relative_angle(pred_rotation_matrix, ps_gt_rotation_matrix, cos_angle=True)#.abs()
+                loss_ground_rot = loss_ground_rot * valid_ground_maps_conf
+
+            # pseudo ground truth z loss
+            if 'z_pseudo_gt_patch' in self.loss_functions:
+                loss_pseudo_gt_z = self.pseudo_gt_z_box_loss(depth_maps, proj_boxes.tensor.split(num_boxes_per_image), cube_z)
+            elif 'z_pseudo_gt_center' in self.loss_functions:
+                loss_pseudo_gt_z = self.pseudo_gt_z_point_loss(depth_maps, cube_xy, cube_z, num_boxes_per_image)
+
+            # segment
+            if 'segmentation' in self.loss_functions:
+                loss_seg = self.segment_loss(masks_all_images, bube_corners, at_which_mask_idx)
+
+            # Z
+            if 'z' in self.loss_functions:
+                loss_z = self.z_loss(gt_boxes, cubes, Ks_scaled_per_box, im_sizes, proj_boxes)
+
+            # Dimensions
+            if 'dims' in self.loss_functions:
+                loss_dims_w, loss_dims_h, loss_dims_l = self.dim_loss((prior_dims_mean, prior_dims_std), cubes.dimensions.squeeze(1))
+
+            # Depth Range
+            if 'depth' in self.loss_functions:
+                loss_depth = self.depth_range_loss(masks_all_images, at_which_mask_idx, depth_maps, cubes, gt_boxes, num_boxes_per_image)
+            
+            total_3D_loss_for_reporting = 0
+            if loss_iou is not None:
+                total_3D_loss_for_reporting += loss_iou*self.loss_w_iou
+            if loss_seg is not None:
+                total_3D_loss_for_reporting += loss_seg*self.loss_w_seg
+            if loss_pose is not None:
+                # this loss is a bit weird when adding, because it is a single number, which is broadcasted. instead of a number per instance
+                total_3D_loss_for_reporting += loss_pose*self.loss_w_pose
+            if loss_ground_rot is not None:
+                total_3D_loss_for_reporting += loss_ground_rot * self.loss_w_normal_vec *  valid_ground_maps_conf
+            if loss_z is not None:
+                total_3D_loss_for_reporting += loss_z*self.loss_w_z
+            if loss_pseudo_gt_z is not None:
+                total_3D_loss_for_reporting += loss_pseudo_gt_z*self.loss_w_z
+            if loss_dims_w is not None:
+                total_3D_loss_for_reporting += loss_dims_w*self.loss_w_dims
+                total_3D_loss_for_reporting += loss_dims_h*self.loss_w_dims
+                total_3D_loss_for_reporting += loss_dims_l*self.loss_w_dims
+            if loss_depth is not None:
+                total_3D_loss_for_reporting += loss_depth*self.loss_w_depth
+            
+            # reporting does not need gradients
+            if not isinstance(total_3D_loss_for_reporting, int):
+                total_3D_loss_for_reporting = total_3D_loss_for_reporting.detach()
+            
+            # compute errors for tracking purposes
+            xy_error = (cube_xy - gt_2d).detach().abs()
+            z_error = (cube_z - gt_z).detach().abs()
+            dims_error = (cube_dims - gt_dims).detach().abs()
+
+            storage.put_scalar(prefix + 'z_error', z_error.mean().item(), smoothing_hint=False)
+            storage.put_scalar(prefix + 'dims_error', dims_error.mean().item(), smoothing_hint=False)
+            storage.put_scalar(prefix + 'xy_error', xy_error.mean().item(), smoothing_hint=False)
+            storage.put_scalar(prefix + 'z_close', (z_error<0.20).float().mean().item(), smoothing_hint=False)
+
+            IoU2D = iou_2d(gt_boxes, proj_boxes).detach()
+            IoU2D = torch.diag(IoU2D.view(n, n))
+
+            if IoU3Ds is not None:
+                storage.put_scalar(prefix + '3D IoU', IoU3Ds.detach().mean().item(), smoothing_hint=False)
+            storage.put_scalar(prefix + '2D IoU', IoU2D.mean().item(), smoothing_hint=False)
+            if not isinstance(total_3D_loss_for_reporting, int):
+                storage.put_scalar(prefix + 'total_3D_loss', self.loss_w_3d * self.safely_reduce_losses(total_3D_loss_for_reporting), smoothing_hint=False)
+
+            if self.use_confidence > 0:
+                
+                uncert_sf = SQRT_2_CONSTANT * torch.exp(-cube_uncert)
+                if loss_iou is not None:
+                    loss_iou *= uncert_sf
+
+                if loss_seg is not None:
+                    loss_seg *= uncert_sf
+    
+                if loss_pose is not None:
+                    loss_pose *= uncert_sf
+
+                if loss_ground_rot is not None:
+                    loss_ground_rot *= uncert_sf
+                
+                if loss_z is not None:
+                    loss_z *= uncert_sf
+                
+                if loss_pseudo_gt_z is not None:
+                    loss_pseudo_gt_z *= uncert_sf
+
+                if loss_dims_w is not None:
+                    loss_dims_w *= uncert_sf
+                    loss_dims_h *= uncert_sf
+                    loss_dims_l *= uncert_sf
+
+                if loss_depth is not None:
+                    loss_depth *= uncert_sf
+
+                losses.update({prefix + 'uncert': self.use_confidence*self.safely_reduce_losses(cube_uncert.clone())})
+                storage.put_scalar(prefix + 'conf', torch.exp(-cube_uncert).mean().item(), smoothing_hint=False)
+
+            if loss_iou is not None:
+                losses.update({
+                    prefix + 'loss_iou': self.safely_reduce_losses(loss_iou) * self.loss_w_iou * self.loss_w_3d,
+                })
+            if loss_pose is not None:
+                losses.update({
+                    prefix + 'loss_pose': self.safely_reduce_losses(loss_pose) * self.loss_w_pose * self.loss_w_3d, 
+                })
+            if loss_ground_rot is not None:
+                losses.update({
+                    prefix + 'loss_normal_vec': self.safely_reduce_losses(loss_ground_rot) * self.loss_w_normal_vec * self.loss_w_3d,
+                })
+            if loss_seg is not None:
+                losses.update({
+                    prefix + 'loss_seg': self.safely_reduce_losses(loss_seg) * self.loss_w_seg * self.loss_w_3d,
+                })
+            if loss_z is not None:
+                losses.update({
+                    prefix + 'loss_z': self.safely_reduce_losses(loss_z) * self.loss_w_z * self.loss_w_3d,
+                })
+            if loss_pseudo_gt_z is not None:
+                losses.update({
+                    prefix + 'loss_pseudo_gt_z': self.safely_reduce_losses(loss_pseudo_gt_z) * self.loss_w_z * self.loss_w_3d,
+                })
+            if loss_dims_w is not None:
+                losses.update({
+                    prefix + 'loss_dims_w': self.safely_reduce_losses(loss_dims_w) * self.loss_w_dims * self.loss_w_3d,
+                })
+                losses.update({
+                    prefix + 'loss_dims_h': self.safely_reduce_losses(loss_dims_h) * self.loss_w_dims * self.loss_w_3d,
+                })
+                losses.update({
+                    prefix + 'loss_dims_l': self.safely_reduce_losses(loss_dims_l) * self.loss_w_dims * self.loss_w_3d,
+                })
+            if loss_depth is not None:
+                losses.update({
+                    prefix + 'loss_depth': self.safely_reduce_losses(loss_depth) * self.loss_w_depth * self.loss_w_3d,
+                })
+ 
+        '''
+        Inference
+        '''
+        if len(cube_z.shape) == 0:
+            cube_z = cube_z.unsqueeze(0)
+
+        # inference
+        cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+        cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+        cube_3D = torch.cat((torch.stack((cube_x3d, cube_y3d, cube_z)).T, cube_dims, cube_xy*im_ratios_per_box.unsqueeze(1)), dim=1)
+
+        if self.use_confidence:
+            cube_conf = torch.exp(-cube_uncert)
+            cube_3D = torch.cat((cube_3D, cube_conf.unsqueeze(1)), dim=1)
+
+        # convert the predictions to intances per image
+        cube_3D = cube_3D.split(num_boxes_per_image)
+        cube_pose = cube_pose.split(num_boxes_per_image)
+        box_classes = box_classes.split(num_boxes_per_image)
+        
+        pred_instances = None
+        
+        pred_instances = instances if not self.training else \
+            [Instances(image_size) for image_size in im_current_dims]
+
+        for cube_3D_i, cube_pose_i, instances_i, K, im_dim, im_scale_ratio, box_classes_i, pred_boxes_i in \
+            zip(cube_3D, cube_pose, pred_instances, Ks, im_current_dims, im_scales_ratio, box_classes, pred_boxes):
+            
+            # merge scores if they already exist
+            if hasattr(instances_i, 'scores'):
+                instances_i.scores = (instances_i.scores * cube_3D_i[:, -1])**(1/2)
+            
+            # assign scores if none are present
+            else:
+                instances_i.scores = cube_3D_i[:, -1]
+            
+            # assign box classes if none exist
+            if not hasattr(instances_i, 'pred_classes'):
+                instances_i.pred_classes = box_classes_i
+
+            # assign predicted boxes if none exist    
+            if not hasattr(instances_i, 'pred_boxes'):
+                instances_i.pred_boxes = pred_boxes_i
+
+            instances_i.pred_bbox3D = util.get_cuboid_verts_faces(cube_3D_i[:, :6], cube_pose_i)[0]
+            instances_i.pred_center_cam = cube_3D_i[:, :3]
+            instances_i.pred_center_2D = cube_3D_i[:, 6:8]
+            instances_i.pred_dimensions = cube_3D_i[:, 3:6]
+            instances_i.pred_pose = cube_pose_i
+
+        if self.training:
+            return pred_instances, losses
+        else:
+            return pred_instances
+
+    def _sample_proposals(
+        self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor, matched_ious=None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.num_classes
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes, matched_ious=matched_ious
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+    
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]:
+        
+        #separate valid and ignore gts
+        targets_ign = [target[target.gt_classes < 0] for target in targets]
+        targets = [target[target.gt_classes >= 0] for target in targets]
+        
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(targets, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+
+        for proposals_per_image, targets_per_image, targets_ign_per_image in zip(proposals, targets, targets_ign):
+            
+            has_gt = len(targets_per_image) > 0
+            
+            match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            
+            try:
+                if len(targets_ign_per_image) > 0:
+
+                    # compute the quality matrix, only on subset of background
+                    background_inds = (matched_labels == 0).nonzero().squeeze()
+
+                    # determine the boxes inside ignore regions with sufficient threshold
+                    if background_inds.numel() > 1:
+                        match_quality_matrix_ign = pairwise_ioa(targets_ign_per_image.gt_boxes, proposals_per_image.proposal_boxes[background_inds])
+                        matched_labels[background_inds[match_quality_matrix_ign.max(0)[0] >= self.ignore_thresh]] = -1
+                    
+                        del match_quality_matrix_ign
+            except:
+                pass
+            
+            gt_arange = torch.arange(match_quality_matrix.shape[1]).to(matched_idxs.device)
+            matched_ious = match_quality_matrix[matched_idxs, gt_arange]
+            sampled_idxs, gt_classes = self._sample_proposals(matched_idxs, matched_labels, targets_per_image.gt_classes, matched_ious=matched_ious)
+
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                # We index all the attributes of targets that start with "gt_"
+                # and have not been added to proposals yet (="gt_classes").
+                # NOTE: here the indexing waste some compute, because heads
+                # like masks, keypoints, etc, will filter the proposals again,
+                # (by foreground/background, or number of keypoints in the image, etc)
+                # so we essentially index the data twice.
+                for (trg_name, trg_value) in targets_per_image.get_fields().items():
+                    if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(trg_name, trg_value[sampled_targets])
+            
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+
+    def safely_reduce_losses(self, loss):
+
+        valid = (~(loss.isinf())) & (~(loss.isnan()))
+
+        if valid.any():
+            return loss[valid].mean()
+        else:
+            # no valid losses, simply zero out
+            return loss.mean()*0.0
+        
+
+
+
+
+
+
+
+
+
+
+@ROI_HEADS_REGISTRY.register()
+class ROIHeads3D(StandardROIHeads):
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        ignore_thresh: float,
+        cube_head: nn.Module,
+        cube_pooler: nn.Module,
+        loss_w_3d: float,
+        loss_w_xy: float,
+        loss_w_z: float,
+        loss_w_dims: float,
+        loss_w_pose: float,
+        loss_w_joint: float,
+        use_confidence: float,
+        inverse_z_weight: bool,
+        z_type: str,
+        pose_type: str,
+        cluster_bins: int,
+        priors = None,
+        dims_priors_enabled = None,
+        dims_priors_func = None,
+        disentangled_loss=None,
+        virtual_depth=None,
+        virtual_focal=None,
+        test_scale=None,
+        allocentric_pose=None,
+        chamfer_pose=None,
+        scale_roi_boxes=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.scale_roi_boxes = scale_roi_boxes
+
+        # rotation settings
+        self.allocentric_pose = allocentric_pose
+        self.chamfer_pose = chamfer_pose
+
+        # virtual settings
+        self.virtual_depth = virtual_depth
+        self.virtual_focal = virtual_focal
+
+        # loss weights, <=0 is off
+        self.loss_w_3d = loss_w_3d
+        self.loss_w_xy = loss_w_xy
+        self.loss_w_z = loss_w_z
+        self.loss_w_dims = loss_w_dims
+        self.loss_w_pose = loss_w_pose
+        self.loss_w_joint = loss_w_joint
+
+        # loss modes
+        self.disentangled_loss = disentangled_loss
+        self.inverse_z_weight = inverse_z_weight
+
+        # misc
+        self.test_scale = test_scale
+        self.ignore_thresh = ignore_thresh
+        
+        # related to network outputs
+        self.z_type = z_type
+        self.pose_type = pose_type
+        self.use_confidence = use_confidence
+
+        # related to priors
+        self.cluster_bins = cluster_bins
+        self.dims_priors_enabled = dims_priors_enabled
+        self.dims_priors_func = dims_priors_func
+
+        # if there is no 3D loss, then we don't need any heads. 
+        if loss_w_3d > 0:
+            
+            self.cube_head = cube_head
+            self.cube_pooler = cube_pooler
+            
+            # the dimensions could rely on pre-computed priors
+            if self.dims_priors_enabled and priors is not None:
+                self.priors_dims_per_cat = nn.Parameter(torch.FloatTensor(priors['priors_dims_per_cat']).unsqueeze(0))
+            else:
+                self.priors_dims_per_cat = nn.Parameter(torch.ones(1, self.num_classes, 2, 3))
+
+            # Optionally, refactor priors and store them in the network params
+            if self.cluster_bins > 1 and priors is not None:
+
+                # the depth could have been clustered based on 2D scales                
+                priors_z_scales = torch.stack([torch.FloatTensor(prior[1]) for prior in priors['priors_bins']])
+                self.priors_z_scales = nn.Parameter(priors_z_scales)
+
+            else:
+                self.priors_z_scales = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins))
+
+            # the depth can be based on priors
+            if self.z_type == 'clusters':
+                
+                assert self.cluster_bins > 1, 'To use z_type of priors, there must be more than 1 cluster bin'
+                
+                if priors is None:
+                    self.priors_z_stats = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins, 2).float())
+                else:
+
+                    # stats
+                    priors_z_stats = torch.cat([torch.FloatTensor(prior[2]).unsqueeze(0) for prior in priors['priors_bins']])
+                    self.priors_z_stats = nn.Parameter(priors_z_stats)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], priors=None):
+        
+        ret = super().from_config(cfg, input_shape)
+        
+        # pass along priors
+        ret["box_predictor"] = FastRCNNOutputs(cfg, ret['box_head'].output_shape)
+        ret.update(cls._init_cube_head(cfg, input_shape))
+        ret["priors"] = priors
+
+        return ret
+
+    @classmethod
+    def _init_cube_head(self, cfg, input_shape: Dict[str, ShapeSpec]):
+        
+        in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
+        pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
+        pooler_resolution = cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION 
+        pooler_sampling_ratio = cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO
+        pooler_type = cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE
+
+        cube_pooler = ROIPooler(
+            output_size=pooler_resolution,
+            scales=pooler_scales,
+            sampling_ratio=pooler_sampling_ratio,
+            pooler_type=pooler_type,
+        )
+
+        in_channels = [input_shape[f].channels for f in in_features][0]
+        shape = ShapeSpec(
+            channels=in_channels, width=pooler_resolution, height=pooler_resolution
+        )
+
+        cube_head = build_cube_head(cfg, shape)
+
+        return {
+            'cube_head': cube_head,
+            'cube_pooler': cube_pooler,
+            'use_confidence': cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE,
+            'inverse_z_weight': cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT,
+            'loss_w_3d': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D,
+            'loss_w_xy': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY,
+            'loss_w_z': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z,
+            'loss_w_dims': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS,
+            'loss_w_pose': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE,
+            'loss_w_joint': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT,
+            'z_type': cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE,
+            'pose_type': cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE,
+            'dims_priors_enabled': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED,
+            'dims_priors_func': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC,
+            'disentangled_loss': cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS,
+            'virtual_depth': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH,
+            'virtual_focal': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL,
+            'test_scale': cfg.INPUT.MIN_SIZE_TEST,
+            'chamfer_pose': cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE,
+            'allocentric_pose': cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE,
+            'cluster_bins': cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS,
+            'ignore_thresh': cfg.MODEL.RPN.IGNORE_THRESHOLD,
+            'scale_roi_boxes': cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES,
+        }
+
+
+    def forward(self, images, features, proposals, Ks, im_scales_ratio, targets=None):
+
+        im_dims = [image.shape[1:] for image in images]
+
+        del images
+
+        if self.training:
+            proposals = self.label_and_sample_proposals(proposals, targets)
+        
+        del targets
+
+        if self.training:
+
+            losses = self._forward_box(features, proposals)
+            if self.loss_w_3d > 0:
+                instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio)
+                losses.update(losses_cube)
+            else:
+                instances_3d = None
+
+            return instances_3d, losses
+        
+        else:
+
+            # when oracle is available, by pass the box forward.
+            # simulate the predicted instances by creating a new 
+            # instance for each passed in image.
+            if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]):
+                pred_instances = []
+                for proposal, im_dim in zip(proposals, im_dims):
+                    
+                    pred_instances_i = Instances(im_dim)
+                    pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D'])
+                    pred_instances_i.pred_classes =  proposal['gt_classes']
+                    pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float()
+                    pred_instances.append(pred_instances_i)
+            else:
+                pred_instances = self._forward_box(features, proposals)
+            
+            if self.loss_w_3d > 0:
+                pred_instances = self._forward_cube(features, pred_instances, Ks, im_dims, im_scales_ratio)
+            return pred_instances, {}
+    
+
+    def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]):
+        """
+        Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`,
+            the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument.
+
+        Args:
+            features (dict[str, Tensor]): mapping from feature map names to tensor.
+                Same as in :meth:`ROIHeads.forward`.
+            proposals (list[Instances]): the per-image object proposals with
+                their matching ground truth.
+                Each has fields "proposal_boxes", and "objectness_logits",
+                "gt_classes", "gt_boxes".
+
+        Returns:
+            In training, a dict of losses.
+            In inference, a list of `Instances`, the predicted instances.
+        """
+        features = [features[f] for f in self.box_in_features]
+        box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
+        box_features = self.box_head(box_features)
+        predictions = self.box_predictor(box_features)
+        del box_features
+
+        if self.training:
+            losses = self.box_predictor.losses(
+                predictions, proposals, 
+            )
+            pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+                predictions, proposals
+            )
+            for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+                proposals_per_image.pred_boxes = Boxes(pred_boxes_per_image)
+
+            # proposals is modified in-place below, so losses must be computed first.
+            if self.train_on_pred_boxes:
+                with torch.no_grad():
+                    pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
+                        predictions, proposals
+                    )
+                    for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
+                        proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image)
+            return losses
+        else:
+            pred_instances, _ = self.box_predictor.inference(predictions, proposals, )
+            return pred_instances
+
+    def l1_loss(self, vals, target):
+        return F.smooth_l1_loss(vals, target, reduction='none', beta=0.0)
+
+    def chamfer_loss(self, vals, target):
+        B = vals.shape[0]
+        xx = vals.view(B, 8, 1, 3)
+        yy = target.view(B, 1, 8, 3)
+        l1_dist = (xx - yy).abs().sum(-1)
+        l1 = (l1_dist.min(1).values.mean(-1) + l1_dist.min(2).values.mean(-1))
+        return l1
+
+    # optionally, scale proposals to zoom RoI in (<1.0) our out (>1.0)
+    def scale_proposals(self, proposal_boxes):
+        if self.scale_roi_boxes > 0:
+
+            proposal_boxes_scaled = []
+            for boxes in proposal_boxes:
+                centers = boxes.get_centers()
+                widths = boxes.tensor[:, 2] - boxes.tensor[:, 0]
+                heights = boxes.tensor[:, 2] - boxes.tensor[:, 0]
+                x1 = centers[:, 0] - 0.5*widths*self.scale_roi_boxes
+                x2 = centers[:, 0] + 0.5*widths*self.scale_roi_boxes
+                y1 = centers[:, 1] - 0.5*heights*self.scale_roi_boxes
+                y2 = centers[:, 1] + 0.5*heights*self.scale_roi_boxes
+                boxes_scaled = Boxes(torch.stack([x1, y1, x2, y2], dim=1))
+                proposal_boxes_scaled.append(boxes_scaled)
+        else:
+            proposal_boxes_scaled = proposal_boxes
+
+        return proposal_boxes_scaled
+    
+    def _forward_cube(self, features, instances, Ks, im_current_dims, im_scales_ratio):
+        
+        features = [features[f] for f in self.in_features]
+
+        # training on foreground
+        if self.training:
+
+            losses = {}
+
+            # add up the amount we should normalize the losses by. 
+            # this follows the same logic as the BoxHead, where each FG proposal 
+            # is able to contribute the same amount of supervision. Technically, 
+            # this value doesn't change during training unless the batch size is dynamic.
+            self.normalize_factor = max(sum([i.gt_classes.numel() for i in instances]), 1.0)
+
+            # The loss is only defined on positive proposals
+            proposals, _ = select_foreground_proposals(instances, self.num_classes)
+            proposal_boxes = [x.proposal_boxes for x in proposals]
+            pred_boxes = [x.pred_boxes for x in proposals]
+
+            box_classes = (torch.cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0))
+            gt_boxes3D = torch.cat([p.gt_boxes3D for p in proposals], dim=0,)
+            gt_poses = torch.cat([p.gt_poses for p in proposals], dim=0,)
+
+            assert len(gt_poses) == len(gt_boxes3D) == len(box_classes)
+        
+        # eval on all instances
+        else:
+            proposals = instances
+            pred_boxes = [x.pred_boxes for x in instances]
+            proposal_boxes = pred_boxes
+            box_classes = torch.cat([x.pred_classes for x in instances])
+
+        proposal_boxes_scaled = self.scale_proposals(proposal_boxes)
+
+        # forward features
+        cube_features = self.cube_pooler(features, proposal_boxes_scaled).flatten(1)
+
+        n = cube_features.shape[0]
+        
+        # nothing to do..
+        if n == 0:
+            return instances if not self.training else (instances, {})
+
+        num_boxes_per_image = [len(i) for i in proposals]
+
+        # scale the intrinsics according to the ratio the image has been scaled. 
+        # this means the projections at the current scale are in sync.
+        Ks_scaled_per_box = torch.cat([
+            (Ks[i]/im_scales_ratio[i]).unsqueeze(0).repeat([num, 1, 1]) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+        Ks_scaled_per_box[:, -1, -1] = 1
+
+        focal_lengths_per_box = torch.cat([
+            (Ks[i][1, 1]).unsqueeze(0).repeat([num]) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+
+        im_ratios_per_box = torch.cat([
+            torch.FloatTensor([im_scales_ratio[i]]).repeat(num) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+
+        # scaling factor for Network resolution -> Original
+        im_scales_per_box = torch.cat([
+            torch.FloatTensor([im_current_dims[i][0]]).repeat(num) 
+            for (i, num) in enumerate(num_boxes_per_image)
+        ]).to(cube_features.device)
+
+        im_scales_original_per_box = im_scales_per_box * im_ratios_per_box
+
+        if self.virtual_depth:
+                
+            virtual_to_real = util.compute_virtual_scale_from_focal_spaces(
+                focal_lengths_per_box, im_scales_original_per_box, 
+                self.virtual_focal, im_scales_per_box
+            )
+            real_to_virtual = 1 / virtual_to_real
+
+        else:
+            real_to_virtual = virtual_to_real = 1.0
+
+        # 2D boxes are needed to apply deltas
+        src_boxes = torch.cat([box_per_im.tensor for box_per_im in proposal_boxes], dim=0)
+        src_widths = src_boxes[:, 2] - src_boxes[:, 0]
+        src_heights = src_boxes[:, 3] - src_boxes[:, 1]
+        src_scales = (src_heights**2 + src_widths**2).sqrt()
+        src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
+        src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
+
+        # For some methods, we need the predicted 2D box,
+        # e.g., the differentiable tensors from the 2D box head. 
+        pred_src_boxes = torch.cat([box_per_im.tensor for box_per_im in pred_boxes], dim=0)
+        pred_widths = pred_src_boxes[:, 2] - pred_src_boxes[:, 0]
+        pred_heights = pred_src_boxes[:, 3] - pred_src_boxes[:, 1]
+        pred_src_x = (pred_src_boxes[:, 2] + pred_src_boxes[:, 0]) * 0.5
+        pred_src_y = (pred_src_boxes[:, 3] + pred_src_boxes[:, 1]) * 0.5
+        
+        # forward predictions
+        cube_2d_deltas, cube_z, cube_dims, cube_pose, cube_uncert = self.cube_head(cube_features)
+        
+        # simple indexing re-used commonly for selection purposes
+        fg_inds = torch.arange(n)
+
+        # Z when clusters are used
+        if cube_z is not None and self.cluster_bins > 1:
+        
+            # compute closest bin assignments per batch per category (batch x n_category)
+            scales_diff = (self.priors_z_scales.detach().T.unsqueeze(0) - src_scales.unsqueeze(1).unsqueeze(2)).abs()
+            
+            # assign the correct scale prediction.
+            # (the others are not used / thrown away)
+            assignments = scales_diff.argmin(1)
+
+            # select FG, category, and correct cluster
+            cube_z = cube_z[fg_inds, :, box_classes, :][fg_inds, assignments[fg_inds, box_classes]]
+
+        elif cube_z is not None:
+
+            # if z is available, collect the per-category predictions.
+            cube_z = cube_z[fg_inds, box_classes, :]
+            
+        cube_dims = cube_dims[fg_inds, box_classes, :]
+        cube_pose = cube_pose[fg_inds, box_classes, :, :]
+
+        if self.use_confidence:
+            
+            # if uncertainty is available, collect the per-category predictions.
+            cube_uncert = cube_uncert[fg_inds, box_classes]
+        
+        cube_2d_deltas = cube_2d_deltas[fg_inds, box_classes, :]
+        
+        # apply our predicted deltas based on src boxes.
+        cube_x = src_ctr_x + src_widths * cube_2d_deltas[:, 0]
+        cube_y = src_ctr_y + src_heights * cube_2d_deltas[:, 1]
+        
+        cube_xy = torch.cat((cube_x.unsqueeze(1), cube_y.unsqueeze(1)), dim=1)
+
+        cube_dims_norm = cube_dims
+        
+        if self.dims_priors_enabled:
+
+            # gather prior dimensions
+            prior_dims = self.priors_dims_per_cat.detach().repeat([n, 1, 1, 1])[fg_inds, box_classes]
+            prior_dims_mean = prior_dims[:, 0, :]
+            prior_dims_std = prior_dims[:, 1, :]
+
+            if self.dims_priors_func == 'sigmoid':
+                prior_dims_min = (prior_dims_mean - 3*prior_dims_std).clip(0.0)
+                prior_dims_max = (prior_dims_mean + 3*prior_dims_std)
+                cube_dims = util.scaled_sigmoid(cube_dims_norm, min=prior_dims_min, max=prior_dims_max)
+            elif self.dims_priors_func == 'exp':
+                cube_dims = torch.exp(cube_dims_norm.clip(max=5)) * prior_dims_mean
+
+        else:
+            # no priors are used
+            cube_dims = torch.exp(cube_dims_norm.clip(max=5))
+        
+        if self.allocentric_pose:
+            
+            # To compare with GTs, we need the pose to be egocentric, not allocentric
+            cube_pose_allocentric = cube_pose
+            cube_pose = util.R_from_allocentric(Ks_scaled_per_box, cube_pose, u=cube_x.detach(), v=cube_y.detach())
+            
+        cube_z = cube_z.squeeze()
+        
+        if self.z_type =='sigmoid':    
+            cube_z_norm = torch.sigmoid(cube_z)
+            cube_z = cube_z_norm * 100
+
+        elif self.z_type == 'log':
+            cube_z_norm = cube_z
+            cube_z = torch.exp(cube_z)
+
+        elif self.z_type == 'clusters':
+            
+            # gather the mean depth, same operation as above, for a n x c result
+            z_means = self.priors_z_stats[:, :, 0].T.unsqueeze(0).repeat([n, 1, 1])
+            z_means = torch.gather(z_means, 1, assignments.unsqueeze(1)).squeeze(1)
+
+            # gather the std depth, same operation as above, for a n x c result
+            z_stds = self.priors_z_stats[:, :, 1].T.unsqueeze(0).repeat([n, 1, 1])
+            z_stds = torch.gather(z_stds, 1, assignments.unsqueeze(1)).squeeze(1)
+
+            # do not learn these, they are static
+            z_means = z_means.detach()
+            z_stds = z_stds.detach()
+
+            z_means = z_means[fg_inds, box_classes]
+            z_stds = z_stds[fg_inds, box_classes]
+
+            z_mins = (z_means - 3*z_stds).clip(0)
+            z_maxs = (z_means + 3*z_stds)
+
+            cube_z_norm = cube_z
+            cube_z = util.scaled_sigmoid(cube_z, min=z_mins, max=z_maxs)
+
+        if self.virtual_depth:
+            cube_z = (cube_z * virtual_to_real)
+
+        if self.training:
+
+            prefix = 'Cube/'
+            storage = get_event_storage()
+
+            # Pull off necessary GT information
+            # let lowercase->2D and uppercase->3D
+            # [x, y, Z, W, H, L] 
+            gt_2d = gt_boxes3D[:, :2]
+            gt_z = gt_boxes3D[:, 2]
+            gt_dims = gt_boxes3D[:, 3:6]
+
+            # this box may have been mirrored and scaled so
+            # we need to recompute XYZ in 3D by backprojecting.
+            gt_x3d = gt_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+            gt_y3d = gt_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+            gt_3d = torch.stack((gt_x3d, gt_y3d, gt_z)).T
+
+            # put together the GT boxes
+            gt_box3d = torch.cat((gt_3d, gt_dims), dim=1)
+
+            # These are the corners which will be the target for all losses!!
+            gt_corners = util.get_cuboid_verts_faces(gt_box3d, gt_poses)[0]
+
+            # project GT corners
+            gt_proj_boxes = torch.bmm(Ks_scaled_per_box, gt_corners.transpose(1,2))
+            gt_proj_boxes /= gt_proj_boxes[:, -1, :].clone().unsqueeze(1)
+
+            gt_proj_x1 = gt_proj_boxes[:, 0, :].min(1)[0]
+            gt_proj_y1 = gt_proj_boxes[:, 1, :].min(1)[0]
+            gt_proj_x2 = gt_proj_boxes[:, 0, :].max(1)[0]
+            gt_proj_y2 = gt_proj_boxes[:, 1, :].max(1)[0]
+
+            gt_widths = gt_proj_x2 - gt_proj_x1
+            gt_heights = gt_proj_y2 - gt_proj_y1
+            gt_x = gt_proj_x1 + 0.5 * gt_widths
+            gt_y = gt_proj_y1 + 0.5 * gt_heights
+
+            gt_proj_boxes = torch.stack((gt_proj_x1, gt_proj_y1, gt_proj_x2, gt_proj_y2), dim=1)
+            
+            if self.disentangled_loss:
+                '''
+                Disentangled loss compares each varaible group to the 
+                cuboid corners, which is generally more robust to hyperparams.
+                '''
+                    
+                # compute disentangled Z corners
+                cube_dis_x3d_from_z = cube_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+                cube_dis_y3d_from_z = cube_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+                cube_dis_z = torch.cat((torch.stack((cube_dis_x3d_from_z, cube_dis_y3d_from_z, cube_z)).T, gt_dims), dim=1)
+                dis_z_corners = util.get_cuboid_verts_faces(cube_dis_z, gt_poses)[0]
+                
+                # compute disentangled XY corners
+                cube_dis_x3d = gt_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+                cube_dis_y3d = gt_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+                cube_dis_XY = torch.cat((torch.stack((cube_dis_x3d, cube_dis_y3d, gt_z)).T, gt_dims), dim=1)
+                dis_XY_corners = util.get_cuboid_verts_faces(cube_dis_XY, gt_poses)[0]
+                loss_xy = self.l1_loss(dis_XY_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
+                    
+                # Pose
+                dis_pose_corners = util.get_cuboid_verts_faces(gt_box3d, cube_pose)[0]
+                
+                # Dims
+                dis_dims_corners = util.get_cuboid_verts_faces(torch.cat((gt_3d, cube_dims), dim=1), gt_poses)[0]
+
+                # Loss dims
+                loss_dims = self.l1_loss(dis_dims_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
+
+                # Loss z
+                loss_z = self.l1_loss(dis_z_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
+    
+                # Rotation uses chamfer or l1 like others
+                if self.chamfer_pose:
+                    loss_pose = self.chamfer_loss(dis_pose_corners, gt_corners)
+
+                else:
+                    loss_pose = self.l1_loss(dis_pose_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
+                
+            # Non-disentangled training losses
+            else:
+                '''
+                These loss functions are fairly arbitrarily designed. 
+                Generally, they are in some normalized space but there
+                are many alternative implementations for most functions.
+                '''
+
+                # XY
+                gt_deltas = (gt_2d.clone() - torch.cat((src_ctr_x.unsqueeze(1), src_ctr_y.unsqueeze(1)), dim=1)) \
+                            / torch.cat((src_widths.unsqueeze(1), src_heights.unsqueeze(1)), dim=1)
+                
+                loss_xy = self.l1_loss(cube_2d_deltas, gt_deltas).mean(1) 
+
+                # Dims
+                if self.dims_priors_enabled:
+                    cube_dims_gt_normspace = torch.log(gt_dims/prior_dims)
+                    loss_dims = self.l1_loss(cube_dims_norm, cube_dims_gt_normspace).mean(1) 
+
+                else:
+                    loss_dims = self.l1_loss(cube_dims_norm, torch.log(gt_dims)).mean(1)
+                
+                # Pose
+                try:
+                    if self.allocentric_pose:
+                        gt_poses_allocentric = util.R_to_allocentric(Ks_scaled_per_box, gt_poses, u=cube_x.detach(), v=cube_y.detach())
+                        loss_pose = 1-so3_relative_angle(cube_pose_allocentric, gt_poses_allocentric, eps=0.1, cos_angle=True)
+                    else:
+                        loss_pose = 1-so3_relative_angle(cube_pose, gt_poses, eps=0.1, cos_angle=True)
+                
+                # Can fail with bad EPS values/instability
+                except:
+                    loss_pose = None
+
+                if self.z_type == 'direct':
+                    loss_z = self.l1_loss(cube_z, gt_z)
+
+                elif self.z_type == 'sigmoid':
+                    loss_z = self.l1_loss(cube_z_norm, (gt_z * real_to_virtual / 100).clip(0, 1))
+                    
+                elif self.z_type == 'log':
+                    loss_z = self.l1_loss(cube_z_norm, torch.log((gt_z * real_to_virtual).clip(0.01)))
+
+                elif self.z_type == 'clusters':
+                    loss_z = self.l1_loss(cube_z_norm, (((gt_z * real_to_virtual) - z_means)/(z_stds)))
+            
+            total_3D_loss_for_reporting = loss_dims*self.loss_w_dims
+
+            if not loss_pose is None:
+                total_3D_loss_for_reporting += loss_pose*self.loss_w_pose
+
+            if not cube_2d_deltas is None:
+                total_3D_loss_for_reporting += loss_xy*self.loss_w_xy
+
+            if not loss_z is None:
+                total_3D_loss_for_reporting += loss_z*self.loss_w_z
+            
+            # reporting does not need gradients
+            total_3D_loss_for_reporting = total_3D_loss_for_reporting.detach()
+
+            if self.loss_w_joint > 0:
+                '''
+                If we are using joint [entangled] loss, then we also need to pair all 
+                predictions together and compute a chamfer or l1 loss vs. cube corners.
+                '''
+                
+                cube_dis_x3d_from_z = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+                cube_dis_y3d_from_z = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+                cube_dis_z = torch.cat((torch.stack((cube_dis_x3d_from_z, cube_dis_y3d_from_z, cube_z)).T, cube_dims), dim=1)
+                dis_z_corners_joint = util.get_cuboid_verts_faces(cube_dis_z, cube_pose)[0]
+                
+                if self.chamfer_pose and self.disentangled_loss:
+                    loss_joint = self.chamfer_loss(dis_z_corners_joint, gt_corners)
+
+                else:
+                    loss_joint = self.l1_loss(dis_z_corners_joint, gt_corners).contiguous().view(n, -1).mean(dim=1)
+
+                valid_joint = loss_joint < np.inf
+                total_3D_loss_for_reporting += (loss_joint*self.loss_w_joint).detach()
+
+            # compute errors for tracking purposes
+            z_error = (cube_z - gt_z).detach().abs()
+            dims_error = (cube_dims - gt_dims).detach().abs()
+            xy_error = (cube_xy - gt_2d).detach().abs()
+
+            storage.put_scalar(prefix + 'z_error', z_error.mean().item(), smoothing_hint=False)
+            storage.put_scalar(prefix + 'dims_error', dims_error.mean().item(), smoothing_hint=False)
+            storage.put_scalar(prefix + 'xy_error', xy_error.mean().item(), smoothing_hint=False)
+            storage.put_scalar(prefix + 'z_close', (z_error<0.20).float().mean().item(), smoothing_hint=False)
+            
+            storage.put_scalar(prefix + 'total_3D_loss', self.loss_w_3d * self.safely_reduce_losses(total_3D_loss_for_reporting), smoothing_hint=False)
+
+            if self.inverse_z_weight:
+                '''
+                Weights all losses to prioritize close up boxes.
+                '''
+
+                gt_z = gt_boxes3D[:, 2]
+
+                inverse_z_w = 1/torch.log(gt_z.clip(E_CONSTANT))
+                
+                loss_dims *= inverse_z_w
+
+                # scale based on log, but clip at e
+                if not cube_2d_deltas is None:
+                    loss_xy *= inverse_z_w
+                
+                if loss_z is not None:
+                    loss_z *= inverse_z_w
+
+                if loss_pose is not None:
+                    loss_pose *= inverse_z_w
+    
+                if self.loss_w_joint > 0:
+                    loss_joint *= inverse_z_w
+
+            if self.use_confidence > 0:
+                
+                uncert_sf = SQRT_2_CONSTANT * torch.exp(-cube_uncert)
+                
+                loss_dims *= uncert_sf
+
+                if not cube_2d_deltas is None:
+                    loss_xy *= uncert_sf
+
+                if not loss_z is None:
+                    loss_z *= uncert_sf
+
+                if loss_pose is not None:
+                    loss_pose *= uncert_sf
+    
+                if self.loss_w_joint > 0:
+                    loss_joint *= uncert_sf
+
+                losses.update({prefix + 'uncert': self.use_confidence*self.safely_reduce_losses(cube_uncert.clone())})
+                storage.put_scalar(prefix + 'conf', torch.exp(-cube_uncert).mean().item(), smoothing_hint=False)
+
+            # store per batch loss stats temporarily
+            self.batch_losses = [batch_losses.mean().item() for batch_losses in total_3D_loss_for_reporting.split(num_boxes_per_image)]
+            
+            if self.loss_w_dims > 0:
+                losses.update({
+                    prefix + 'loss_dims': self.safely_reduce_losses(loss_dims) * self.loss_w_dims * self.loss_w_3d,
+                })
+
+            if not cube_2d_deltas is None:
+                losses.update({
+                    prefix + 'loss_xy': self.safely_reduce_losses(loss_xy) * self.loss_w_xy * self.loss_w_3d,
+                })
+
+            if not loss_z is None:
+                losses.update({
+                    prefix + 'loss_z': self.safely_reduce_losses(loss_z) * self.loss_w_z * self.loss_w_3d,
+                })
+
+            if loss_pose is not None:
+                
+                losses.update({
+                    prefix + 'loss_pose': self.safely_reduce_losses(loss_pose) * self.loss_w_pose * self.loss_w_3d, 
+                })
+
+            if self.loss_w_joint > 0:
+                if valid_joint.any():
+                    losses.update({prefix + 'loss_joint': self.safely_reduce_losses(loss_joint[valid_joint]) * self.loss_w_joint * self.loss_w_3d})
+
+            
+        '''
+        Inference
+        '''
+        if len(cube_z.shape) == 0:
+            cube_z = cube_z.unsqueeze(0)
+
+        # inference
+        cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
+        cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
+        cube_3D = torch.cat((torch.stack((cube_x3d, cube_y3d, cube_z)).T, cube_dims, cube_xy*im_ratios_per_box.unsqueeze(1)), dim=1)
+
+        if self.use_confidence:
+            cube_conf = torch.exp(-cube_uncert)
+            cube_3D = torch.cat((cube_3D, cube_conf.unsqueeze(1)), dim=1)
+
+        # convert the predictions to intances per image
+        cube_3D = cube_3D.split(num_boxes_per_image)
+        cube_pose = cube_pose.split(num_boxes_per_image)
+        box_classes = box_classes.split(num_boxes_per_image)
+        
+        pred_instances = None
+        
+        pred_instances = instances if not self.training else \
+            [Instances(image_size) for image_size in im_current_dims]
+
+        for cube_3D_i, cube_pose_i, instances_i, K, im_dim, im_scale_ratio, box_classes_i, pred_boxes_i in \
+            zip(cube_3D, cube_pose, pred_instances, Ks, im_current_dims, im_scales_ratio, box_classes, pred_boxes):
+            
+            # merge scores if they already exist
+            if hasattr(instances_i, 'scores'):
+                instances_i.scores = (instances_i.scores * cube_3D_i[:, -1])**(1/2)
+            
+            # assign scores if none are present
+            else:
+                instances_i.scores = cube_3D_i[:, -1]
+            
+            # assign box classes if none exist
+            if not hasattr(instances_i, 'pred_classes'):
+                instances_i.pred_classes = box_classes_i
+
+            # assign predicted boxes if none exist    
+            if not hasattr(instances_i, 'pred_boxes'):
+                instances_i.pred_boxes = pred_boxes_i
+
+            instances_i.pred_bbox3D = util.get_cuboid_verts_faces(cube_3D_i[:, :6], cube_pose_i)[0]
+            instances_i.pred_center_cam = cube_3D_i[:, :3]
+            instances_i.pred_center_2D = cube_3D_i[:, 6:8]
+            instances_i.pred_dimensions = cube_3D_i[:, 3:6]
+            instances_i.pred_pose = cube_pose_i
+
+        if self.training:
+            return pred_instances, losses
+        else:
+            return pred_instances
+
+    def _sample_proposals(
+        self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor, matched_ious=None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Based on the matching between N proposals and M groundtruth,
+        sample the proposals and set their classification labels.
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N).
+            Tensor: a vector of the same length, the classification label for
+                each sampled proposal. Each sample is labeled as either a category in
+                [0, num_classes) or the background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.num_classes
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes, matched_ious=matched_ious
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+    
+    @torch.no_grad()
+    def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]:
+        
+        #separate valid and ignore gts
+        targets_ign = [target[target.gt_classes < 0] for target in targets]
+        targets = [target[target.gt_classes >= 0] for target in targets]
+        
+        if self.proposal_append_gt:
+            proposals = add_ground_truth_to_proposals(targets, proposals)
+
+        proposals_with_gt = []
+
+        num_fg_samples = []
+        num_bg_samples = []
+
+        for proposals_per_image, targets_per_image, targets_ign_per_image in zip(proposals, targets, targets_ign):
+            
+            has_gt = len(targets_per_image) > 0
+            
+            match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
+            matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
+            
+            try:
+                if len(targets_ign_per_image) > 0:
+
+                    # compute the quality matrix, only on subset of background
+                    background_inds = (matched_labels == 0).nonzero().squeeze()
+
+                    # determine the boxes inside ignore regions with sufficient threshold
+                    if background_inds.numel() > 1:
+                        match_quality_matrix_ign = pairwise_ioa(targets_ign_per_image.gt_boxes, proposals_per_image.proposal_boxes[background_inds])
+                        matched_labels[background_inds[match_quality_matrix_ign.max(0)[0] >= self.ignore_thresh]] = -1
+                    
+                        del match_quality_matrix_ign
+            except:
+                pass
+            
+            gt_arange = torch.arange(match_quality_matrix.shape[1]).to(matched_idxs.device)
+            matched_ious = match_quality_matrix[matched_idxs, gt_arange]
+            sampled_idxs, gt_classes = self._sample_proposals(matched_idxs, matched_labels, targets_per_image.gt_classes, matched_ious=matched_ious)
+
+            # Set target attributes of the sampled proposals:
+            proposals_per_image = proposals_per_image[sampled_idxs]
+            proposals_per_image.gt_classes = gt_classes
+
+            if has_gt:
+                sampled_targets = matched_idxs[sampled_idxs]
+                # We index all the attributes of targets that start with "gt_"
+                # and have not been added to proposals yet (="gt_classes").
+                # NOTE: here the indexing waste some compute, because heads
+                # like masks, keypoints, etc, will filter the proposals again,
+                # (by foreground/background, or number of keypoints in the image, etc)
+                # so we essentially index the data twice.
+                for (trg_name, trg_value) in targets_per_image.get_fields().items():
+                    if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
+                        proposals_per_image.set(trg_name, trg_value[sampled_targets])
+            
+
+            num_bg_samples.append((gt_classes == self.num_classes).sum().item())
+            num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
+            proposals_with_gt.append(proposals_per_image)
+
+        # Log the number of fg/bg samples that are selected for training ROI heads
+        storage = get_event_storage()
+        storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
+        storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
+
+        return proposals_with_gt
+
+
+    def safely_reduce_losses(self, loss):
+
+        valid = (~(loss.isinf())) & (~(loss.isnan()))
+
+        if valid.any():
+            return loss[valid].mean()
+        else:
+            # no valid losses, simply zero out
+            return loss.mean()*0.0
+        
\ No newline at end of file
diff --git a/cubercnn/solver/__init__.py b/cubercnn/solver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03ddcbd9bee9a0e196ed61f3637b1aacfd018c9e
--- /dev/null
+++ b/cubercnn/solver/__init__.py
@@ -0,0 +1,2 @@
+from .build import *
+from .checkpoint import *
\ No newline at end of file
diff --git a/cubercnn/solver/build.py b/cubercnn/solver/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc5e582caac3f514a359d29957f7f0c9b637a576
--- /dev/null
+++ b/cubercnn/solver/build.py
@@ -0,0 +1,76 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import torch
+from typing import Any, Dict, List, Set
+from detectron2.solver.build import maybe_add_gradient_clipping
+
+def build_optimizer(cfg, model):
+    norm_module_types = (
+        torch.nn.BatchNorm1d,
+        torch.nn.BatchNorm2d,
+        torch.nn.BatchNorm3d,
+        torch.nn.SyncBatchNorm,
+        torch.nn.GroupNorm,
+        torch.nn.InstanceNorm1d,
+        torch.nn.InstanceNorm2d,
+        torch.nn.InstanceNorm3d,
+        torch.nn.LayerNorm,
+        torch.nn.LocalResponseNorm,
+    )
+    params: List[Dict[str, Any]] = []
+    memo: Set[torch.nn.parameter.Parameter] = set()
+    for module in model.modules():
+        for key, value in module.named_parameters(recurse=False):
+            if not value.requires_grad:
+                continue
+            # Avoid duplicating parameters
+            if value in memo:
+                continue
+            memo.add(value)
+            
+            lr = cfg.SOLVER.BASE_LR
+            weight_decay = cfg.SOLVER.WEIGHT_DECAY
+
+            if isinstance(module, norm_module_types) and (cfg.SOLVER.WEIGHT_DECAY_NORM is not None):
+                weight_decay = cfg.SOLVER.WEIGHT_DECAY_NORM
+            
+            elif key == "bias":
+                if (cfg.SOLVER.BIAS_LR_FACTOR is not None):
+                    lr = cfg.SOLVER.BASE_LR * cfg.SOLVER.BIAS_LR_FACTOR
+                if (cfg.SOLVER.WEIGHT_DECAY_BIAS is not None):
+                    weight_decay = cfg.SOLVER.WEIGHT_DECAY_BIAS
+
+            # these params do not need weight decay at all
+            # TODO parameterize these in configs instead.
+            if key in ['priors_dims_per_cat', 'priors_z_scales', 'priors_z_stats']:
+                weight_decay = 0.0
+
+            params += [{"params": [value], "lr": lr, "weight_decay": weight_decay}]
+
+    if cfg.SOLVER.TYPE == 'sgd':
+        optimizer = torch.optim.SGD(
+            params, 
+            cfg.SOLVER.BASE_LR, 
+            momentum=cfg.SOLVER.MOMENTUM, 
+            nesterov=cfg.SOLVER.NESTEROV, 
+            weight_decay=cfg.SOLVER.WEIGHT_DECAY
+        )
+    elif cfg.SOLVER.TYPE == 'adam':
+        optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, eps=1e-02)
+    elif cfg.SOLVER.TYPE == 'adam+amsgrad':
+        optimizer = torch.optim.Adam(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02)
+    elif cfg.SOLVER.TYPE == 'adamw':
+        optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, eps=1e-02)
+    elif cfg.SOLVER.TYPE == 'adamw+amsgrad':
+        optimizer = torch.optim.AdamW(params, cfg.SOLVER.BASE_LR, amsgrad=True, eps=1e-02)
+    else:
+        raise ValueError('{} is not supported as an optimizer.'.format(cfg.SOLVER.TYPE))
+
+    optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+    return optimizer
+
+def freeze_bn(network):
+
+    for _, module in network.named_modules():
+        if isinstance(module, torch.nn.BatchNorm2d):
+            module.eval()
+            module.track_running_stats = False
diff --git a/cubercnn/solver/checkpoint.py b/cubercnn/solver/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..2298d67c18fd9f8e51f2d6c0522f7eb393db6fe7
--- /dev/null
+++ b/cubercnn/solver/checkpoint.py
@@ -0,0 +1,28 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from detectron2.checkpoint import PeriodicCheckpointer
+from typing import Any
+
+class PeriodicCheckpointerOnlyOne(PeriodicCheckpointer):
+    def step(self, iteration: int, **kwargs: Any) -> None:
+        """
+        Perform the appropriate action at the given iteration.
+
+        Args:
+            iteration (int): the current iteration, ranged in [0, max_iter-1].
+            kwargs (Any): extra data to save, same as in
+                :meth:`Checkpointer.save`.
+        """
+        iteration = int(iteration)
+        additional_state = {"iteration": iteration}
+        additional_state.update(kwargs)
+
+        if (iteration + 1) % self.period == 0:
+            
+            # simply save a single recent model
+            self.checkpointer.save(
+                "{}_recent".format(self.file_prefix), **additional_state
+            )
+
+        if self.max_iter is not None:
+            if iteration >= self.max_iter - 1:
+                self.checkpointer.save(f"{self.file_prefix}_final", **additional_state)
\ No newline at end of file
diff --git a/cubercnn/util/__init__.py b/cubercnn/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac2c1d0bb07fa1bbe06849c9579567dc629395c6
--- /dev/null
+++ b/cubercnn/util/__init__.py
@@ -0,0 +1,3 @@
+from .util import *
+from .model_zoo import *
+from .math_util import *
\ No newline at end of file
diff --git a/cubercnn/util/math_util.py b/cubercnn/util/math_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f172d1167e55cfb5db5b79615488516caf52792
--- /dev/null
+++ b/cubercnn/util/math_util.py
@@ -0,0 +1,1237 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import math
+import numpy as np
+import pandas as pd
+from typing import Tuple, List
+from pytorch3d.renderer.lighting import PointLights
+from pytorch3d.renderer.mesh.renderer import MeshRenderer
+from pytorch3d.renderer.mesh.shader import SoftPhongShader
+from pytorch3d.transforms.math import acos_linear_extrapolation
+import torch 
+from pytorch3d.structures import Meshes
+from detectron2.structures import BoxMode
+from pytorch3d.renderer import TexturesVertex
+from pytorch3d.structures.meshes import (
+    Meshes,
+)
+
+from pytorch3d.renderer import (
+    PerspectiveCameras, 
+    RasterizationSettings,
+    MeshRasterizer
+)
+
+from pytorch3d.renderer import (
+    PerspectiveCameras, 
+    SoftSilhouetteShader, 
+    RasterizationSettings,
+    MeshRasterizer
+)
+from detectron2.data import (
+    MetadataCatalog,
+)
+from pytorch3d.transforms import axis_angle_to_matrix
+from pytorch3d.renderer import MeshRenderer as MR
+
+UNIT_CUBE = np.array([
+       [-0.5, -0.5, -0.5],
+       [ 0.5, -0.5, -0.5],
+       [ 0.5,  0.5, -0.5],
+       [-0.5,  0.5, -0.5],
+       [-0.5, -0.5,  0.5],
+       [ 0.5, -0.5,  0.5],
+       [ 0.5,  0.5,  0.5],
+       [-0.5,  0.5,  0.5]
+])
+
+def upto_2Pi(val):
+
+    out = val
+
+    # constrain between [0, 2pi)
+    while out >= 2*math.pi: out -= math.pi * 2
+    while out < 0: out += math.pi * 2
+
+    return out
+
+def upto_Pi(val):
+
+    out = val
+
+    # constrain between [0, pi)
+    while out >= math.pi: out -= math.pi
+    while out < 0: out += math.pi
+
+    return out
+
+# Calculates rotation matrix to euler angles
+# The result is the same as MATLAB except the order
+# of the euler angles ( x and z are swapped ).
+# adopted from https://www.learnopencv.com/rotation-matrix-to-euler-angles/
+def mat2euler(R):
+
+    sy = math.sqrt(R[0, 0] * R[0, 0] + R[1, 0] * R[1, 0])
+
+    #singular = sy < 1e-6
+
+    x = math.atan2(R[2, 1], R[2, 2])
+    y = math.atan2(-R[2, 0], sy)
+    z = math.atan2(R[1, 0], R[0, 0])
+
+    return np.array([x, y, z])
+
+# Calculates Rotation Matrix given euler angles.
+# adopted from https://www.learnopencv.com/rotation-matrix-to-euler-angles/
+def euler2mat(euler):
+
+    R_x = np.array([[1, 0, 0],
+                    [0, math.cos(euler[0]), -math.sin(euler[0])],
+                    [0, math.sin(euler[0]), math.cos(euler[0])]
+                    ])
+
+    R_y = np.array([[math.cos(euler[1]), 0, math.sin(euler[1])],
+                    [0, 1, 0],
+                    [-math.sin(euler[1]), 0, math.cos(euler[1])]
+                    ])
+
+    R_z = np.array([[math.cos(euler[2]), -math.sin(euler[2]), 0],
+                    [math.sin(euler[2]), math.cos(euler[2]), 0],
+                    [0, 0, 1]
+                    ])
+
+    R = np.dot(R_z, np.dot(R_y, R_x))
+
+    return R
+
+def euler2mat_torch(euler):
+    R_x = torch.stack([
+        torch.tensor([[1, 0, 0],
+                        [0, torch.cos(angle), -torch.sin(angle)],
+                        [0, torch.sin(angle), torch.cos(angle)]])
+        for angle in euler[:, 0]
+    ])
+
+    R_y = torch.stack([
+        torch.tensor([[torch.cos(angle), 0, torch.sin(angle)],
+                        [0, 1, 0],
+                        [-torch.sin(angle), 0, torch.cos(angle)]])
+        for angle in euler[:, 1]
+    ])
+
+    R_z = torch.stack([
+        torch.tensor([[torch.cos(angle), -torch.sin(angle), 0],
+                        [torch.sin(angle), torch.cos(angle), 0],
+                        [0, 0, 1]])
+        for angle in euler[:, 2]
+    ])
+
+    R = torch.matmul(R_z, torch.matmul(R_y, R_x))
+    # (n x 3 x 3 out tensor)
+    return R
+
+
+def to_float_tensor(input):
+
+    data_type = type(input)
+
+    if data_type != torch.Tensor:
+        input = torch.tensor(input)
+    
+    return input.float()
+
+def get_cuboid_verts_faces(box3d=None, R=None):
+    """
+    Computes vertices and faces from a 3D cuboid representation.
+    Args:
+        bbox3d (flexible): [[X Y Z W H L]]
+        R (flexible): [np.array(3x3)]
+    Returns:
+        verts: the 3D vertices of the cuboid in camera space
+        faces: the vertex indices per face
+    """
+    if box3d is None:
+        box3d = [0, 0, 0, 1, 1, 1]
+
+    # make sure types are correct
+    box3d = to_float_tensor(box3d)
+    
+    if R is not None:
+        R = to_float_tensor(R)
+
+    squeeze = len(box3d.shape) == 1
+    
+    if squeeze:    
+        box3d = box3d.unsqueeze(0)
+        if R is not None:
+            R = R.unsqueeze(0)
+    
+    n = len(box3d)
+
+    x3d = box3d[:, 0].unsqueeze(1)
+    y3d = box3d[:, 1].unsqueeze(1)
+    z3d = box3d[:, 2].unsqueeze(1)
+    w3d = box3d[:, 3].unsqueeze(1)
+    h3d = box3d[:, 4].unsqueeze(1)
+    l3d = box3d[:, 5].unsqueeze(1)
+
+    '''
+                    v4_____________________v5
+                    /|                    /|
+                   / |                   / |
+                  /  |                  /  |
+                 /___|_________________/   |
+              v0|    |                 |v1 |
+                |    |                 |   |
+                |    |                 |   |
+                |    |                 |   |
+                |    |_________________|___|
+                |   / v7               |   /v6
+                |  /                   |  /
+                | /                    | /
+                |/_____________________|/
+                v3                     v2
+    '''
+
+    verts = to_float_tensor(torch.zeros([n, 3, 8], device=box3d.device))
+
+    # setup X
+    verts[:, 0, [0, 3, 4, 7]] = -l3d / 2
+    verts[:, 0, [1, 2, 5, 6]] = l3d / 2
+
+    # setup Y
+    verts[:, 1, [0, 1, 4, 5]] = -h3d / 2
+    verts[:, 1, [2, 3, 6, 7]] = h3d / 2
+
+    # setup Z
+    verts[:, 2, [0, 1, 2, 3]] = -w3d / 2
+    verts[:, 2, [4, 5, 6, 7]] = w3d / 2
+
+    if R is not None:
+
+        # rotate
+        verts = R @ verts
+    
+    # translate
+    verts[:, 0, :] += x3d
+    verts[:, 1, :] += y3d
+    verts[:, 2, :] += z3d
+
+    verts = verts.transpose(1, 2)
+
+    faces = torch.tensor([
+        [0, 1, 2], # front TR
+        [2, 3, 0], # front BL
+
+        [1, 5, 6], # right TR
+        [6, 2, 1], # right BL
+
+        [4, 0, 3], # left TR
+        [3, 7, 4], # left BL
+
+        [5, 4, 7], # back TR
+        [7, 6, 5], # back BL
+
+        [4, 5, 1], # top TR
+        [1, 0, 4], # top BL
+
+        [3, 2, 6], # bottom TR
+        [6, 7, 3], # bottom BL
+    ]).float().unsqueeze(0).repeat([n, 1, 1])
+
+    if squeeze:
+        verts = verts.squeeze()
+        faces = faces.squeeze()
+
+    return verts, faces.to(verts.device)
+
+def get_cuboid_verts(K, box3d, R=None, view_R=None, view_T=None):
+
+    # make sure types are correct
+    K = to_float_tensor(K)
+    box3d = to_float_tensor(box3d)
+    
+    if R is not None:
+        R = to_float_tensor(R)
+
+    squeeze = len(box3d.shape) == 1
+    
+    if squeeze:    
+        box3d = box3d.unsqueeze(0)
+        if R is not None:
+            R = R.unsqueeze(0)
+
+    n = len(box3d)
+
+    if len(K.shape) == 2:
+        K = K.unsqueeze(0).repeat([n, 1, 1])
+
+    corners_3d, _ = get_cuboid_verts_faces(box3d, R)
+    if view_T is not None:
+        corners_3d -= view_T.view(1, 1, 3)
+    if view_R is not None:
+        corners_3d = (view_R @ corners_3d[0].T).T.unsqueeze(0)
+    if view_T is not None:
+        corners_3d[:, :, -1] += view_T.view(1, 1, 3)[:, :, -1]*1.25
+
+    # project to 2D
+    corners_2d = K @ corners_3d.transpose(1, 2)
+    corners_2d[:, :2, :] = corners_2d[:, :2, :] / corners_2d[:, 2, :].unsqueeze(1)
+    corners_2d = corners_2d.transpose(1, 2)
+
+    if squeeze:
+        corners_3d = corners_3d.squeeze()
+        corners_2d = corners_2d.squeeze()
+
+    return corners_2d, corners_3d
+
+
+def approx_eval_resolution(h, w, scale_min=0, scale_max=1e10):
+    """
+    Approximates the resolution an image with h x w resolution would
+    run through a model at which constrains the scale to a min and max. 
+    Args:
+        h (int): input resolution height
+        w (int): input resolution width
+        scale_min (int): minimum scale allowed to resize too
+        scale_max (int): maximum scale allowed to resize too
+    Returns:
+        h (int): output resolution height
+        w (int): output resolution width
+        sf (float): scaling factor that was applied
+            which can convert from original --> network resolution.
+    """
+    orig_h = h
+
+    # first resize to min
+    sf = scale_min / min(h, w)
+    h *= sf
+    w *= sf
+
+    # next resize to max
+    sf = min(scale_max / max(h, w), 1.0)
+    h *= sf
+    w *= sf
+
+    return h, w, h/orig_h
+
+
+def compute_priors(cfg, datasets, max_cluster_rounds=1000, min_points_for_std=5, n_bins=None):
+    """
+    Computes priors via simple averaging or a custom K-Means clustering. 
+    """
+
+    annIds = datasets.getAnnIds()
+    anns = datasets.loadAnns(annIds)
+
+    data_raw = []
+
+    category_names = MetadataCatalog.get('omni3d_model').thing_classes
+
+    virtual_depth = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH
+    virtual_focal = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL
+    test_scale_min = cfg.INPUT.MIN_SIZE_TEST
+    test_scale_max = cfg.INPUT.MAX_SIZE_TEST
+
+    '''
+    Accumulate the annotations while discarding the 2D center information
+    (hence, keeping only the 2D and 3D scale information, and properties.)
+    '''
+
+    for ann_idx, ann in enumerate(anns):
+
+        category_name = ann['category_name'].lower()
+
+        ignore = ann['ignore']
+        dataset_id = ann['dataset_id']
+        image_id = ann['image_id']
+
+        fy = datasets.imgs[image_id]['K'][1][1]
+        im_h = datasets.imgs[image_id]['height']
+        im_w = datasets.imgs[image_id]['width']
+        f = 2 * fy / im_h
+
+        if cfg.DATASETS.MODAL_2D_BOXES and 'bbox2D_tight' in ann and ann['bbox2D_tight'][0] != -1:
+            x, y, w, h =  BoxMode.convert(ann['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+        elif cfg.DATASETS.TRUNC_2D_BOXES and 'bbox2D_trunc' in ann and not np.all([val==-1 for val in ann['bbox2D_trunc']]):
+            x, y, w, h =  BoxMode.convert(ann['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+        elif 'bbox2D_proj' in ann:
+            x, y, w, h =  BoxMode.convert(ann['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+        else:
+            continue
+
+        x3d, y3d, z3d = ann['center_cam']
+        w3d, h3d, l3d = ann['dimensions']
+        
+        test_h, test_w, sf = approx_eval_resolution(im_h, im_w, test_scale_min, test_scale_max)
+
+        # scale everything to test resolution
+        h *= sf
+        w *= sf
+
+        if virtual_depth:
+            virtual_to_real = compute_virtual_scale_from_focal_spaces(fy, im_h, virtual_focal, test_h)
+            real_to_virtual = 1/virtual_to_real
+            z3d *= real_to_virtual
+
+        scale = np.sqrt(h**2 + w**2)
+
+        if (not ignore) and category_name in category_names:
+            data_raw.append([category_name, w, h, x3d, y3d, z3d, w3d, h3d, l3d, w3d*h3d*l3d, dataset_id, image_id, fy, f, scale])
+
+    # TODO pandas is fairly inefficient to rely on for large scale.
+    df_raw = pd.DataFrame(data_raw, columns=[
+        'name', 
+        'w', 'h', 'x3d', 'y3d', 'z3d', 
+        'w3d', 'h3d', 'l3d', 'volume', 
+        'dataset', 'image', 
+        'fy', 'f', 'scale'
+    ])
+    # ^ the elements ending in w/h/l3d are the actual sizes, while the x/y/z3d are the camera perspective sizes.
+
+    priors_bins = []
+    priors_dims_per_cat = []
+    priors_z3d_per_cat = []
+    priors_y3d_per_cat = []
+
+    # compute priors for z and y globally
+    priors_z3d = [df_raw.z3d.mean(), df_raw.z3d.std()]
+    priors_y3d = [df_raw.y3d.mean(), df_raw.y3d.std()]
+
+    if n_bins is None:
+        n_bins = cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS
+
+    # Each prior is pre-computed per category
+    for cat in category_names:
+        
+        df_cat = df_raw[df_raw.name == cat]        
+
+        '''
+        First compute static variable statistics
+        '''
+
+        scales = torch.FloatTensor(np.array(df_cat.scale))
+        n = len(scales)
+
+        if n > 0:
+            priors_dims_per_cat.append([[df_cat.w3d.mean(), df_cat.h3d.mean(), df_cat.l3d.mean()], [df_cat.w3d.std(), df_cat.h3d.std(), df_cat.l3d.std()]])            
+            priors_z3d_per_cat.append([df_cat.z3d.mean(), df_cat.z3d.std()])            
+            priors_y3d_per_cat.append([df_cat.y3d.mean(), df_cat.y3d.std()])
+        
+        else:
+            # dummy data.
+            priors_dims_per_cat.append([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]])            
+            priors_z3d_per_cat.append([50, 50])            
+            priors_y3d_per_cat.append([1, 10])
+
+        '''
+        Next compute Z cluster statistics based on y and area
+        '''
+
+        def compute_cluster_scale_mean(scales, assignments, n_bins, match_quality):
+
+            cluster_scales = []
+
+            for bin in range(n_bins):
+
+                in_cluster = assignments==bin
+                
+                if in_cluster.sum() < min_points_for_std:
+                    in_cluster[match_quality[:, bin].topk(min_points_for_std)[1]] = True
+
+                scale = scales[in_cluster].mean()
+                cluster_scales.append(scale.item())
+
+            return torch.FloatTensor(cluster_scales)
+
+        if n_bins > 1:
+
+            if n < min_points_for_std:
+                
+                print('Warning {} category has only {} valid samples...'.format(cat, n))
+                
+                # dummy data since category doesn't have available samples.
+                max_scale = cfg.MODEL.ANCHOR_GENERATOR.SIZES[-1][-1]
+                min_scale = cfg.MODEL.ANCHOR_GENERATOR.SIZES[0][0]
+                base = (max_scale / min_scale) ** (1 / (n_bins - 1))
+                cluster_scales = np.array([min_scale * (base ** i) for i in range(0, n_bins)])
+                
+                # default values are unused anyways in training. but range linearly 
+                # from 100 to 1 and ascend with 2D scale. 
+                bin_priors_z = [[b, 15] for b in np.arange(100, 1, -(100-1)/n_bins)]
+                priors_bins.append((cat, cluster_scales.tolist(), bin_priors_z))
+                assert len(bin_priors_z) == n_bins, 'Broken default bin scaling.'
+            else:
+            
+                max_scale = scales.max()
+                min_scale = scales.min()
+                base = (max_scale / min_scale) ** (1 / (n_bins - 1))
+                cluster_scales = torch.FloatTensor([min_scale * (base ** i) for i in range(0, n_bins)])
+
+                best_score = -np.inf
+
+                for round in range(max_cluster_rounds):
+                    
+                    # quality scores for gts and clusters (n x n_bins)
+                    match_quality = -(cluster_scales.unsqueeze(0) - scales.unsqueeze(1)).abs()
+
+                    # assign to best clusters
+                    scores, assignments_round = match_quality.max(1)
+                    round_score = scores.mean().item()
+
+                    if np.round(round_score, 5) > best_score:
+                        best_score = round_score
+                        assignments = assignments_round
+                        
+                        # make new clusters
+                        cluster_scales = compute_cluster_scale_mean(scales, assignments, n_bins, match_quality)
+
+                    else:
+                        break
+
+                bin_priors_z = []
+
+                for bin in range(n_bins):
+                    
+                    in_cluster = assignments == bin
+
+                    # not enough in the cluster to compute reliable stats?
+                    # fill it with the topk others
+                    if in_cluster.sum() < min_points_for_std:
+                        in_cluster[match_quality[:, bin].topk(min_points_for_std)[1]] = True
+
+                    # move to numpy for indexing pandas
+                    in_cluster = in_cluster.numpy()
+
+                    z3d_mean = df_cat.z3d[in_cluster].mean()
+                    z3d_std = df_cat.z3d[in_cluster].std()
+
+                    bin_priors_z.append([z3d_mean, z3d_std])
+                
+                priors_bins.append((cat, cluster_scales.numpy().tolist(), bin_priors_z))
+        
+    priors = {
+        'priors_dims_per_cat': priors_dims_per_cat,
+        'priors_z3d_per_cat': priors_z3d_per_cat,
+        'priors_y3d_per_cat': priors_y3d_per_cat,
+        'priors_bins': priors_bins,
+        'priors_y3d': priors_y3d,
+        'priors_z3d': priors_z3d,
+    }
+    
+    return priors
+
+def compute_priors_custom(cfg, datasets, max_cluster_rounds=1000, min_points_for_std=5):
+    """
+    simplification of the standard compute_priors function
+    
+    Computes priors via simple averaging
+    """
+
+    annIds = datasets.getAnnIds()
+    anns = datasets.loadAnns(annIds)
+
+    data_raw = []
+
+    category_names = MetadataCatalog.get('omni3d_model').thing_classes
+
+    virtual_depth = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH
+    virtual_focal = cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL
+    test_scale_min = cfg.INPUT.MIN_SIZE_TEST
+    test_scale_max = cfg.INPUT.MAX_SIZE_TEST
+
+    '''
+    Accumulate the annotations while discarding the 2D center information
+    (hence, keeping only the 2D and 3D scale information, and properties.)
+    '''
+
+    for ann_idx, ann in enumerate(anns):
+
+        category_name = ann['category_name'].lower()
+
+        ignore = ann['ignore']
+        dataset_id = ann['dataset_id']
+        image_id = ann['image_id']
+
+        fy = datasets.imgs[image_id]['K'][1][1]
+        im_h = datasets.imgs[image_id]['height']
+        im_w = datasets.imgs[image_id]['width']
+        f = 2 * fy / im_h
+
+        if cfg.DATASETS.MODAL_2D_BOXES and 'bbox2D_tight' in ann and ann['bbox2D_tight'][0] != -1:
+            x, y, w, h =  BoxMode.convert(ann['bbox2D_tight'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+        elif cfg.DATASETS.TRUNC_2D_BOXES and 'bbox2D_trunc' in ann and not np.all([val==-1 for val in ann['bbox2D_trunc']]):
+            x, y, w, h =  BoxMode.convert(ann['bbox2D_trunc'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+        elif 'bbox2D_proj' in ann:
+            x, y, w, h =  BoxMode.convert(ann['bbox2D_proj'], BoxMode.XYXY_ABS, BoxMode.XYWH_ABS)
+
+        else:
+            continue
+
+        x3d, y3d, z3d = ann['center_cam']
+        w3d, h3d, l3d = ann['dimensions']
+        
+        test_h, test_w, sf = approx_eval_resolution(im_h, im_w, test_scale_min, test_scale_max)
+
+        # scale everything to test resolution
+        h *= sf
+        w *= sf
+
+        if virtual_depth:
+            virtual_to_real = compute_virtual_scale_from_focal_spaces(fy, im_h, virtual_focal, test_h)
+            real_to_virtual = 1/virtual_to_real
+            z3d *= real_to_virtual
+
+        scale = np.sqrt(h**2 + w**2)
+
+        if (not ignore) and category_name in category_names:
+            data_raw.append([category_name, w, h, x3d, y3d, z3d, w3d, h3d, l3d, w3d*h3d*l3d, dataset_id, image_id, fy, f, scale])
+
+    # TODO pandas is fairly inefficient to rely on for large scale.
+    df_raw = pd.DataFrame(data_raw, columns=[
+        'name', 
+        'w', 'h', 'x3d', 'y3d', 'z3d', 
+        'w3d', 'h3d', 'l3d', 'volume', 
+        'dataset', 'image', 
+        'fy', 'f', 'scale'
+    ])
+    # ^ the elements ending in w/h/l3d are the actual sizes, while the x/y/z3d are the camera perspective sizes.
+
+    priors_bins = []
+    priors_dims_per_cat = []
+    priors_z3d_per_cat = []
+    priors_y3d_per_cat = []
+
+    # compute priors for z and y globally
+    priors_z3d = [df_raw.z3d.mean(), df_raw.z3d.std()]
+    priors_y3d = [df_raw.y3d.mean(), df_raw.y3d.std()]
+
+
+    # Each prior is pre-computed per category
+    for cat in category_names:
+        
+        df_cat = df_raw[df_raw.name == cat]        
+
+        '''
+        First compute static variable statistics
+        '''
+
+        scales = torch.FloatTensor(np.array(df_cat.scale))
+        n = len(scales)
+
+        if None:
+            priors_dims_per_cat.append([[df_cat.w3d.mean(), df_cat.h3d.mean(), df_cat.l3d.mean()], [df_cat.w3d.std(), df_cat.h3d.std(), df_cat.l3d.std()]])            
+            priors_z3d_per_cat.append([df_cat.z3d.mean(), df_cat.z3d.std()])            
+            priors_y3d_per_cat.append([df_cat.y3d.mean(), df_cat.y3d.std()])
+        
+        else:
+            # dummy data.
+            priors_dims_per_cat.append([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]])            
+            priors_z3d_per_cat.append([0, 0])            
+            priors_y3d_per_cat.append([0, 0])
+
+          
+    priors = {
+        'priors_dims_per_cat': priors_dims_per_cat,
+        'priors_z3d_per_cat': priors_z3d_per_cat,
+        'priors_y3d_per_cat': priors_y3d_per_cat,
+        'priors_bins': priors_bins,
+        'priors_y3d': priors_y3d,
+        'priors_z3d': priors_z3d,
+    }
+    
+    return priors
+
+def convert_3d_box_to_2d(K, box3d, R=None, clipw=0, cliph=0, XYWH=True, min_z=0.20):
+    """
+    Converts a 3D box to a 2D box via projection. 
+    Args:
+        K (np.array): intrinsics matrix 3x3
+        bbox3d (flexible): [[X Y Z W H L]]
+        R (flexible): [np.array(3x3)]
+        clipw (int): clip invalid X to the image bounds. Image width is usually used here.
+        cliph (int): clip invalid Y to the image bounds. Image height is usually used here.
+        XYWH (bool): returns in XYWH if true, otherwise XYXY format. 
+        min_z: the threshold for how close a vertex is allowed to be before being
+            considered as invalid for projection purposes.
+    Returns:
+        box2d (flexible): the 2D box results.
+        behind_camera (bool): whether the projection has any points behind the camera plane.
+        fully_behind (bool): all points are behind the camera plane. 
+    """
+
+    # bounds used for vertices behind image plane
+    topL_bound = torch.tensor([[0, 0, 0]]).float()
+    topR_bound = torch.tensor([[clipw-1, 0, 0]]).float()
+    botL_bound = torch.tensor([[0, cliph-1, 0]]).float()
+    botR_bound = torch.tensor([[clipw-1, cliph-1, 0]]).float()
+
+    # make sure types are correct
+    K = to_float_tensor(K)
+    box3d = to_float_tensor(box3d)
+    
+    if R is not None:
+        R = to_float_tensor(R)
+
+    squeeze = len(box3d.shape) == 1
+    
+    if squeeze:    
+        box3d = box3d.unsqueeze(0)
+        if R is not None:
+            R = R.unsqueeze(0)
+    
+    n = len(box3d)
+    verts2d, verts3d = get_cuboid_verts(K, box3d, R)
+
+    # any boxes behind camera plane?
+    verts_behind = verts2d[:, :, 2] <= min_z
+    behind_camera = verts_behind.any(1)
+
+    verts_signs = torch.sign(verts3d)
+
+    # check for any boxes projected behind image plane corners
+    topL = verts_behind & (verts_signs[:, :, 0] < 0) & (verts_signs[:, :, 1] < 0)
+    topR = verts_behind & (verts_signs[:, :, 0] > 0) & (verts_signs[:, :, 1] < 0)
+    botL = verts_behind & (verts_signs[:, :, 0] < 0) & (verts_signs[:, :, 1] > 0)
+    botR = verts_behind & (verts_signs[:, :, 0] > 0) & (verts_signs[:, :, 1] > 0)
+    
+    # clip values to be in bounds for invalid points
+    verts2d[topL] = topL_bound
+    verts2d[topR] = topR_bound
+    verts2d[botL] = botL_bound
+    verts2d[botR] = botR_bound
+
+    x, xi = verts2d[:, :, 0].min(1)
+    y, yi = verts2d[:, :, 1].min(1)
+    x2, x2i = verts2d[:, :, 0].max(1)
+    y2, y2i = verts2d[:, :, 1].max(1)
+
+    fully_behind = verts_behind.all(1)
+
+    width = x2 - x
+    height = y2 - y
+
+    if XYWH:
+        box2d = torch.cat((x.unsqueeze(1), y.unsqueeze(1), width.unsqueeze(1), height.unsqueeze(1)), dim=1)
+    else:
+        box2d = torch.cat((x.unsqueeze(1), y.unsqueeze(1), x2.unsqueeze(1), y2.unsqueeze(1)), dim=1)
+
+    if squeeze:
+        box2d = box2d.squeeze()
+        behind_camera = behind_camera.squeeze()
+        fully_behind = fully_behind.squeeze()
+
+    return box2d, behind_camera, fully_behind
+
+
+# 
+def compute_virtual_scale_from_focal_spaces(f, H, f0, H0):
+    """
+    Computes the scaling factor of depth from f0, H0 to f, H
+    Args:
+        f (float): the desired [virtual] focal length (px)
+        H (float): the desired [virtual] height (px)
+        f0 (float): the initial [real] focal length (px)
+        H0 (float): the initial [real] height (px)
+    Returns:
+        the scaling factor float to convert form (f0, H0) --> (f, H)
+    """
+    return (H0 * f) / (f0 * H)
+
+
+def R_to_allocentric(K, R, u=None, v=None):
+    """
+    Convert a rotation matrix or series of rotation matrices to allocentric
+    representation given a 2D location (u, v) in pixels. 
+    When u or v are not available, we fall back on the principal point of K.
+    """
+    if type(K) == torch.Tensor:
+        fx = K[:, 0, 0]
+        fy = K[:, 1, 1]
+        sx = K[:, 0, 2]
+        sy = K[:, 1, 2]
+
+        n = len(K)
+        
+        oray = torch.stack(((u - sx)/fx, (v - sy)/fy, torch.ones_like(u))).T
+        oray = oray / torch.linalg.norm(oray, dim=1).unsqueeze(1)
+        angle = torch.acos(oray[:, -1])
+
+        axis = torch.zeros_like(oray)
+        axis[:, 0] = axis[:, 0] - oray[:, 1]
+        axis[:, 1] = axis[:, 1] + oray[:, 0]
+        norms = torch.linalg.norm(axis, dim=1)
+
+        valid_angle = angle > 0
+
+        M = axis_angle_to_matrix(angle.unsqueeze(1)*axis/norms.unsqueeze(1))
+        
+        R_view = R.clone()
+        R_view[valid_angle] = torch.bmm(M[valid_angle].transpose(2, 1), R[valid_angle])
+
+    else:
+        fx = K[0][0]
+        fy = K[1][1]
+        sx = K[0][2]
+        sy = K[1][2]
+        
+        if u is None:
+            u = sx
+
+        if v is None:
+            v = sy
+
+        oray = np.array([(u - sx)/fx, (v - sy)/fy, 1])
+        oray = oray / np.linalg.norm(oray)
+        cray = np.array([0, 0, 1])
+        angle = math.acos(cray.dot(oray))
+        if angle != 0:
+            axis = np.cross(cray, oray)
+            axis_torch = torch.from_numpy(angle*axis/np.linalg.norm(axis)).float()
+            R_view = np.dot(axis_angle_to_matrix(axis_torch).numpy().T, R)
+        else: 
+            R_view = R
+
+    return R_view
+
+
+def R_from_allocentric(K, R_view, u=None, v=None):
+    """
+    Convert a rotation matrix or series of rotation matrices to egocentric
+    representation given a 2D location (u, v) in pixels. 
+    When u or v are not available, we fall back on the principal point of K.
+    """
+    if type(K) == torch.Tensor:
+        fx = K[:, 0, 0]
+        fy = K[:, 1, 1]
+        sx = K[:, 0, 2]
+        sy = K[:, 1, 2]
+
+        n = len(K)
+        
+        oray = torch.stack(((u - sx)/fx, (v - sy)/fy, torch.ones_like(u))).T
+        oray = oray / torch.linalg.norm(oray, dim=1).unsqueeze(1)
+        angle = torch.acos(oray[:, -1])
+
+        axis = torch.zeros_like(oray)
+        axis[:, 0] = axis[:, 0] - oray[:, 1]
+        axis[:, 1] = axis[:, 1] + oray[:, 0]
+        norms = torch.linalg.norm(axis, dim=1)
+
+        valid_angle = angle > 0
+
+        M = axis_angle_to_matrix(angle.unsqueeze(1)*axis/norms.unsqueeze(1))
+        
+        R = R_view.clone()
+        R[valid_angle] = torch.bmm(M[valid_angle], R_view[valid_angle])
+
+    else:
+        fx = K[0][0]
+        fy = K[1][1]
+        sx = K[0][2]
+        sy = K[1][2]
+        
+        if u is None:
+            u = sx
+
+        if v is None:
+            v = sy
+
+        oray = np.array([(u - sx)/fx, (v - sy)/fy, 1])
+        oray = oray / np.linalg.norm(oray)
+        cray = np.array([0, 0, 1])
+        angle = math.acos(cray.dot(oray))
+        if angle != 0:
+            #axis = np.cross(cray, oray)
+            axis = np.array([-oray[1], oray[0], 0])
+            axis_torch = torch.from_numpy(angle*axis/np.linalg.norm(axis)).float()
+            R = np.dot(axis_angle_to_matrix(axis_torch).numpy(), R_view)
+        else: 
+            R = R_view
+
+    return R
+
+def render_depth_map(K, box3d, pose, width, height, device=None):
+    
+    cameras = get_camera(K, width, height)
+    renderer = get_basic_renderer(cameras, width, height)
+
+    mesh = mesh_cuboid(box3d, pose)
+
+    if device is not None:
+        cameras = cameras.to(device)
+        renderer = renderer.to(device)
+        mesh = mesh.to(device)
+
+    im_rendered, fragment = renderer(mesh)
+    silhouettes = im_rendered[:, :, :, -1] > 0
+
+    zbuf = fragment.zbuf[:, :, :, 0]
+    zbuf[zbuf==-1] = math.inf
+    depth_map, depth_map_inds = zbuf.min(dim=0)
+
+    return silhouettes, depth_map, depth_map_inds
+
+def estimate_visibility(K, box3d, pose, width, height, device=None):
+
+    silhouettes, depth_map, depth_map_inds = render_depth_map(K, box3d, pose, width, height, device=device)
+
+    n = silhouettes.shape[0]
+
+    visibilies = []
+
+    for annidx in range(n):
+
+        area = silhouettes[annidx].sum()
+        visible = (depth_map_inds[silhouettes[annidx]] == annidx).sum()
+
+        visibilies.append((visible / area).item())
+
+    return visibilies
+
+def estimate_truncation(K, box3d, R, imW, imH):
+
+    box2d, out_of_bounds, fully_behind =  convert_3d_box_to_2d(K, box3d, R, imW, imH)
+    
+    if fully_behind:
+        return 1.0
+
+    box2d = box2d.detach().cpu().numpy().tolist()
+    box2d_XYXY = BoxMode.convert(box2d, BoxMode.XYWH_ABS, BoxMode.XYXY_ABS)
+    image_box = np.array([0, 0, imW-1, imH-1])
+
+    truncation = 1 - iou(np.array(box2d_XYXY)[np.newaxis], image_box[np.newaxis], ign_area_b=True)
+
+    return truncation.item()
+
+
+def mesh_cuboid(box3d=None, R=None, color=None):
+
+    verts, faces = get_cuboid_verts_faces(box3d, R)
+    
+    if verts.ndim == 2:
+        verts = to_float_tensor(verts).unsqueeze(0)
+        faces = to_float_tensor(faces).unsqueeze(0)
+
+    ninstances = len(verts)
+
+    if (isinstance(color, Tuple) or isinstance(color, List)) and len(color) == 3:
+        color = torch.tensor(color).view(1, 1, 3).expand(ninstances, 8, 3).float()
+
+    # pass in a tensor of colors per box
+    elif color.ndim == 2: 
+        color = to_float_tensor(color).unsqueeze(1).expand(ninstances, 8, 3).float()
+
+    device = verts.device
+
+    mesh = Meshes(verts=verts, faces=faces, textures=None if color is None else TexturesVertex(verts_features=color).to(device))
+
+    return mesh
+
+def get_camera(K, width, height, switch_hands=True, R=None, T=None):
+
+    K = to_float_tensor(K)
+
+    if switch_hands:
+        K = K @ torch.tensor([
+            [-1, 0, 0],
+            [0, -1, 0],
+            [0, 0, 1]
+        ]).float()
+
+    fx = K[0, 0]
+    fy = K[1, 1]
+    px = K[0, 2]
+    py = K[1, 2]
+
+    if R is None:
+        camera = PerspectiveCameras(
+            focal_length=((fx, fy),), principal_point=((px, py),), 
+            image_size=((height, width),), in_ndc=False
+        )
+    else:
+        camera = PerspectiveCameras(
+            focal_length=((fx, fy),), principal_point=((px, py),), 
+            image_size=((height, width),), in_ndc=False, R=R, T=T
+        )
+
+    return camera
+
+
+def get_basic_renderer(cameras, width, height, use_color=False):
+
+    raster_settings = RasterizationSettings(
+        image_size=(height, width), 
+        blur_radius=0 if use_color else np.log(1. / 1e-4 - 1.) * 1e-4, 
+        faces_per_pixel=1, 
+        perspective_correct=False,
+    )
+
+    if use_color:
+        # SoftPhongShader, HardPhongShader, HardFlatShader, SoftGouraudShader
+        lights = PointLights(location=[[0.0, 0.0, 0.0]])
+        shader = SoftPhongShader(cameras=cameras, lights=lights)
+    else:
+        shader = SoftSilhouetteShader()
+
+    renderer = MeshRenderer(
+        rasterizer=MeshRasterizer(
+            cameras=cameras, 
+            raster_settings=raster_settings,
+        ),
+        shader=shader
+    )
+
+    return renderer
+
+class MeshRenderer(MR):
+    def __init__(self, rasterizer, shader):
+        super().__init__(rasterizer, shader)
+
+    def forward(self, meshes_world, **kwargs) -> torch.Tensor:
+        fragments = self.rasterizer(meshes_world, **kwargs)
+        images = self.shader(fragments, meshes_world, **kwargs)
+
+        return images, fragments
+
+def iou(box_a, box_b, mode='cross', ign_area_b=False):
+    """
+    Computes the amount of Intersection over Union (IoU) between two different sets of boxes.
+    Args:
+        box_a (array or tensor): Mx4 boxes, defined by [x1, y1, x2, y2]
+        box_a (array or tensor): Nx4 boxes, defined by [x1, y1, x2, y2]
+        mode (str): either 'cross' or 'list', where cross will check all combinations of box_a and
+                    box_b hence MxN array, and list expects the same size list M == N, hence returns Mx1 array.
+        ign_area_b (bool): if true then we ignore area of b. e.g., checking % box a is inside b
+    """
+
+    data_type = type(box_a)
+
+    # this mode computes the IoU in the sense of cross.
+    # i.e., box_a = M x 4, box_b = N x 4 then the output is M x N
+    if mode == 'cross':
+
+        inter = intersect(box_a, box_b, mode=mode)
+        area_a = ((box_a[:, 2] - box_a[:, 0]) *
+                  (box_a[:, 3] - box_a[:, 1]))
+        area_b = ((box_b[:, 2] - box_b[:, 0]) *
+                  (box_b[:, 3] - box_b[:, 1]))
+
+        # torch.Tensor
+        if data_type == torch.Tensor:
+            union = area_a.unsqueeze(0)
+            if not ign_area_b:
+                union = union + area_b.unsqueeze(1) - inter
+
+            return (inter / union).permute(1, 0)
+
+        # np.ndarray
+        elif data_type == np.ndarray:
+            union = np.expand_dims(area_a, 0) 
+            if not ign_area_b:
+                union = union + np.expand_dims(area_b, 1) - inter
+            return (inter / union).T
+
+        # unknown type
+        else:
+            raise ValueError('unknown data type {}'.format(data_type))
+
+
+    # this mode compares every box in box_a with target in box_b
+    # i.e., box_a = M x 4 and box_b = M x 4 then output is M x 1
+    elif mode == 'list':
+
+        inter = intersect(box_a, box_b, mode=mode)
+        area_a = (box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])
+        area_b = (box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])
+        union = area_a + area_b - inter
+
+        return inter / union
+
+    else:
+        raise ValueError('unknown mode {}'.format(mode))
+
+
+def intersect(box_a, box_b, mode='cross'):
+    """
+    Computes the amount of intersect between two different sets of boxes.
+    Args:
+        box_a (nparray): Mx4 boxes, defined by [x1, y1, x2, y2]
+        box_a (nparray): Nx4 boxes, defined by [x1, y1, x2, y2]
+        mode (str): either 'cross' or 'list', where cross will check all combinations of box_a and
+                    box_b hence MxN array, and list expects the same size list M == N, hence returns Mx1 array.
+        data_type (type): either torch.Tensor or np.ndarray, we automatically determine otherwise
+    """
+
+    # determine type
+    data_type = type(box_a)
+
+    # this mode computes the intersect in the sense of cross.
+    # i.e., box_a = M x 4, box_b = N x 4 then the output is M x N
+    if mode == 'cross':
+
+        # np.ndarray
+        if data_type == np.ndarray:
+            max_xy = np.minimum(box_a[:, 2:4], np.expand_dims(box_b[:, 2:4], axis=1))
+            min_xy = np.maximum(box_a[:, 0:2], np.expand_dims(box_b[:, 0:2], axis=1))
+            inter = np.clip((max_xy - min_xy), a_min=0, a_max=None)
+
+        elif data_type == torch.Tensor:
+            max_xy = torch.min(box_a[:, 2:4], box_b[:, 2:4].unsqueeze(1))
+            min_xy = torch.max(box_a[:, 0:2], box_b[:, 0:2].unsqueeze(1))
+            inter = torch.clamp((max_xy - min_xy), 0)
+
+        # unknown type
+        else:
+            raise ValueError('type {} is not implemented'.format(data_type))
+
+        return inter[:, :, 0] * inter[:, :, 1]
+
+    # this mode computes the intersect in the sense of list_a vs. list_b.
+    # i.e., box_a = M x 4, box_b = M x 4 then the output is Mx1
+    elif mode == 'list':
+
+        # torch.Tesnor
+        if data_type == torch.Tensor:
+            max_xy = torch.min(box_a[:, 2:], box_b[:, 2:])
+            min_xy = torch.max(box_a[:, :2], box_b[:, :2])
+            inter = torch.clamp((max_xy - min_xy), 0)
+
+        # np.ndarray
+        elif data_type == np.ndarray:
+            max_xy = np.min(box_a[:, 2:], box_b[:, 2:])
+            min_xy = np.max(box_a[:, :2], box_b[:, :2])
+            inter = np.clip((max_xy - min_xy), a_min=0, a_max=None)
+
+        # unknown type
+        else:
+            raise ValueError('unknown data type {}'.format(data_type))
+
+        return inter[:, 0] * inter[:, 1]
+
+    else:
+        raise ValueError('unknown mode {}'.format(mode))
+
+
+def scaled_sigmoid(vals, min=0.0, max=1.0):
+    """
+    Simple helper function for a scaled sigmoid. 
+    The output is bounded by (min, max)
+    Args:
+        vals (Tensor): input logits to scale
+        min (Tensor or float): the minimum value to scale to.
+        max (Tensor or float): the maximum value to scale to.
+    """
+    return min + (max-min)*torch.sigmoid(vals)
+
+
+def so3_relative_angle_batched(
+    R: torch.Tensor,
+    cos_angle: bool = False,
+    cos_bound: float = 1e-4,
+    eps: float = 1e-4,
+) -> torch.Tensor:
+    """
+    Calculates the relative angle (in radians) between pairs of
+    rotation matrices `R1` and `R2` with `angle = acos(0.5 * (Trace(R1 R2^T)-1))`
+
+    .. note::
+        This corresponds to a geodesic distance on the 3D manifold of rotation
+        matrices.
+
+    Args:
+        R1: Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        R2: Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        cos_angle: If==True return cosine of the relative angle rather than
+            the angle itself. This can avoid the unstable calculation of `acos`.
+        cos_bound: Clamps the cosine of the relative rotation angle to
+            [-1 + cos_bound, 1 - cos_bound] to avoid non-finite outputs/gradients
+            of the `acos` call. Note that the non-finite outputs/gradients
+            are returned when the angle is requested (i.e. `cos_angle==False`)
+            and the rotation angle is close to 0 or π.
+        eps: Tolerance for the valid trace check of the relative rotation matrix
+            in `so3_rotation_angle`.
+    Returns:
+        Corresponding rotation angles of shape `(minibatch,)`.
+        If `cos_angle==True`, returns the cosine of the angles.
+
+    Raises:
+        ValueError if `R1` or `R2` is of incorrect shape.
+        ValueError if `R1` or `R2` has an unexpected trace.
+    """
+    N = R.shape[0]
+    n_pairs = N * (N - 1) // 2
+    Rleft = torch.zeros((n_pairs, 3, 3))
+    Rright = torch.zeros((n_pairs, 3, 3))
+    global_idx = 0
+    for i in range(1, N):
+        for j in range(i):
+            p1 = R[i]
+            p2 = R[j]
+            Rleft[global_idx] = p1
+            Rright[global_idx] = p2
+            global_idx += 1
+    # gather up the pairs
+    R12 = torch.matmul(Rleft, Rright.permute(0, 2, 1))
+
+    return so3_rotation_angle(R12, cos_angle=cos_angle, cos_bound=cos_bound, eps=eps)
+
+
+def so3_rotation_angle(
+    R: torch.Tensor,
+    eps: float = 1e-4,
+    cos_angle: bool = False,
+    cos_bound: float = 1e-4,
+) -> torch.Tensor:
+    """
+    Calculates angles (in radians) of a batch of rotation matrices `R` with
+    `angle = acos(0.5 * (Trace(R)-1))`. The trace of the
+    input matrices is checked to be in the valid range `[-1-eps,3+eps]`.
+    The `eps` argument is a small constant that allows for small errors
+    caused by limited machine precision.
+
+    Args:
+        R: Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        eps: Tolerance for the valid trace check.
+        cos_angle: If==True return cosine of the rotation angles rather than
+            the angle itself. This can avoid the unstable
+            calculation of `acos`.
+        cos_bound: Clamps the cosine of the rotation angle to
+            [-1 + cos_bound, 1 - cos_bound] to avoid non-finite outputs/gradients
+            of the `acos` call. Note that the non-finite outputs/gradients
+            are returned when the angle is requested (i.e. `cos_angle==False`)
+            and the rotation angle is close to 0 or π.
+
+    Returns:
+        Corresponding rotation angles of shape `(minibatch,)`.
+        If `cos_angle==True`, returns the cosine of the angles.
+
+    Raises:
+        ValueError if `R` is of incorrect shape.
+        ValueError if `R` has an unexpected trace.
+    """
+
+    N, dim1, dim2 = R.shape
+    if dim1 != 3 or dim2 != 3:
+        raise ValueError("Input has to be a batch of 3x3 Tensors.")
+
+    rot_trace = R[:, 0, 0] + R[:, 1, 1] + R[:, 2, 2]
+
+    if ((rot_trace < -1.0 - eps) + (rot_trace > 3.0 + eps)).any():
+        raise ValueError("A matrix has trace outside valid range [-1-eps,3+eps].")
+
+    # phi ... rotation angle
+    phi_cos = (rot_trace - 1.0) * 0.5
+
+    if cos_angle:
+        return phi_cos
+    else:
+        if cos_bound > 0.0:
+            bound = 1.0 - cos_bound
+            return acos_linear_extrapolation(phi_cos, (-bound, bound))
+        else:
+            return torch.acos(phi_cos)
\ No newline at end of file
diff --git a/cubercnn/util/model_zoo.py b/cubercnn/util/model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecd5854f214e638e70337d6e4595705188c07b5e
--- /dev/null
+++ b/cubercnn/util/model_zoo.py
@@ -0,0 +1,25 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from detectron2.utils.file_io import PathHandler, PathManager
+
+__all__ = ["CubeRCNNHandler"]
+
+class CubeRCNNHandler(PathHandler):
+    """
+    Resolves CubeRCNN's model zoo files. 
+    """
+
+    PREFIX = "cubercnn://"
+    CUBERCNN_PREFIX = "https://dl.fbaipublicfiles.com/cubercnn/"
+
+    def _get_supported_prefixes(self):
+        return [self.PREFIX]
+
+    def _get_local_path(self, path):
+        name = path[len(self.PREFIX) :]
+        return PathManager.get_local_path(self.CUBERCNN_PREFIX + name)
+
+    def _open(self, path, mode="r", **kwargs):
+        return PathManager.open(self._get_local_path(path), mode, **kwargs)
+
+
+PathManager.register_handler(CubeRCNNHandler())
\ No newline at end of file
diff --git a/cubercnn/util/util.py b/cubercnn/util/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..35863058ba945d8ca107d3efba1877d5c0adaf67
--- /dev/null
+++ b/cubercnn/util/util.py
@@ -0,0 +1,303 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import json
+import pickle
+import cv2
+from time import time
+import numpy as np
+import os 
+import shutil
+import scipy.io
+from PIL import Image
+from glob import glob
+from difflib import SequenceMatcher
+import matplotlib.colors as mplc
+
+def file_parts(file_path):
+
+    base_path, tail = os.path.split(file_path)
+    name, ext = os.path.splitext(tail)
+
+    return base_path, name, ext
+
+def save_json(path, data):
+
+    with open(path, 'w') as fp:
+        json.dump(data, fp)
+
+def load_json(path):
+    
+    with open(path, 'r') as fp:
+        data = json.load(fp)
+
+    return data
+
+def load_mat(path):
+
+    data = scipy.io.loadmat(path, struct_as_record=False, squeeze_me=True)
+
+    return data
+
+def pickle_write(file_path, obj):
+
+    with open(file_path, 'wb') as file:
+        pickle.dump(obj, file)
+
+
+def pickle_read(file_path, latin=False, iso8859=False, bytes=False):
+    
+    
+    with open(file_path, 'rb') as file:
+        if bytes:
+            obj = pickle.load(file, encoding='bytes')
+        elif latin:
+            obj = pickle.load(file, encoding='latin1')
+        elif iso8859:
+            obj = pickle.load(file, encoding='iso-8859-1')
+        
+        # default encoding
+        else:
+            obj = pickle.load(file)
+            
+
+        return obj
+
+def imread(path):
+    return cv2.imread(path)
+
+# much faster than reading the entire image, just to get the width, height
+def imreadstats(path):
+
+    im = Image.open(path)
+    width, height = im.size
+
+    return width, height
+
+def imwrite(im, path):
+    cv2.imwrite(path, im)
+
+def compute_eta(start_time, idx, total):
+    """
+    Computes estimated time left for an iterative function to finish. 
+    Args:
+        start_time (int): the time the function started at (e.g from time())
+        idx (int): the index the function is currently on, or has completed.
+        total (int): the total amount that needs to pass for completion.
+    Returns:
+        time_str (str): convenient string to display the time remaining
+            in seconds, minutes or hours depending on magnitude. 
+        dt (float): the average change in seconds per iteration. 
+    """
+    
+    # cannot be less than 1
+    idx = max(idx, 1)
+
+    dt = (time() - start_time)/idx
+    timeleft = np.max([dt * (total - idx), 0])
+    if timeleft > 3600: time_str = '{:.1f}h'.format(timeleft / 3600);
+    elif timeleft > 60: time_str = '{:.1f}m'.format(timeleft / 60);
+    else: time_str = '{:.1f}s'.format(timeleft);
+
+    return time_str, dt
+
+def list_files(base_dir, file_pattern):
+    """
+    Returns a list of files given a directory and pattern
+    The results are sorted alphabetically
+    Example:
+        files = list_files('path/to/images/', '*.jpg')
+    """
+    return sorted(glob(os.path.join(base_dir) + file_pattern))
+
+def list_subdirectories(path, include_files=False):
+
+    # this lists everything.
+    if include_files:
+        return sorted(glob(os.path.join(path, '*')))
+
+    # only subdirectories.
+    else:
+        return [fpath for fpath in glob(os.path.join(path, '*')) if os.path.isdir(fpath)]
+
+def mkdir_if_missing(directory, delete_if_exist=False):
+
+    if delete_if_exist and os.path.exists(directory): shutil.rmtree(directory)
+
+    # check if not exist, then make
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+
+# All coco categories, together with their nice-looking visualization colors
+# It's from https://github.com/cocodataset/panopticapi/blob/master/panoptic_coco_categories.json
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"color": [255, 255, 128], "isthing": 0, "id": 92, "name": "banner"},
+    {"color": [147, 211, 203], "isthing": 0, "id": 93, "name": "blanket"},
+    {"color": [150, 100, 100], "isthing": 0, "id": 95, "name": "bridge"},
+    {"color": [168, 171, 172], "isthing": 0, "id": 100, "name": "cardboard"},
+    {"color": [146, 112, 198], "isthing": 0, "id": 107, "name": "counter"},
+    {"color": [210, 170, 100], "isthing": 0, "id": 109, "name": "curtain"},
+    {"color": [92, 136, 89], "isthing": 0, "id": 112, "name": "door-stuff"},
+    {"color": [218, 88, 184], "isthing": 0, "id": 118, "name": "floor-wood"},
+    {"color": [241, 129, 0], "isthing": 0, "id": 119, "name": "flower"},
+    {"color": [217, 17, 255], "isthing": 0, "id": 122, "name": "fruit"},
+    {"color": [124, 74, 181], "isthing": 0, "id": 125, "name": "gravel"},
+    {"color": [70, 70, 70], "isthing": 0, "id": 128, "name": "house"},
+    {"color": [255, 228, 255], "isthing": 0, "id": 130, "name": "light"},
+    {"color": [154, 208, 0], "isthing": 0, "id": 133, "name": "mirror-stuff"},
+    {"color": [193, 0, 92], "isthing": 0, "id": 138, "name": "net"},
+    {"color": [76, 91, 113], "isthing": 0, "id": 141, "name": "pillow"},
+    {"color": [255, 180, 195], "isthing": 0, "id": 144, "name": "platform"},
+    {"color": [106, 154, 176], "isthing": 0, "id": 145, "name": "playingfield"},
+    {"color": [230, 150, 140], "isthing": 0, "id": 147, "name": "railroad"},
+    {"color": [60, 143, 255], "isthing": 0, "id": 148, "name": "river"},
+    {"color": [128, 64, 128], "isthing": 0, "id": 149, "name": "road"},
+    {"color": [92, 82, 55], "isthing": 0, "id": 151, "name": "roof"},
+    {"color": [254, 212, 124], "isthing": 0, "id": 154, "name": "sand"},
+    {"color": [73, 77, 174], "isthing": 0, "id": 155, "name": "sea"},
+    {"color": [255, 160, 98], "isthing": 0, "id": 156, "name": "shelf"},
+    {"color": [255, 255, 255], "isthing": 0, "id": 159, "name": "snow"},
+    {"color": [104, 84, 109], "isthing": 0, "id": 161, "name": "stairs"},
+    {"color": [169, 164, 131], "isthing": 0, "id": 166, "name": "tent"},
+    {"color": [225, 199, 255], "isthing": 0, "id": 168, "name": "towel"},
+    {"color": [137, 54, 74], "isthing": 0, "id": 171, "name": "wall-brick"},
+    {"color": [135, 158, 223], "isthing": 0, "id": 175, "name": "wall-stone"},
+    {"color": [7, 246, 231], "isthing": 0, "id": 176, "name": "wall-tile"},
+    {"color": [107, 255, 200], "isthing": 0, "id": 177, "name": "wall-wood"},
+    {"color": [58, 41, 149], "isthing": 0, "id": 178, "name": "water-other"},
+    {"color": [183, 121, 142], "isthing": 0, "id": 180, "name": "window-blind"},
+    {"color": [255, 73, 97], "isthing": 0, "id": 181, "name": "window-other"},
+    {"color": [107, 142, 35], "isthing": 0, "id": 184, "name": "tree-merged"},
+    {"color": [190, 153, 153], "isthing": 0, "id": 185, "name": "fence-merged"},
+    {"color": [146, 139, 141], "isthing": 0, "id": 186, "name": "ceiling-merged"},
+    {"color": [70, 130, 180], "isthing": 0, "id": 187, "name": "sky-other-merged"},
+    {"color": [134, 199, 156], "isthing": 0, "id": 188, "name": "cabinet-merged"},
+    {"color": [209, 226, 140], "isthing": 0, "id": 189, "name": "table-merged"},
+    {"color": [96, 36, 108], "isthing": 0, "id": 190, "name": "floor-other-merged"},
+    {"color": [96, 96, 96], "isthing": 0, "id": 191, "name": "pavement-merged"},
+    {"color": [64, 170, 64], "isthing": 0, "id": 192, "name": "mountain-merged"},
+    {"color": [152, 251, 152], "isthing": 0, "id": 193, "name": "grass-merged"},
+    {"color": [208, 229, 228], "isthing": 0, "id": 194, "name": "dirt-merged"},
+    {"color": [206, 186, 171], "isthing": 0, "id": 195, "name": "paper-merged"},
+    {"color": [152, 161, 64], "isthing": 0, "id": 196, "name": "food-other-merged"},
+    {"color": [116, 112, 0], "isthing": 0, "id": 197, "name": "building-other-merged"},
+    {"color": [0, 114, 143], "isthing": 0, "id": 198, "name": "rock-merged"},
+    {"color": [102, 102, 156], "isthing": 0, "id": 199, "name": "wall-other-merged"},
+    {"color": [250, 141, 255], "isthing": 0, "id": 200, "name": "rug-merged"},]
+
+
+_colors = [cat['color'] for cat in COCO_CATEGORIES]
+
+def _jitter(color):
+    """
+    Randomly modifies given color to produce a slightly different color than the color given.
+    Args:
+        color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+            picked. The values in the list are in the [0.0, 1.0] range.
+    Returns:
+        jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+            color after being jittered. The values in the list are in the [0.0, 1.0] range.
+    """
+    color = [c/255.0 for c in color]
+    color = mplc.to_rgb(color)
+    vec = np.random.rand(3)
+
+    # better to do it in another color space
+    vec = vec / np.linalg.norm(vec) * 0.5
+    res = np.clip(vec + color, 0, 1)
+    return [c*255.0 for c in res]
+
+
+def get_color(ind=None, hex=False):
+
+    if ind is None:
+        ind = np.random.randint(len(_colors))
+
+    color = _jitter(_colors[ind % len(_colors)])
+
+    if hex:
+        return '#%02x%02x%02x' % (color[0], color[1], color[2])
+        
+    else:
+        return color
+
+def string_similarity(text1, text2):
+    return SequenceMatcher(None, text1, text2).ratio()
\ No newline at end of file
diff --git a/cubercnn/vis/__init__.py b/cubercnn/vis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f4c5b475ae560d2d479e4a94fa14eefd23f753
--- /dev/null
+++ b/cubercnn/vis/__init__.py
@@ -0,0 +1 @@
+from .vis import * 
\ No newline at end of file
diff --git a/cubercnn/vis/logperf.py b/cubercnn/vis/logperf.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c59b1f971dcad808dae5f241558bf27fd03a212
--- /dev/null
+++ b/cubercnn/vis/logperf.py
@@ -0,0 +1,117 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+from termcolor import colored
+import itertools
+from tabulate import tabulate
+import logging
+
+logger = logging.getLogger(__name__)
+
+def print_ap_category_histogram(dataset, results):
+    """
+    Prints AP performance for each category.
+    Args:
+        results: dictionary; each entry contains information for a dataset
+    """
+    num_classes = len(results)
+    N_COLS = 9
+    data = list(
+        itertools.chain(
+            *[
+                [
+                    cat,
+                    out["AP2D"],
+                    out["AP3D"],
+                ]
+                for cat, out in results.items()
+            ]
+        )
+    )
+    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
+    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
+    table = tabulate(
+        data,
+        headers=["category", "AP2D", "AP3D"] * (N_COLS // 2),
+        tablefmt="pipe",
+        numalign="left",
+        stralign="center",
+    )
+    logger.info(
+        "Performance for each of {} categories on {}:\n".format(num_classes, dataset)
+        + colored(table, "cyan")
+    )
+
+
+def print_ap_analysis_histogram(results):
+    """
+    Prints AP performance for various IoU thresholds and (near, medium, far) objects.
+    Args:
+        results: dictionary. Each entry in results contains outputs for a dataset
+    """
+    metric_names = ["AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F"]
+    N_COLS = 10
+    data = []
+    for name, metrics in results.items():
+        data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"], metrics["AP3D@15"], metrics["AP3D@25"], metrics["AP3D@50"], metrics["AP3D-N"], metrics["AP3D-M"], metrics["AP3D-F"]]
+        data.append(data_item)
+    table = tabulate(
+        data,
+        headers=["Dataset", "#iters", "AP2D", "AP3D", "AP3D@15", "AP3D@25", "AP3D@50", "AP3D-N", "AP3D-M", "AP3D-F"],
+        tablefmt="grid",
+        numalign="left",
+        stralign="center",
+    )
+    logger.info(
+        "Per-dataset performance analysis on test set:\n"
+        + colored(table, "cyan")
+    )
+
+
+def print_ap_dataset_histogram(results):
+    """
+    Prints AP performance for each dataset.
+    Args:
+        results: list of dicts. Each entry in results contains outputs for a dataset
+    """
+    metric_names = ["AP2D", "AP3D"]
+    N_COLS = 4
+    data = []
+    for name, metrics in results.items():
+        data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]]
+        data.append(data_item)
+    table = tabulate(
+        data,
+        headers=["Dataset", "#iters", "AP2D", "AP3D"],
+        tablefmt="grid",
+        numalign="left",
+        stralign="center",
+    )
+    logger.info(
+        "Per-dataset performance on test set:\n"
+        + colored(table, "cyan")
+    )
+
+
+def print_ap_omni_histogram(results):
+    """
+    Prints AP performance for Omni3D dataset.
+    Args:
+        results: list of dicts. Each entry in results contains outputs for a dataset
+    """
+    metric_names = ["AP2D", "AP3D"]
+    N_COLS = 4
+    data = []
+    for name, metrics in results.items():
+        data_item = [name, metrics["iters"], metrics["AP2D"], metrics["AP3D"]]
+        data.append(data_item)
+    table = tabulate(
+        data,
+        headers=["Dataset", "#iters", "AP2D", "AP3D"],
+        tablefmt="grid",
+        numalign="left",
+        stralign="center",
+    )
+    logger.info("Omni3D performance on test set. The numbers below should be used to compare to others approaches on Omni3D, such as Cube R-CNN")
+    logger.info(
+        "Performance on Omni3D:\n"
+        + colored(table, "magenta")
+    )
diff --git a/cubercnn/vis/vis.py b/cubercnn/vis/vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..1042d91ce646c9012bfab0408dfbec22f07348dd
--- /dev/null
+++ b/cubercnn/vis/vis.py
@@ -0,0 +1,750 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+import math
+import torch
+from copy import deepcopy
+from pytorch3d.structures.meshes import join_meshes_as_scene
+from pytorch3d.transforms.so3 import (
+    so3_relative_angle,
+)
+from matplotlib.path import Path
+
+from cubercnn import util
+
+def interp_color(dist, bounds=[0, 1], color_lo=(0,0, 250), color_hi=(0, 250, 250)):
+
+    percent = (dist - bounds[0]) / (bounds[1] - bounds[0])
+    b = color_lo[0] * (1 - percent) + color_hi[0] * percent
+    g = color_lo[1] * (1 - percent) + color_hi[1] * percent
+    r = color_lo[2] * (1 - percent) + color_hi[2] * percent
+
+    return (b, g, r)
+
+def draw_bev(canvas_bev, z3d, l3d, w3d, x3d, ry3d, color=(0, 200, 200), scale=1, thickness=2):
+
+    w = l3d * scale
+    l = w3d * scale
+    x = x3d * scale
+    z = z3d * scale
+    r = ry3d*-1
+
+    corners1 = np.array([
+        [-w / 2, -l / 2, 1],
+        [+w / 2, -l / 2, 1],
+        [+w / 2, +l / 2, 1],
+        [-w / 2, +l / 2, 1]
+    ])
+
+    ry = np.array([
+        [+math.cos(r), -math.sin(r), 0],
+        [+math.sin(r), math.cos(r), 0],
+        [0, 0, 1],
+    ])
+
+    corners2 = ry.dot(corners1.T).T
+
+    corners2[:, 0] += w/2 + x + canvas_bev.shape[1] / 2
+    corners2[:, 1] += l/2 + z
+
+    draw_line(canvas_bev, corners2[0], corners2[1], color=color, thickness=thickness)
+    draw_line(canvas_bev, corners2[1], corners2[2], color=color, thickness=thickness)
+    draw_line(canvas_bev, corners2[2], corners2[3], color=color, thickness=thickness)
+    draw_line(canvas_bev, corners2[3], corners2[0], color=color, thickness=thickness)
+
+
+def draw_line(im, v0, v1, color=(0, 200, 200), thickness=1):
+    cv2.line(im, (int(v0[0]), int(v0[1])), (int(v1[0]), int(v1[1])), color, thickness)
+
+
+def create_colorbar(height, width, color_lo=(0,0, 250), color_hi=(0, 250, 250)):
+
+    im = np.zeros([height, width, 3])
+
+    for h in range(0, height):
+
+        color = interp_color(h + 0.5, [0, height], color_hi, color_lo)
+        im[h, :, 0] = (color[0])
+        im[h, :, 1] = (color[1])
+        im[h, :, 2] = (color[2])
+
+    return im.astype(np.uint8)
+
+
+def visualize_from_instances(detections, dataset, dataset_name, min_size_test, output_folder, category_names_official, iteration='',visualize_every=50):
+    
+    vis_folder = os.path.join(output_folder, 'vis')
+    
+    util.mkdir_if_missing(vis_folder)
+
+    log_str = ''
+
+    xy_errors = []
+    z_errors = []
+    w3d_errors = []
+    h3d_errors = []
+    l3d_errors = []
+    dim_errors = []
+    ry_errors = []
+
+    n_cats = len(category_names_official)
+    thres = np.sqrt(1/n_cats)
+
+    for imind, im_obj in enumerate(detections):
+        
+        write_sample = ((imind % visualize_every) == 0)
+        
+        annos = dataset._dataset[imind]['annotations']
+        gt_boxes_2d = np.array([anno['bbox'] for anno in annos])
+        
+        if len(gt_boxes_2d)==0: 
+            continue
+
+        gt_boxes_2d[:, 2] += gt_boxes_2d[:, 0]
+        gt_boxes_2d[:, 3] += gt_boxes_2d[:, 1]
+
+        gt_boxes_cat = np.array([anno['category_id'] for anno in annos])
+
+        if write_sample:
+            data_obj = dataset[imind]
+            assert(data_obj['image_id'] == im_obj['image_id'])
+            im = util.imread(data_obj['file_name'])
+
+        K = np.array(im_obj['K'])
+        K_inv = np.linalg.inv(K)
+
+        sf = im_obj['height'] / min_size_test
+
+        for instance in im_obj['instances']:
+            
+            cat = category_names_official[instance['category_id']]
+            score = instance['score']
+            x1, y1, w, h = instance['bbox']
+            x2 = x1 + w 
+            y2 = y1 + h 
+
+            alpha, h3d, w3d, l3d, x3d, y3d, z3d, ry3d = (-1,)*8
+
+            w3d, h3d, l3d = instance['dimensions']
+
+            # unproject
+            cen_2d = np.array(instance['center_2D'] + [1])
+            z3d = instance['center_cam'][2]
+
+            # get rotation (y-axis only)
+            ry3d = np.array(instance['pose'])
+
+            valid_gt_inds = np.flatnonzero(instance['category_id'] == gt_boxes_cat)
+
+            if len(valid_gt_inds) > 0:
+                quality_matrix = util.iou(np.array([[x1, y1, x2, y2]]), gt_boxes_2d[valid_gt_inds])
+                nearest_gt = quality_matrix.argmax(axis=1)[0]
+                nearest_gt_iou = quality_matrix.max(axis=1)[0]
+                valid_match = nearest_gt_iou >= 0.5
+            else:
+                valid_match = False
+
+            if valid_match:
+                gt_x1, gt_y1, gt_w, gt_h = annos[valid_gt_inds[nearest_gt]]['bbox']
+                gt_x3d, gt_y3d, gt_z3d = annos[valid_gt_inds[nearest_gt]]['center_cam']
+                gt_w3d, gt_h3d, gt_l3d = annos[valid_gt_inds[nearest_gt]]['dimensions']
+                gt_cen_2d = K @ np.array([gt_x3d, gt_y3d, gt_z3d])
+                gt_cen_2d /= gt_cen_2d[2]
+                gt_pose = annos[valid_gt_inds[nearest_gt]]['pose']
+                gt_ry3d = np.array(gt_pose)
+
+            if valid_match:
+            
+                # compute errors
+                xy_errors.append(np.sqrt(((cen_2d[:2] - gt_cen_2d[:2])**2).sum()))
+                z_errors.append(np.abs(z3d - gt_z3d))
+                w3d_errors.append(np.abs(w3d - gt_w3d))
+                h3d_errors.append(np.abs(h3d - gt_h3d))
+                l3d_errors.append(np.abs(l3d - gt_l3d))
+                dim_errors.append(np.sqrt((w3d - gt_w3d)**2 + (h3d - gt_h3d)**2 + (l3d - gt_l3d)**2))
+                
+                try:
+                    ry_errors.append(so3_relative_angle(torch.from_numpy(ry3d).unsqueeze(0), torch.from_numpy(gt_ry3d).unsqueeze(0), cos_bound=1).item())
+                except:
+                    pass
+
+            # unproject point to 3D
+            x3d, y3d, z3d = (K_inv @ (z3d*cen_2d))
+
+            # let us visualize the detections now
+            if write_sample and score > thres:
+                color = util.get_color(instance['category_id'])
+                draw_3d_box(im, K, [x3d, y3d, z3d, w3d, h3d, l3d], ry3d, color=color, thickness=int(np.round(3*im.shape[0]/500)), draw_back=False)
+                draw_text(im, '{}, z={:.1f}, s={:.2f}'.format(cat, z3d, score), [x1, y1, w, h], scale=0.50*im.shape[0]/500, bg_color=color)
+
+        if write_sample:
+            util.imwrite(im, os.path.join(vis_folder, '{:06d}.jpg'.format(imind)))
+    
+    # safety in case all rotation matrices failed. 
+    if len(ry_errors) == 0:
+        ry_errors = [1000, 1000]
+
+    log_str += dataset_name + 'iter={}, xy({:.2f}), z({:.2f}), whl({:.2f}, {:.2f}, {:.2f}), ry({:.2f})\n'.format(
+        iteration,
+        np.mean(xy_errors), np.mean(z_errors),
+        np.mean(w3d_errors), np.mean(h3d_errors), np.mean(l3d_errors),
+        np.mean(ry_errors),
+    )
+
+    return log_str
+
+
+def imshow(im, fig_num=None):
+
+    if fig_num is not None: plt.figure(fig_num)
+
+    if len(im.shape) == 2:
+        im = np.tile(im, [3, 1, 1]).transpose([1, 2, 0])
+
+    plt.imshow(cv2.cvtColor(im.astype(np.uint8), cv2.COLOR_RGB2BGR))
+    plt.show()
+
+
+def draw_scene_view(im, K, meshes, text=None, scale=1000, R=None, T=None, zoom_factor=1.0, mode='front_and_novel', blend_weight=0.80, blend_weight_overlay=1.0, ground_bounds=None, canvas=None, zplane=0.05, colors=None):
+    """
+    Draws a scene from multiple different modes. 
+    Args:
+        im (array): the image to draw onto
+        K (array): the 3x3 matrix for projection to camera to screen
+        meshes ([Mesh]): a list of meshes to draw into the scene
+        text ([str]): optional strings to draw per mesh
+        scale (int): the size of the square novel view canvas (pixels)
+        R (array): a single 3x3 matrix defining the novel view
+        T (array): a 3x vector defining the position of the novel view
+        zoom_factor (float): an optional amount to zoom out (>1) or in (<1)
+        mode (str): supports ['2D_only', 'front', 'novel', 'front_and_novel'] where 
+            front implies the front-facing camera view and novel is based on R,T
+        blend_weight (float): blend factor for box edges over the RGB
+        blend_weight_overlay (float): blends the RGB image with the rendered meshes
+        ground_bounds (tuple): max_y3d, x3d_start, x3d_end, z3d_start, z3d_end for the Ground floor or 
+            None to let the renderer to estimate the ground bounds in the novel view itself.
+        canvas (array): if the canvas doesn't change it can be faster to re-use it. Optional.
+        zplane (float): a plane of depth to solve intersection when
+            vertex points project behind the camera plane. 
+    """
+    if R is None:
+        R = util.euler2mat([np.pi/3, 0, 0])
+
+    if mode == '2D_only':
+        
+        im_drawn_rgb = deepcopy(im)
+
+        # go in order of reverse depth
+        for mesh_idx in reversed(np.argsort([mesh.verts_padded().cpu().mean(1)[0, 1] for mesh in meshes])):
+            mesh = meshes[mesh_idx]
+
+            verts3D = mesh.verts_padded()[0].numpy()
+            verts2D = (K @ verts3D.T) / verts3D[:, -1]
+
+            color = [min(255, c*255*1.25) for c in mesh.textures.verts_features_padded()[0,0].tolist()]
+            
+            x1 = verts2D[0, :].min()
+            y1 = verts2D[1, :].min() 
+            x2 = verts2D[0, :].max() 
+            y2 = verts2D[1, :].max() 
+
+            draw_2d_box(im_drawn_rgb, [x1, y1, x2-x1, y2-y1], color=color, thickness=max(2, int(np.round(3*im_drawn_rgb.shape[0]/1250))))
+            
+            if text is not None:
+                draw_text(im_drawn_rgb, '{}'.format(text[mesh_idx]), [x1, y1], scale=0.50*im_drawn_rgb.shape[0]/500, bg_color=color)
+        
+        return im_drawn_rgb
+
+    else:
+        meshes_scene = join_meshes_as_scene(meshes)
+        if torch.cuda.is_available():
+            meshes_scene = meshes_scene.cuda()
+        device = meshes_scene.device
+        meshes_scene.textures = meshes_scene.textures.to(device)
+
+        cameras = util.get_camera(K, im.shape[1], im.shape[0]).to(device)
+        renderer = util.get_basic_renderer(cameras, im.shape[1], im.shape[0], use_color=True).to(device)
+        
+
+        if mode in ['front_and_novel', 'front']:
+            '''
+            Render full scene from image view
+            '''
+            
+            im_drawn_rgb = deepcopy(im)
+
+            # save memory if not blending the render
+            if blend_weight > 0:
+                rendered_img, _ = renderer(meshes_scene)
+                sil_mask = rendered_img[0, :, :, 3].cpu().numpy() > 0.1
+                rendered_img = (rendered_img[0, :, :, :3].cpu().numpy() * 255).astype(np.uint8)    
+                im_drawn_rgb[sil_mask] = rendered_img[sil_mask] * blend_weight + im_drawn_rgb[sil_mask] * (1 - blend_weight)
+
+            '''
+            Draw edges for image view
+            '''
+            
+            # go in order of reverse depth
+            for mesh_idx in reversed(np.argsort([mesh.verts_padded().cpu().mean(1)[0, 1] for mesh in meshes])):
+                mesh = meshes[mesh_idx]
+
+                verts3D = mesh.verts_padded()[0].cpu().numpy()
+                verts2D = (K @ verts3D.T) / verts3D[:, -1]
+
+                if colors is not None:
+                    color = np.minimum(colors[mesh_idx][:-1] * 255 * 1.25, np.ones_like(colors[mesh_idx][:-1])*255).tolist()
+                else:
+                    color = [min(255, c*255*1.25) for c in mesh.textures.verts_features_padded()[0,0].tolist()]
+
+                draw_3d_box_from_verts(
+                    im_drawn_rgb, K, verts3D, color=color, 
+                    thickness=max(2, int(np.round(3*im_drawn_rgb.shape[0]/1250))), 
+                    draw_back=False, draw_top=False, zplane=zplane
+                )
+
+                x1 = verts2D[0, :].min() #min(verts2D[0, (verts2D[0, :] > 0) & (verts2D[0, :] < im_drawn_rgb.shape[1])])
+                y1 = verts2D[1, :].min() #min(verts2D[1, (verts2D[1, :] > 0) & (verts2D[1, :] < im_drawn_rgb.shape[0])])
+                
+                if text is not None:
+                    draw_text(im_drawn_rgb, '{}'.format(text[mesh_idx]), [x1, y1], scale=0.50*im_drawn_rgb.shape[0]/500, bg_color=color)
+
+            if blend_weight_overlay < 1.0 and blend_weight_overlay > 0.0:
+                im_drawn_rgb = im_drawn_rgb * blend_weight_overlay + deepcopy(im) * (1 - blend_weight_overlay)
+
+        if mode == 'front':
+            return im_drawn_rgb
+
+        elif mode in ['front_and_novel', 'novel']:
+
+            '''
+            Render from a new view
+            '''
+            
+            has_canvas_already = canvas is not None
+            if not has_canvas_already:
+                canvas = np.ones((scale, scale, 3))
+
+            view_R = torch.from_numpy(R).float().to(device)
+
+            if T is None:
+                center = (meshes_scene.verts_padded().min(1).values + meshes_scene.verts_padded().max(1).values).unsqueeze(0)/2
+            else:
+                center = torch.from_numpy(T).float().to(device).view(1, 1, 3)
+            
+            verts_rotated = meshes_scene.verts_padded().clone()
+            verts_rotated -= center
+            verts_rotated = (view_R @ verts_rotated[0].T).T.unsqueeze(0)
+
+            K_novelview = deepcopy(K)
+            K_novelview[0, -1] *= scale / im.shape[1]
+            K_novelview[1, -1] *= scale / im.shape[0]
+
+            cameras = util.get_camera(K_novelview, scale, scale).to(device)
+            renderer = util.get_basic_renderer(cameras, scale, scale, use_color=True).to(device)
+
+            margin = 0.01
+
+            if T is None:
+                max_trials = 10000
+                zoom_factor = 100.0
+                zoom_factor_in = zoom_factor
+
+                while max_trials:
+                    zoom_factor_in = zoom_factor_in*0.95
+                    verts = verts_rotated.clone()
+                    verts[:, :, -1] += center[:, :, -1]*zoom_factor_in
+                    verts_np = verts.cpu().numpy()
+
+                    proj = ((K_novelview @ verts_np[0].T) / verts_np[:, :, -1])
+
+                    # some vertices are extremely close or negative...
+                    # this implies we have zoomed in too much
+                    if (verts[0, :, -1] < 0.25).any():
+                        break
+                    
+                    # left or above image
+                    elif (proj[:2, :] < scale*margin).any():
+                        break
+                    
+                    # right or below borders
+                    elif (proj[:2, :] > scale*(1 - margin)).any():
+                        break
+
+                    # everything is in view.
+                    zoom_factor = zoom_factor_in
+                    max_trials -= 1
+
+                zoom_out_bias = center[:, :, -1].item()
+            else:
+                zoom_out_bias = 1.0
+
+            verts_rotated[:, :, -1] += zoom_out_bias*zoom_factor
+            meshes_novel_view = meshes_scene.clone().update_padded(verts_rotated)
+
+            rendered_img, _ = renderer(meshes_novel_view)
+            im_novel_view = (rendered_img[0, :, :, :3].cpu().numpy() * 255).astype(np.uint8)
+            sil_mask = rendered_img[0, :, :, 3].cpu().numpy() > 0.1
+            
+            center_np = center.cpu().numpy()
+            view_R_np = view_R.cpu().numpy()
+
+            if not has_canvas_already:
+                if ground_bounds is None:
+
+                    min_x3d, _, min_z3d = meshes_scene.verts_padded().min(1).values[0, :].tolist()
+                    max_x3d, max_y3d, max_z3d = meshes_scene.verts_padded().max(1).values[0, :].tolist()
+
+                    # go for grid projection, but with extremely bad guess at bounds
+                    x3d_start = np.round(min_x3d - (max_x3d - min_x3d)*50)
+                    x3d_end = np.round(max_x3d + (max_x3d - min_x3d)*50)
+                    z3d_start = np.round(min_z3d - (max_z3d - min_z3d)*50)
+                    z3d_end = np.round(max_z3d + (max_z3d - min_z3d)*50)
+
+                    grid_xs = np.arange(x3d_start, x3d_end)
+                    grid_zs = np.arange(z3d_start, z3d_end)
+
+                    xs_mesh, zs_mesh = np.meshgrid(grid_xs, grid_zs)
+                    ys_mesh = np.ones_like(xs_mesh)*max_y3d
+
+                    point_mesh = np.concatenate((xs_mesh[:, :, np.newaxis], ys_mesh[:, :, np.newaxis], zs_mesh[:, :, np.newaxis]), axis=2)
+                    point_mesh_orig = deepcopy(point_mesh)
+
+                    mesh_shape = point_mesh.shape
+                    point_mesh = view_R_np @ (point_mesh - center_np).transpose(2, 0, 1).reshape(3, -1)
+                    point_mesh[-1] += zoom_out_bias*zoom_factor
+                    point_mesh[-1, :] = point_mesh[-1, :].clip(0.25)
+                    point_mesh_2D = (K_novelview @ point_mesh) / point_mesh[-1]
+                    point_mesh_2D[-1] = point_mesh[-1]
+
+                    point_mesh = point_mesh.reshape(3, mesh_shape[0], mesh_shape[1]).transpose(1, 2, 0)
+                    point_mesh_2D = point_mesh_2D.reshape(3, mesh_shape[0], mesh_shape[1]).transpose(1, 2, 0)
+
+                    maskx = (point_mesh_2D[:, :, 0].T >= -50) & (point_mesh_2D[:, :, 0].T < scale+50) & (point_mesh_2D[:, :, 2].T > 0)
+                    maskz = (point_mesh_2D[:, :, 1].T >= -50) & (point_mesh_2D[:, :, 1].T < scale+50) & (point_mesh_2D[:, :, 2].T > 0)
+
+                    # invalid scene?
+                    if (not maskz.any()) or (not maskx.any()):
+                        return im, im, canvas
+
+                    # go for grid projection again!! but with sensible bounds    
+                    x3d_start = np.round(point_mesh[:, :, 0].T[maskx].min() - 10)
+                    x3d_end = np.round(point_mesh[:, :, 0].T[maskx].max() + 10)
+                    z3d_start = np.round(point_mesh_orig[:, :, 2].T[maskz].min() - 10)
+                    z3d_end = np.round(point_mesh_orig[:, :, 2].T[maskz].max() + 10)
+
+                else:
+                    max_y3d, x3d_start, x3d_end, z3d_start, z3d_end = ground_bounds
+
+                grid_xs = np.arange(x3d_start, x3d_end)
+                grid_zs = np.arange(z3d_start, z3d_end)
+
+                xs_mesh, zs_mesh = np.meshgrid(grid_xs, grid_zs)
+                ys_mesh = np.ones_like(xs_mesh)*max_y3d
+
+                point_mesh = np.concatenate((xs_mesh[:, :, np.newaxis], ys_mesh[:, :, np.newaxis], zs_mesh[:, :, np.newaxis]), axis=2)
+
+                mesh_shape = point_mesh.shape
+                point_mesh = view_R_np @ (point_mesh - center_np).transpose(2, 0, 1).reshape(3, -1)
+                point_mesh[-1] += zoom_out_bias*zoom_factor
+                point_mesh[-1, :] = point_mesh[-1, :].clip(0.25)
+                point_mesh_2D = (K_novelview @ point_mesh) / point_mesh[-1]
+                point_mesh_2D[-1] = point_mesh[-1]
+
+                point_mesh = point_mesh.reshape(3, mesh_shape[0], mesh_shape[1]).transpose(1, 2, 0)
+                point_mesh_2D = point_mesh_2D.reshape(3, mesh_shape[0], mesh_shape[1]).transpose(1, 2, 0)
+
+                bg_color = (225,)*3
+                line_color = (175,)*3
+                canvas[:, :, 0] = bg_color[0]
+                canvas[:, :, 1] = bg_color[1]
+                canvas[:, :, 2] = bg_color[2]
+                lines_to_draw = set()
+
+                for grid_row_idx in range(1, len(grid_zs)):
+
+                    pre_z = grid_zs[grid_row_idx-1]
+                    cur_z = grid_zs[grid_row_idx]
+
+                    for grid_col_idx in range(1, len(grid_xs)):
+                        pre_x = grid_xs[grid_col_idx-1]
+                        cur_x = grid_xs[grid_col_idx]
+                        
+                        p1 = point_mesh_2D[grid_row_idx-1, grid_col_idx-1]
+                        valid1 = p1[-1] > 0
+                        p2 = point_mesh_2D[grid_row_idx-1, grid_col_idx]
+                        valid2 = p2[-1] > 0
+                        if valid1 and valid2:
+                            line = (tuple(p1[:2].astype(int).tolist()), tuple(p2[:2].astype(int).tolist()))
+                            lines_to_draw.add(line)
+
+                        # draw vertical line from the previous row
+                        p1 = point_mesh_2D[grid_row_idx-1, grid_col_idx-1]
+                        valid1 = p1[-1] > 0
+                        p2 = point_mesh_2D[grid_row_idx, grid_col_idx-1]
+                        valid2 = p2[-1] > 0
+                        if valid1 and valid2:
+                            line = (tuple(p1[:2].astype(int).tolist()), tuple(p2[:2].astype(int).tolist()))
+                            lines_to_draw.add(line)
+
+                for line in lines_to_draw:
+                    draw_line(canvas, line[0], line[1], color=line_color, thickness=max(1, int(np.round(3*scale/1250))))
+
+            im_novel_view[~sil_mask] = canvas[~sil_mask]
+
+            '''
+            Draw edges for novel view
+            '''
+
+            # apply novel view to meshes
+            meshes_novel = []
+
+            for mesh in meshes:
+                
+                mesh_novel = mesh.clone().to(device)
+
+                verts_rotated = mesh_novel.verts_padded()
+                verts_rotated -= center
+                verts_rotated = (view_R @ verts_rotated[0].T).T.unsqueeze(0)
+                verts_rotated[:, :, -1] += zoom_out_bias*zoom_factor
+                mesh_novel = mesh_novel.update_padded(verts_rotated)
+
+                meshes_novel.append(mesh_novel)
+
+            # go in order of reverse depth
+            for mesh_idx in reversed(np.argsort([mesh.verts_padded().cpu().mean(1)[0, 1] for mesh in meshes_novel])):
+                mesh = meshes_novel[mesh_idx]
+
+                verts3D = mesh.verts_padded()[0].cpu().numpy()
+                verts2D = (K_novelview @ verts3D.T) / verts3D[:, -1]
+                
+                if colors is not None:
+                    color = np.minimum(colors[mesh_idx][:-1] * 255 * 1.25, np.ones_like(colors[mesh_idx][:-1])*255).tolist() # colors[mesh_idx][:-1] * 255 * 1.25
+                else:
+                    color = [min(255, c*255*1.25) for c in mesh.textures.verts_features_padded()[0,0].tolist()]
+
+                draw_3d_box_from_verts(
+                    im_novel_view, K_novelview, verts3D, color=color, 
+                    thickness=max(2, int(np.round(3*im_novel_view.shape[0]/1250))), 
+                    draw_back=False, draw_top=False, zplane=zplane
+                )
+                
+                x1 = verts2D[0, :].min() 
+                y1 = verts2D[1, :].min() 
+                
+                if text is not None:
+                    draw_text(im_novel_view, '{}'.format(text[mesh_idx]), [x1, y1], scale=0.50*im_novel_view.shape[0]/500, bg_color=color)
+
+            if mode == 'front_and_novel':
+                return im_drawn_rgb, im_novel_view, canvas
+            else:
+                return im_novel_view, canvas
+
+        else:
+            raise ValueError('No visualization written for {}'.format(mode))
+
+def get_polygon_grid(im, poly_verts):
+
+    nx = im.shape[1]
+    ny = im.shape[0]
+
+    x, y = np.meshgrid(np.arange(nx), np.arange(ny))
+    x, y = x.flatten(), y.flatten()
+
+    points = np.vstack((x, y)).T
+
+    path = Path(poly_verts)
+    grid = path.contains_points(points)
+    grid = grid.reshape((ny, nx))
+
+    return grid
+
+def draw_circle(im, pos, radius=5, thickness=1, color=(250, 100, 100), fill=True):
+
+    if fill: thickness = -1
+
+    cv2.circle(im, (int(pos[0]), int(pos[1])), radius, color=color, thickness=thickness)
+
+def draw_transparent_polygon(im, verts, blend=0.5, color=(0, 255, 255)):
+
+    mask = get_polygon_grid(im, verts[:4, :])
+
+    im[mask, 0] = im[mask, 0] * blend + (1 - blend) * color[0]
+    im[mask, 1] = im[mask, 1] * blend + (1 - blend) * color[1]
+    im[mask, 2] = im[mask, 2] * blend + (1 - blend) * color[2]
+
+
+def draw_3d_box_from_verts(im, K, verts3d, color=(0, 200, 200), thickness=1, draw_back=False, draw_top=False, zplane=0.05, eps=1e-4):
+    """
+    Draws a scene from multiple different modes. 
+    Args:
+        im (array): the image to draw onto
+        K (array): the 3x3 matrix for projection to camera to screen
+        verts3d (array): the 8x3 matrix of vertices in camera space
+        color (tuple): color in RGB scaled [0, 255)
+        thickness (float): the line thickness for opencv lines
+        draw_back (bool): whether a backface should be highlighted
+        draw_top (bool): whether the top face should be highlighted
+        zplane (float): a plane of depth to solve intersection when
+            vertex points project behind the camera plane. 
+    """
+
+    if isinstance(K, torch.Tensor):
+        K = K.detach().cpu().numpy()
+
+    if isinstance(verts3d, torch.Tensor):
+        verts3d = verts3d.detach().cpu().numpy()
+
+    # reorder
+    bb3d_lines_verts = [[0, 1], [1, 2], [2, 3], [3, 0], [1, 5], [5, 6], [6, 2], [4, 5], [4, 7], [6, 7], [0, 4], [3, 7]]
+    
+    # define back and top vetice planes
+    back_idxs = [4, 0, 3, 7]
+    top_idxs = [4, 0, 1, 5]
+    
+    for (i, j) in bb3d_lines_verts:
+        v0 = verts3d[i]
+        v1 = verts3d[j]
+
+        z0, z1 = v0[-1], v1[-1]
+
+        if (z0 >= zplane or z1 >= zplane):
+            
+            # computer intersection of v0, v1 and zplane
+            s = (zplane - z0) / max((z1 - z0), eps)
+            new_v = v0 + s * (v1 - v0)
+
+            if (z0 < zplane) and (z1 >= zplane):
+                # i0 vertex is behind the plane
+                v0 = new_v
+            elif (z0 >= zplane) and (z1 < zplane):
+                # i1 vertex is behind the plane
+                v1 = new_v
+
+            v0_proj = (K @ v0)/max(v0[-1], eps)
+            v1_proj = (K @ v1)/max(v1[-1], eps)
+
+            # project vertices
+            cv2.line(im, 
+                (int(v0_proj[0]), int(v0_proj[1])), 
+                (int(v1_proj[0]), int(v1_proj[1])), 
+                color, thickness
+            )
+
+    # dont draw  the planes if a vertex is out of bounds
+    draw_back &= np.all(verts3d[back_idxs, -1] >= zplane)
+    draw_top &= np.all(verts3d[top_idxs, -1] >= zplane)
+
+    if draw_back or draw_top:
+        
+        # project to image
+        verts2d = (K @ verts3d.T).T
+        verts2d /= verts2d[:, -1][:, np.newaxis]
+        
+        if type(verts2d) == torch.Tensor:
+            verts2d = verts2d.detach().cpu().numpy()
+
+        if draw_back:
+            draw_transparent_polygon(im, verts2d[back_idxs, :2], blend=0.5, color=color)
+
+        if draw_top:
+            draw_transparent_polygon(im, verts2d[top_idxs, :2], blend=0.5, color=color)
+    
+
+def draw_3d_box(im, K, box3d, R, color=(0, 200, 200), thickness=1, draw_back=False, draw_top=False, view_R=None, view_T=None):
+
+    verts2d, verts3d = util.get_cuboid_verts(K, box3d, R, view_R=view_R, view_T=view_T)
+    draw_3d_box_from_verts(im, K, verts3d, color=color, thickness=thickness, draw_back=draw_back, draw_top=draw_top)
+
+def draw_text(im, text, pos, scale=0.4, color='auto', font=cv2.FONT_HERSHEY_SIMPLEX, bg_color=(0, 255, 255),
+              blend=0.33, lineType=1):
+
+    text = str(text)
+    pos = [int(pos[0]), int(pos[1])]
+
+    if color == 'auto':
+        
+        if bg_color is not None:
+            color = (0, 0, 0) if ((bg_color[0] + bg_color[1] + bg_color[2])/3) > 127.5 else (255, 255, 255)
+        else:
+            color = (0, 0, 0) 
+
+    if bg_color is not None:
+
+        text_size, _ = cv2.getTextSize(text, font, scale, lineType)
+        x_s = int(np.clip(pos[0], a_min=0, a_max=im.shape[1]))
+        x_e = int(np.clip(x_s + text_size[0] - 1 + 4, a_min=0, a_max=im.shape[1]))
+        y_s = int(np.clip(pos[1] - text_size[1] - 2, a_min=0, a_max=im.shape[0]))
+        y_e = int(np.clip(pos[1] + 1 - 2, a_min=0, a_max=im.shape[0]))
+
+        im[y_s:y_e + 1, x_s:x_e + 1, 0] = im[y_s:y_e + 1, x_s:x_e + 1, 0]*blend + bg_color[0] * (1 - blend)
+        im[y_s:y_e + 1, x_s:x_e + 1, 1] = im[y_s:y_e + 1, x_s:x_e + 1, 1]*blend + bg_color[1] * (1 - blend)
+        im[y_s:y_e + 1, x_s:x_e + 1, 2] = im[y_s:y_e + 1, x_s:x_e + 1, 2]*blend + bg_color[2] * (1 - blend)
+        
+        pos[0] = int(np.clip(pos[0] + 2, a_min=0, a_max=im.shape[1]))
+        pos[1] = int(np.clip(pos[1] - 2, a_min=0, a_max=im.shape[0]))
+
+    cv2.putText(im, text, tuple(pos), font, scale, color, lineType)
+
+
+def draw_transparent_square(im, pos, alpha=1, radius=5, color=(250, 100, 100)):
+
+    l = pos[1] - radius
+    r = pos[1] + radius
+
+    t = pos[0] - radius
+    b = pos[0] + radius
+
+    if (np.array([l, r, t, b]) >= 0).any():
+        l = np.clip(np.floor(l), 0, im.shape[0]).astype(int)
+        r = np.clip(np.floor(r), 0, im.shape[0]).astype(int)
+
+        t = np.clip(np.floor(t), 0, im.shape[1]).astype(int)
+        b = np.clip(np.floor(b), 0, im.shape[1]).astype(int)
+
+        # blend
+        im[l:r + 1, t:b + 1, 0] = im[l:r + 1, t:b + 1, 0] * alpha + color[0] * (1 - alpha)
+        im[l:r + 1, t:b + 1, 1] = im[l:r + 1, t:b + 1, 1] * alpha + color[1] * (1 - alpha)
+        im[l:r + 1, t:b + 1, 2] = im[l:r + 1, t:b + 1, 2] * alpha + color[2] * (1 - alpha)
+
+
+def draw_2d_box(im, box, color=(0, 200, 200), thickness=1):
+
+    x = box[0]
+    y = box[1]
+    w = box[2]
+    h = box[3]
+    x2 = (x + w) - 1
+    y2 = (y + h) - 1
+
+    cv2.rectangle(im, (int(x), int(y)), (int(x2), int(y2)), color, thickness)
+
+
+def imhstack(im1, im2):
+
+    sf = im1.shape[0] / im2.shape[0]
+
+    if sf > 1:
+        im2 = cv2.resize(im2, (int(im2.shape[1] / sf), im1.shape[0]))
+    elif sf < 1:
+        im1 = cv2.resize(im1, (int(im1.shape[1] / sf), im2.shape[0]))
+
+
+    im_concat = np.hstack((im1, im2))
+
+    return im_concat
+
+
+def imvstack(im1, im2):
+
+    sf = im1.shape[1] / im2.shape[1]
+
+    if sf > 1:
+        im2 = cv2.resize(im2, (int(im2.shape[0] / sf), im1.shape[1]))
+    elif sf < 1:
+        im1 = cv2.resize(im1, (int(im1.shape[0] / sf), im2.shape[1]))
+
+    im_concat = np.vstack((im1, im2))
+
+    return im_concat
\ No newline at end of file
diff --git a/datasetminify.py b/datasetminify.py
new file mode 100644
index 0000000000000000000000000000000000000000..98afebe920ec013a36746e4bb113e6ff1e314b95
--- /dev/null
+++ b/datasetminify.py
@@ -0,0 +1,71 @@
+import json
+import random
+random.seed(0)
+
+def minify_dataset(path, num_images=10):
+    with open(path, 'r') as f:
+        data = json.load(f)
+    
+    new_file = {}
+    new_file['info'] = data['info']
+    idx = random.sample(range(len(data['images'])), num_images)
+    new_file['images'] = [data['images'][i] for i in idx]
+    new_file['categories'] = data['categories']
+    # grab only annotation for the image ids
+    new_file['annotations'] = [ann for ann in data['annotations'] if ann['image_id'] in [img['id'] for img in new_file['images']]]
+    
+    with open(path.replace('.json', '_mini.json'), 'w') as f:
+        json.dump(new_file, f)
+
+cats = set({'bicycle', 'books', 'bottle', 'chair', 'cup', 'laptop', 'shoes', 'towel', 'blinds', 'window', 'lamp', 'shelves', 'mirror', 'sink', 'cabinet', 'bathtub', 'door', 'toilet', 'desk', 'box', 'bookcase', 'picture', 'table', 'counter', 'bed', 'night stand', 'pillow', 'sofa', 'television', 'floor mat', 'curtain', 'clothes', 'stationery', 'refrigerator', 'bin', 'stove', 'oven', 'machine'})
+n_images = 103
+# minify_dataset('datasets/Omni3D/SUNRGBD_test.json', n_images*2)
+# minify_dataset('datasets/Omni3D/SUNRGBD_train.json', n_images)
+# minify_dataset('datasets/Omni3D/SUNRGBD_val.json', n_images)
+
+minify_dataset('datasets/Omni3D/KITTI_test.json', n_images*2)
+minify_dataset('datasets/Omni3D/KITTI_train.json', n_images)
+minify_dataset('datasets/Omni3D/KITTI_val.json', n_images)
+
+def minify_dataset_cats(path, cats):
+    '''make a mini dataset which has all the specified categories'''
+    with open(path, 'r') as f:
+        data = json.load(f)
+    
+    new_file = {}
+    new_file['info'] = data['info']
+    i = 0
+    while len(cats) > 0:
+        idx = random.sample(range(len(data['images'])), 1)
+        new_file['images'] = [data['images'][i] for i in idx]
+        # grab only annotation for the image ids
+        new_file['annotations'] = [ann for ann in data['annotations'] if ann['image_id'] in [img['id'] for img in new_file['images']]]
+        # check if all categories are present
+        cat_in_img = set([i['category_name'] for i in new_file['annotations']])
+        cats = cats - cat_in_img
+        i += 1
+    print('num_ ', i)
+    with open(path.replace('.json', '_mini.json'), 'w') as f:
+        json.dump(new_file, f)
+
+
+# minify_dataset_cats('datasets/Omni3D/SUNRGBD_test.json', cats)
+# minify_dataset_cats('datasets/Omni3D/SUNRGBD_train.json', cats)
+# minify_dataset_cats('datasets/Omni3D/SUNRGBD_val.json', cats)
+
+def minify_dataset_idx(path, idx):
+    with open(path, 'r') as f:
+        data = json.load(f)
+    
+    new_file = {}
+    new_file['info'] = data['info']
+    # find only image with idx
+    new_file['images'] = [i for i in data['images'] if i['id'] == idx]
+    new_file['categories'] = data['categories']
+    # grab only annotation for the image ids
+    new_file['annotations'] = [ann for ann in data['annotations'] if ann['image_id'] in [img['id'] for img in new_file['images']]]
+    
+    with open(path.replace('.json', f'_mini_{idx}.json'), 'w') as f:
+        json.dump(new_file, f)
+
+# minify_dataset_idx('datasets/Omni3D/SUNRGBD_test.json', 168509)
\ No newline at end of file
diff --git a/demo.sh b/demo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..1e462f1384af491f953d20135bad80bf77475dce
--- /dev/null
+++ b/demo.sh
@@ -0,0 +1,6 @@
+python demo/demo.py \
+--config-file cubercnn://omni3d/cubercnn_DLA34_FPN.yaml \
+--input-folder "datasets/coco_examples" \
+--threshold 0.25 --display \
+MODEL.WEIGHTS cubercnn://omni3d/cubercnn_DLA34_FPN.pth \
+OUTPUT_DIR output/demo 
diff --git a/demo/demo.py b/demo/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..a39a94ebd8181d2eceed7f7533f40af84aa262d3
--- /dev/null
+++ b/demo/demo.py
@@ -0,0 +1,221 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+import os
+import argparse
+import sys
+import numpy as np
+from collections import OrderedDict
+import torch
+
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.engine import default_argument_parser, default_setup, launch
+from detectron2.data import transforms as T
+
+
+logger = logging.getLogger("detectron2")
+
+sys.dont_write_bytecode = True
+sys.path.append(os.getcwd())
+np.set_printoptions(suppress=True)
+
+from cubercnn.config import get_cfg_defaults
+from cubercnn.modeling.proposal_generator import RPNWithIgnore
+from cubercnn.modeling.roi_heads import ROIHeads3D
+from cubercnn.modeling.meta_arch import RCNN3D, build_model
+from cubercnn.modeling.backbone import build_dla_from_vision_fpn_backbone
+from cubercnn import util, vis
+
+def do_test(args, cfg, model):
+
+    list_of_ims = util.list_files(os.path.join(args.input_folder, ''), '*')
+
+    model.eval()
+    
+    focal_length = args.focal_length
+    principal_point = args.principal_point
+    thres = args.threshold
+
+    output_dir = cfg.OUTPUT_DIR
+    min_size = cfg.INPUT.MIN_SIZE_TEST
+    max_size = cfg.INPUT.MAX_SIZE_TEST
+    augmentations = T.AugmentationList([T.ResizeShortestEdge(min_size, max_size, "choice")])
+
+    util.mkdir_if_missing(output_dir)
+
+    category_path = os.path.join(util.file_parts(args.config_file)[0], 'category_meta.json')
+        
+    # store locally if needed
+    if category_path.startswith(util.CubeRCNNHandler.PREFIX):
+        category_path = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, category_path)
+
+    metadata = util.load_json(category_path)
+    cats = metadata['thing_classes']
+    
+    for path in list_of_ims:
+
+        im_name = util.file_parts(path)[1]
+        im = util.imread(path)
+
+        if im is None:
+            continue
+        
+        image_shape = im.shape[:2]  # h, w
+
+        h, w = image_shape
+        
+        if focal_length == 0:
+            focal_length_ndc = 4.0
+            focal_length = focal_length_ndc * h / 2
+
+        if len(principal_point) == 0:
+            px, py = w/2, h/2
+        else:
+            px, py = principal_point
+
+        K = np.array([
+            [focal_length, 0.0, px], 
+            [0.0, focal_length, py], 
+            [0.0, 0.0, 1.0]
+        ])
+        is_ground = os.path.exists(f'datasets/ground_maps/{im_name}.jpg.npz')
+        if is_ground:
+            ground_map = np.load(f'datasets/ground_maps/{im_name}.jpg.npz')['mask']
+        depth_map = np.load(f'datasets/depth_maps/{im_name}.jpg.npz')['depth']
+
+        aug_input = T.AugInput(im)
+        tfms = augmentations(aug_input)
+        image = aug_input.image
+        if is_ground:
+            ground_map = tfms.apply_image(ground_map*1.0)
+            ground_map = torch.as_tensor(ground_map)
+        else:
+            ground_map = None
+        depth_map = tfms.apply_image(depth_map)
+
+        # batched = [{
+        #     'image': torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))).cuda(), 
+        #     'height': image_shape[0], 'width': image_shape[1], 'K': K
+        # }]
+        # first you must run the scripts to get the ground and depth map for the images
+        batched = [{
+            'image': torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))), 
+            'depth_map': torch.as_tensor(depth_map),
+            'ground_map': ground_map,
+            'height': image_shape[0], 'width': image_shape[1], 'K': K
+        }]
+        dets = model(batched)[0]['instances']
+
+        n_det = len(dets)
+
+        meshes = []
+        meshes_text = []
+
+        if n_det > 0:
+            for idx, (corners3D, center_cam, center_2D, dimensions, pose, score, cat_idx) in enumerate(zip(
+                    dets.pred_bbox3D, dets.pred_center_cam, dets.pred_center_2D, dets.pred_dimensions, 
+                    dets.pred_pose, dets.scores, dets.pred_classes
+                )):
+
+                # skip
+                if score < thres:
+                    continue
+                
+                cat = cats[cat_idx]
+
+                bbox3D = center_cam.tolist() + dimensions.tolist()
+                meshes_text.append('{} {:.2f}'.format(cat, score))
+                color = [c/255.0 for c in util.get_color(idx)]
+                box_mesh = util.mesh_cuboid(bbox3D, pose.tolist(), color=color)
+                meshes.append(box_mesh)
+        
+        print('File: {} with {} dets'.format(im_name, len(meshes)))
+
+        if len(meshes) > 0:
+            im_drawn_rgb, im_topdown, _ = vis.draw_scene_view(im, K, meshes, text=meshes_text, scale=im.shape[0], blend_weight=0.5, blend_weight_overlay=0.85)
+            
+            if args.display:
+                im_concat = np.concatenate((im_drawn_rgb, im_topdown), axis=1)
+                vis.imshow(im_concat)
+
+            util.imwrite(im_drawn_rgb, os.path.join(output_dir, im_name+'_boxes.jpg'))
+            util.imwrite(im_topdown, os.path.join(output_dir, im_name+'_novel.jpg'))
+        else:
+            util.imwrite(im, os.path.join(output_dir, im_name+'_boxes.jpg'))
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    get_cfg_defaults(cfg)
+
+    config_file = args.config_file
+
+    # store locally if needed
+    if config_file.startswith(util.CubeRCNNHandler.PREFIX):    
+        config_file = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, config_file)
+
+    cfg.merge_from_file(config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    return cfg
+
+def main(args):
+    cfg = setup(args)
+    model = build_model(cfg)
+    
+    logger.info("Model:\n{}".format(model))
+    DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+        cfg.MODEL.WEIGHTS, resume=True
+    )
+
+    with torch.no_grad():
+        do_test(args, cfg, model)
+
+if __name__ == "__main__":
+    
+    parser = argparse.ArgumentParser(
+        epilog=None, formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("--config-file", default="", metavar="FILE", help="path to config file")
+    parser.add_argument('--input-folder',  type=str, help='list of image folders to process', required=True)
+    parser.add_argument("--focal-length", type=float, default=0, help="focal length for image inputs (in px)")
+    parser.add_argument("--principal-point", type=float, default=[], nargs=2, help="principal point for image inputs (in px)")
+    parser.add_argument("--threshold", type=float, default=0.25, help="threshold on score for visualizing")
+    parser.add_argument("--display", default=False, action="store_true", help="Whether to show the images in matplotlib",)
+    
+    parser.add_argument("--eval-only", default=True, action="store_true", help="perform evaluation only")
+    parser.add_argument("--num-gpus", type=int, default=1, help="number of gpus *per machine*")
+    parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
+    parser.add_argument(
+        "--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)"
+    )
+    port = 2 ** 15 + 2 ** 14 + hash(os.getuid() if sys.platform != "win32" else 1) % 2 ** 14
+    parser.add_argument(
+        "--dist-url",
+        default="tcp://127.0.0.1:{}".format(port),
+        help="initialization URL for pytorch distributed backend. See "
+        "https://pytorch.org/docs/stable/distributed.html for details.",
+    )
+    parser.add_argument(
+        "opts",
+        help="Modify config options by adding 'KEY VALUE' pairs at the end of the command. "
+        "See config references at "
+        "https://detectron2.readthedocs.io/modules/config.html#config-references",
+        default=None,
+        nargs=argparse.REMAINDER,
+    )
+
+    args = parser.parse_args()
+
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
\ No newline at end of file
diff --git a/demo/download_demo_COCO_images.sh b/demo/download_demo_COCO_images.sh
new file mode 100644
index 0000000000000000000000000000000000000000..24970c52fb1cd3ab14c5c4f47bed3bc5f76d44da
--- /dev/null
+++ b/demo/download_demo_COCO_images.sh
@@ -0,0 +1,13 @@
+#Copyright (c) Meta Platforms, Inc. and affiliates
+
+mkdir -p datasets/coco_examples
+cd datasets/coco_examples
+wget https://farm1.staticflickr.com/19/3045175664_6e42bd43f3_z.jpg
+wget https://farm1.staticflickr.com/19/6140190660_c220e6e1ea_z.jpg
+wget https://farm1.staticflickr.com/19/5375406975_0a72911ae7_z.jpg
+wget https://farm1.staticflickr.com/19/4634546881_8203dd8f94_z.jpg
+wget https://farm1.staticflickr.com/19/4586421859_517c65c02b_z.jpg
+wget https://farm1.staticflickr.com/19/4198075011_06332047e2_z.jpg
+wget https://farm1.staticflickr.com/19/3480322600_bc542ae19b_z.jpg
+wget https://farm1.staticflickr.com/19/3164116912_41b30edbf7_z.jpg
+cd ../../
diff --git a/download_models.sh b/download_models.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e7767d51968aaf47335f26799c421881db512528
--- /dev/null
+++ b/download_models.sh
@@ -0,0 +1,22 @@
+
+cd GroundingDINO/
+pip install -e .
+mkdir weights
+cd weights
+wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
+cd ..
+
+cd sam-hq
+wget https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_b.pth -O sam_hq_vit_b.pth
+cd ..
+
+cd depth
+./download_models.sh
+cd ..
+
+mkdir output
+mkdir output/Baseline_sgd
+mkdir output/weak_cube_r-cnn
+# wget https://huggingface.co/AndreasLH/Weak-Cube-R-CNN/blob/main/Baseline_sgd/model_final.pth -O output/Baseline_sgd/model_final.pth
+
+wget https://huggingface.co/AndreasLH/Weak-Cube-R-CNN/resolve/main/weak%20cube%20r-cnn/model_final.pth?download=true -O output/weak_cube_r-cnn/model_final.pth
diff --git a/pre-requirements.txt b/pre-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5eec8cf8d145b3c3eeeba9e98a74119f009fad97
--- /dev/null
+++ b/pre-requirements.txt
@@ -0,0 +1,14 @@
+ninja
+iopath
+fvcore
+
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch==2.4.1 
+torchvision==0.19.1 
+torchaudio==2.4.1
+
+gradio
+matplotlib
+numpy <= 1.26.1
+opencv-python
+pyransac3d
\ No newline at end of file
diff --git a/priors.py b/priors.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ea5cb458e56204a95993b297d3ef85c8577c12
--- /dev/null
+++ b/priors.py
@@ -0,0 +1,102 @@
+import os
+import logging
+
+from detectron2.config.config import get_cfg
+from detectron2.data.catalog import MetadataCatalog
+from detectron2.utils.logger import setup_logger
+import pandas as pd
+from cubercnn import data, util, vis
+from cubercnn.config.config import get_cfg_defaults
+from cubercnn.data.datasets import simple_register
+        
+logger = logging.getLogger("cubercnn")
+
+
+def get_config_and_filter_settings(config_file='configs/Base_Omni3D.yaml'):
+    # we must load the config file to get the filter settings
+    cfg = get_cfg()
+    get_cfg_defaults(cfg)
+    cfg.merge_from_file(config_file)
+    # must setup logger to get info about filtered out annotations
+    setup_logger(output=cfg.OUTPUT_DIR, name="cubercnn")
+    filter_settings = data.get_filter_settings_from_cfg(cfg)
+    return cfg, filter_settings
+
+def priors_of_objects(dataset):
+        
+    cfg, filter_settings = get_config_and_filter_settings()
+
+    dataset_names = ['SUNRGBD_train','SUNRGBD_val', 'SUNRGBD_test']
+    for dataset_name in dataset_names:
+        simple_register(dataset_name, filter_settings, filter_empty=True)
+
+    dataset_paths = ['datasets/Omni3D/'+dataset_name+'.json' for dataset_name in dataset_names]
+
+    datasets = data.Omni3D(dataset_paths, filter_settings=filter_settings)
+
+    # determine the meta data given the datasets used. 
+    data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+
+    thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+    dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+    
+    '''
+    It may be useful to keep track of which categories are annotated/known
+    for each dataset in use, in case a method wants to use this information.
+    '''
+
+    infos = datasets.dataset['info']
+
+    if type(infos) == dict:
+        infos = [datasets.dataset['info']]
+
+    dataset_id_to_unknown_cats = {}
+    possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+    
+    dataset_id_to_src = {}
+
+    dset_classes = []
+
+    for info in infos:
+        dataset_id = info['id']
+        known_category_training_ids = set()
+
+        if not dataset_id in dataset_id_to_src:
+            dataset_id_to_src[dataset_id] = info['source']
+
+        for id in info['known_category_ids']:
+            if id in dataset_id_to_contiguous_id:
+                known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+        
+        # determine and store the unknown categories.
+        unknown_categories = possible_categories - known_category_training_ids
+        dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+
+        # log the per-dataset categories
+        avail_cats = [thing_classes[i] for i in (possible_categories & known_category_training_ids)]
+        logger.info('Available categories for {}'.format(info['name']))
+        logger.info(avail_cats)
+
+        dset_classes.append(avail_cats)
+
+    # set difference between the available categories for each dataset.
+    dset1 = set(dset_classes[0])
+    dset2 = set(dset_classes[1])
+    logger.info(f'Categories in {dataset_names[0]} missing from {dataset_names[1]}:')
+    logger.info(dset1 - dset2)
+    
+    # compute priors given the training data.
+    # interested in priors['priors_dims_per_cat'], some of them have [1,1,1], as they are invalid categories, e.g. car and bus which are not present SUNRGBD 
+    # priors are w / h / d, std for (w / h / d) 
+    priors = util.compute_priors(cfg, datasets)
+    priors_bins = util.compute_priors(cfg, datasets,n_bins=5)
+    # TODO: can we emulate the behaviour of this function
+    # without ever having access to 3D annotations?
+    priors2 = pd.read_csv('datasets/typical sizes of 3d items.csv')
+    print(priors)
+    pass
+
+
+if __name__ == "__main__":
+    priors_of_objects('SUNRGBD')
+    
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f0deab8b7864df02600813961c4d2e9b16e42810
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,8 @@
+
+git+https://github.com/facebookresearch/pytorch3d.git@stable
+git+https://github.com/facebookresearch/detectron2.git
+
+gradio
+matplotlib
+numpy
+opencv-python
\ No newline at end of file
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/train_net.py b/tools/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ab819a8e2a70b636fc61940db69af6166f08be
--- /dev/null
+++ b/tools/train_net.py
@@ -0,0 +1,536 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates
+import logging
+import os
+import sys
+import warnings
+warnings.filterwarnings("ignore", message="Overwriting tiny_vit_21m_512 in registry")
+warnings.filterwarnings("ignore", message="Overwriting tiny_vit_21m_384 in registry")
+warnings.filterwarnings("ignore", message="Overwriting tiny_vit_21m_224 in registry")
+warnings.filterwarnings("ignore", message="Overwriting tiny_vit_11m_224 in registry")
+warnings.filterwarnings("ignore", message="Overwriting tiny_vit_5m_224 in registry")
+
+import numpy as np
+import copy
+from collections import OrderedDict
+import pandas as pd
+import torch
+import datetime
+from torch.nn.parallel import DistributedDataParallel
+import torch.distributed as dist
+import detectron2.utils.comm as comm
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.engine import (
+    default_argument_parser, 
+    default_setup, 
+    default_writers, 
+    launch
+)
+from detectron2.solver import build_lr_scheduler
+from detectron2.utils.events import EventStorage
+from detectron2.utils.logger import setup_logger
+import wandb
+
+logger = logging.getLogger("cubercnn")
+
+
+
+from cubercnn.solver import build_optimizer, freeze_bn, PeriodicCheckpointerOnlyOne
+from cubercnn.config import get_cfg_defaults
+from cubercnn.data import (
+    load_omni3d_json,
+    DatasetMapper3D,
+    build_detection_train_loader,
+    build_detection_test_loader,
+    get_omni3d_categories,
+    simple_register
+)
+from cubercnn.evaluation import (
+    Omni3DEvaluator, Omni3Deval,
+    Omni3DEvaluationHelper,
+    inference_on_dataset
+)
+from cubercnn.modeling.proposal_generator import RPNWithIgnore
+from cubercnn.modeling.roi_heads import ROIHeads3D
+from cubercnn.modeling.meta_arch import RCNN3D, build_model
+from cubercnn.modeling.backbone import build_dla_from_vision_fpn_backbone
+from cubercnn import util, vis, data
+import cubercnn.vis.logperf as utils_logperf
+
+
+MAX_TRAINING_ATTEMPTS = 10
+
+
+def do_test(cfg, model, iteration='final', storage=None):
+        
+    filter_settings = data.get_filter_settings_from_cfg(cfg)    
+    filter_settings['visibility_thres'] = cfg.TEST.VISIBILITY_THRES
+    filter_settings['truncation_thres'] = cfg.TEST.TRUNCATION_THRES
+    filter_settings['min_height_thres'] = 0.0625
+    filter_settings['max_depth'] = 1e8
+
+    dataset_names_test = cfg.DATASETS.TEST
+    only_2d = cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D == 0.0
+    output_folder = os.path.join(cfg.OUTPUT_DIR, "inference", 'iter_{}'.format(iteration))
+    logger.info('Output folder: %s', output_folder)
+
+    eval_helper = Omni3DEvaluationHelper(
+        dataset_names_test, 
+        filter_settings, 
+        output_folder, 
+        iter_label=iteration,
+        only_2d=only_2d,
+    )
+
+    for dataset_name in dataset_names_test:
+        """
+        Cycle through each dataset and test them individually.
+        This loop keeps track of each per-image evaluation result, 
+        so that it doesn't need to be re-computed for the collective.
+        """
+
+        '''
+        Distributed Cube R-CNN inference
+        '''
+        data_loader = build_detection_test_loader(cfg, dataset_name,batch_size=cfg.SOLVER.IMS_PER_BATCH, num_workers=2)
+        results_json = inference_on_dataset(model, data_loader)
+
+        if comm.is_main_process():
+            
+            '''
+            Individual dataset evaluation
+            '''
+            eval_helper.add_predictions(dataset_name, results_json)
+            eval_helper.save_predictions(dataset_name)
+            eval_helper.evaluate(dataset_name)
+
+            '''
+            Optionally, visualize some instances
+            '''
+            instances = torch.load(os.path.join(output_folder, dataset_name, 'instances_predictions.pth'), weights_only=False)
+            log_str = vis.visualize_from_instances(
+                instances, data_loader.dataset, dataset_name, 
+                cfg.INPUT.MIN_SIZE_TEST, os.path.join(output_folder, dataset_name), 
+                MetadataCatalog.get('omni3d_model').thing_classes, iteration, visualize_every=1
+            )
+            logger.info(log_str)
+
+    if comm.is_main_process():
+        
+        '''
+        Summarize each Omni3D Evaluation metric
+        '''  
+        eval_helper.summarize_all()
+
+
+def do_train(cfg, model, dataset_id_to_unknown_cats, dataset_id_to_src, resume=False):
+
+    max_iter = cfg.SOLVER.MAX_ITER
+    do_eval = cfg.TEST.EVAL_PERIOD > 0
+
+    model.train()
+
+    optimizer = build_optimizer(cfg, model)
+    scheduler = build_lr_scheduler(cfg, optimizer)
+
+    # bookkeeping
+    checkpointer = DetectionCheckpointer(model, cfg.OUTPUT_DIR, optimizer=optimizer, scheduler=scheduler)    
+    periodic_checkpointer = PeriodicCheckpointerOnlyOne(checkpointer, cfg.SOLVER.CHECKPOINT_PERIOD, max_iter=max_iter)
+    writers = default_writers(cfg.OUTPUT_DIR, max_iter) if comm.is_main_process() else []
+    
+    # create the dataloader
+    data_mapper = DatasetMapper3D(cfg, is_train=True)
+    data_loader = build_detection_train_loader(cfg, mapper=data_mapper, dataset_id_to_src=dataset_id_to_src, num_workers=2)
+
+    # give the mapper access to dataset_ids
+    data_mapper.dataset_id_to_unknown_cats = dataset_id_to_unknown_cats
+
+    if cfg.MODEL.WEIGHTS_PRETRAIN != '':
+        
+        # load ONLY the model, no checkpointables.
+        checkpointer.load(cfg.MODEL.WEIGHTS_PRETRAIN, checkpointables=[])
+
+    # determine the starting iteration, if resuming
+    start_iter = (checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume).get("iteration", -1) + 1)
+    iteration = start_iter
+
+    logger.info("Starting training from iteration {}".format(start_iter))
+
+    if not cfg.MODEL.USE_BN:
+        freeze_bn(model)
+
+    world_size = comm.get_world_size()
+
+    # if the loss diverges for more than the below TOLERANCE
+    # as a percent of the iterations, the training will stop.
+    # This is only enabled if "STABILIZE" is on, which 
+    # prevents a single example from exploding the training. 
+    iterations_success = 0
+    iterations_explode = 0
+    
+    # when loss > recent_loss * TOLERANCE, then it could be a
+    # diverging/failing model, which we should skip all updates for.
+    TOLERANCE = 4.0         
+
+    GAMMA = 0.02            # rolling average weight gain
+    recent_loss = None      # stores the most recent loss magnitude
+
+    data_iter = iter(data_loader)
+
+    # model.parameters() is surprisingly expensive at 150ms, so cache it
+    named_params = list(model.named_parameters())
+
+    with EventStorage(start_iter) as storage:
+        
+        while True:
+
+            data = next(data_iter)
+            storage.iter = iteration
+
+            # forward
+            loss_dict = model(data)
+            losses = sum(loss_dict.values())
+
+            # reduce
+            loss_dict_reduced = {k: v.item() for k, v in allreduce_dict(loss_dict).items()}
+            losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+        
+            # sync up
+            comm.synchronize()
+
+            if recent_loss is None:
+
+                # init recent loss fairly high
+                recent_loss = losses_reduced*2.0
+
+            # Is stabilization enabled, and loss high or NaN?
+            diverging_model = cfg.MODEL.STABILIZE > 0 and \
+                        (losses_reduced > recent_loss*TOLERANCE or \
+                            not (np.isfinite(losses_reduced)) or np.isnan(losses_reduced))
+
+            if diverging_model:
+                # clip and warn the user.
+                losses = losses.clip(0, 1) 
+                logger.warning('Skipping gradient update due to higher than normal loss {:.2f} vs. rolling mean {:.2f}, Dict-> {}'.format(
+                    losses_reduced, recent_loss, loss_dict_reduced
+                ))
+            else:
+                # compute rolling average of loss
+                recent_loss = recent_loss * (1-GAMMA) + losses_reduced*GAMMA
+            
+            if comm.is_main_process():
+                # send loss scalars to tensorboard.
+                storage.put_scalars(total_loss=losses_reduced, **loss_dict_reduced)
+                epoch = iteration // cfg.SOLVER.IMS_PER_BATCH
+
+            # backward and step
+            optimizer.zero_grad()
+            losses.backward()
+
+            # if the loss is not too high, 
+            # we still want to check gradients.
+            if not diverging_model:
+
+                if cfg.MODEL.STABILIZE > 0:
+                    
+                    for name, param in named_params:
+
+                        if param.grad is not None:
+                            diverging_model = torch.isnan(param.grad).any() or torch.isinf(param.grad).any()
+                        
+                        if diverging_model:
+                            logger.warning('Skipping gradient update due to inf/nan detection, loss is {}'.format(loss_dict_reduced))
+                            break
+
+            # convert exploded to a float, then allreduce it, 
+            # if any process gradients have exploded then we skip together.
+            if cfg.MODEL.DEVICE == 'cuda':
+                diverging_model = torch.tensor(float(diverging_model)).cuda()
+            else:
+                diverging_model = torch.tensor(float(diverging_model))
+
+            if world_size > 1:
+                dist.all_reduce(diverging_model)
+
+            # sync up
+            comm.synchronize()
+
+            if diverging_model > 0:
+                optimizer.zero_grad()
+                iterations_explode += 1
+
+            else:
+                optimizer.step()
+                storage.put_scalar("lr", optimizer.param_groups[0]["lr"], smoothing_hint=False)
+                iterations_success += 1
+
+            total_iterations = iterations_success + iterations_explode
+
+            # Only retry if we have trained sufficiently long relative
+            # to the latest checkpoint, which we would otherwise revert back to.
+            retry = (iterations_explode / total_iterations) >= cfg.MODEL.STABILIZE \
+                    and (total_iterations > cfg.SOLVER.CHECKPOINT_PERIOD*1/2)
+            
+            # Important for dist training. Convert to a float, then allreduce it, 
+            # if any process gradients have exploded then we must skip together.
+            if cfg.MODEL.DEVICE == 'cuda':
+                retry = torch.tensor(float(retry)).cuda()
+            else:
+                retry = torch.tensor(float(retry))
+            
+            if world_size > 1:
+                dist.all_reduce(retry)
+
+            # sync up
+            comm.synchronize()
+
+            # any processes need to retry
+            if retry > 0:
+
+                # instead of failing, try to resume the iteration instead. 
+                logger.warning('!! Restarting training at {} iters. Exploding loss {:d}% of iters !!'.format(
+                    iteration, int(100*(iterations_explode / (iterations_success + iterations_explode)))
+                ))
+
+                # send these to garbage, for ideally a cleaner restart.
+                del data_mapper
+                del data_loader
+                del optimizer
+                del checkpointer
+                del periodic_checkpointer
+                return False
+                
+            scheduler.step()
+
+            # Evaluate only when the loss is not diverging.
+            if not (diverging_model > 0) and \
+                (do_eval and ((iteration + 1) % cfg.TEST.EVAL_PERIOD) == 0 and iteration != (max_iter - 1)):
+
+                logger.info('Starting test for iteration {}'.format(iteration+1))
+                do_test(cfg, model, iteration=iteration+1, storage=storage)
+                comm.synchronize()
+                
+                if not cfg.MODEL.USE_BN: 
+                    freeze_bn(model)
+
+            # Flush events
+            if iteration - start_iter > 5 and ((iteration + 1) % 20 == 0 or iteration == max_iter - 1):
+                for writer in writers:
+                    writer.write()
+            
+            # Do not bother checkpointing if there is potential for a diverging model.
+            if not (diverging_model > 0) and \
+                (iterations_explode / total_iterations) < 0.5*cfg.MODEL.STABILIZE:
+                periodic_checkpointer.step(iteration)
+
+            iteration += 1
+
+            if iteration >= max_iter:
+                break
+    
+    # success
+    return True
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    get_cfg_defaults(cfg)
+
+    config_file = args.config_file
+    
+    # store locally if needed
+    if config_file.startswith(util.CubeRCNNHandler.PREFIX):    
+        config_file = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, config_file)
+
+    cfg.merge_from_file(config_file)
+    cfg.merge_from_list(args.opts)
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    cfg.MODEL.DEVICE = device
+    cfg.SEED = 12 
+    cfg.freeze()
+    default_setup(cfg, args)
+
+    setup_logger(output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="cubercnn")
+    
+    filter_settings = data.get_filter_settings_from_cfg(cfg)
+
+    for dataset_name in cfg.DATASETS.TRAIN:
+        simple_register(dataset_name, filter_settings, filter_empty=True)
+    
+    dataset_names_test = cfg.DATASETS.TEST
+
+    for dataset_name in dataset_names_test:
+        if not(dataset_name in cfg.DATASETS.TRAIN):
+            simple_register(dataset_name, filter_settings, filter_empty=True)
+    
+    return cfg
+
+
+def main(args):
+    
+    cfg = setup(args)
+    
+    if cfg.log:
+        idx = cfg.OUTPUT_DIR.find('/')
+        name = f'{cfg.OUTPUT_DIR[idx+1:]} {datetime.datetime.now():%Y-%m-%d %H:%M:%S%z}'
+        wandb.init(project="cube", sync_tensorboard=True, name=name, config=cfg)
+
+    logger.info('Preprocessing Training Datasets')
+
+    filter_settings = data.get_filter_settings_from_cfg(cfg)
+
+    priors = None
+
+    if args.eval_only:
+        category_path = os.path.join(util.file_parts(args.config_file)[0], 'category_meta.json')
+        
+        # store locally if needed
+        if category_path.startswith(util.CubeRCNNHandler.PREFIX):
+            category_path = util.CubeRCNNHandler._get_local_path(util.CubeRCNNHandler, category_path)
+
+        metadata = util.load_json(category_path)
+
+        # register the categories
+        thing_classes = metadata['thing_classes']
+        id_map = {int(key):val for key, val in metadata['thing_dataset_id_to_contiguous_id'].items()}
+        MetadataCatalog.get('omni3d_model').thing_classes = thing_classes
+        MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id  = id_map
+
+    else: 
+
+        # setup and join the data.
+        dataset_paths = [os.path.join('datasets', 'Omni3D', name + '.json') for name in cfg.DATASETS.TRAIN]
+        datasets = data.Omni3D(dataset_paths, filter_settings=filter_settings)
+
+        # determine the meta data given the datasets used. 
+        data.register_and_store_model_metadata(datasets, cfg.OUTPUT_DIR, filter_settings)
+
+        thing_classes = MetadataCatalog.get('omni3d_model').thing_classes
+        dataset_id_to_contiguous_id = MetadataCatalog.get('omni3d_model').thing_dataset_id_to_contiguous_id
+        
+        '''
+        It may be useful to keep track of which categories are annotated/known
+        for each dataset in use, in case a method wants to use this information.
+        '''
+
+        infos = datasets.dataset['info']
+
+        if type(infos) == dict:
+            infos = [datasets.dataset['info']]
+
+        dataset_id_to_unknown_cats = {}
+        possible_categories = set(i for i in range(cfg.MODEL.ROI_HEADS.NUM_CLASSES + 1))
+        
+        dataset_id_to_src = {}
+
+        for info in infos:
+            dataset_id = info['id']
+            known_category_training_ids = set()
+
+            if not dataset_id in dataset_id_to_src:
+                dataset_id_to_src[dataset_id] = info['source']
+
+            for id in info['known_category_ids']:
+                if id in dataset_id_to_contiguous_id:
+                    known_category_training_ids.add(dataset_id_to_contiguous_id[id])
+            
+            # determine and store the unknown categories.
+            unknown_categories = possible_categories - known_category_training_ids
+            dataset_id_to_unknown_cats[dataset_id] = unknown_categories
+
+            # log the per-dataset categories
+            logger.info('Available categories for {}'.format(info['name']))
+            logger.info([thing_classes[i] for i in (possible_categories & known_category_training_ids)])
+
+        # compute priors given the training data.
+        priors = util.compute_priors(cfg, datasets)
+    
+    '''
+    The training loops can attempt to train for N times.
+    This catches a divergence or other failure modes. 
+    '''
+
+    remaining_attempts = MAX_TRAINING_ATTEMPTS
+    while remaining_attempts > 0:
+
+        # build the training model.
+        model = build_model(cfg, priors=priors)
+
+        if remaining_attempts == MAX_TRAINING_ATTEMPTS:
+            # log the first attempt's settings.
+            # logger.info("Model:\n{}".format(model))
+            pass
+
+        if args.eval_only:
+            # skip straight to eval mode
+            DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+                cfg.MODEL.WEIGHTS, resume=args.resume
+            )
+            return do_test(cfg, model)
+
+        # setup distributed training.
+        distributed = comm.get_world_size() > 1
+        if distributed:
+            model = DistributedDataParallel(
+                model, device_ids=[comm.get_local_rank()], 
+                broadcast_buffers=False, find_unused_parameters=True
+            )
+
+        # train full model, potentially with resume.
+        if do_train(cfg, model, dataset_id_to_unknown_cats, dataset_id_to_src, resume=args.resume):
+            break
+        else:
+
+            # allow restart when a model fails to train.
+            remaining_attempts -= 1
+            del model
+
+    if remaining_attempts == 0:
+        # Exit if the model could not finish without diverging. 
+        raise ValueError('Training failed')
+        
+    return do_test(cfg, model)
+
+def allreduce_dict(input_dict, average=True):
+    """
+    Reduce the values in the dictionary from all processes so that process with rank
+    0 has the reduced results.
+    Args:
+        input_dict (dict): inputs to be reduced. All the values must be scalar CUDA Tensor.
+        average (bool): whether to do average or sum
+    Returns:
+        a dict with the same keys as input_dict, after reduction.
+    """
+    world_size = comm.get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            # only main process gets accumulated, so only divide by
+            # world_size in this case
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )
\ No newline at end of file