diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..fc4c07276075879524bda37a4ca3e76c5aea9529
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.bmp filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b10d85b2e054b8ffb4d9dbe32cbd4ac6e71c7b35
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+*.zip
+*.pyc
\ No newline at end of file
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
new file mode 100644
index 0000000000000000000000000000000000000000..83f431e8feeb7e80d571f39c9f6c1b96857b5f85
--- /dev/null
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,80 @@
+# Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..7141f8d55f5d491525cf73b4958ff560f65e7a1a
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,32 @@
+# Contributing to OVSeg
+We want to make contributing to this project as easy and transparent as
+possible.
+
+## Pull Requests
+We actively welcome your pull requests.
+
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Meta's open source projects.
+
+Complete your CLA here: <https://code.facebook.com/cla>
+
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+
+Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+
+
+## License
+By contributing to OVSeg, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.
diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md
new file mode 100644
index 0000000000000000000000000000000000000000..847ddda04c47cf234c3593da2504184011e165fc
--- /dev/null
+++ b/GETTING_STARTED.md
@@ -0,0 +1,99 @@
+## Getting started with OVSeg
+
+
+### Try demo
+
+We release our largest model (Swin-Base + CLIP-ViT-L/14) [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) (md5: <tt>526080</tt>).
+
+- Test on sample image
+  ```bash
+  python demo.py --config-file configs/ovseg_swinB_vitL_demo.yaml --class-names 'Oculus' 'Ukulele'  --input ./resources/demo_samples/sample_03.jpeg --output ./pred --opts MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth 
+  ```
+
+### Evaluation with pre-trained weights
+
+We release our largest model (Swin-Base + CLIP-ViT-L/14) [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) (md5: <tt>526080</tt>).
+
+- Test on ADE20K-150 and ADE-847
+  ```bash
+  python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) 
+  ```
+
+- Test on PascalContext-59 and PascalContext-459
+  ```bash
+  python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.6  DATASETS.TEST \(\"pascal_context_59_sem_seg_val\",\"pascal_context_459_sem_seg_val\",\)
+  ```
+
+- Test on PascalVOC-20
+  ```bash
+  python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.45  DATASETS.TEST \(\"pascalvoc20_sem_seg_val\",\)
+  ```
+  
+#### Performance benchmark
+
+| method                             | backbone | training dataset | A-847 | PC-459 | A-150 | PC-59 | PAS-20 |
+|------------------------------------|----------|------------------|:-----:|:------:|:-----:|:-----:|:------:|
+| Open-vocabulary generalist models. |          |                  |       |        |       |       |        |
+| SPNet                              | R-101    | PASCAL-15        |   -   |    -   |   -   |  24.3 |  18.3  |
+| ZS3Net                             | R-101    | PASCAL-15        |   -   |    -   |   -   |  19.4 |  38.3  |
+| LSeg                               | R-101    | PASCAL-15        |   -   |    -   |   -   |   -   |  47.4  |
+| LSeg+                              | R-101    | COCO Panoptic    |  2.5  |   5.2  |  13.0 |  36.0 |  59.0  |
+| SimBaseline                        | R-101c   | COCO-Stuff-156   |   -   |    -   |  15.3 |   -   |  74.5  |
+| ZegFormer                          | R-50     | COCO-Stuff-156   |   -   |    -   |  16.4 |   -   |  80.7  |
+| OpenSeg                            | R-101    | COCO Panoptic    |  4.0  |   6.5  |  15.3 |  36.9 |  60.0  |
+| OVSeg (Ours)                       | R-101c   | COCO-Stuff-171   |  7.1  |  11.0  |  24.8 |  53.3 |  92.6  |
+| LSeg+                              | Eff-B7   | COCO Panoptic    |  3.8  |   7.8  |  18.0 |  46.5 |    -   |
+| OpenSeg                            | Eff-B7   | COCO Panoptic    |  6.3  |   9.0  |  21.1 |  42.1 |    -   |
+| OVSeg (Ours)                       | Swin-B   | COCO-Stuff-171   |  9.0  |  12.4  |  29.6 |  55.7 |  94.5  |
+| Supervised specialist models.      |          |                  |       |        |       |       |        |
+| FCN                                | FCN-8s   | Same as test     |   -   |    -   |  29.4 |  37.8 |    -   |
+| Deeplab                            | R-101    | Same as test     |   -   |    -   |   -   |  45.7 |  77.7  |
+| SelfTrain                          | Eff-L2   | Same as test     |   -   |    -   |   -   |   -   |  90.0  |
+
+#### Ablation study
+
+- Mask prompt tuning can bring significant improvement without changing CLIP weights (Table 3 in [paper](https://arxiv.org/pdf/2210.04150.pdf))
+
+Download the checkpoint with mpt only [ovseg_swinbase_vitL14_mpt_only.pt](https://drive.google.com/file/d/1LJGWFjHw76OGDNy9r9KQIaACfIm9KMhQ/view?usp=sharing) (md5: <tt>2dd495</tt>).
+
+  ```bash
+  python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_mpt_only.pt DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) 
+  ```
+  
+- Mask prompt tuning can improve over fully finetuned model (Table 3 in [paper](https://arxiv.org/pdf/2210.04150.pdf))
+
+With the same [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) checkpoint, set `MASK_PROMPT_FWD` as `False` 
+
+  ```bash
+  python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) 
+  ```
+
+- The effects of class prediction ensemble (Table 6 in [paper](https://arxiv.org/pdf/2210.04150.pdf))
+
+With the same [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) checkpoint, set `CLIP_ENSEMBLE` as `False`.
+
+  ```bash
+  python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE False MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) 
+  ```
+
+### Training Segmentation model
+
+  Our model is trained on COCO-Stuff
+  
+- Training baseline w/ original CLIP
+  ```
+  python train_net.py --num-gpu 8 --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False
+  ```
+
+To reproduce our final results, you may want to use the our mask-adapted CLIP
+
+- Training ovseg w/ mask-adapted CLIP
+  ```
+  python train_net.py --num-gpu 8 --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME #PATH_TO_MASKADAPTED_CLIP
+  ```
+  
+CAUTION: The final results is sensitive to the ensemble (appendix A.5 in [paper](https://arxiv.org/pdf/2210.04150.pdf)). Thus, you may want to use the ```tools/search_thr_ensemble_w.sh``` to find the best ensemble hyper-parameters.
+
+### Fine-tuning CLIP with collected mask-category pairs
+
+We are still working on this part, stay tuned!
\ No newline at end of file
diff --git a/INSTALL.md b/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..59ee72f5a078de9cf7a4d66aea7e6099b7345f02
--- /dev/null
+++ b/INSTALL.md
@@ -0,0 +1,50 @@
+## Installation
+
+### Requirements
+- Linux with Python ≥ 3.8
+- PyTorch ≥ 1.8 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation.
+  Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check
+  PyTorch version matches that is required by Detectron2.
+- PyTorch3d: follow [Pytorch3d installation instructions](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md).
+- Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html).
+- Segment Anything Model: follow [SAM](https://github.com/facebookresearch/segment-anything).
+
+### Usage
+
+Install required packages. 
+
+```bash
+conda create --name ovseg python=3.8
+conda activate ovseg
+conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge
+conda install -c fvcore -c iopath -c conda-forge fvcore iopath
+conda install pytorch3d -c pytorch3d
+pip install -r requirements.txt
+```
+
+You need to download `detectron2==0.6` following [instructions](https://detectron2.readthedocs.io/en/latest/tutorials/install.html)
+
+```bash
+python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
+```
+
+If you cannot succefully install `pycocotools`, try this from [here](https://github.com/cocodataset/cocoapi/issues/351):
+```bash
+conda install -c conda-forge pycocotools
+```
+
+Install the SAM with:
+```bash
+pip install git+https://github.com/facebookresearch/segment-anything.git
+```
+To fully support the SAM, install these packages:
+```bash
+pip install opencv-python pycocotools matplotlib onnxruntime onnx
+```
+
+FurtherMore, install the modified clip package.
+
+```bash
+cd third_party/CLIP
+python -m pip install -Ue .
+```
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..50f2e656c8e006d68fce3c9ddd02d9069072214a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,399 @@
+Attribution-NonCommercial 4.0 International
+
+=======================================================================
+
+Creative Commons Corporation ("Creative Commons") is not a law firm and
+does not provide legal services or legal advice. Distribution of
+Creative Commons public licenses does not create a lawyer-client or
+other relationship. Creative Commons makes its licenses and related
+information available on an "as-is" basis. Creative Commons gives no
+warranties regarding its licenses, any material licensed under their
+terms and conditions, or any related information. Creative Commons
+disclaims all liability for damages resulting from their use to the
+fullest extent possible.
+
+Using Creative Commons Public Licenses
+
+Creative Commons public licenses provide a standard set of terms and
+conditions that creators and other rights holders may use to share
+original works of authorship and other material subject to copyright
+and certain other rights specified in the public license below. The
+following considerations are for informational purposes only, are not
+exhaustive, and do not form part of our licenses.
+
+     Considerations for licensors: Our public licenses are
+     intended for use by those authorized to give the public
+     permission to use material in ways otherwise restricted by
+     copyright and certain other rights. Our licenses are
+     irrevocable. Licensors should read and understand the terms
+     and conditions of the license they choose before applying it.
+     Licensors should also secure all rights necessary before
+     applying our licenses so that the public can reuse the
+     material as expected. Licensors should clearly mark any
+     material not subject to the license. This includes other CC-
+     licensed material, or material used under an exception or
+     limitation to copyright. More considerations for licensors:
+	wiki.creativecommons.org/Considerations_for_licensors
+
+     Considerations for the public: By using one of our public
+     licenses, a licensor grants the public permission to use the
+     licensed material under specified terms and conditions. If
+     the licensor's permission is not necessary for any reason--for
+     example, because of any applicable exception or limitation to
+     copyright--then that use is not regulated by the license. Our
+     licenses grant only permissions under copyright and certain
+     other rights that a licensor has authority to grant. Use of
+     the licensed material may still be restricted for other
+     reasons, including because others have copyright or other
+     rights in the material. A licensor may make special requests,
+     such as asking that all changes be marked or described.
+     Although not required by our licenses, you are encouraged to
+     respect those requests where reasonable. More_considerations
+     for the public:
+	wiki.creativecommons.org/Considerations_for_licensees
+
+=======================================================================
+
+Creative Commons Attribution-NonCommercial 4.0 International Public
+License
+
+By exercising the Licensed Rights (defined below), You accept and agree
+to be bound by the terms and conditions of this Creative Commons
+Attribution-NonCommercial 4.0 International Public License ("Public
+License"). To the extent this Public License may be interpreted as a
+contract, You are granted the Licensed Rights in consideration of Your
+acceptance of these terms and conditions, and the Licensor grants You
+such rights in consideration of benefits the Licensor receives from
+making the Licensed Material available under these terms and
+conditions.
+
+Section 1 -- Definitions.
+
+  a. Adapted Material means material subject to Copyright and Similar
+     Rights that is derived from or based upon the Licensed Material
+     and in which the Licensed Material is translated, altered,
+     arranged, transformed, or otherwise modified in a manner requiring
+     permission under the Copyright and Similar Rights held by the
+     Licensor. For purposes of this Public License, where the Licensed
+     Material is a musical work, performance, or sound recording,
+     Adapted Material is always produced where the Licensed Material is
+     synched in timed relation with a moving image.
+
+  b. Adapter's License means the license You apply to Your Copyright
+     and Similar Rights in Your contributions to Adapted Material in
+     accordance with the terms and conditions of this Public License.
+
+  c. Copyright and Similar Rights means copyright and/or similar rights
+     closely related to copyright including, without limitation,
+     performance, broadcast, sound recording, and Sui Generis Database
+     Rights, without regard to how the rights are labeled or
+     categorized. For purposes of this Public License, the rights
+     specified in Section 2(b)(1)-(2) are not Copyright and Similar
+     Rights.
+  d. Effective Technological Measures means those measures that, in the
+     absence of proper authority, may not be circumvented under laws
+     fulfilling obligations under Article 11 of the WIPO Copyright
+     Treaty adopted on December 20, 1996, and/or similar international
+     agreements.
+
+  e. Exceptions and Limitations means fair use, fair dealing, and/or
+     any other exception or limitation to Copyright and Similar Rights
+     that applies to Your use of the Licensed Material.
+
+  f. Licensed Material means the artistic or literary work, database,
+     or other material to which the Licensor applied this Public
+     License.
+
+  g. Licensed Rights means the rights granted to You subject to the
+     terms and conditions of this Public License, which are limited to
+     all Copyright and Similar Rights that apply to Your use of the
+     Licensed Material and that the Licensor has authority to license.
+
+  h. Licensor means the individual(s) or entity(ies) granting rights
+     under this Public License.
+
+  i. NonCommercial means not primarily intended for or directed towards
+     commercial advantage or monetary compensation. For purposes of
+     this Public License, the exchange of the Licensed Material for
+     other material subject to Copyright and Similar Rights by digital
+     file-sharing or similar means is NonCommercial provided there is
+     no payment of monetary compensation in connection with the
+     exchange.
+
+  j. Share means to provide material to the public by any means or
+     process that requires permission under the Licensed Rights, such
+     as reproduction, public display, public performance, distribution,
+     dissemination, communication, or importation, and to make material
+     available to the public including in ways that members of the
+     public may access the material from a place and at a time
+     individually chosen by them.
+
+  k. Sui Generis Database Rights means rights other than copyright
+     resulting from Directive 96/9/EC of the European Parliament and of
+     the Council of 11 March 1996 on the legal protection of databases,
+     as amended and/or succeeded, as well as other essentially
+     equivalent rights anywhere in the world.
+
+  l. You means the individual or entity exercising the Licensed Rights
+     under this Public License. Your has a corresponding meaning.
+
+Section 2 -- Scope.
+
+  a. License grant.
+
+       1. Subject to the terms and conditions of this Public License,
+          the Licensor hereby grants You a worldwide, royalty-free,
+          non-sublicensable, non-exclusive, irrevocable license to
+          exercise the Licensed Rights in the Licensed Material to:
+
+            a. reproduce and Share the Licensed Material, in whole or
+               in part, for NonCommercial purposes only; and
+
+            b. produce, reproduce, and Share Adapted Material for
+               NonCommercial purposes only.
+
+       2. Exceptions and Limitations. For the avoidance of doubt, where
+          Exceptions and Limitations apply to Your use, this Public
+          License does not apply, and You do not need to comply with
+          its terms and conditions.
+
+       3. Term. The term of this Public License is specified in Section
+          6(a).
+
+       4. Media and formats; technical modifications allowed. The
+          Licensor authorizes You to exercise the Licensed Rights in
+          all media and formats whether now known or hereafter created,
+          and to make technical modifications necessary to do so. The
+          Licensor waives and/or agrees not to assert any right or
+          authority to forbid You from making technical modifications
+          necessary to exercise the Licensed Rights, including
+          technical modifications necessary to circumvent Effective
+          Technological Measures. For purposes of this Public License,
+          simply making modifications authorized by this Section 2(a)
+          (4) never produces Adapted Material.
+
+       5. Downstream recipients.
+
+            a. Offer from the Licensor -- Licensed Material. Every
+               recipient of the Licensed Material automatically
+               receives an offer from the Licensor to exercise the
+               Licensed Rights under the terms and conditions of this
+               Public License.
+
+            b. No downstream restrictions. You may not offer or impose
+               any additional or different terms or conditions on, or
+               apply any Effective Technological Measures to, the
+               Licensed Material if doing so restricts exercise of the
+               Licensed Rights by any recipient of the Licensed
+               Material.
+
+       6. No endorsement. Nothing in this Public License constitutes or
+          may be construed as permission to assert or imply that You
+          are, or that Your use of the Licensed Material is, connected
+          with, or sponsored, endorsed, or granted official status by,
+          the Licensor or others designated to receive attribution as
+          provided in Section 3(a)(1)(A)(i).
+
+  b. Other rights.
+
+       1. Moral rights, such as the right of integrity, are not
+          licensed under this Public License, nor are publicity,
+          privacy, and/or other similar personality rights; however, to
+          the extent possible, the Licensor waives and/or agrees not to
+          assert any such rights held by the Licensor to the limited
+          extent necessary to allow You to exercise the Licensed
+          Rights, but not otherwise.
+
+       2. Patent and trademark rights are not licensed under this
+          Public License.
+
+       3. To the extent possible, the Licensor waives any right to
+          collect royalties from You for the exercise of the Licensed
+          Rights, whether directly or through a collecting society
+          under any voluntary or waivable statutory or compulsory
+          licensing scheme. In all other cases the Licensor expressly
+          reserves any right to collect such royalties, including when
+          the Licensed Material is used other than for NonCommercial
+          purposes.
+
+Section 3 -- License Conditions.
+
+Your exercise of the Licensed Rights is expressly made subject to the
+following conditions.
+
+  a. Attribution.
+
+       1. If You Share the Licensed Material (including in modified
+          form), You must:
+
+            a. retain the following if it is supplied by the Licensor
+               with the Licensed Material:
+
+                 i. identification of the creator(s) of the Licensed
+                    Material and any others designated to receive
+                    attribution, in any reasonable manner requested by
+                    the Licensor (including by pseudonym if
+                    designated);
+
+                ii. a copyright notice;
+
+               iii. a notice that refers to this Public License;
+
+                iv. a notice that refers to the disclaimer of
+                    warranties;
+
+                 v. a URI or hyperlink to the Licensed Material to the
+                    extent reasonably practicable;
+
+            b. indicate if You modified the Licensed Material and
+               retain an indication of any previous modifications; and
+
+            c. indicate the Licensed Material is licensed under this
+               Public License, and include the text of, or the URI or
+               hyperlink to, this Public License.
+
+       2. You may satisfy the conditions in Section 3(a)(1) in any
+          reasonable manner based on the medium, means, and context in
+          which You Share the Licensed Material. For example, it may be
+          reasonable to satisfy the conditions by providing a URI or
+          hyperlink to a resource that includes the required
+          information.
+
+       3. If requested by the Licensor, You must remove any of the
+          information required by Section 3(a)(1)(A) to the extent
+          reasonably practicable.
+
+       4. If You Share Adapted Material You produce, the Adapter's
+          License You apply must not prevent recipients of the Adapted
+          Material from complying with this Public License.
+
+Section 4 -- Sui Generis Database Rights.
+
+Where the Licensed Rights include Sui Generis Database Rights that
+apply to Your use of the Licensed Material:
+
+  a. for the avoidance of doubt, Section 2(a)(1) grants You the right
+     to extract, reuse, reproduce, and Share all or a substantial
+     portion of the contents of the database for NonCommercial purposes
+     only;
+
+  b. if You include all or a substantial portion of the database
+     contents in a database in which You have Sui Generis Database
+     Rights, then the database in which You have Sui Generis Database
+     Rights (but not its individual contents) is Adapted Material; and
+
+  c. You must comply with the conditions in Section 3(a) if You Share
+     all or a substantial portion of the contents of the database.
+
+For the avoidance of doubt, this Section 4 supplements and does not
+replace Your obligations under this Public License where the Licensed
+Rights include other Copyright and Similar Rights.
+
+Section 5 -- Disclaimer of Warranties and Limitation of Liability.
+
+  a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE
+     EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS
+     AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF
+     ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS,
+     IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION,
+     WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR
+     PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS,
+     ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
+     KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT
+     ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU.
+
+  b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE
+     TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION,
+     NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT,
+     INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES,
+     COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR
+     USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN
+     ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR
+     DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR
+     IN PART, THIS LIMITATION MAY NOT APPLY TO YOU.
+
+  c. The disclaimer of warranties and limitation of liability provided
+     above shall be interpreted in a manner that, to the extent
+     possible, most closely approximates an absolute disclaimer and
+     waiver of all liability.
+
+Section 6 -- Term and Termination.
+
+  a. This Public License applies for the term of the Copyright and
+     Similar Rights licensed here. However, if You fail to comply with
+     this Public License, then Your rights under this Public License
+     terminate automatically.
+
+  b. Where Your right to use the Licensed Material has terminated under
+     Section 6(a), it reinstates:
+
+       1. automatically as of the date the violation is cured, provided
+          it is cured within 30 days of Your discovery of the
+          violation; or
+
+       2. upon express reinstatement by the Licensor.
+
+     For the avoidance of doubt, this Section 6(b) does not affect any
+     right the Licensor may have to seek remedies for Your violations
+     of this Public License.
+
+  c. For the avoidance of doubt, the Licensor may also offer the
+     Licensed Material under separate terms or conditions or stop
+     distributing the Licensed Material at any time; however, doing so
+     will not terminate this Public License.
+
+  d. Sections 1, 5, 6, 7, and 8 survive termination of this Public
+     License.
+
+Section 7 -- Other Terms and Conditions.
+
+  a. The Licensor shall not be bound by any additional or different
+     terms or conditions communicated by You unless expressly agreed.
+
+  b. Any arrangements, understandings, or agreements regarding the
+     Licensed Material not stated herein are separate from and
+     independent of the terms and conditions of this Public License.
+
+Section 8 -- Interpretation.
+
+  a. For the avoidance of doubt, this Public License does not, and
+     shall not be interpreted to, reduce, limit, restrict, or impose
+     conditions on any use of the Licensed Material that could lawfully
+     be made without permission under this Public License.
+
+  b. To the extent possible, if any provision of this Public License is
+     deemed unenforceable, it shall be automatically reformed to the
+     minimum extent necessary to make it enforceable. If the provision
+     cannot be reformed, it shall be severed from this Public License
+     without affecting the enforceability of the remaining terms and
+     conditions.
+
+  c. No term or condition of this Public License will be waived and no
+     failure to comply consented to unless expressly agreed to by the
+     Licensor.
+
+  d. Nothing in this Public License constitutes or may be interpreted
+     as a limitation upon, or waiver of, any privileges and immunities
+     that apply to the Licensor or You, including from the legal
+     processes of any jurisdiction or authority.
+
+=======================================================================
+
+Creative Commons is not a party to its public
+licenses. Notwithstanding, Creative Commons may elect to apply one of
+its public licenses to material it publishes and in those instances
+will be considered the “Licensor.” The text of the Creative Commons
+public licenses is dedicated to the public domain under the CC0 Public
+Domain Dedication. Except for the limited purpose of indicating that
+material is shared under a Creative Commons public license or as
+otherwise permitted by the Creative Commons policies published at
+creativecommons.org/policies, Creative Commons does not authorize the
+use of the trademark "Creative Commons" or any other trademark or logo
+of Creative Commons without its prior written consent including,
+without limitation, in connection with any unauthorized modifications
+to any of its public licenses or any other arrangements,
+understandings, or agreements concerning use of licensed material. For
+the avoidance of doubt, this paragraph does not form part of the
+public licenses.
+
+Creative Commons may be contacted at creativecommons.org.
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8f2b6472fa2e15646df3bc9821f8bde16f788fe6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+---
+title: Semantic Segment AnyRGBD
+emoji: ⚡
+colorFrom: yellow
+colorTo: green
+sdk: gradio
+sdk_version: 3.27.0
+app_file: app.py
+pinned: false
+license: mit
+---
+
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
diff --git a/UI/sailvos3d/ex1/inputs/depth_000160.npy b/UI/sailvos3d/ex1/inputs/depth_000160.npy
new file mode 100644
index 0000000000000000000000000000000000000000..8ab775b91010849a4a98e2d0e4a595cf1cec76df
--- /dev/null
+++ b/UI/sailvos3d/ex1/inputs/depth_000160.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:96d4969b8b33250785d1996b1536bb9026536f420391c68255e326990138598e
+size 4096128
diff --git a/UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz b/UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz
new file mode 100644
index 0000000000000000000000000000000000000000..08b4ae3bb99971542db3661cf049c0d993d48710
--- /dev/null
+++ b/UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5afc2fdf1faa9b7b5d7808bb703c82aa5ccbb3154e2f62b3cc4989a2dcc92fe5
+size 1234
diff --git a/UI/sailvos3d/ex1/inputs/rgb_000160.bmp b/UI/sailvos3d/ex1/inputs/rgb_000160.bmp
new file mode 100644
index 0000000000000000000000000000000000000000..03a72ef7da9ae9186efa66119cb287bfef4185e5
--- /dev/null
+++ b/UI/sailvos3d/ex1/inputs/rgb_000160.bmp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c461e0c0cf6049bd9984ccaedb8b8fb07a1df06462931d38fdcd952bb38805c
+size 3072054
diff --git a/UI/sailvos3d/ex2/inputs/depth_000540.npy b/UI/sailvos3d/ex2/inputs/depth_000540.npy
new file mode 100644
index 0000000000000000000000000000000000000000..f952ea79b409c521ead8a8fea61f64a271079c9d
--- /dev/null
+++ b/UI/sailvos3d/ex2/inputs/depth_000540.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f24fcabda3f7fd17856c4105279f2842b631cc18579d273b87dd8f2cb39e7df6
+size 4096128
diff --git a/UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz b/UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz
new file mode 100644
index 0000000000000000000000000000000000000000..ed8f4184e0f67faaa1f448e8046d2f7f280b4244
--- /dev/null
+++ b/UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fb66f05ce4cdb6d6410bd3e34b70eeb07724810e70786249c30de0f50404fd64
+size 1234
diff --git a/UI/sailvos3d/ex2/inputs/rgb_000540.bmp b/UI/sailvos3d/ex2/inputs/rgb_000540.bmp
new file mode 100644
index 0000000000000000000000000000000000000000..115b5ca1897d7d3d0b80fc411b0834fe106ee4bb
--- /dev/null
+++ b/UI/sailvos3d/ex2/inputs/rgb_000540.bmp
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1aa08869030d51751983bdab733f4f26342dc239abedb3195d3f4771d93701cf
+size 3072054
diff --git a/UI/scannetv2/examples/scene0000_00/color/1660.jpg b/UI/scannetv2/examples/scene0000_00/color/1660.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..dbbd168d04d7a13533a81b2d051ff8888cb8400a
Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/color/1660.jpg differ
diff --git a/UI/scannetv2/examples/scene0000_00/color/5560.jpg b/UI/scannetv2/examples/scene0000_00/color/5560.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..cf72818630d852e9dba470b2fe2f8a517530a549
Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/color/5560.jpg differ
diff --git a/UI/scannetv2/examples/scene0000_00/depth/1660.png b/UI/scannetv2/examples/scene0000_00/depth/1660.png
new file mode 100644
index 0000000000000000000000000000000000000000..701312992acf73df575741d6ac237ab6d0d531db
Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/depth/1660.png differ
diff --git a/UI/scannetv2/examples/scene0000_00/depth/5560.png b/UI/scannetv2/examples/scene0000_00/depth/5560.png
new file mode 100644
index 0000000000000000000000000000000000000000..cfc72a3d8d0f707be0f5b57a52044e767f4ba4a9
Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/depth/5560.png differ
diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_color.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_color.txt
new file mode 100644
index 0000000000000000000000000000000000000000..50a318656a6eb9597e65cd0811bbd671e26b9c24
--- /dev/null
+++ b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_color.txt
@@ -0,0 +1,4 @@
+1.000000 0.000000 0.000000 0.000000
+0.000000 1.000000 0.000000 0.000000
+0.000000 0.000000 1.000000 0.000000
+0.000000 0.000000 0.000000 1.000000
diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_depth.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_depth.txt
new file mode 100644
index 0000000000000000000000000000000000000000..50a318656a6eb9597e65cd0811bbd671e26b9c24
--- /dev/null
+++ b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_depth.txt
@@ -0,0 +1,4 @@
+1.000000 0.000000 0.000000 0.000000
+0.000000 1.000000 0.000000 0.000000
+0.000000 0.000000 1.000000 0.000000
+0.000000 0.000000 0.000000 1.000000
diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_color.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_color.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b59f1c74414f34ea206c923e54863debd7b135c1
--- /dev/null
+++ b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_color.txt
@@ -0,0 +1,4 @@
+1169.621094 0.000000 646.295044 0.000000
+0.000000 1167.105103 489.927032 0.000000
+0.000000 0.000000 1.000000 0.000000
+0.000000 0.000000 0.000000 1.000000
diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_depth.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_depth.txt
new file mode 100644
index 0000000000000000000000000000000000000000..757719fe85cf22cb752a9f10f49ac100428e400c
--- /dev/null
+++ b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_depth.txt
@@ -0,0 +1,4 @@
+577.590698 0.000000 318.905426 0.000000
+0.000000 578.729797 242.683609 0.000000
+0.000000 0.000000 1.000000 0.000000
+0.000000 0.000000 0.000000 1.000000
diff --git a/UI/scannetv2/examples/scene0000_00/pose/1660.txt b/UI/scannetv2/examples/scene0000_00/pose/1660.txt
new file mode 100644
index 0000000000000000000000000000000000000000..df62592207edaee68d2a93452d80de21e535f347
--- /dev/null
+++ b/UI/scannetv2/examples/scene0000_00/pose/1660.txt
@@ -0,0 +1,4 @@
+0.470083 -0.286393 0.834866 4.877258
+-0.882320 -0.127731 0.452986 4.841086
+-0.023094 -0.949560 -0.312735 1.390592
+0.000000 0.000000 0.000000 1.000000
diff --git a/UI/scannetv2/examples/scene0000_00/pose/5560.txt b/UI/scannetv2/examples/scene0000_00/pose/5560.txt
new file mode 100644
index 0000000000000000000000000000000000000000..02ba667129dff485eebbc333b3ea5aae7030a410
--- /dev/null
+++ b/UI/scannetv2/examples/scene0000_00/pose/5560.txt
@@ -0,0 +1,4 @@
+-0.994579 -0.050921 0.090665 2.842624
+-0.101826 0.300126 -0.948449 3.131151
+0.021085 -0.952539 -0.303684 1.467106
+0.000000 0.000000 0.000000 1.000000
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..84235225a705e98da68a0a276f54a91ae182feb5
--- /dev/null
+++ b/app.py
@@ -0,0 +1,304 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import os
+os.system('pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html')
+
+try:
+    import detectron2
+except:
+    import os 
+    # os.system('cd /home/user/app/third_party/CLIP && pip install -Ue .')
+    os.system('pip install git+https://github.com/Jun-CEN/CLIP.git')
+    os.system('pip install git+https://github.com/facebookresearch/detectron2.git')
+    os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git')
+    os.system('pip install git+https://github.com/facebookresearch/segment-anything.git')
+    
+import argparse
+import glob
+import multiprocessing as mp
+import os
+import time
+import cv2
+import tqdm
+import numpy as np
+import gradio as gr
+from tools.util import *
+
+from detectron2.config import get_cfg
+
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+from open_vocab_seg import add_ovseg_config
+
+from open_vocab_seg.utils import VisualizationDemo, VisualizationDemoIndoor
+
+# constants
+WINDOW_NAME = "Open vocabulary segmentation"
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    # for poly lr schedule
+    add_deeplab_config(cfg)
+    add_ovseg_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
+    parser.add_argument(
+        "--config-file",
+        default="configs/ovseg_swinB_vitL_demo.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument(
+        "--input",
+        default=["/mnt/lustre/jkyang/PSG4D/sailvos3d/downloads/sailvos3d/trevor_1_int/images/000160.bmp"],
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--class-names",
+        default=["person", "car", "motorcycle", "truck", "bird", "dog", "handbag", "suitcase", "bottle", "cup", "bowl", "chair", "potted plant", "bed", "dining table", "tv", "laptop", "cell phone", "bag", "bin", "box", "door", "road barrier", "stick", "lamp", "floor", "wall"],
+        nargs="+",
+        help="A list of user-defined class_names"
+    )
+    parser.add_argument(
+        "--output", 
+        default = "./pred",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=["MODEL.WEIGHTS", "ovseg_swinbase_vitL14_ft_mpt.pth"],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+args = get_parser().parse_args()
+
+def greet_sailvos3d(rgb_input, depth_map_input, rage_matrices_input, class_candidates):
+    print(args.class_names)
+    print(class_candidates[0], class_candidates[1], class_candidates[2], class_candidates[3],)
+    print(class_candidates.split(', '))
+    args.input = [rgb_input]
+    args.class_names = class_candidates.split(', ')
+    depth_map_path = depth_map_input.name
+    rage_matrices_path = rage_matrices_input.name
+    print(args.input, args.class_names, depth_map_path, rage_matrices_path)
+    mp.set_start_method("spawn", force=True)
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationDemo(cfg)
+    class_names = args.class_names
+    print(args.input)
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            start_time = time.time()
+            predictions, visualized_output_rgb, visualized_output_depth, visualized_output_rgb_sam, visualized_output_depth_sam = demo.run_on_image_sam(path, class_names, depth_map_path, rage_matrices_path)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output_rgb.save('outputs/RGB_Semantic_SAM.png')
+                visualized_output_depth.save('outputs/Depth_Semantic_SAM.png')
+                visualized_output_rgb_sam.save('outputs/RGB_Semantic_SAM_Mask.png')
+                visualized_output_depth_sam.save('outputs/Depth_Semantic_SAM_Mask.png')
+                rgb_3d_sam = demo.get_xyzrgb('outputs/RGB_Semantic_SAM.png', depth_map_path, rage_matrices_path)
+                depth_3d_sam = demo.get_xyzrgb('outputs/Depth_Semantic_SAM.png', depth_map_path, rage_matrices_path)
+                rgb_3d_sam_mask = demo.get_xyzrgb('outputs/RGB_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path)
+                depth_3d_sam_mask = demo.get_xyzrgb('outputs/Depth_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path)
+                np.savez('outputs/xyzrgb.npz', rgb_3d_sam = rgb_3d_sam, depth_3d_sam = depth_3d_sam, rgb_3d_sam_mask = rgb_3d_sam_mask, depth_3d_sam_mask = depth_3d_sam_mask)
+                demo.render_3d_video('outputs/xyzrgb.npz', depth_map_path)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output_rgb.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    else:
+        raise NotImplementedError
+    
+    Depth_Semantic_SAM_Mask = read_image('outputs/Depth_Semantic_SAM_Mask.png')
+    RGB_Semantic_SAM_Mask = read_image('outputs/RGB_Semantic_SAM_Mask.png')
+    Depth_Semantic_SAM = read_image('outputs/Depth_Semantic_SAM.png')
+    RGB_Semantic_SAM = read_image('outputs/RGB_Semantic_SAM.png')
+    two_image_to_gif(Depth_Semantic_SAM_Mask, Depth_Semantic_SAM, 'Depth_Semantic_SAM_2D')
+    two_image_to_gif(RGB_Semantic_SAM_Mask, RGB_Semantic_SAM, 'RGB_Semantic_SAM_2D')
+    Depth_Semantic_SAM_2D = 'outputs/Depth_Semantic_SAM_2D.mp4'
+    RGB_Semantic_SAM_2D = 'outputs/RGB_Semantic_SAM_2D.mp4'
+    Depth_map = read_image('outputs/Depth_rendered.png')
+    Depth_Semantic_SAM_Mask_gif = 'outputs/Depth_3D_All.mp4'
+    RGB_Semantic_SAM_Mask_gif = 'outputs/RGB_3D_All.mp4'
+    return RGB_Semantic_SAM_2D, RGB_Semantic_SAM_Mask_gif, Depth_map, Depth_Semantic_SAM_2D, Depth_Semantic_SAM_Mask_gif
+
+def greet_scannet(rgb_input, depth_map_input, class_candidates):
+    rgb_input = rgb_input
+    depth_map_input = depth_map_input.name
+    class_candidates = class_candidates.split(', ')
+    print(rgb_input, depth_map_input, class_candidates)
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationDemoIndoor(cfg)
+    """ args.input = glob.glob(os.path.expanduser(args.input[0]))
+    assert args.input, "The input path(s) was not found" """
+    start_time = time.time()
+    predictions, output2D, output3D = demo.run_on_pcd_ui(rgb_input, depth_map_input, class_candidates)
+
+    output2D['sem_seg_on_rgb'].save('outputs/RGB_Semantic_SAM.png')
+    output2D['sem_seg_on_depth'].save('outputs/Depth_Semantic_SAM.png')
+    output2D['sam_seg_on_rgb'].save('outputs/RGB_Semantic_SAM_Mask.png')
+    output2D['sam_seg_on_depth'].save('outputs/Depth_Semantic_SAM_Mask.png')
+    """ rgb_3d_sam = demo.get_xyzrgb('outputs/RGB_Semantic_SAM.png', path)
+    depth_3d_sam = demo.get_xyzrgb('outputs/Depth_Semantic_SAM.png', path)
+    rgb_3d_sam_mask = demo.get_xyzrgb('outputs/RGB_Semantic_SAM_Mask.png', path)
+    depth_3d_sam_mask = demo.get_xyzrgb(outputs/'Depth_Semantic_SAM_Mask.png', path) """
+    rgb_3d_sem = output3D['rgb_3d_sem']
+    depth_3d_sem = output3D['depth_3d_sem']
+    rgb_3d_sam = output3D['rgb_3d_sam']
+    depth_3d_sam = output3D['depth_3d_sam']
+    
+    np.savez('outputs/xyzrgb.npz', rgb_3d_sam = rgb_3d_sem, depth_3d_sam = depth_3d_sem, rgb_3d_sam_mask = rgb_3d_sam, depth_3d_sam_mask = depth_3d_sam)
+    demo.render_3d_video('outputs/xyzrgb.npz')
+
+    Depth_Semantic_SAM_Mask = read_image('outputs/Depth_Semantic_SAM_Mask.png')
+    RGB_Semantic_SAM_Mask = read_image('outputs/RGB_Semantic_SAM_Mask.png')
+    Depth_Semantic_SAM = read_image('outputs/Depth_Semantic_SAM.png')
+    RGB_Semantic_SAM = read_image('outputs/RGB_Semantic_SAM.png')
+    two_image_to_gif(Depth_Semantic_SAM_Mask, Depth_Semantic_SAM, 'Depth_Semantic_SAM_2D')
+    two_image_to_gif(RGB_Semantic_SAM_Mask, RGB_Semantic_SAM, 'RGB_Semantic_SAM_2D')
+    Depth_Semantic_SAM_2D = 'outputs/Depth_Semantic_SAM_2D.mp4'
+    RGB_Semantic_SAM_2D = 'outputs/RGB_Semantic_SAM_2D.mp4'
+    Depth_map = read_image('outputs/Depth_rendered.png')
+    Depth_Semantic_SAM_Mask_gif = 'outputs/Depth_3D_All.mp4'
+    RGB_Semantic_SAM_Mask_gif = 'outputs/RGB_3D_All.mp4'
+    return RGB_Semantic_SAM_2D, RGB_Semantic_SAM_Mask_gif, Depth_map, Depth_Semantic_SAM_2D, Depth_Semantic_SAM_Mask_gif
+
+
+with gr.Blocks(analytics_enabled=False) as segrgbd_iface:
+        gr.Markdown("<div align='center'> <h2> Segment Any RGBD </span> </h2> \
+                     <a style='font-size:18px;color: #000000' href='https://github.com/Jun-CEN/SegmentAnyRGBD'> Github </div>")
+        
+        gr.Markdown("<b> Note that you need a GPU for this project. You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. <a style='display:inline-block' href='https://huggingface.co/spaces/jcenaa/Semantic_Segment_AnyRGBD?duplicate=true'> <img src='https://bit.ly/3gLdBN6' alt='Duplicate Space'></a> </b>")
+        #######t2v#######
+        with gr.Tab(label="Dataset: Sailvos3D"):
+            with gr.Column():
+                with gr.Row():
+                    # with gr.Tab(label='input'):
+                    with gr.Column():
+                        with gr.Row():
+                            Input_RGB_Component = gr.Image(label = 'RGB_Input', type = 'filepath').style(width=320, height=200)
+                            Depth_Map_Output_Component = gr.Image(label = "Vis_Depth_Map").style(width=320, height=200)
+                        with gr.Row():
+                            Depth_Map_Input_Component = gr.File(label = 'input_Depth_map')
+                            Component_2D_to_3D_Projection_Parameters = gr.File(label = '2D_to_3D_Projection_Parameters')
+                        with gr.Row():
+                            Class_Candidates_Component = gr.Text(label = 'Class_Candidates')
+                        vc_end_btn = gr.Button("Send")
+                    with gr.Tab(label='Result'):
+                        with gr.Row():
+                            RGB_Semantic_SAM_Mask_Component = gr.Video(label = "RGB_Semantic_SAM_Mask").style(width=320, height=200)
+                            RGB_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_RGB_Semantic_SAM_Mask").style(width=320, height=200)
+                        with gr.Row():
+                            Depth_Semantic_SAM_Mask_Component = gr.Video(label = "Depth_Semantic_SAM_Mask").style(width=320, height=200)
+                            Depth_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_Depth_Semantic_SAM_Mask").style(width=320, height=200)
+                        with gr.Row():
+                            gr.Markdown("<b> It takes around 2 to 5 minutes to get the final results. The framework initialization, SAM segmentation, zero-shot semantic segmentation and 3D results rendering take long time.</b>")
+                gr.Examples(examples=[
+                        [
+                            'UI/sailvos3d/ex1/inputs/rgb_000160.bmp',
+                            'UI/sailvos3d/ex1/inputs/depth_000160.npy',
+                            'UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz',
+                            'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall',
+                        ],
+                        [
+                            'UI/sailvos3d/ex2/inputs/rgb_000540.bmp',
+                            'UI/sailvos3d/ex2/inputs/depth_000540.npy',
+                            'UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz',
+                            'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall',
+                        ]],
+                            inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component],
+                            outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
+                            fn=greet_sailvos3d)
+            vc_end_btn.click(inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component],
+                            outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
+                            fn=greet_sailvos3d)
+            
+        with gr.Tab(label="Dataset: Scannet"):
+            with gr.Column():
+                with gr.Row():
+                    # with gr.Tab(label='input'):
+                    with gr.Column():
+                        with gr.Row():
+                            Input_RGB_Component = gr.Image(label = 'RGB_Input', type = 'filepath').style(width=320, height=200)
+                            Depth_Map_Output_Component = gr.Image(label = "Vis_Depth_Map").style(width=320, height=200)
+                        with gr.Row():
+                            Depth_Map_Input_Component = gr.File(label = "Input_Depth_Map")
+                            Class_Candidates_Component = gr.Text(label = 'Class_Candidates')
+                        vc_end_btn = gr.Button("Send")
+                    with gr.Tab(label='Result'):
+                        with gr.Row():
+                            RGB_Semantic_SAM_Mask_Component = gr.Video(label = "RGB_Semantic_SAM_Mask").style(width=320, height=200)
+                            RGB_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_RGB_Semantic_SAM_Mask").style(width=320, height=200)
+                        with gr.Row():
+                            Depth_Semantic_SAM_Mask_Component = gr.Video(label = "Depth_Semantic_SAM_Mask").style(width=320, height=200)
+                            Depth_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_Depth_Semantic_SAM_Mask").style(width=320, height=200)
+                        with gr.Row():
+                            gr.Markdown("<b> It takes around 2 to 5 minutes to get the final results. The framework initialization, SAM segmentation, zero-shot semantic segmentation and 3D results rendering take long time.</b>")
+                gr.Examples(examples=[
+                        [
+                            'UI/scannetv2/examples/scene0000_00/color/1660.jpg',
+                            'UI/scannetv2/examples/scene0000_00/depth/1660.png',
+                            'wall, floor, cabinet, bed, chair, sofa, table, door, window, bookshelf, picture, counter, desk, curtain, refrigerator, shower curtain, toilet, sink, bathtub, other furniture',
+                        ],
+                        [
+                            'UI/scannetv2/examples/scene0000_00/color/5560.jpg',
+                            'UI/scannetv2/examples/scene0000_00/depth/5560.png',
+                            'wall, floor, cabinet, bed, chair, sofa, table, door, window, bookshelf, picture, counter, desk, curtain, refrigerator, shower curtain, toilet, sink, bathtub, other furniture',
+                        ]],
+                            inputs=[Input_RGB_Component, Depth_Map_Input_Component, Class_Candidates_Component],
+                            outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
+                            fn=greet_scannet)
+            vc_end_btn.click(inputs=[Input_RGB_Component, Depth_Map_Input_Component, Class_Candidates_Component],
+                            outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component],
+                            fn=greet_scannet)
+
+demo = segrgbd_iface
+demo.launch()
+
diff --git a/configs/ovseg_swinB_vitL_bs32_120k.yaml b/configs/ovseg_swinB_vitL_bs32_120k.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..068de76324481cdb6fc66a21cc481fad5eb42070
--- /dev/null
+++ b/configs/ovseg_swinB_vitL_bs32_120k.yaml
@@ -0,0 +1,100 @@
+MODEL:
+  META_ARCHITECTURE: "OVSeg"
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 128
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [4, 8, 16, 32]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  SEM_SEG_HEAD:
+    NAME: "OpenVocabMaskFormerHead"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 171 # number of categories in training set
+    EMBEDDING_DIM: 768
+    EMBED_LAYERS: 2
+    COMMON_STRIDE: 4 # not used, hard-coded
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+  MASK_FORMER:
+    TRANSFORMER_IN_FEATURE: "res5"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    DICE_WEIGHT: 1.0
+    MASK_WEIGHT: 20.0
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 100
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    DEC_LAYERS: 6
+    PRE_NORM: False
+  CLIP_ADAPTER:
+    TEXT_TEMPLATES: "vild"
+    CLIP_MODEL_NAME: "ViT-L/14"
+    MASK_FILL: "mean"
+    MASK_EXPAND_RATIO: 1.0
+    MASK_THR: 0.4 # choose the foreground objects
+    MASK_MATTING: False # use soft background, default not used
+    MASK_PROMPT_DEPTH: 3
+    MASK_PROMPT_FWD: True # use mask prompt during forward
+    REGION_RESIZED: True # resize to the input of clip, e.g., 224
+    CLIP_ENSEMBLE: True # use ensemble of two classification branches
+    CLIP_ENSEMBLE_WEIGHT: 0.7
+DATASETS:
+  TRAIN: ("coco_2017_train_stuff_sem_seg",)
+  TEST: ("ade20k_sem_seg_val",)
+SOLVER:
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.00006
+  MAX_ITER: 120000
+  WARMUP_FACTOR: 1e-6
+  WARMUP_ITERS: 1500
+  LR_SCHEDULER_NAME: "WarmupPolyLR"
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_NORM: 0.0
+  WEIGHT_DECAY_EMBED: 0.0
+  BACKBONE_MULTIPLIER: 1.0
+  TEST_IMS_PER_BATCH: 1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [256, 384, 512, 640, 768, 896]
+    MAX_SIZE: 3584
+    FLIP: True
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2
\ No newline at end of file
diff --git a/configs/ovseg_swinB_vitL_demo.yaml b/configs/ovseg_swinB_vitL_demo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..20a3be9304b898ebf7cc2535d43833900b6233f2
--- /dev/null
+++ b/configs/ovseg_swinB_vitL_demo.yaml
@@ -0,0 +1,99 @@
+MODEL:
+  META_ARCHITECTURE: "OVSegDEMO"
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 128
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [4, 8, 16, 32]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_base_patch4_window12_384_22k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  SEM_SEG_HEAD:
+    NAME: "OpenVocabMaskFormerHead"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 171 # number of categories in training set
+    EMBEDDING_DIM: 768
+    EMBED_LAYERS: 2
+    COMMON_STRIDE: 4 # not used, hard-coded
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+  MASK_FORMER:
+    TRANSFORMER_IN_FEATURE: "res5"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    DICE_WEIGHT: 1.0
+    MASK_WEIGHT: 20.0
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 100
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    DEC_LAYERS: 6
+    PRE_NORM: False
+  CLIP_ADAPTER:
+    TEXT_TEMPLATES: "vild"
+    CLIP_MODEL_NAME: "ViT-L/14"
+    MASK_FILL: "mean"
+    MASK_EXPAND_RATIO: 1.0
+    MASK_THR: 0.1 # choose the foreground objects
+    MASK_MATTING: False # use soft background, default not used
+    MASK_PROMPT_DEPTH: 3
+    MASK_PROMPT_FWD: True # use mask prompt during forward
+    REGION_RESIZED: True # resize to the input of clip, e.g., 224
+    CLIP_ENSEMBLE: True # use ensemble of two classification branches
+    CLIP_ENSEMBLE_WEIGHT: 0.0
+DATASETS:
+  TRAIN: ("coco_2017_train_stuff_sem_seg",)
+  TEST: ("ade20k_sem_seg_val",)
+SOLVER:
+  IMS_PER_BATCH: 32
+  BASE_LR: 0.00006
+  MAX_ITER: 120000
+  WARMUP_FACTOR: 1e-6
+  WARMUP_ITERS: 1500
+  WEIGHT_DECAY: 0.01
+  WEIGHT_DECAY_NORM: 0.0
+  WEIGHT_DECAY_EMBED: 0.0
+  BACKBONE_MULTIPLIER: 1.0
+  TEST_IMS_PER_BATCH: 1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [256, 384, 512, 640, 768, 896]
+    MAX_SIZE: 3584
+    FLIP: True
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2
\ No newline at end of file
diff --git a/datasets/DATASETS.md b/datasets/DATASETS.md
new file mode 100644
index 0000000000000000000000000000000000000000..30d30ba314c9842098c5c38d0a47ce780283d9d9
--- /dev/null
+++ b/datasets/DATASETS.md
@@ -0,0 +1,122 @@
+## Prepare Datasets for OVSeg
+
+This doc is a modification/extension of [MaskFormer](https://github.com/facebookresearch/MaskFormer/blob/main/datasets/README.md) following [Detectron2 fromat](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html).
+
+A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog)
+for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc).
+This document explains how to setup the builtin datasets so they can be used by the above APIs.
+[Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`,
+and how to add new datasets to them.
+
+OVSeg has builtin support for a few datasets.
+The datasets are assumed to exist in a directory specified by the environment variable
+`DETECTRON2_DATASETS`.
+Under this directory, detectron2 will look for datasets in the structure described below, if needed.
+```
+$DETECTRON2_DATASETS/
+  coco/                 # COCOStuff-171
+  ADEChallengeData2016/ # ADE20K-150
+  ADE20K_2021_17_01/    # ADE20K-847
+  VOCdevkit/
+    VOC2012/            # PASCALVOC-20
+    VOC2010/            # PASCALContext-59, PASCALContext-459
+```
+
+You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`.
+If left unset, the default is `./datasets` relative to your current working directory.
+
+Without specific notifications, our model is trained on COCOStuff-171 and evlauted on ADE20K-150, ADE20K-847, PASCALVOC-20, PASCALContext-59 and PASCALContext-459.
+
+|     dataset    |   split   | # images | # categories |
+|:--------------:|:---------:|:--------:|:------------:|
+|   COCO Stuff   | train2017 |   118K   |      171     |
+|     ADE20K     |    val    |    2K    |    150/847   |
+|   Pascal VOC   |    val    |   1.5K   |      20      |
+| Pascal Context |    val    |    5K    |    59/459    |
+
+
+### Expected dataset structure for [COCO Stuff](https://github.com/nightrome/cocostuff):
+```
+coco/
+  train2017/ # http://images.cocodataset.org/zips/train2017.zip
+  annotations/ # http://images.cocodataset.org/annotations/annotations_trainval2017.zip
+  stuffthingmaps/
+    stuffthingmaps_trainval2017.zip # http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
+    train2017/
+  # below are generated
+  stuffthingmaps_detectron2/ 
+    train2017/
+```
+
+The directory `stuffthingmaps_detectron2` is generated by running `python datasets/prepare_coco_stuff_sem_seg.py`.
+
+
+
+### Expected dataset structure for [ADE20k Scene Parsing (ADE20K-150)](http://sceneparsing.csail.mit.edu/):
+```
+ADEChallengeData2016/
+  annotations/
+  images/
+  objectInfo150.txt
+  # below are generated
+  annotations_detectron2/
+```
+The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`.
+
+
+### Expected dataset structure for [ADE20k-Full (ADE20K-847)](https://github.com/CSAILVision/ADE20K#download):
+```
+ADE20K_2021_17_01/
+  images/
+  index_ade20k.pkl
+  objects.txt
+  # below are generated
+  images_detectron2/
+  annotations_detectron2/
+```
+The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_ade20k_full_sem_seg.py`.
+
+### Expected dataset structure for [Pascal VOC 2012 (PASCALVOC-20)](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/#devkit):
+```
+VOCdevkit/VOC2012/
+  Annotations/
+  ImageSets/
+  JPEGImages/
+  SegmentationClass/
+  SegmentationObject/
+  SegmentationClassAug/ # https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/voc12/README.md
+  # below are generated
+  images_detectron2/
+  annotations_detectron2/
+```
+
+It starts with a tar file `VOCtrainval_11-May-2012.tar`.
+
+We use SBD augmentated training data as `SegmentationClassAug` following [Deeplab](https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/voc12/README.md)
+
+The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_voc_sem_seg.py`.
+
+
+### Expected dataset structure for [Pascal Context](https://www.cs.stanford.edu/~roozbeh/pascal-context/):
+
+```
+VOCdevkit/VOC2010/
+  Annotations/
+  ImageSets/
+  JPEGImages/
+  SegmentationClass/
+  SegmentationObject/
+  # below are from https://www.cs.stanford.edu/~roozbeh/pascal-context/trainval.tar.gz
+  trainval/
+  labels.txt
+  59_labels.txt # https://www.cs.stanford.edu/~roozbeh/pascal-context/59_labels.txt
+  pascalcontext_val.txt # https://drive.google.com/file/d/1BCbiOKtLvozjVnlTJX51koIveUZHCcUh/view?usp=sharing
+  # below are generated
+  annotations_detectron2/
+    pc459_val
+    pc59_val
+```
+It starts with a tar file `VOCtrainval_03-May-2010.tar`. You may want to download the 5K validation set [here](https://drive.google.com/file/d/1BCbiOKtLvozjVnlTJX51koIveUZHCcUh/view?usp=sharing).
+
+The directory `annotations_detectron2` is generated by running `python datasets/prepare_pascal_context.py`.
+
diff --git a/datasets/prepare_ade20k_full_sem_seg.py b/datasets/prepare_ade20k_full_sem_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a55e039549ff0aaf928a4dddee7a94ea8d0f6bf
--- /dev/null
+++ b/datasets/prepare_ade20k_full_sem_seg.py
@@ -0,0 +1,1011 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import os
+import pickle as pkl
+from pathlib import Path
+
+import cv2
+import numpy as np
+import tqdm
+from PIL import Image
+
+ADE20K_SEM_SEG_FULL_CATEGORIES = [
+    {"name": "wall", "id": 2978, "trainId": 0},
+    {"name": "building, edifice", "id": 312, "trainId": 1},
+    {"name": "sky", "id": 2420, "trainId": 2},
+    {"name": "tree", "id": 2855, "trainId": 3},
+    {"name": "road, route", "id": 2131, "trainId": 4},
+    {"name": "floor, flooring", "id": 976, "trainId": 5},
+    {"name": "ceiling", "id": 447, "trainId": 6},
+    {"name": "bed", "id": 165, "trainId": 7},
+    {"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
+    {"name": "earth, ground", "id": 838, "trainId": 9},
+    {"name": "cabinet", "id": 350, "trainId": 10},
+    {"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11},
+    {"name": "grass", "id": 1125, "trainId": 12},
+    {"name": "windowpane, window", "id": 3055, "trainId": 13},
+    {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
+    {"name": "mountain, mount", "id": 1610, "trainId": 15},
+    {"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
+    {"name": "table", "id": 2684, "trainId": 17},
+    {"name": "chair", "id": 471, "trainId": 18},
+    {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
+    {"name": "door", "id": 774, "trainId": 20},
+    {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
+    {"name": "sea", "id": 2264, "trainId": 22},
+    {"name": "painting, picture", "id": 1735, "trainId": 23},
+    {"name": "water", "id": 2994, "trainId": 24},
+    {"name": "mirror", "id": 1564, "trainId": 25},
+    {"name": "house", "id": 1276, "trainId": 26},
+    {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
+    {"name": "shelf", "id": 2329, "trainId": 28},
+    {"name": "armchair", "id": 57, "trainId": 29},
+    {"name": "fence, fencing", "id": 907, "trainId": 30},
+    {"name": "field", "id": 913, "trainId": 31},
+    {"name": "lamp", "id": 1395, "trainId": 32},
+    {"name": "rock, stone", "id": 2138, "trainId": 33},
+    {"name": "seat", "id": 2272, "trainId": 34},
+    {"name": "river", "id": 2128, "trainId": 35},
+    {"name": "desk", "id": 724, "trainId": 36},
+    {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
+    {"name": "railing, rail", "id": 2053, "trainId": 38},
+    {"name": "signboard, sign", "id": 2380, "trainId": 39},
+    {"name": "cushion", "id": 689, "trainId": 40},
+    {"name": "path", "id": 1788, "trainId": 41},
+    {"name": "work surface", "id": 3087, "trainId": 42},
+    {"name": "stairs, steps", "id": 2530, "trainId": 43},
+    {"name": "column, pillar", "id": 581, "trainId": 44},
+    {"name": "sink", "id": 2388, "trainId": 45},
+    {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
+    {"name": "snow", "id": 2454, "trainId": 47},
+    {"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
+    {"name": "base, pedestal, stand", "id": 137, "trainId": 49},
+    {"name": "bridge, span", "id": 294, "trainId": 50},
+    {"name": "blind, screen", "id": 212, "trainId": 51},
+    {"name": "runway", "id": 2185, "trainId": 52},
+    {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
+    {"name": "sand", "id": 2212, "trainId": 54},
+    {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
+    {"name": "pillow", "id": 1869, "trainId": 56},
+    {"name": "screen door, screen", "id": 2251, "trainId": 57},
+    {"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58},
+    {"name": "skyscraper", "id": 2423, "trainId": 59},
+    {"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
+    {"name": "box", "id": 266, "trainId": 61},
+    {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
+    {"name": "palm, palm tree", "id": 1744, "trainId": 63},
+    {"name": "double door", "id": 783, "trainId": 64},
+    {"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
+    {"name": "counter", "id": 627, "trainId": 66},
+    {"name": "countertop", "id": 629, "trainId": 67},
+    {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
+    {"name": "kitchen island", "id": 1374, "trainId": 69},
+    {"name": "boat", "id": 223, "trainId": 70},
+    {"name": "waterfall, falls", "id": 3016, "trainId": 71},
+    {
+        "name": "stove, kitchen stove, range, kitchen range, cooking stove",
+        "id": 2598,
+        "trainId": 72,
+    },
+    {"name": "flower", "id": 978, "trainId": 73},
+    {"name": "bookcase", "id": 239, "trainId": 74},
+    {"name": "controls", "id": 608, "trainId": 75},
+    {"name": "book", "id": 236, "trainId": 76},
+    {"name": "stairway, staircase", "id": 2531, "trainId": 77},
+    {"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
+    {
+        "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
+        "id": 591,
+        "trainId": 79,
+    },
+    {
+        "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
+        "id": 327,
+        "trainId": 80,
+    },
+    {"name": "swivel chair", "id": 2679, "trainId": 81},
+    {"name": "light, light source", "id": 1451, "trainId": 82},
+    {"name": "bench", "id": 181, "trainId": 83},
+    {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
+    {"name": "towel", "id": 2821, "trainId": 85},
+    {"name": "fountain", "id": 1023, "trainId": 86},
+    {"name": "embankment", "id": 855, "trainId": 87},
+    {
+        "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
+        "id": 2733,
+        "trainId": 88,
+    },
+    {"name": "van", "id": 2928, "trainId": 89},
+    {"name": "hill", "id": 1240, "trainId": 90},
+    {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
+    {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
+    {"name": "truck, motortruck", "id": 2880, "trainId": 93},
+    {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
+    {"name": "pole", "id": 1936, "trainId": 95},
+    {"name": "tower", "id": 2828, "trainId": 96},
+    {"name": "court", "id": 631, "trainId": 97},
+    {"name": "ball", "id": 103, "trainId": 98},
+    {
+        "name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+        "id": 3144,
+        "trainId": 99,
+    },
+    {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
+    {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
+    {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
+    {"name": "minibike, motorbike", "id": 1563, "trainId": 103},
+    {"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104},
+    {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
+    {"name": "step, stair", "id": 2569, "trainId": 106},
+    {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
+    {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
+    {"name": "doorframe, doorcase", "id": 778, "trainId": 109},
+    {"name": "sconce", "id": 2243, "trainId": 110},
+    {"name": "pond", "id": 1941, "trainId": 111},
+    {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
+    {"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113},
+    {"name": "bag", "id": 95, "trainId": 114},
+    {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
+    {"name": "gazebo", "id": 1087, "trainId": 116},
+    {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
+    {"name": "land, ground, soil", "id": 1401, "trainId": 118},
+    {"name": "board, plank", "id": 220, "trainId": 119},
+    {"name": "arcade machine", "id": 47, "trainId": 120},
+    {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
+    {"name": "bar", "id": 123, "trainId": 122},
+    {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
+    {"name": "playground", "id": 1927, "trainId": 124},
+    {"name": "ship", "id": 2337, "trainId": 125},
+    {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
+    {
+        "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+        "id": 64,
+        "trainId": 127,
+    },
+    {"name": "bottle", "id": 249, "trainId": 128},
+    {"name": "cradle", "id": 642, "trainId": 129},
+    {"name": "pot, flowerpot", "id": 1981, "trainId": 130},
+    {
+        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+        "id": 609,
+        "trainId": 131,
+    },
+    {"name": "train, railroad train", "id": 2840, "trainId": 132},
+    {"name": "stool", "id": 2586, "trainId": 133},
+    {"name": "lake", "id": 1393, "trainId": 134},
+    {"name": "tank, storage tank", "id": 2704, "trainId": 135},
+    {"name": "ice, water ice", "id": 1304, "trainId": 136},
+    {"name": "basket, handbasket", "id": 146, "trainId": 137},
+    {"name": "manhole", "id": 1494, "trainId": 138},
+    {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
+    {"name": "canopy", "id": 389, "trainId": 140},
+    {"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
+    {"name": "barrel, cask", "id": 131, "trainId": 142},
+    {"name": "dirt track", "id": 738, "trainId": 143},
+    {"name": "beam", "id": 161, "trainId": 144},
+    {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
+    {"name": "plate", "id": 1919, "trainId": 146},
+    {"name": "screen, crt screen", "id": 3109, "trainId": 147},
+    {"name": "ruins", "id": 2179, "trainId": 148},
+    {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
+    {"name": "blanket, cover", "id": 206, "trainId": 150},
+    {"name": "plaything, toy", "id": 1930, "trainId": 151},
+    {"name": "food, solid food", "id": 1002, "trainId": 152},
+    {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
+    {"name": "oven", "id": 1708, "trainId": 154},
+    {"name": "stage", "id": 2526, "trainId": 155},
+    {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
+    {"name": "umbrella", "id": 2901, "trainId": 157},
+    {"name": "sculpture", "id": 2262, "trainId": 158},
+    {"name": "aqueduct", "id": 44, "trainId": 159},
+    {"name": "container", "id": 597, "trainId": 160},
+    {"name": "scaffolding, staging", "id": 2235, "trainId": 161},
+    {"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
+    {"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
+    {"name": "roller coaster", "id": 2151, "trainId": 164},
+    {"name": "horse, equus caballus", "id": 3107, "trainId": 165},
+    {"name": "catwalk", "id": 432, "trainId": 166},
+    {"name": "glass, drinking glass", "id": 1098, "trainId": 167},
+    {"name": "vase", "id": 2932, "trainId": 168},
+    {"name": "central reservation", "id": 461, "trainId": 169},
+    {"name": "carousel", "id": 410, "trainId": 170},
+    {"name": "radiator", "id": 2046, "trainId": 171},
+    {"name": "closet", "id": 533, "trainId": 172},
+    {"name": "machine", "id": 1481, "trainId": 173},
+    {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
+    {"name": "fan", "id": 894, "trainId": 175},
+    {"name": "inflatable bounce game", "id": 1322, "trainId": 176},
+    {"name": "pitch", "id": 1891, "trainId": 177},
+    {"name": "paper", "id": 1756, "trainId": 178},
+    {"name": "arcade, colonnade", "id": 49, "trainId": 179},
+    {"name": "hot tub", "id": 1272, "trainId": 180},
+    {"name": "helicopter", "id": 1229, "trainId": 181},
+    {"name": "tray", "id": 2850, "trainId": 182},
+    {"name": "partition, divider", "id": 1784, "trainId": 183},
+    {"name": "vineyard", "id": 2962, "trainId": 184},
+    {"name": "bowl", "id": 259, "trainId": 185},
+    {"name": "bullring", "id": 319, "trainId": 186},
+    {"name": "flag", "id": 954, "trainId": 187},
+    {"name": "pot", "id": 1974, "trainId": 188},
+    {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
+    {"name": "shower", "id": 2356, "trainId": 190},
+    {"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191},
+    {"name": "bulletin board, notice board", "id": 318, "trainId": 192},
+    {"name": "confessional booth", "id": 592, "trainId": 193},
+    {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
+    {"name": "forest", "id": 1017, "trainId": 195},
+    {"name": "elevator door", "id": 851, "trainId": 196},
+    {"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
+    {"name": "instrument panel", "id": 1332, "trainId": 198},
+    {"name": "bucket, pail", "id": 303, "trainId": 199},
+    {"name": "tapestry, tapis", "id": 2714, "trainId": 200},
+    {"name": "platform", "id": 1924, "trainId": 201},
+    {"name": "jacket", "id": 1346, "trainId": 202},
+    {"name": "gate", "id": 1081, "trainId": 203},
+    {"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
+    {
+        "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
+        "id": 2727,
+        "trainId": 205,
+    },
+    {"name": "spotlight, spot", "id": 2509, "trainId": 206},
+    {"name": "ring", "id": 2123, "trainId": 207},
+    {"name": "control panel", "id": 602, "trainId": 208},
+    {"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
+    {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
+    {"name": "chest", "id": 490, "trainId": 211},
+    {"name": "clock", "id": 530, "trainId": 212},
+    {"name": "sand dune", "id": 2213, "trainId": 213},
+    {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
+    {"name": "vault", "id": 2934, "trainId": 215},
+    {"name": "table football", "id": 2687, "trainId": 216},
+    {"name": "cannon", "id": 387, "trainId": 217},
+    {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
+    {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
+    {"name": "statue", "id": 2547, "trainId": 220},
+    {
+        "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+        "id": 1474,
+        "trainId": 221,
+    },
+    {"name": "exhibitor", "id": 877, "trainId": 222},
+    {"name": "ladder", "id": 1391, "trainId": 223},
+    {"name": "carport", "id": 414, "trainId": 224},
+    {"name": "dam", "id": 698, "trainId": 225},
+    {"name": "pulpit", "id": 2019, "trainId": 226},
+    {"name": "skylight, fanlight", "id": 2422, "trainId": 227},
+    {"name": "water tower", "id": 3010, "trainId": 228},
+    {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
+    {"name": "display board", "id": 753, "trainId": 230},
+    {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
+    {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
+    {"name": "ice rink", "id": 1301, "trainId": 233},
+    {"name": "fruit", "id": 1033, "trainId": 234},
+    {"name": "patio", "id": 1789, "trainId": 235},
+    {"name": "vending machine", "id": 2939, "trainId": 236},
+    {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
+    {"name": "net", "id": 1652, "trainId": 238},
+    {
+        "name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+        "id": 90,
+        "trainId": 239,
+    },
+    {"name": "jar", "id": 1349, "trainId": 240},
+    {"name": "track", "id": 2830, "trainId": 241},
+    {"name": "magazine", "id": 1485, "trainId": 242},
+    {"name": "shutter", "id": 2370, "trainId": 243},
+    {"name": "roof", "id": 2155, "trainId": 244},
+    {"name": "banner, streamer", "id": 118, "trainId": 245},
+    {"name": "landfill", "id": 1402, "trainId": 246},
+    {"name": "post", "id": 1957, "trainId": 247},
+    {"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
+    {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
+    {"name": "arch, archway", "id": 52, "trainId": 250},
+    {"name": "table game", "id": 2688, "trainId": 251},
+    {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
+    {"name": "document, written document, papers", "id": 762, "trainId": 253},
+    {"name": "dome", "id": 772, "trainId": 254},
+    {"name": "pier", "id": 1857, "trainId": 255},
+    {"name": "shanties", "id": 2315, "trainId": 256},
+    {"name": "forecourt", "id": 1016, "trainId": 257},
+    {"name": "crane", "id": 643, "trainId": 258},
+    {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
+    {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
+    {"name": "drawing", "id": 791, "trainId": 261},
+    {"name": "cabin", "id": 349, "trainId": 262},
+    {
+        "name": "ad, advertisement, advertizement, advertising, advertizing, advert",
+        "id": 6,
+        "trainId": 263,
+    },
+    {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
+    {"name": "monument", "id": 1587, "trainId": 265},
+    {"name": "henhouse", "id": 1233, "trainId": 266},
+    {"name": "cockpit", "id": 559, "trainId": 267},
+    {"name": "heater, warmer", "id": 1223, "trainId": 268},
+    {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
+    {"name": "pool", "id": 1943, "trainId": 270},
+    {"name": "elevator, lift", "id": 853, "trainId": 271},
+    {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
+    {"name": "labyrinth", "id": 1390, "trainId": 273},
+    {"name": "text, textual matter", "id": 2748, "trainId": 274},
+    {"name": "printer", "id": 2007, "trainId": 275},
+    {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
+    {"name": "mattress", "id": 1513, "trainId": 277},
+    {"name": "straw", "id": 2600, "trainId": 278},
+    {"name": "stalls", "id": 2538, "trainId": 279},
+    {"name": "patio, terrace", "id": 1790, "trainId": 280},
+    {"name": "billboard, hoarding", "id": 194, "trainId": 281},
+    {"name": "bus stop", "id": 326, "trainId": 282},
+    {"name": "trouser, pant", "id": 2877, "trainId": 283},
+    {"name": "console table, console", "id": 594, "trainId": 284},
+    {"name": "rack", "id": 2036, "trainId": 285},
+    {"name": "notebook", "id": 1662, "trainId": 286},
+    {"name": "shrine", "id": 2366, "trainId": 287},
+    {"name": "pantry", "id": 1754, "trainId": 288},
+    {"name": "cart", "id": 418, "trainId": 289},
+    {"name": "steam shovel", "id": 2553, "trainId": 290},
+    {"name": "porch", "id": 1951, "trainId": 291},
+    {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
+    {"name": "figurine, statuette", "id": 918, "trainId": 293},
+    {"name": "recycling bin", "id": 2086, "trainId": 294},
+    {"name": "folding screen", "id": 997, "trainId": 295},
+    {"name": "telescope", "id": 2731, "trainId": 296},
+    {"name": "deck chair, beach chair", "id": 704, "trainId": 297},
+    {"name": "kennel", "id": 1365, "trainId": 298},
+    {"name": "coffee maker", "id": 569, "trainId": 299},
+    {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
+    {"name": "fish", "id": 948, "trainId": 301},
+    {"name": "easel", "id": 839, "trainId": 302},
+    {"name": "artificial golf green", "id": 63, "trainId": 303},
+    {"name": "iceberg", "id": 1305, "trainId": 304},
+    {"name": "candlestick, candle holder", "id": 378, "trainId": 305},
+    {"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
+    {"name": "television stand", "id": 2734, "trainId": 307},
+    {
+        "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
+        "id": 2982,
+        "trainId": 308,
+    },
+    {"name": "skeleton", "id": 2398, "trainId": 309},
+    {"name": "grand piano, grand", "id": 1119, "trainId": 310},
+    {"name": "candy, confect", "id": 382, "trainId": 311},
+    {"name": "grille door", "id": 1141, "trainId": 312},
+    {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
+    {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
+    {"name": "shoe", "id": 2341, "trainId": 315},
+    {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
+    {"name": "shanty", "id": 2316, "trainId": 317},
+    {"name": "structure", "id": 2626, "trainId": 318},
+    {"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
+    {"name": "bird", "id": 198, "trainId": 320},
+    {"name": "place mat", "id": 1896, "trainId": 321},
+    {"name": "tomb", "id": 2800, "trainId": 322},
+    {"name": "big top", "id": 190, "trainId": 323},
+    {"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324},
+    {"name": "lockers", "id": 1463, "trainId": 325},
+    {"name": "cage", "id": 357, "trainId": 326},
+    {"name": "finger", "id": 929, "trainId": 327},
+    {"name": "bleachers", "id": 209, "trainId": 328},
+    {"name": "ferris wheel", "id": 912, "trainId": 329},
+    {"name": "hairdresser chair", "id": 1164, "trainId": 330},
+    {"name": "mat", "id": 1509, "trainId": 331},
+    {"name": "stands", "id": 2539, "trainId": 332},
+    {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
+    {"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334},
+    {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
+    {"name": "dummy", "id": 818, "trainId": 336},
+    {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
+    {"name": "sand trap", "id": 2217, "trainId": 338},
+    {"name": "shop, store", "id": 2347, "trainId": 339},
+    {"name": "table cloth", "id": 2686, "trainId": 340},
+    {"name": "service station", "id": 2300, "trainId": 341},
+    {"name": "coffin", "id": 572, "trainId": 342},
+    {"name": "drawer", "id": 789, "trainId": 343},
+    {"name": "cages", "id": 358, "trainId": 344},
+    {"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
+    {"name": "balcony", "id": 101, "trainId": 346},
+    {"name": "volleyball court", "id": 2969, "trainId": 347},
+    {"name": "table tennis", "id": 2692, "trainId": 348},
+    {"name": "control table", "id": 606, "trainId": 349},
+    {"name": "shirt", "id": 2339, "trainId": 350},
+    {"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
+    {"name": "railway", "id": 2060, "trainId": 352},
+    {"name": "parterre", "id": 1782, "trainId": 353},
+    {"name": "chimney", "id": 495, "trainId": 354},
+    {"name": "can, tin, tin can", "id": 371, "trainId": 355},
+    {"name": "tanks", "id": 2707, "trainId": 356},
+    {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
+    {"name": "alga, algae", "id": 3156, "trainId": 358},
+    {"name": "system", "id": 2683, "trainId": 359},
+    {"name": "map", "id": 1499, "trainId": 360},
+    {"name": "greenhouse", "id": 1135, "trainId": 361},
+    {"name": "mug", "id": 1619, "trainId": 362},
+    {"name": "barbecue", "id": 125, "trainId": 363},
+    {"name": "trailer", "id": 2838, "trainId": 364},
+    {"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365},
+    {"name": "organ", "id": 1695, "trainId": 366},
+    {"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
+    {"name": "island", "id": 1343, "trainId": 368},
+    {"name": "keyboard", "id": 1370, "trainId": 369},
+    {"name": "trench", "id": 2858, "trainId": 370},
+    {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
+    {"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
+    {"name": "pitcher, ewer", "id": 1892, "trainId": 373},
+    {"name": "goal", "id": 1103, "trainId": 374},
+    {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
+    {"name": "beds", "id": 170, "trainId": 376},
+    {"name": "wood", "id": 3073, "trainId": 377},
+    {"name": "file cabinet", "id": 922, "trainId": 378},
+    {"name": "newspaper, paper", "id": 1655, "trainId": 379},
+    {"name": "motorboat", "id": 1602, "trainId": 380},
+    {"name": "rope", "id": 2160, "trainId": 381},
+    {"name": "guitar", "id": 1151, "trainId": 382},
+    {"name": "rubble", "id": 2176, "trainId": 383},
+    {"name": "scarf", "id": 2239, "trainId": 384},
+    {"name": "barrels", "id": 132, "trainId": 385},
+    {"name": "cap", "id": 394, "trainId": 386},
+    {"name": "leaves", "id": 1424, "trainId": 387},
+    {"name": "control tower", "id": 607, "trainId": 388},
+    {"name": "dashboard", "id": 700, "trainId": 389},
+    {"name": "bandstand", "id": 116, "trainId": 390},
+    {"name": "lectern", "id": 1425, "trainId": 391},
+    {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
+    {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
+    {"name": "shower room", "id": 2360, "trainId": 394},
+    {"name": "smoke", "id": 2449, "trainId": 395},
+    {"name": "faucet, spigot", "id": 897, "trainId": 396},
+    {"name": "bulldozer", "id": 317, "trainId": 397},
+    {"name": "saucepan", "id": 2228, "trainId": 398},
+    {"name": "shops", "id": 2351, "trainId": 399},
+    {"name": "meter", "id": 1543, "trainId": 400},
+    {"name": "crevasse", "id": 656, "trainId": 401},
+    {"name": "gear", "id": 1088, "trainId": 402},
+    {"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
+    {"name": "sofa bed", "id": 2472, "trainId": 404},
+    {"name": "tunnel", "id": 2892, "trainId": 405},
+    {"name": "pallet", "id": 1740, "trainId": 406},
+    {"name": "wire, conducting wire", "id": 3067, "trainId": 407},
+    {"name": "kettle, boiler", "id": 1367, "trainId": 408},
+    {"name": "bidet", "id": 188, "trainId": 409},
+    {
+        "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
+        "id": 79,
+        "trainId": 410,
+    },
+    {"name": "music stand", "id": 1633, "trainId": 411},
+    {"name": "pipe, tube", "id": 1885, "trainId": 412},
+    {"name": "cup", "id": 677, "trainId": 413},
+    {"name": "parking meter", "id": 1779, "trainId": 414},
+    {"name": "ice hockey rink", "id": 1297, "trainId": 415},
+    {"name": "shelter", "id": 2334, "trainId": 416},
+    {"name": "weeds", "id": 3027, "trainId": 417},
+    {"name": "temple", "id": 2735, "trainId": 418},
+    {"name": "patty, cake", "id": 1791, "trainId": 419},
+    {"name": "ski slope", "id": 2405, "trainId": 420},
+    {"name": "panel", "id": 1748, "trainId": 421},
+    {"name": "wallet", "id": 2983, "trainId": 422},
+    {"name": "wheel", "id": 3035, "trainId": 423},
+    {"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
+    {"name": "roundabout", "id": 2168, "trainId": 425},
+    {"name": "canister, cannister, tin", "id": 385, "trainId": 426},
+    {"name": "rod", "id": 2148, "trainId": 427},
+    {"name": "soap dispenser", "id": 2465, "trainId": 428},
+    {"name": "bell", "id": 175, "trainId": 429},
+    {"name": "canvas", "id": 390, "trainId": 430},
+    {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
+    {"name": "teacup", "id": 2722, "trainId": 432},
+    {"name": "trellis", "id": 2857, "trainId": 433},
+    {"name": "workbench", "id": 3088, "trainId": 434},
+    {"name": "valley, vale", "id": 2926, "trainId": 435},
+    {"name": "toaster", "id": 2782, "trainId": 436},
+    {"name": "knife", "id": 1378, "trainId": 437},
+    {"name": "podium", "id": 1934, "trainId": 438},
+    {"name": "ramp", "id": 2072, "trainId": 439},
+    {"name": "tumble dryer", "id": 2889, "trainId": 440},
+    {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
+    {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
+    {"name": "lab bench", "id": 1383, "trainId": 443},
+    {"name": "equipment", "id": 867, "trainId": 444},
+    {"name": "rocky formation", "id": 2145, "trainId": 445},
+    {"name": "plastic", "id": 1915, "trainId": 446},
+    {"name": "calendar", "id": 361, "trainId": 447},
+    {"name": "caravan", "id": 402, "trainId": 448},
+    {"name": "check-in-desk", "id": 482, "trainId": 449},
+    {"name": "ticket counter", "id": 2761, "trainId": 450},
+    {"name": "brush", "id": 300, "trainId": 451},
+    {"name": "mill", "id": 1554, "trainId": 452},
+    {"name": "covered bridge", "id": 636, "trainId": 453},
+    {"name": "bowling alley", "id": 260, "trainId": 454},
+    {"name": "hanger", "id": 1186, "trainId": 455},
+    {"name": "excavator", "id": 871, "trainId": 456},
+    {"name": "trestle", "id": 2859, "trainId": 457},
+    {"name": "revolving door", "id": 2103, "trainId": 458},
+    {"name": "blast furnace", "id": 208, "trainId": 459},
+    {"name": "scale, weighing machine", "id": 2236, "trainId": 460},
+    {"name": "projector", "id": 2012, "trainId": 461},
+    {"name": "soap", "id": 2462, "trainId": 462},
+    {"name": "locker", "id": 1462, "trainId": 463},
+    {"name": "tractor", "id": 2832, "trainId": 464},
+    {"name": "stretcher", "id": 2617, "trainId": 465},
+    {"name": "frame", "id": 1024, "trainId": 466},
+    {"name": "grating", "id": 1129, "trainId": 467},
+    {"name": "alembic", "id": 18, "trainId": 468},
+    {"name": "candle, taper, wax light", "id": 376, "trainId": 469},
+    {"name": "barrier", "id": 134, "trainId": 470},
+    {"name": "cardboard", "id": 407, "trainId": 471},
+    {"name": "cave", "id": 434, "trainId": 472},
+    {"name": "puddle", "id": 2017, "trainId": 473},
+    {"name": "tarp", "id": 2717, "trainId": 474},
+    {"name": "price tag", "id": 2005, "trainId": 475},
+    {"name": "watchtower", "id": 2993, "trainId": 476},
+    {"name": "meters", "id": 1545, "trainId": 477},
+    {
+        "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
+        "id": 1445,
+        "trainId": 478,
+    },
+    {"name": "tracks", "id": 2831, "trainId": 479},
+    {"name": "hair dryer", "id": 1161, "trainId": 480},
+    {"name": "skirt", "id": 2411, "trainId": 481},
+    {"name": "viaduct", "id": 2949, "trainId": 482},
+    {"name": "paper towel", "id": 1769, "trainId": 483},
+    {"name": "coat", "id": 552, "trainId": 484},
+    {"name": "sheet", "id": 2327, "trainId": 485},
+    {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
+    {"name": "water wheel", "id": 3013, "trainId": 487},
+    {"name": "pottery, clayware", "id": 1986, "trainId": 488},
+    {"name": "magazine rack", "id": 1486, "trainId": 489},
+    {"name": "teapot", "id": 2723, "trainId": 490},
+    {"name": "microphone, mike", "id": 1549, "trainId": 491},
+    {"name": "support", "id": 2649, "trainId": 492},
+    {"name": "forklift", "id": 1020, "trainId": 493},
+    {"name": "canyon", "id": 392, "trainId": 494},
+    {"name": "cash register, register", "id": 422, "trainId": 495},
+    {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
+    {"name": "remote control, remote", "id": 2099, "trainId": 497},
+    {"name": "soap dish", "id": 2464, "trainId": 498},
+    {"name": "windshield, windscreen", "id": 3058, "trainId": 499},
+    {"name": "cat", "id": 430, "trainId": 500},
+    {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
+    {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
+    {"name": "videos", "id": 2955, "trainId": 503},
+    {"name": "shovel", "id": 2355, "trainId": 504},
+    {"name": "eaves", "id": 840, "trainId": 505},
+    {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
+    {"name": "shipyard", "id": 2338, "trainId": 507},
+    {"name": "hen, biddy", "id": 1232, "trainId": 508},
+    {"name": "traffic cone", "id": 2834, "trainId": 509},
+    {"name": "washing machines", "id": 2991, "trainId": 510},
+    {"name": "truck crane", "id": 2879, "trainId": 511},
+    {"name": "cds", "id": 444, "trainId": 512},
+    {"name": "niche", "id": 1657, "trainId": 513},
+    {"name": "scoreboard", "id": 2246, "trainId": 514},
+    {"name": "briefcase", "id": 296, "trainId": 515},
+    {"name": "boot", "id": 245, "trainId": 516},
+    {"name": "sweater, jumper", "id": 2661, "trainId": 517},
+    {"name": "hay", "id": 1202, "trainId": 518},
+    {"name": "pack", "id": 1714, "trainId": 519},
+    {"name": "bottle rack", "id": 251, "trainId": 520},
+    {"name": "glacier", "id": 1095, "trainId": 521},
+    {"name": "pergola", "id": 1828, "trainId": 522},
+    {"name": "building materials", "id": 311, "trainId": 523},
+    {"name": "television camera", "id": 2732, "trainId": 524},
+    {"name": "first floor", "id": 947, "trainId": 525},
+    {"name": "rifle", "id": 2115, "trainId": 526},
+    {"name": "tennis table", "id": 2738, "trainId": 527},
+    {"name": "stadium", "id": 2525, "trainId": 528},
+    {"name": "safety belt", "id": 2194, "trainId": 529},
+    {"name": "cover", "id": 634, "trainId": 530},
+    {"name": "dish rack", "id": 740, "trainId": 531},
+    {"name": "synthesizer", "id": 2682, "trainId": 532},
+    {"name": "pumpkin", "id": 2020, "trainId": 533},
+    {"name": "gutter", "id": 1156, "trainId": 534},
+    {"name": "fruit stand", "id": 1036, "trainId": 535},
+    {"name": "ice floe, floe", "id": 1295, "trainId": 536},
+    {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
+    {"name": "wheelchair", "id": 3037, "trainId": 538},
+    {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
+    {"name": "diploma", "id": 736, "trainId": 540},
+    {"name": "fairground ride", "id": 893, "trainId": 541},
+    {"name": "radio", "id": 2047, "trainId": 542},
+    {"name": "hotplate", "id": 1274, "trainId": 543},
+    {"name": "junk", "id": 1361, "trainId": 544},
+    {"name": "wheelbarrow", "id": 3036, "trainId": 545},
+    {"name": "stream", "id": 2606, "trainId": 546},
+    {"name": "toll plaza", "id": 2797, "trainId": 547},
+    {"name": "punching bag", "id": 2022, "trainId": 548},
+    {"name": "trough", "id": 2876, "trainId": 549},
+    {"name": "throne", "id": 2758, "trainId": 550},
+    {"name": "chair desk", "id": 472, "trainId": 551},
+    {"name": "weighbridge", "id": 3028, "trainId": 552},
+    {"name": "extractor fan", "id": 882, "trainId": 553},
+    {"name": "hanging clothes", "id": 1189, "trainId": 554},
+    {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
+    {"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
+    {"name": "ski lift", "id": 2401, "trainId": 557},
+    {"name": "chain", "id": 468, "trainId": 558},
+    {"name": "garage", "id": 1061, "trainId": 559},
+    {"name": "mechanical shovel", "id": 1523, "trainId": 560},
+    {"name": "wine rack", "id": 3059, "trainId": 561},
+    {"name": "tramway", "id": 2843, "trainId": 562},
+    {"name": "treadmill", "id": 2853, "trainId": 563},
+    {"name": "menu", "id": 1529, "trainId": 564},
+    {"name": "block", "id": 214, "trainId": 565},
+    {"name": "well", "id": 3032, "trainId": 566},
+    {"name": "witness stand", "id": 3071, "trainId": 567},
+    {"name": "branch", "id": 277, "trainId": 568},
+    {"name": "duck", "id": 813, "trainId": 569},
+    {"name": "casserole", "id": 426, "trainId": 570},
+    {"name": "frying pan", "id": 1039, "trainId": 571},
+    {"name": "desk organizer", "id": 727, "trainId": 572},
+    {"name": "mast", "id": 1508, "trainId": 573},
+    {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
+    {"name": "service elevator", "id": 2299, "trainId": 575},
+    {"name": "dollhouse", "id": 768, "trainId": 576},
+    {"name": "hammock", "id": 1172, "trainId": 577},
+    {"name": "clothes hanging", "id": 537, "trainId": 578},
+    {"name": "photocopier", "id": 1847, "trainId": 579},
+    {"name": "notepad", "id": 1664, "trainId": 580},
+    {"name": "golf cart", "id": 1110, "trainId": 581},
+    {"name": "footpath", "id": 1014, "trainId": 582},
+    {"name": "cross", "id": 662, "trainId": 583},
+    {"name": "baptismal font", "id": 121, "trainId": 584},
+    {"name": "boiler", "id": 227, "trainId": 585},
+    {"name": "skip", "id": 2410, "trainId": 586},
+    {"name": "rotisserie", "id": 2165, "trainId": 587},
+    {"name": "tables", "id": 2696, "trainId": 588},
+    {"name": "water mill", "id": 3005, "trainId": 589},
+    {"name": "helmet", "id": 1231, "trainId": 590},
+    {"name": "cover curtain", "id": 635, "trainId": 591},
+    {"name": "brick", "id": 292, "trainId": 592},
+    {"name": "table runner", "id": 2690, "trainId": 593},
+    {"name": "ashtray", "id": 65, "trainId": 594},
+    {"name": "street box", "id": 2607, "trainId": 595},
+    {"name": "stick", "id": 2574, "trainId": 596},
+    {"name": "hangers", "id": 1188, "trainId": 597},
+    {"name": "cells", "id": 456, "trainId": 598},
+    {"name": "urinal", "id": 2913, "trainId": 599},
+    {"name": "centerpiece", "id": 459, "trainId": 600},
+    {"name": "portable fridge", "id": 1955, "trainId": 601},
+    {"name": "dvds", "id": 827, "trainId": 602},
+    {"name": "golf club", "id": 1111, "trainId": 603},
+    {"name": "skirting board", "id": 2412, "trainId": 604},
+    {"name": "water cooler", "id": 2997, "trainId": 605},
+    {"name": "clipboard", "id": 528, "trainId": 606},
+    {"name": "camera, photographic camera", "id": 366, "trainId": 607},
+    {"name": "pigeonhole", "id": 1863, "trainId": 608},
+    {"name": "chips", "id": 500, "trainId": 609},
+    {"name": "food processor", "id": 1001, "trainId": 610},
+    {"name": "post box", "id": 1958, "trainId": 611},
+    {"name": "lid", "id": 1441, "trainId": 612},
+    {"name": "drum", "id": 809, "trainId": 613},
+    {"name": "blender", "id": 210, "trainId": 614},
+    {"name": "cave entrance", "id": 435, "trainId": 615},
+    {"name": "dental chair", "id": 718, "trainId": 616},
+    {"name": "obelisk", "id": 1674, "trainId": 617},
+    {"name": "canoe", "id": 388, "trainId": 618},
+    {"name": "mobile", "id": 1572, "trainId": 619},
+    {"name": "monitors", "id": 1584, "trainId": 620},
+    {"name": "pool ball", "id": 1944, "trainId": 621},
+    {"name": "cue rack", "id": 674, "trainId": 622},
+    {"name": "baggage carts", "id": 99, "trainId": 623},
+    {"name": "shore", "id": 2352, "trainId": 624},
+    {"name": "fork", "id": 1019, "trainId": 625},
+    {"name": "paper filer", "id": 1763, "trainId": 626},
+    {"name": "bicycle rack", "id": 185, "trainId": 627},
+    {"name": "coat rack", "id": 554, "trainId": 628},
+    {"name": "garland", "id": 1066, "trainId": 629},
+    {"name": "sports bag", "id": 2508, "trainId": 630},
+    {"name": "fish tank", "id": 951, "trainId": 631},
+    {"name": "towel dispenser", "id": 2822, "trainId": 632},
+    {"name": "carriage", "id": 415, "trainId": 633},
+    {"name": "brochure", "id": 297, "trainId": 634},
+    {"name": "plaque", "id": 1914, "trainId": 635},
+    {"name": "stringer", "id": 2619, "trainId": 636},
+    {"name": "iron", "id": 1338, "trainId": 637},
+    {"name": "spoon", "id": 2505, "trainId": 638},
+    {"name": "flag pole", "id": 955, "trainId": 639},
+    {"name": "toilet brush", "id": 2786, "trainId": 640},
+    {"name": "book stand", "id": 238, "trainId": 641},
+    {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
+    {"name": "ticket office", "id": 2763, "trainId": 643},
+    {"name": "broom", "id": 299, "trainId": 644},
+    {"name": "dvd", "id": 822, "trainId": 645},
+    {"name": "ice bucket", "id": 1288, "trainId": 646},
+    {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
+    {"name": "tureen", "id": 2894, "trainId": 648},
+    {"name": "folders", "id": 992, "trainId": 649},
+    {"name": "chess", "id": 489, "trainId": 650},
+    {"name": "root", "id": 2157, "trainId": 651},
+    {"name": "sewing machine", "id": 2309, "trainId": 652},
+    {"name": "model", "id": 1576, "trainId": 653},
+    {"name": "pen", "id": 1810, "trainId": 654},
+    {"name": "violin", "id": 2964, "trainId": 655},
+    {"name": "sweatshirt", "id": 2662, "trainId": 656},
+    {"name": "recycling materials", "id": 2087, "trainId": 657},
+    {"name": "mitten", "id": 1569, "trainId": 658},
+    {"name": "chopping board, cutting board", "id": 503, "trainId": 659},
+    {"name": "mask", "id": 1505, "trainId": 660},
+    {"name": "log", "id": 1468, "trainId": 661},
+    {"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
+    {"name": "grill", "id": 1138, "trainId": 663},
+    {"name": "hole", "id": 1256, "trainId": 664},
+    {"name": "target", "id": 2715, "trainId": 665},
+    {"name": "trash bag", "id": 2846, "trainId": 666},
+    {"name": "chalk", "id": 477, "trainId": 667},
+    {"name": "sticks", "id": 2576, "trainId": 668},
+    {"name": "balloon", "id": 108, "trainId": 669},
+    {"name": "score", "id": 2245, "trainId": 670},
+    {"name": "hair spray", "id": 1162, "trainId": 671},
+    {"name": "roll", "id": 2149, "trainId": 672},
+    {"name": "runner", "id": 2183, "trainId": 673},
+    {"name": "engine", "id": 858, "trainId": 674},
+    {"name": "inflatable glove", "id": 1324, "trainId": 675},
+    {"name": "games", "id": 1055, "trainId": 676},
+    {"name": "pallets", "id": 1741, "trainId": 677},
+    {"name": "baskets", "id": 149, "trainId": 678},
+    {"name": "coop", "id": 615, "trainId": 679},
+    {"name": "dvd player", "id": 825, "trainId": 680},
+    {"name": "rocking horse", "id": 2143, "trainId": 681},
+    {"name": "buckets", "id": 304, "trainId": 682},
+    {"name": "bread rolls", "id": 283, "trainId": 683},
+    {"name": "shawl", "id": 2322, "trainId": 684},
+    {"name": "watering can", "id": 3017, "trainId": 685},
+    {"name": "spotlights", "id": 2510, "trainId": 686},
+    {"name": "post-it", "id": 1960, "trainId": 687},
+    {"name": "bowls", "id": 265, "trainId": 688},
+    {"name": "security camera", "id": 2282, "trainId": 689},
+    {"name": "runner cloth", "id": 2184, "trainId": 690},
+    {"name": "lock", "id": 1461, "trainId": 691},
+    {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
+    {"name": "side", "id": 2372, "trainId": 693},
+    {"name": "roulette", "id": 2166, "trainId": 694},
+    {"name": "bone", "id": 232, "trainId": 695},
+    {"name": "cutlery", "id": 693, "trainId": 696},
+    {"name": "pool balls", "id": 1945, "trainId": 697},
+    {"name": "wheels", "id": 3039, "trainId": 698},
+    {"name": "spice rack", "id": 2494, "trainId": 699},
+    {"name": "plant pots", "id": 1908, "trainId": 700},
+    {"name": "towel ring", "id": 2827, "trainId": 701},
+    {"name": "bread box", "id": 280, "trainId": 702},
+    {"name": "video", "id": 2950, "trainId": 703},
+    {"name": "funfair", "id": 1044, "trainId": 704},
+    {"name": "breads", "id": 288, "trainId": 705},
+    {"name": "tripod", "id": 2863, "trainId": 706},
+    {"name": "ironing board", "id": 1342, "trainId": 707},
+    {"name": "skimmer", "id": 2409, "trainId": 708},
+    {"name": "hollow", "id": 1258, "trainId": 709},
+    {"name": "scratching post", "id": 2249, "trainId": 710},
+    {"name": "tricycle", "id": 2862, "trainId": 711},
+    {"name": "file box", "id": 920, "trainId": 712},
+    {"name": "mountain pass", "id": 1607, "trainId": 713},
+    {"name": "tombstones", "id": 2802, "trainId": 714},
+    {"name": "cooker", "id": 610, "trainId": 715},
+    {"name": "card game, cards", "id": 3129, "trainId": 716},
+    {"name": "golf bag", "id": 1108, "trainId": 717},
+    {"name": "towel paper", "id": 2823, "trainId": 718},
+    {"name": "chaise lounge", "id": 476, "trainId": 719},
+    {"name": "sun", "id": 2641, "trainId": 720},
+    {"name": "toilet paper holder", "id": 2788, "trainId": 721},
+    {"name": "rake", "id": 2070, "trainId": 722},
+    {"name": "key", "id": 1368, "trainId": 723},
+    {"name": "umbrella stand", "id": 2903, "trainId": 724},
+    {"name": "dartboard", "id": 699, "trainId": 725},
+    {"name": "transformer", "id": 2844, "trainId": 726},
+    {"name": "fireplace utensils", "id": 942, "trainId": 727},
+    {"name": "sweatshirts", "id": 2663, "trainId": 728},
+    {
+        "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+        "id": 457,
+        "trainId": 729,
+    },
+    {"name": "tallboy", "id": 2701, "trainId": 730},
+    {"name": "stapler", "id": 2540, "trainId": 731},
+    {"name": "sauna", "id": 2231, "trainId": 732},
+    {"name": "test tube", "id": 2746, "trainId": 733},
+    {"name": "palette", "id": 1738, "trainId": 734},
+    {"name": "shopping carts", "id": 2350, "trainId": 735},
+    {"name": "tools", "id": 2808, "trainId": 736},
+    {"name": "push button, push, button", "id": 2025, "trainId": 737},
+    {"name": "star", "id": 2541, "trainId": 738},
+    {"name": "roof rack", "id": 2156, "trainId": 739},
+    {"name": "barbed wire", "id": 126, "trainId": 740},
+    {"name": "spray", "id": 2512, "trainId": 741},
+    {"name": "ear", "id": 831, "trainId": 742},
+    {"name": "sponge", "id": 2503, "trainId": 743},
+    {"name": "racket", "id": 2039, "trainId": 744},
+    {"name": "tins", "id": 2774, "trainId": 745},
+    {"name": "eyeglasses", "id": 886, "trainId": 746},
+    {"name": "file", "id": 919, "trainId": 747},
+    {"name": "scarfs", "id": 2240, "trainId": 748},
+    {"name": "sugar bowl", "id": 2636, "trainId": 749},
+    {"name": "flip flop", "id": 963, "trainId": 750},
+    {"name": "headstones", "id": 1218, "trainId": 751},
+    {"name": "laptop bag", "id": 1406, "trainId": 752},
+    {"name": "leash", "id": 1420, "trainId": 753},
+    {"name": "climbing frame", "id": 526, "trainId": 754},
+    {"name": "suit hanger", "id": 2639, "trainId": 755},
+    {"name": "floor spotlight", "id": 975, "trainId": 756},
+    {"name": "plate rack", "id": 1921, "trainId": 757},
+    {"name": "sewer", "id": 2305, "trainId": 758},
+    {"name": "hard drive", "id": 1193, "trainId": 759},
+    {"name": "sprinkler", "id": 2517, "trainId": 760},
+    {"name": "tools box", "id": 2809, "trainId": 761},
+    {"name": "necklace", "id": 1647, "trainId": 762},
+    {"name": "bulbs", "id": 314, "trainId": 763},
+    {"name": "steel industry", "id": 2560, "trainId": 764},
+    {"name": "club", "id": 545, "trainId": 765},
+    {"name": "jack", "id": 1345, "trainId": 766},
+    {"name": "door bars", "id": 775, "trainId": 767},
+    {
+        "name": "control panel, instrument panel, control board, board, panel",
+        "id": 603,
+        "trainId": 768,
+    },
+    {"name": "hairbrush", "id": 1163, "trainId": 769},
+    {"name": "napkin holder", "id": 1641, "trainId": 770},
+    {"name": "office", "id": 1678, "trainId": 771},
+    {"name": "smoke detector", "id": 2450, "trainId": 772},
+    {"name": "utensils", "id": 2915, "trainId": 773},
+    {"name": "apron", "id": 42, "trainId": 774},
+    {"name": "scissors", "id": 2242, "trainId": 775},
+    {"name": "terminal", "id": 2741, "trainId": 776},
+    {"name": "grinder", "id": 1143, "trainId": 777},
+    {"name": "entry phone", "id": 862, "trainId": 778},
+    {"name": "newspaper stand", "id": 1654, "trainId": 779},
+    {"name": "pepper shaker", "id": 1826, "trainId": 780},
+    {"name": "onions", "id": 1689, "trainId": 781},
+    {
+        "name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
+        "id": 3124,
+        "trainId": 782,
+    },
+    {"name": "tape", "id": 2710, "trainId": 783},
+    {"name": "bat", "id": 152, "trainId": 784},
+    {"name": "coaster", "id": 549, "trainId": 785},
+    {"name": "calculator", "id": 360, "trainId": 786},
+    {"name": "potatoes", "id": 1982, "trainId": 787},
+    {"name": "luggage rack", "id": 1478, "trainId": 788},
+    {"name": "salt", "id": 2203, "trainId": 789},
+    {"name": "street number", "id": 2612, "trainId": 790},
+    {"name": "viewpoint", "id": 2956, "trainId": 791},
+    {"name": "sword", "id": 2681, "trainId": 792},
+    {"name": "cd", "id": 437, "trainId": 793},
+    {"name": "rowing machine", "id": 2171, "trainId": 794},
+    {"name": "plug", "id": 1933, "trainId": 795},
+    {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
+    {"name": "pepper", "id": 1824, "trainId": 797},
+    {"name": "tongs", "id": 2803, "trainId": 798},
+    {"name": "bonfire", "id": 234, "trainId": 799},
+    {"name": "dog dish", "id": 764, "trainId": 800},
+    {"name": "belt", "id": 177, "trainId": 801},
+    {"name": "dumbbells", "id": 817, "trainId": 802},
+    {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
+    {"name": "hook", "id": 1262, "trainId": 804},
+    {"name": "envelopes", "id": 864, "trainId": 805},
+    {"name": "shower faucet", "id": 2359, "trainId": 806},
+    {"name": "watch", "id": 2992, "trainId": 807},
+    {"name": "padlock", "id": 1725, "trainId": 808},
+    {"name": "swimming pool ladder", "id": 2667, "trainId": 809},
+    {"name": "spanners", "id": 2484, "trainId": 810},
+    {"name": "gravy boat", "id": 1133, "trainId": 811},
+    {"name": "notice board", "id": 1667, "trainId": 812},
+    {"name": "trash bags", "id": 2847, "trainId": 813},
+    {"name": "fire alarm", "id": 932, "trainId": 814},
+    {"name": "ladle", "id": 1392, "trainId": 815},
+    {"name": "stethoscope", "id": 2573, "trainId": 816},
+    {"name": "rocket", "id": 2140, "trainId": 817},
+    {"name": "funnel", "id": 1046, "trainId": 818},
+    {"name": "bowling pins", "id": 264, "trainId": 819},
+    {"name": "valve", "id": 2927, "trainId": 820},
+    {"name": "thermometer", "id": 2752, "trainId": 821},
+    {"name": "cups", "id": 679, "trainId": 822},
+    {"name": "spice jar", "id": 2493, "trainId": 823},
+    {"name": "night light", "id": 1658, "trainId": 824},
+    {"name": "soaps", "id": 2466, "trainId": 825},
+    {"name": "games table", "id": 1057, "trainId": 826},
+    {"name": "slotted spoon", "id": 2444, "trainId": 827},
+    {"name": "reel", "id": 2093, "trainId": 828},
+    {"name": "scourer", "id": 2248, "trainId": 829},
+    {"name": "sleeping robe", "id": 2432, "trainId": 830},
+    {"name": "desk mat", "id": 726, "trainId": 831},
+    {"name": "dumbbell", "id": 816, "trainId": 832},
+    {"name": "hammer", "id": 1171, "trainId": 833},
+    {"name": "tie", "id": 2766, "trainId": 834},
+    {"name": "typewriter", "id": 2900, "trainId": 835},
+    {"name": "shaker", "id": 2313, "trainId": 836},
+    {"name": "cheese dish", "id": 488, "trainId": 837},
+    {"name": "sea star", "id": 2265, "trainId": 838},
+    {"name": "racquet", "id": 2043, "trainId": 839},
+    {"name": "butane gas cylinder", "id": 332, "trainId": 840},
+    {"name": "paper weight", "id": 1771, "trainId": 841},
+    {"name": "shaving brush", "id": 2320, "trainId": 842},
+    {"name": "sunglasses", "id": 2646, "trainId": 843},
+    {"name": "gear shift", "id": 1089, "trainId": 844},
+    {"name": "towel rail", "id": 2826, "trainId": 845},
+    {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
+]
+
+
+def loadAde20K(file):
+    fileseg = file.replace(".jpg", "_seg.png")
+    with Image.open(fileseg) as io:
+        seg = np.array(io)
+
+    R = seg[:, :, 0]
+    G = seg[:, :, 1]
+    ObjectClassMasks = (R / 10).astype(np.int32) * 256 + (G.astype(np.int32))
+
+    return {"img_name": file, "segm_name": fileseg, "class_mask": ObjectClassMasks}
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    index_file = dataset_dir / "ADE20K_2021_17_01" / "index_ade20k.pkl"
+    print('Caution: we only generate the validation set!')
+    with open(index_file, "rb") as f:
+        index_ade20k = pkl.load(f)
+
+    id_map = {}
+    for cat in ADE20K_SEM_SEG_FULL_CATEGORIES:
+        id_map[cat["id"]] = cat["trainId"]
+
+    # make output dir
+    for name in ["training", "validation"]:
+        image_dir = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / name
+        image_dir.mkdir(parents=True, exist_ok=True)
+        annotation_dir = dataset_dir / "ADE20K_2021_17_01" / "annotations_detectron2" / name
+        annotation_dir.mkdir(parents=True, exist_ok=True)
+
+    # process image and gt
+    for i, (folder_name, file_name) in tqdm.tqdm(
+        enumerate(zip(index_ade20k["folder"], index_ade20k["filename"])),
+        total=len(index_ade20k["filename"]),
+    ):
+        split = "validation" if file_name.split("_")[1] == "val" else "training"
+        if split == 'training':
+            # FIXME: If you want to generate training set, delete this condition
+            continue
+        info = loadAde20K(str(dataset_dir / folder_name / file_name))
+
+        # resize image and label
+        img = np.asarray(Image.open(info["img_name"]))
+        lab = np.asarray(info["class_mask"])
+
+        h, w = img.shape[0], img.shape[1]
+        max_size = 512
+        resize = True
+        if w >= h > max_size:
+            h_new, w_new = max_size, round(w / float(h) * max_size)
+        elif h >= w > max_size:
+            h_new, w_new = round(h / float(w) * max_size), max_size
+        else:
+            resize = False
+
+        if resize:
+            img = cv2.resize(img, (w_new, h_new), interpolation=cv2.INTER_LINEAR)
+            lab = cv2.resize(lab, (w_new, h_new), interpolation=cv2.INTER_NEAREST)
+
+        assert img.dtype == np.uint8
+        assert lab.dtype == np.int32
+
+        # apply label conversion and save into uint16 images
+        output = np.zeros_like(lab, dtype=np.uint16) + 65535
+        for obj_id in np.unique(lab):
+            if obj_id in id_map:
+                output[lab == obj_id] = id_map[obj_id]
+
+        output_img = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / split / file_name
+        output_lab = (
+            dataset_dir
+            / "ADE20K_2021_17_01"
+            / "annotations_detectron2"
+            / split
+            / file_name.replace(".jpg", ".tif")
+        )
+        Image.fromarray(img).save(output_img)
+
+        assert output.dtype == np.uint16
+        Image.fromarray(output).save(output_lab)
\ No newline at end of file
diff --git a/datasets/prepare_ade20k_sem_seg.py b/datasets/prepare_ade20k_sem_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..9598151dc446807ccaee2056f7dae3260023a9b0
--- /dev/null
+++ b/datasets/prepare_ade20k_sem_seg.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import os
+from pathlib import Path
+
+import numpy as np
+import tqdm
+from PIL import Image
+
+
+def convert(input, output, index=None):
+    img = np.asarray(Image.open(input))
+    assert img.dtype == np.uint8
+    img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
+    if index is not None:
+        mapping = {i: k for k, i in enumerate(index)}
+        img = np.vectorize(lambda x: mapping[x] if x in mapping else 255)(
+            img.astype(np.float)
+        ).astype(np.uint8)
+    Image.fromarray(img).save(output)
+
+
+if __name__ == "__main__":
+    dataset_dir = (
+        Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
+    )
+    print('Caution: we only generate the validation set!')
+    for name in ["validation"]:
+        annotation_dir = dataset_dir / "annotations" / name
+        output_dir = dataset_dir / "annotations_detectron2" / name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for file in tqdm.tqdm(list(annotation_dir.iterdir())):
+            output_file = output_dir / file.name
+            convert(file, output_file)
diff --git a/datasets/prepare_coco_stuff_sem_seg.py b/datasets/prepare_coco_stuff_sem_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c2281f3590a2ec68d5aceb904d7a8ba10bd993a
--- /dev/null
+++ b/datasets/prepare_coco_stuff_sem_seg.py
@@ -0,0 +1,219 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from
+# https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_coco_stuff_164k_sem_seg.py
+
+import os
+import os.path as osp
+from pathlib import Path
+import tqdm
+from glob import glob
+
+import numpy as np
+from PIL import Image
+
+
+full_clsID_to_trID = {
+    0: 0,
+    1: 1,
+    2: 2,
+    3: 3,
+    4: 4,
+    5: 5,
+    6: 6,
+    7: 7,
+    8: 8,
+    9: 9,
+    10: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    21: 20,
+    22: 21,
+    23: 22,
+    24: 23,
+    26: 24,
+    27: 25,
+    30: 26,
+    31: 27,
+    32: 28,
+    33: 29,
+    34: 30,
+    35: 31,
+    36: 32,
+    37: 33,
+    38: 34,
+    39: 35,
+    40: 36,
+    41: 37,
+    42: 38,
+    43: 39,
+    45: 40,
+    46: 41,
+    47: 42,
+    48: 43,
+    49: 44,
+    50: 45,
+    51: 46,
+    52: 47,
+    53: 48,
+    54: 49,
+    55: 50,
+    56: 51,
+    57: 52,
+    58: 53,
+    59: 54,
+    60: 55,
+    61: 56,
+    62: 57,
+    63: 58,
+    64: 59,
+    66: 60,
+    69: 61,
+    71: 62,
+    72: 63,
+    73: 64,
+    74: 65,
+    75: 66,
+    76: 67,
+    77: 68,
+    78: 69,
+    79: 70,
+    80: 71,
+    81: 72,
+    83: 73,
+    84: 74,
+    85: 75,
+    86: 76,
+    87: 77,
+    88: 78,
+    89: 79,
+    91: 80,
+    92: 81,
+    93: 82,
+    94: 83,
+    95: 84,
+    96: 85,
+    97: 86,
+    98: 87,
+    99: 88,
+    100: 89,
+    101: 90,
+    102: 91,
+    103: 92,
+    104: 93,
+    105: 94,
+    106: 95,
+    107: 96,
+    108: 97,
+    109: 98,
+    110: 99,
+    111: 100,
+    112: 101,
+    113: 102,
+    114: 103,
+    115: 104,
+    116: 105,
+    117: 106,
+    118: 107,
+    119: 108,
+    120: 109,
+    121: 110,
+    122: 111,
+    123: 112,
+    124: 113,
+    125: 114,
+    126: 115,
+    127: 116,
+    128: 117,
+    129: 118,
+    130: 119,
+    131: 120,
+    132: 121,
+    133: 122,
+    134: 123,
+    135: 124,
+    136: 125,
+    137: 126,
+    138: 127,
+    139: 128,
+    140: 129,
+    141: 130,
+    142: 131,
+    143: 132,
+    144: 133,
+    145: 134,
+    146: 135,
+    147: 136,
+    148: 137,
+    149: 138,
+    150: 139,
+    151: 140,
+    152: 141,
+    153: 142,
+    154: 143,
+    155: 144,
+    156: 145,
+    157: 146,
+    158: 147,
+    159: 148,
+    160: 149,
+    161: 150,
+    162: 151,
+    163: 152,
+    164: 153,
+    165: 154,
+    166: 155,
+    167: 156,
+    168: 157,
+    169: 158,
+    170: 159,
+    171: 160,
+    172: 161,
+    173: 162,
+    174: 163,
+    175: 164,
+    176: 165,
+    177: 166,
+    178: 167,
+    179: 168,
+    180: 169,
+    181: 170,
+    255: 255,
+}
+
+def convert_to_trainID(
+    maskpath, out_mask_dir, is_train, clsID_to_trID=full_clsID_to_trID, suffix=""
+):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = (
+        osp.join(out_mask_dir, "train2017" + suffix, osp.basename(maskpath))
+        if is_train
+        else osp.join(out_mask_dir, "val2017" + suffix, osp.basename(maskpath))
+    )
+    if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
+        return
+    Image.fromarray(mask_copy).save(seg_filename, "PNG")
+
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    print('Caution: we only generate the training set!')
+    coco_path = dataset_dir / "coco"
+    mask_dir = coco_path / "stuffthingmaps"
+    out_mask_dir = coco_path / "stuffthingmaps_detectron2"
+    for name in ["train2017"]:
+        os.makedirs((out_mask_dir / name), exist_ok=True)
+        train_list = glob(osp.join(mask_dir, "train2017", "*.png"))
+        for file in tqdm.tqdm(train_list):
+            convert_to_trainID(file, out_mask_dir, is_train=True)
diff --git a/datasets/prepare_pascal_context.py b/datasets/prepare_pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d38469242affc188617cbd23eaaf33219bd317
--- /dev/null
+++ b/datasets/prepare_pascal_context.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import tqdm
+import os
+import os.path as osp
+from pathlib import Path
+
+import numpy as np
+from PIL import Image
+import scipy.io
+
+def convert_pc59(mask_path, new_mask_path, pc59_dict):
+    mat = scipy.io.loadmat(mask_path)
+    mask = mat['LabelMap']
+
+    mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
+    for trID, clsID in pc59_dict.items():
+        mask_copy[mask == clsID] = trID
+
+    min_value = np.amin(mask_copy)
+    assert min_value >= 0, print(min_value)
+    Image.fromarray(mask_copy).save(new_mask_path, "PNG")
+
+def convert_pc459(mask_path, new_mask_path):
+    mat = scipy.io.loadmat(mask_path)
+    mask = mat['LabelMap']
+    mask = mask - 1
+    min_value = np.amin(mask)
+    assert min_value >= 0, print(min_value)
+    Image.fromarray(mask).save(new_mask_path, "TIFF")
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    print('Caution: we only generate the validation set!')
+    pc_path = dataset_dir / "VOCdevkit/VOC2010"
+
+    val_list = open(pc_path / "pascalcontext_val.txt", "r")
+    pc459_labels = open(pc_path / "labels.txt", "r")
+    pc59_labels = open(pc_path / "59_labels.txt", "r")
+
+    pc459_dict = {}
+    for line in pc459_labels.readlines():
+        if ':' in line:
+            idx, name = line.split(':')
+            idx = int(idx.strip())
+            name = name.strip()
+            pc459_dict[name] = idx
+
+    pc59_dict = {}
+    for i, line in enumerate(pc59_labels.readlines()):
+        name = line.split(':')[-1].strip()
+        if name is not '':
+            pc59_dict[i] = pc459_dict[name]
+
+    pc459_dir = pc_path / "annotations_detectron2" / "pc459_val"
+    pc459_dir.mkdir(parents=True, exist_ok=True)
+    pc59_dir = pc_path / "annotations_detectron2" / "pc59_val"
+    pc59_dir.mkdir(parents=True, exist_ok=True)
+
+    for line in tqdm.tqdm(val_list.readlines()):
+        fileid = line.strip()
+        ori_mask = f'{pc_path}/trainval/{fileid}.mat'
+        pc459_dst = f'{pc459_dir}/{fileid}.tif'
+        pc59_dst = f'{pc59_dir}/{fileid}.png'
+        if osp.exists(ori_mask):
+            convert_pc459(ori_mask, pc459_dst)
+            convert_pc59(ori_mask, pc59_dst, pc59_dict)
diff --git a/datasets/prepare_voc_sem_seg.py b/datasets/prepare_voc_sem_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dbe80a5b8ae53627998214ec6a1f9a7fc30fad9
--- /dev/null
+++ b/datasets/prepare_voc_sem_seg.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_voc_sem_seg.py
+
+import os
+import os.path as osp
+from pathlib import Path
+import tqdm
+
+import numpy as np
+from PIL import Image
+
+
+clsID_to_trID = {
+    0: 255,
+    1: 0,
+    2: 1,
+    3: 2,
+    4: 3,
+    5: 4,
+    6: 5,
+    7: 6,
+    8: 7,
+    9: 8,
+    10: 9,
+    11: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    255: 255,
+}
+
+def convert_to_trainID(
+    maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix=""
+):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = (
+        osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath))
+        if is_train
+        else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath))
+    )
+    if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
+        return
+    Image.fromarray(mask_copy).save(seg_filename, "PNG")
+
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    print('Caution: we only generate the validation set!')
+    voc_path = dataset_dir / "VOCdevkit" / "VOC2012"
+    out_mask_dir = voc_path / "annotations_detectron2"
+    out_image_dir = voc_path / "images_detectron2"
+    for name in ["val"]:
+        os.makedirs((out_mask_dir / name), exist_ok=True)
+        os.makedirs((out_image_dir / name), exist_ok=True)
+        val_list = [
+            osp.join(voc_path, "SegmentationClassAug", f + ".png")
+            for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist()
+        ]
+        for file in tqdm.tqdm(val_list):
+            convert_to_trainID(file, out_mask_dir, is_train=False)
diff --git a/datasets/scannet_preprocess/meta_data/classes_ObjClassification-ShapeNetCore55.txt b/datasets/scannet_preprocess/meta_data/classes_ObjClassification-ShapeNetCore55.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e53f5bcb2c1480f42ee9327940246258aa434f88
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/classes_ObjClassification-ShapeNetCore55.txt
@@ -0,0 +1,17 @@
+1	trash
+3 	basket
+4	bathtub
+5	bed
+9	shelf
+13	cabinet
+18	chair
+20	keyboard
+22	tv
+30	lamp
+31	laptop
+35	microwave
+39	pillow
+42	printer
+47	sofa
+48	stove
+49	table
diff --git a/datasets/scannet_preprocess/meta_data/classes_SemVoxLabel-nyu40id.txt b/datasets/scannet_preprocess/meta_data/classes_SemVoxLabel-nyu40id.txt
new file mode 100644
index 0000000000000000000000000000000000000000..48e228766391e0f0234c2eed086e31f738068a4b
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/classes_SemVoxLabel-nyu40id.txt
@@ -0,0 +1,20 @@
+1       wall
+2       floor
+3       cabinet
+4       bed
+5       chair
+6       sofa
+7       table
+8       door
+9       window
+10      bookshelf
+11      picture
+12      counter
+14      desk
+16      curtain
+24      refridgerator
+28      shower curtain
+33      toilet
+34      sink
+36      bathtub
+39      otherfurniture
\ No newline at end of file
diff --git a/datasets/scannet_preprocess/meta_data/scannet200_constants.py b/datasets/scannet_preprocess/meta_data/scannet200_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e8e9a64d0b9ff821cfd46ea017305aff65c5d60
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannet200_constants.py
@@ -0,0 +1,295 @@
+# ScanNet Benchmark constants
+VALID_CLASS_IDS_20 = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39)
+
+CLASS_LABELS_20 = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window',
+                   'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator',
+                   'shower curtain', 'toilet', 'sink', 'bathtub', 'otherfurniture')
+
+SCANNET_COLOR_MAP_20 = {
+    0: (0., 0., 0.),
+    1: (174., 199., 232.),
+    2: (152., 223., 138.),
+    3: (31., 119., 180.),
+    4: (255., 187., 120.),
+    5: (188., 189., 34.),
+    6: (140., 86., 75.),
+    7: (255., 152., 150.),
+    8: (214., 39., 40.),
+    9: (197., 176., 213.),
+    10: (148., 103., 189.),
+    11: (196., 156., 148.),
+    12: (23., 190., 207.),
+    14: (247., 182., 210.),
+    15: (66., 188., 102.),
+    16: (219., 219., 141.),
+    17: (140., 57., 197.),
+    18: (202., 185., 52.),
+    19: (51., 176., 203.),
+    20: (200., 54., 131.),
+    21: (92., 193., 61.),
+    22: (78., 71., 183.),
+    23: (172., 114., 82.),
+    24: (255., 127., 14.),
+    25: (91., 163., 138.),
+    26: (153., 98., 156.),
+    27: (140., 153., 101.),
+    28: (158., 218., 229.),
+    29: (100., 125., 154.),
+    30: (178., 127., 135.),
+    32: (146., 111., 194.),
+    33: (44., 160., 44.),
+    34: (112., 128., 144.),
+    35: (96., 207., 209.),
+    36: (227., 119., 194.),
+    37: (213., 92., 176.),
+    38: (94., 106., 211.),
+    39: (82., 84., 163.),
+    40: (100., 85., 144.),
+}
+
+# ScanNet200 Benchmark constants
+VALID_CLASS_IDS_200 = (
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35,
+    36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68, 69,
+    70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+    104, 105, 106, 107, 110, 112, 115, 116, 118, 120, 121, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141,
+    145, 148, 154, 155, 156, 157, 159, 161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195, 202, 208,
+    213, 214, 221, 229, 230, 232, 233, 242, 250, 261, 264, 276, 283, 286, 300, 304, 312, 323, 325, 331, 342, 356, 370,
+    392, 395, 399, 408, 417, 488, 540, 562, 570, 572, 581, 609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168,
+    1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188,
+    1189, 1190, 1191)
+
+CLASS_LABELS_200 = (
+    'wall', 'chair', 'floor', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk', 'office chair', 'bed', 'pillow',
+    'sink', 'picture', 'window', 'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair', 'coffee table', 'box',
+    'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool',
+    'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard', 'bag', 'backpack',
+    'toilet paper', 'printer', 'tv stand', 'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet', 'stairs',
+    'microwave', 'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board', 'washing machine',
+    'mirror', 'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person',
+    'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail', 'radiator',
+    'recycling bin', 'container', 'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light',
+    'laundry basket', 'pipe', 'clothes dryer', 'guitar', 'toilet paper holder', 'seat', 'speaker', 'column', 'bicycle',
+    'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin', 'coffee maker', 'dishwasher',
+    'paper towel roll', 'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board',
+    'fireplace', 'soap dish', 'kitchen counter', 'doorframe', 'toilet paper dispenser', 'mini fridge',
+    'fire extinguisher', 'ball', 'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray', 'shower door',
+    'pillar', 'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser', 'furniture', 'cart', 'storage container',
+    'scale', 'tissue box', 'light switch', 'crate', 'power outlet', 'decoration', 'sign', 'projector', 'closet door',
+    'vacuum cleaner', 'candle', 'plunger', 'stuffed animal', 'headphones', 'dish rack', 'broom', 'guitar case',
+    'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar', 'purse', 'vent', 'shower floor',
+    'water pitcher', 'mailbox', 'bowl', 'paper bag', 'alarm clock', 'music stand', 'projector screen', 'divider',
+    'laundry detergent', 'bathroom counter', 'object', 'bathroom vanity', 'closet wall', 'laundry hamper',
+    'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell', 'stair rail', 'tube', 'bathroom cabinet',
+    'cd case', 'closet rod', 'coffee kettle', 'structure', 'shower head', 'keyboard piano', 'case of water bottles',
+    'coat rack', 'storage organizer', 'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster', 'potted plant',
+    'luggage', 'mattress')
+
+SCANNET_COLOR_MAP_200 = {
+    0: (0., 0., 0.),
+    1: (174., 199., 232.),
+    2: (188., 189., 34.),
+    3: (152., 223., 138.),
+    4: (255., 152., 150.),
+    5: (214., 39., 40.),
+    6: (91., 135., 229.),
+    7: (31., 119., 180.),
+    8: (229., 91., 104.),
+    9: (247., 182., 210.),
+    10: (91., 229., 110.),
+    11: (255., 187., 120.),
+    13: (141., 91., 229.),
+    14: (112., 128., 144.),
+    15: (196., 156., 148.),
+    16: (197., 176., 213.),
+    17: (44., 160., 44.),
+    18: (148., 103., 189.),
+    19: (229., 91., 223.),
+    21: (219., 219., 141.),
+    22: (192., 229., 91.),
+    23: (88., 218., 137.),
+    24: (58., 98., 137.),
+    26: (177., 82., 239.),
+    27: (255., 127., 14.),
+    28: (237., 204., 37.),
+    29: (41., 206., 32.),
+    31: (62., 143., 148.),
+    32: (34., 14., 130.),
+    33: (143., 45., 115.),
+    34: (137., 63., 14.),
+    35: (23., 190., 207.),
+    36: (16., 212., 139.),
+    38: (90., 119., 201.),
+    39: (125., 30., 141.),
+    40: (150., 53., 56.),
+    41: (186., 197., 62.),
+    42: (227., 119., 194.),
+    44: (38., 100., 128.),
+    45: (120., 31., 243.),
+    46: (154., 59., 103.),
+    47: (169., 137., 78.),
+    48: (143., 245., 111.),
+    49: (37., 230., 205.),
+    50: (14., 16., 155.),
+    51: (196., 51., 182.),
+    52: (237., 80., 38.),
+    54: (138., 175., 62.),
+    55: (158., 218., 229.),
+    56: (38., 96., 167.),
+    57: (190., 77., 246.),
+    58: (208., 49., 84.),
+    59: (208., 193., 72.),
+    62: (55., 220., 57.),
+    63: (10., 125., 140.),
+    64: (76., 38., 202.),
+    65: (191., 28., 135.),
+    66: (211., 120., 42.),
+    67: (118., 174., 76.),
+    68: (17., 242., 171.),
+    69: (20., 65., 247.),
+    70: (208., 61., 222.),
+    71: (162., 62., 60.),
+    72: (210., 235., 62.),
+    73: (45., 152., 72.),
+    74: (35., 107., 149.),
+    75: (160., 89., 237.),
+    76: (227., 56., 125.),
+    77: (169., 143., 81.),
+    78: (42., 143., 20.),
+    79: (25., 160., 151.),
+    80: (82., 75., 227.),
+    82: (253., 59., 222.),
+    84: (240., 130., 89.),
+    86: (123., 172., 47.),
+    87: (71., 194., 133.),
+    88: (24., 94., 205.),
+    89: (134., 16., 179.),
+    90: (159., 32., 52.),
+    93: (213., 208., 88.),
+    95: (64., 158., 70.),
+    96: (18., 163., 194.),
+    97: (65., 29., 153.),
+    98: (177., 10., 109.),
+    99: (152., 83., 7.),
+    100: (83., 175., 30.),
+    101: (18., 199., 153.),
+    102: (61., 81., 208.),
+    103: (213., 85., 216.),
+    104: (170., 53., 42.),
+    105: (161., 192., 38.),
+    106: (23., 241., 91.),
+    107: (12., 103., 170.),
+    110: (151., 41., 245.),
+    112: (133., 51., 80.),
+    115: (184., 162., 91.),
+    116: (50., 138., 38.),
+    118: (31., 237., 236.),
+    120: (39., 19., 208.),
+    121: (223., 27., 180.),
+    122: (254., 141., 85.),
+    125: (97., 144., 39.),
+    128: (106., 231., 176.),
+    130: (12., 61., 162.),
+    131: (124., 66., 140.),
+    132: (137., 66., 73.),
+    134: (250., 253., 26.),
+    136: (55., 191., 73.),
+    138: (60., 126., 146.),
+    139: (153., 108., 234.),
+    140: (184., 58., 125.),
+    141: (135., 84., 14.),
+    145: (139., 248., 91.),
+    148: (53., 200., 172.),
+    154: (63., 69., 134.),
+    155: (190., 75., 186.),
+    156: (127., 63., 52.),
+    157: (141., 182., 25.),
+    159: (56., 144., 89.),
+    161: (64., 160., 250.),
+    163: (182., 86., 245.),
+    165: (139., 18., 53.),
+    166: (134., 120., 54.),
+    168: (49., 165., 42.),
+    169: (51., 128., 133.),
+    170: (44., 21., 163.),
+    177: (232., 93., 193.),
+    180: (176., 102., 54.),
+    185: (116., 217., 17.),
+    188: (54., 209., 150.),
+    191: (60., 99., 204.),
+    193: (129., 43., 144.),
+    195: (252., 100., 106.),
+    202: (187., 196., 73.),
+    208: (13., 158., 40.),
+    213: (52., 122., 152.),
+    214: (128., 76., 202.),
+    221: (187., 50., 115.),
+    229: (180., 141., 71.),
+    230: (77., 208., 35.),
+    232: (72., 183., 168.),
+    233: (97., 99., 203.),
+    242: (172., 22., 158.),
+    250: (155., 64., 40.),
+    261: (118., 159., 30.),
+    264: (69., 252., 148.),
+    276: (45., 103., 173.),
+    283: (111., 38., 149.),
+    286: (184., 9., 49.),
+    300: (188., 174., 67.),
+    304: (53., 206., 53.),
+    312: (97., 235., 252.),
+    323: (66., 32., 182.),
+    325: (236., 114., 195.),
+    331: (241., 154., 83.),
+    342: (133., 240., 52.),
+    356: (16., 205., 144.),
+    370: (75., 101., 198.),
+    392: (237., 95., 251.),
+    395: (191., 52., 49.),
+    399: (227., 254., 54.),
+    408: (49., 206., 87.),
+    417: (48., 113., 150.),
+    488: (125., 73., 182.),
+    540: (229., 32., 114.),
+    562: (158., 119., 28.),
+    570: (60., 205., 27.),
+    572: (18., 215., 201.),
+    581: (79., 76., 153.),
+    609: (134., 13., 116.),
+    748: (192., 97., 63.),
+    776: (108., 163., 18.),
+    1156: (95., 220., 156.),
+    1163: (98., 141., 208.),
+    1164: (144., 19., 193.),
+    1165: (166., 36., 57.),
+    1166: (212., 202., 34.),
+    1167: (23., 206., 34.),
+    1168: (91., 211., 236.),
+    1169: (79., 55., 137.),
+    1170: (182., 19., 117.),
+    1171: (134., 76., 14.),
+    1172: (87., 185., 28.),
+    1173: (82., 224., 187.),
+    1174: (92., 110., 214.),
+    1175: (168., 80., 171.),
+    1176: (197., 63., 51.),
+    1178: (175., 199., 77.),
+    1179: (62., 180., 98.),
+    1180: (8., 91., 150.),
+    1181: (77., 15., 130.),
+    1182: (154., 65., 96.),
+    1183: (197., 152., 11.),
+    1184: (59., 155., 45.),
+    1185: (12., 147., 145.),
+    1186: (54., 35., 219.),
+    1187: (210., 73., 181.),
+    1188: (221., 124., 77.),
+    1189: (149., 214., 66.),
+    1190: (72., 185., 134.),
+    1191: (42., 94., 198.),
+}
+
+# For instance segmentation the non-object categories
+VALID_PANOPTIC_IDS = (1, 3)
+
+CLASS_LABELS_PANOPTIC = ('wall', 'floor')
diff --git a/datasets/scannet_preprocess/meta_data/scannet200_splits.py b/datasets/scannet_preprocess/meta_data/scannet200_splits.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e66fc81f2c48e7df5ce328dc89f11ad3f4eb98a
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannet200_splits.py
@@ -0,0 +1,18 @@
+# This file contains the HEAD - COMMON - TAIL split category ids for ScanNet 200
+
+HEAD_CATS_SCANNET_200 = ['tv stand', 'curtain', 'blinds', 'shower curtain', 'bookshelf', 'tv', 'kitchen cabinet', 'pillow', 'lamp', 'dresser', 'monitor', 'object', 'ceiling', 'board', 'stove', 'closet wall', 'couch', 'office chair', 'kitchen counter', 'shower', 'closet', 'doorframe', 'sofa chair', 'mailbox', 'nightstand', 'washing machine', 'picture', 'book', 'sink', 'recycling bin', 'table', 'backpack', 'shower wall', 'toilet', 'copier', 'counter', 'stool', 'refrigerator', 'window', 'file cabinet', 'chair', 'wall', 'plant', 'coffee table', 'stairs', 'armchair', 'cabinet', 'bathroom vanity', 'bathroom stall', 'mirror', 'blackboard', 'trash can', 'stair rail', 'box', 'towel', 'door', 'clothes', 'whiteboard', 'bed', 'floor', 'bathtub', 'desk', 'wardrobe', 'clothes dryer', 'radiator', 'shelf']
+COMMON_CATS_SCANNET_200 = ["cushion", "end table", "dining table", "keyboard", "bag", "toilet paper", "printer", "blanket", "microwave", "shoe", "computer tower", "bottle", "bin", "ottoman", "bench", "basket", "fan", "laptop", "person", "paper towel dispenser", "oven", "rack", "piano", "suitcase", "rail", "container", "telephone", "stand", "light", "laundry basket", "pipe", "seat", "column", "bicycle", "ladder", "jacket", "storage bin", "coffee maker", "dishwasher", "machine", "mat", "windowsill", "bulletin board", "fireplace", "mini fridge", "water cooler", "shower door", "pillar", "ledge", "furniture", "cart", "decoration", "closet door", "vacuum cleaner", "dish rack", "range hood", "projector screen", "divider", "bathroom counter", "laundry hamper", "bathroom stall door", "ceiling light", "trash bin", "bathroom cabinet", "structure", "storage organizer", "potted plant", "mattress"]
+TAIL_CATS_SCANNET_200 = ["paper", "plate", "soap dispenser", "bucket", "clock", "guitar", "toilet paper holder", "speaker", "cup", "paper towel roll", "bar", "toaster", "ironing board", "soap dish", "toilet paper dispenser", "fire extinguisher", "ball", "hat", "shower curtain rod", "paper cutter", "tray", "toaster oven", "mouse", "toilet seat cover dispenser", "storage container", "scale", "tissue box", "light switch", "crate", "power outlet", "sign", "projector", "candle", "plunger", "stuffed animal", "headphones", "broom", "guitar case", "dustpan", "hair dryer", "water bottle", "handicap bar", "purse", "vent", "shower floor", "water pitcher", "bowl", "paper bag", "alarm clock", "music stand", "laundry detergent", "dumbbell", "tube", "cd case", "closet rod", "coffee kettle", "shower head", "keyboard piano", "case of water bottles", "coat rack", "folded chair", "fire alarm", "power strip", "calendar", "poster", "luggage"]
+
+
+# Given the different size of the official train and val sets, not all ScanNet200 categories are present in the validation set.
+# Here we list of categories with labels and IDs present in both train and validation set, and the remaining categories those are present in train, but not in val
+# We dont evaluate on unseen validation categories in this benchmark
+
+VALID_CLASS_IDS_200_VALIDATION = ('wall', 'chair', 'floor', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk', 'office chair', 'bed', 'pillow', 'sink', 'picture', 'window', 'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair', 'coffee table', 'box', 'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool', 'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard', 'bag', 'backpack', 'toilet paper', 'printer', 'tv stand', 'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet', 'stairs', 'microwave', 'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board', 'washing machine', 'mirror', 'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person', 'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail', 'radiator', 'recycling bin', 'container', 'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light', 'laundry basket', 'pipe', 'clothes dryer', 'guitar', 'toilet paper holder', 'seat', 'speaker', 'column', 'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin', 'coffee maker', 'dishwasher', 'paper towel roll', 'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board', 'fireplace', 'soap dish', 'kitchen counter', 'doorframe', 'toilet paper dispenser', 'mini fridge', 'fire extinguisher', 'ball', 'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray', 'shower door', 'pillar', 'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser', 'furniture', 'cart', 'scale', 'tissue box', 'light switch', 'crate', 'power outlet', 'decoration', 'sign', 'projector', 'closet door', 'vacuum cleaner', 'plunger', 'stuffed animal', 'headphones', 'dish rack', 'broom', 'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar', 'vent', 'shower floor', 'water pitcher', 'mailbox', 'bowl', 'paper bag', 'projector screen', 'divider', 'laundry detergent', 'bathroom counter', 'object', 'bathroom vanity', 'closet wall', 'laundry hamper', 'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell', 'stair rail', 'tube', 'bathroom cabinet', 'closet rod', 'coffee kettle', 'shower head', 'keyboard piano', 'case of water bottles', 'coat rack', 'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster', 'potted plant', 'mattress')
+
+CLASS_LABELS_200_VALIDATION = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 110, 112, 115, 116, 118, 120, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, 145, 148, 154, 155, 156, 157, 159, 161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195, 202, 208, 213, 214, 229, 230, 232, 233, 242, 250, 261, 264, 276, 283, 300, 304, 312, 323, 325, 342, 356, 370, 392, 395, 408, 417, 488, 540, 562, 570, 609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1175, 1176, 1179, 1180, 1181, 1182, 1184, 1185, 1186, 1187, 1188, 1189, 1191)
+
+VALID_CLASS_IDS_200_TRAIN_ONLY = ('bicycle', 'storage container', 'candle', 'guitar case', 'purse', 'alarm clock', 'music stand', 'cd case', 'structure', 'storage organizer', 'luggage')
+
+CLASS_LABELS_200_TRAIN_ONLY = (121,  221,  286,  331,  399,  572,  581, 1174, 1178, 1183, 1190)
\ No newline at end of file
diff --git a/datasets/scannet_preprocess/meta_data/scannet_means.npz b/datasets/scannet_preprocess/meta_data/scannet_means.npz
new file mode 100644
index 0000000000000000000000000000000000000000..d9bbb4f7c3b72dbe81fbeb86f594066b883fafaf
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannet_means.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df5c2bd40e8518e982c7d7b4b39020b07ac774695038bf49cb28b44e5760457e
+size 676
diff --git a/datasets/scannet_preprocess/meta_data/scannetv1_test.txt b/datasets/scannet_preprocess/meta_data/scannetv1_test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b9e7d9205321e8ca047a527466f4b7100c9c9d2c
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv1_test.txt
@@ -0,0 +1,312 @@
+scene0568_00
+scene0568_01
+scene0568_02
+scene0304_00
+scene0488_00
+scene0488_01
+scene0412_00
+scene0412_01
+scene0217_00
+scene0019_00
+scene0019_01
+scene0414_00
+scene0575_00
+scene0575_01
+scene0575_02
+scene0426_00
+scene0426_01
+scene0426_02
+scene0426_03
+scene0549_00
+scene0549_01
+scene0578_00
+scene0578_01
+scene0578_02
+scene0665_00
+scene0665_01
+scene0050_00
+scene0050_01
+scene0050_02
+scene0257_00
+scene0025_00
+scene0025_01
+scene0025_02
+scene0583_00
+scene0583_01
+scene0583_02
+scene0701_00
+scene0701_01
+scene0701_02
+scene0580_00
+scene0580_01
+scene0565_00
+scene0169_00
+scene0169_01
+scene0655_00
+scene0655_01
+scene0655_02
+scene0063_00
+scene0221_00
+scene0221_01
+scene0591_00
+scene0591_01
+scene0591_02
+scene0678_00
+scene0678_01
+scene0678_02
+scene0462_00
+scene0427_00
+scene0595_00
+scene0193_00
+scene0193_01
+scene0164_00
+scene0164_01
+scene0164_02
+scene0164_03
+scene0598_00
+scene0598_01
+scene0598_02
+scene0599_00
+scene0599_01
+scene0599_02
+scene0328_00
+scene0300_00
+scene0300_01
+scene0354_00
+scene0458_00
+scene0458_01
+scene0423_00
+scene0423_01
+scene0423_02
+scene0307_00
+scene0307_01
+scene0307_02
+scene0606_00
+scene0606_01
+scene0606_02
+scene0432_00
+scene0432_01
+scene0608_00
+scene0608_01
+scene0608_02
+scene0651_00
+scene0651_01
+scene0651_02
+scene0430_00
+scene0430_01
+scene0689_00
+scene0357_00
+scene0357_01
+scene0574_00
+scene0574_01
+scene0574_02
+scene0329_00
+scene0329_01
+scene0329_02
+scene0153_00
+scene0153_01
+scene0616_00
+scene0616_01
+scene0671_00
+scene0671_01
+scene0618_00
+scene0382_00
+scene0382_01
+scene0490_00
+scene0621_00
+scene0607_00
+scene0607_01
+scene0149_00
+scene0695_00
+scene0695_01
+scene0695_02
+scene0695_03
+scene0389_00
+scene0377_00
+scene0377_01
+scene0377_02
+scene0342_00
+scene0139_00
+scene0629_00
+scene0629_01
+scene0629_02
+scene0496_00
+scene0633_00
+scene0633_01
+scene0518_00
+scene0652_00
+scene0406_00
+scene0406_01
+scene0406_02
+scene0144_00
+scene0144_01
+scene0494_00
+scene0278_00
+scene0278_01
+scene0316_00
+scene0609_00
+scene0609_01
+scene0609_02
+scene0609_03
+scene0084_00
+scene0084_01
+scene0084_02
+scene0696_00
+scene0696_01
+scene0696_02
+scene0351_00
+scene0351_01
+scene0643_00
+scene0644_00
+scene0645_00
+scene0645_01
+scene0645_02
+scene0081_00
+scene0081_01
+scene0081_02
+scene0647_00
+scene0647_01
+scene0535_00
+scene0353_00
+scene0353_01
+scene0353_02
+scene0559_00
+scene0559_01
+scene0559_02
+scene0593_00
+scene0593_01
+scene0246_00
+scene0653_00
+scene0653_01
+scene0064_00
+scene0064_01
+scene0356_00
+scene0356_01
+scene0356_02
+scene0030_00
+scene0030_01
+scene0030_02
+scene0222_00
+scene0222_01
+scene0338_00
+scene0338_01
+scene0338_02
+scene0378_00
+scene0378_01
+scene0378_02
+scene0660_00
+scene0553_00
+scene0553_01
+scene0553_02
+scene0527_00
+scene0663_00
+scene0663_01
+scene0663_02
+scene0664_00
+scene0664_01
+scene0664_02
+scene0334_00
+scene0334_01
+scene0334_02
+scene0046_00
+scene0046_01
+scene0046_02
+scene0203_00
+scene0203_01
+scene0203_02
+scene0088_00
+scene0088_01
+scene0088_02
+scene0088_03
+scene0086_00
+scene0086_01
+scene0086_02
+scene0670_00
+scene0670_01
+scene0256_00
+scene0256_01
+scene0256_02
+scene0249_00
+scene0441_00
+scene0658_00
+scene0704_00
+scene0704_01
+scene0187_00
+scene0187_01
+scene0131_00
+scene0131_01
+scene0131_02
+scene0207_00
+scene0207_01
+scene0207_02
+scene0461_00
+scene0011_00
+scene0011_01
+scene0343_00
+scene0251_00
+scene0077_00
+scene0077_01
+scene0684_00
+scene0684_01
+scene0550_00
+scene0686_00
+scene0686_01
+scene0686_02
+scene0208_00
+scene0500_00
+scene0500_01
+scene0552_00
+scene0552_01
+scene0648_00
+scene0648_01
+scene0435_00
+scene0435_01
+scene0435_02
+scene0435_03
+scene0690_00
+scene0690_01
+scene0693_00
+scene0693_01
+scene0693_02
+scene0700_00
+scene0700_01
+scene0700_02
+scene0699_00
+scene0231_00
+scene0231_01
+scene0231_02
+scene0697_00
+scene0697_01
+scene0697_02
+scene0697_03
+scene0474_00
+scene0474_01
+scene0474_02
+scene0474_03
+scene0474_04
+scene0474_05
+scene0355_00
+scene0355_01
+scene0146_00
+scene0146_01
+scene0146_02
+scene0196_00
+scene0702_00
+scene0702_01
+scene0702_02
+scene0314_00
+scene0277_00
+scene0277_01
+scene0277_02
+scene0095_00
+scene0095_01
+scene0015_00
+scene0100_00
+scene0100_01
+scene0100_02
+scene0558_00
+scene0558_01
+scene0558_02
+scene0685_00
+scene0685_01
+scene0685_02
diff --git a/datasets/scannet_preprocess/meta_data/scannetv1_train.txt b/datasets/scannet_preprocess/meta_data/scannetv1_train.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7520948c8170df9ae1a9e8a40bc444fcc7cc0772
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv1_train.txt
@@ -0,0 +1,1045 @@
+scene0191_00
+scene0191_01
+scene0191_02
+scene0119_00
+scene0230_00
+scene0528_00
+scene0528_01
+scene0705_00
+scene0705_01
+scene0705_02
+scene0415_00
+scene0415_01
+scene0415_02
+scene0007_00
+scene0141_00
+scene0141_01
+scene0141_02
+scene0515_00
+scene0515_01
+scene0515_02
+scene0447_00
+scene0447_01
+scene0447_02
+scene0531_00
+scene0503_00
+scene0285_00
+scene0069_00
+scene0584_00
+scene0584_01
+scene0584_02
+scene0581_00
+scene0581_01
+scene0581_02
+scene0620_00
+scene0620_01
+scene0263_00
+scene0263_01
+scene0481_00
+scene0481_01
+scene0020_00
+scene0020_01
+scene0291_00
+scene0291_01
+scene0291_02
+scene0469_00
+scene0469_01
+scene0469_02
+scene0659_00
+scene0659_01
+scene0024_00
+scene0024_01
+scene0024_02
+scene0564_00
+scene0117_00
+scene0027_00
+scene0027_01
+scene0027_02
+scene0028_00
+scene0330_00
+scene0418_00
+scene0418_01
+scene0418_02
+scene0233_00
+scene0233_01
+scene0673_00
+scene0673_01
+scene0673_02
+scene0673_03
+scene0673_04
+scene0673_05
+scene0585_00
+scene0585_01
+scene0362_00
+scene0362_01
+scene0362_02
+scene0362_03
+scene0035_00
+scene0035_01
+scene0358_00
+scene0358_01
+scene0358_02
+scene0037_00
+scene0194_00
+scene0321_00
+scene0293_00
+scene0293_01
+scene0623_00
+scene0623_01
+scene0592_00
+scene0592_01
+scene0569_00
+scene0569_01
+scene0413_00
+scene0313_00
+scene0313_01
+scene0313_02
+scene0480_00
+scene0480_01
+scene0401_00
+scene0517_00
+scene0517_01
+scene0517_02
+scene0032_00
+scene0032_01
+scene0613_00
+scene0613_01
+scene0613_02
+scene0306_00
+scene0306_01
+scene0052_00
+scene0052_01
+scene0052_02
+scene0053_00
+scene0444_00
+scene0444_01
+scene0055_00
+scene0055_01
+scene0055_02
+scene0560_00
+scene0589_00
+scene0589_01
+scene0589_02
+scene0610_00
+scene0610_01
+scene0610_02
+scene0364_00
+scene0364_01
+scene0383_00
+scene0383_01
+scene0383_02
+scene0006_00
+scene0006_01
+scene0006_02
+scene0275_00
+scene0451_00
+scene0451_01
+scene0451_02
+scene0451_03
+scene0451_04
+scene0451_05
+scene0135_00
+scene0065_00
+scene0065_01
+scene0065_02
+scene0104_00
+scene0674_00
+scene0674_01
+scene0448_00
+scene0448_01
+scene0448_02
+scene0502_00
+scene0502_01
+scene0502_02
+scene0440_00
+scene0440_01
+scene0440_02
+scene0071_00
+scene0072_00
+scene0072_01
+scene0072_02
+scene0509_00
+scene0509_01
+scene0509_02
+scene0649_00
+scene0649_01
+scene0602_00
+scene0694_00
+scene0694_01
+scene0101_00
+scene0101_01
+scene0101_02
+scene0101_03
+scene0101_04
+scene0101_05
+scene0218_00
+scene0218_01
+scene0579_00
+scene0579_01
+scene0579_02
+scene0039_00
+scene0039_01
+scene0493_00
+scene0493_01
+scene0242_00
+scene0242_01
+scene0242_02
+scene0083_00
+scene0083_01
+scene0127_00
+scene0127_01
+scene0662_00
+scene0662_01
+scene0662_02
+scene0018_00
+scene0087_00
+scene0087_01
+scene0087_02
+scene0332_00
+scene0332_01
+scene0332_02
+scene0628_00
+scene0628_01
+scene0628_02
+scene0134_00
+scene0134_01
+scene0134_02
+scene0238_00
+scene0238_01
+scene0092_00
+scene0092_01
+scene0092_02
+scene0092_03
+scene0092_04
+scene0022_00
+scene0022_01
+scene0467_00
+scene0392_00
+scene0392_01
+scene0392_02
+scene0424_00
+scene0424_01
+scene0424_02
+scene0646_00
+scene0646_01
+scene0646_02
+scene0098_00
+scene0098_01
+scene0044_00
+scene0044_01
+scene0044_02
+scene0510_00
+scene0510_01
+scene0510_02
+scene0571_00
+scene0571_01
+scene0166_00
+scene0166_01
+scene0166_02
+scene0563_00
+scene0172_00
+scene0172_01
+scene0388_00
+scene0388_01
+scene0215_00
+scene0215_01
+scene0252_00
+scene0287_00
+scene0668_00
+scene0572_00
+scene0572_01
+scene0572_02
+scene0026_00
+scene0224_00
+scene0113_00
+scene0113_01
+scene0551_00
+scene0381_00
+scene0381_01
+scene0381_02
+scene0371_00
+scene0371_01
+scene0460_00
+scene0118_00
+scene0118_01
+scene0118_02
+scene0417_00
+scene0008_00
+scene0634_00
+scene0521_00
+scene0123_00
+scene0123_01
+scene0123_02
+scene0045_00
+scene0045_01
+scene0511_00
+scene0511_01
+scene0114_00
+scene0114_01
+scene0114_02
+scene0070_00
+scene0029_00
+scene0029_01
+scene0029_02
+scene0129_00
+scene0103_00
+scene0103_01
+scene0002_00
+scene0002_01
+scene0132_00
+scene0132_01
+scene0132_02
+scene0124_00
+scene0124_01
+scene0143_00
+scene0143_01
+scene0143_02
+scene0604_00
+scene0604_01
+scene0604_02
+scene0507_00
+scene0105_00
+scene0105_01
+scene0105_02
+scene0428_00
+scene0428_01
+scene0311_00
+scene0140_00
+scene0140_01
+scene0182_00
+scene0182_01
+scene0182_02
+scene0142_00
+scene0142_01
+scene0399_00
+scene0399_01
+scene0012_00
+scene0012_01
+scene0012_02
+scene0060_00
+scene0060_01
+scene0370_00
+scene0370_01
+scene0370_02
+scene0310_00
+scene0310_01
+scene0310_02
+scene0661_00
+scene0650_00
+scene0152_00
+scene0152_01
+scene0152_02
+scene0158_00
+scene0158_01
+scene0158_02
+scene0482_00
+scene0482_01
+scene0600_00
+scene0600_01
+scene0600_02
+scene0393_00
+scene0393_01
+scene0393_02
+scene0562_00
+scene0174_00
+scene0174_01
+scene0157_00
+scene0157_01
+scene0161_00
+scene0161_01
+scene0161_02
+scene0159_00
+scene0254_00
+scene0254_01
+scene0115_00
+scene0115_01
+scene0115_02
+scene0162_00
+scene0163_00
+scene0163_01
+scene0523_00
+scene0523_01
+scene0523_02
+scene0459_00
+scene0459_01
+scene0175_00
+scene0085_00
+scene0085_01
+scene0279_00
+scene0279_01
+scene0279_02
+scene0201_00
+scene0201_01
+scene0201_02
+scene0283_00
+scene0456_00
+scene0456_01
+scene0429_00
+scene0043_00
+scene0043_01
+scene0419_00
+scene0419_01
+scene0419_02
+scene0368_00
+scene0368_01
+scene0348_00
+scene0348_01
+scene0348_02
+scene0442_00
+scene0178_00
+scene0380_00
+scene0380_01
+scene0380_02
+scene0165_00
+scene0165_01
+scene0165_02
+scene0181_00
+scene0181_01
+scene0181_02
+scene0181_03
+scene0333_00
+scene0614_00
+scene0614_01
+scene0614_02
+scene0404_00
+scene0404_01
+scene0404_02
+scene0185_00
+scene0126_00
+scene0126_01
+scene0126_02
+scene0519_00
+scene0236_00
+scene0236_01
+scene0189_00
+scene0075_00
+scene0267_00
+scene0192_00
+scene0192_01
+scene0192_02
+scene0281_00
+scene0420_00
+scene0420_01
+scene0420_02
+scene0195_00
+scene0195_01
+scene0195_02
+scene0597_00
+scene0597_01
+scene0597_02
+scene0041_00
+scene0041_01
+scene0111_00
+scene0111_01
+scene0111_02
+scene0666_00
+scene0666_01
+scene0666_02
+scene0200_00
+scene0200_01
+scene0200_02
+scene0536_00
+scene0536_01
+scene0536_02
+scene0390_00
+scene0280_00
+scene0280_01
+scene0280_02
+scene0344_00
+scene0344_01
+scene0205_00
+scene0205_01
+scene0205_02
+scene0484_00
+scene0484_01
+scene0009_00
+scene0009_01
+scene0009_02
+scene0302_00
+scene0302_01
+scene0209_00
+scene0209_01
+scene0209_02
+scene0210_00
+scene0210_01
+scene0395_00
+scene0395_01
+scene0395_02
+scene0683_00
+scene0601_00
+scene0601_01
+scene0214_00
+scene0214_01
+scene0214_02
+scene0477_00
+scene0477_01
+scene0439_00
+scene0439_01
+scene0468_00
+scene0468_01
+scene0468_02
+scene0546_00
+scene0466_00
+scene0466_01
+scene0220_00
+scene0220_01
+scene0220_02
+scene0122_00
+scene0122_01
+scene0130_00
+scene0110_00
+scene0110_01
+scene0110_02
+scene0327_00
+scene0156_00
+scene0266_00
+scene0266_01
+scene0001_00
+scene0001_01
+scene0228_00
+scene0199_00
+scene0219_00
+scene0464_00
+scene0232_00
+scene0232_01
+scene0232_02
+scene0299_00
+scene0299_01
+scene0530_00
+scene0363_00
+scene0453_00
+scene0453_01
+scene0570_00
+scene0570_01
+scene0570_02
+scene0183_00
+scene0239_00
+scene0239_01
+scene0239_02
+scene0373_00
+scene0373_01
+scene0241_00
+scene0241_01
+scene0241_02
+scene0188_00
+scene0622_00
+scene0622_01
+scene0244_00
+scene0244_01
+scene0691_00
+scene0691_01
+scene0206_00
+scene0206_01
+scene0206_02
+scene0247_00
+scene0247_01
+scene0061_00
+scene0061_01
+scene0082_00
+scene0250_00
+scene0250_01
+scene0250_02
+scene0501_00
+scene0501_01
+scene0501_02
+scene0320_00
+scene0320_01
+scene0320_02
+scene0320_03
+scene0631_00
+scene0631_01
+scene0631_02
+scene0255_00
+scene0255_01
+scene0255_02
+scene0047_00
+scene0265_00
+scene0265_01
+scene0265_02
+scene0004_00
+scene0336_00
+scene0336_01
+scene0058_00
+scene0058_01
+scene0260_00
+scene0260_01
+scene0260_02
+scene0243_00
+scene0603_00
+scene0603_01
+scene0093_00
+scene0093_01
+scene0093_02
+scene0109_00
+scene0109_01
+scene0434_00
+scene0434_01
+scene0434_02
+scene0290_00
+scene0627_00
+scene0627_01
+scene0470_00
+scene0470_01
+scene0137_00
+scene0137_01
+scene0137_02
+scene0270_00
+scene0270_01
+scene0270_02
+scene0271_00
+scene0271_01
+scene0504_00
+scene0274_00
+scene0274_01
+scene0274_02
+scene0036_00
+scene0036_01
+scene0276_00
+scene0276_01
+scene0272_00
+scene0272_01
+scene0499_00
+scene0698_00
+scene0698_01
+scene0051_00
+scene0051_01
+scene0051_02
+scene0051_03
+scene0108_00
+scene0245_00
+scene0369_00
+scene0369_01
+scene0369_02
+scene0284_00
+scene0289_00
+scene0289_01
+scene0286_00
+scene0286_01
+scene0286_02
+scene0286_03
+scene0031_00
+scene0031_01
+scene0031_02
+scene0545_00
+scene0545_01
+scene0545_02
+scene0557_00
+scene0557_01
+scene0557_02
+scene0533_00
+scene0533_01
+scene0116_00
+scene0116_01
+scene0116_02
+scene0611_00
+scene0611_01
+scene0688_00
+scene0294_00
+scene0294_01
+scene0294_02
+scene0295_00
+scene0295_01
+scene0296_00
+scene0296_01
+scene0596_00
+scene0596_01
+scene0596_02
+scene0532_00
+scene0532_01
+scene0637_00
+scene0638_00
+scene0121_00
+scene0121_01
+scene0121_02
+scene0040_00
+scene0040_01
+scene0197_00
+scene0197_01
+scene0197_02
+scene0410_00
+scene0410_01
+scene0305_00
+scene0305_01
+scene0615_00
+scene0615_01
+scene0703_00
+scene0703_01
+scene0555_00
+scene0297_00
+scene0297_01
+scene0297_02
+scene0582_00
+scene0582_01
+scene0582_02
+scene0023_00
+scene0094_00
+scene0013_00
+scene0013_01
+scene0013_02
+scene0136_00
+scene0136_01
+scene0136_02
+scene0407_00
+scene0407_01
+scene0062_00
+scene0062_01
+scene0062_02
+scene0386_00
+scene0318_00
+scene0554_00
+scene0554_01
+scene0497_00
+scene0213_00
+scene0258_00
+scene0323_00
+scene0323_01
+scene0324_00
+scene0324_01
+scene0016_00
+scene0016_01
+scene0016_02
+scene0681_00
+scene0398_00
+scene0398_01
+scene0227_00
+scene0090_00
+scene0066_00
+scene0262_00
+scene0262_01
+scene0155_00
+scene0155_01
+scene0155_02
+scene0352_00
+scene0352_01
+scene0352_02
+scene0038_00
+scene0038_01
+scene0038_02
+scene0335_00
+scene0335_01
+scene0335_02
+scene0261_00
+scene0261_01
+scene0261_02
+scene0261_03
+scene0640_00
+scene0640_01
+scene0640_02
+scene0080_00
+scene0080_01
+scene0080_02
+scene0403_00
+scene0403_01
+scene0282_00
+scene0282_01
+scene0282_02
+scene0682_00
+scene0173_00
+scene0173_01
+scene0173_02
+scene0522_00
+scene0687_00
+scene0345_00
+scene0345_01
+scene0612_00
+scene0612_01
+scene0411_00
+scene0411_01
+scene0411_02
+scene0625_00
+scene0625_01
+scene0211_00
+scene0211_01
+scene0211_02
+scene0211_03
+scene0676_00
+scene0676_01
+scene0179_00
+scene0498_00
+scene0498_01
+scene0498_02
+scene0547_00
+scene0547_01
+scene0547_02
+scene0269_00
+scene0269_01
+scene0269_02
+scene0366_00
+scene0680_00
+scene0680_01
+scene0588_00
+scene0588_01
+scene0588_02
+scene0588_03
+scene0346_00
+scene0346_01
+scene0359_00
+scene0359_01
+scene0014_00
+scene0120_00
+scene0120_01
+scene0212_00
+scene0212_01
+scene0212_02
+scene0176_00
+scene0049_00
+scene0259_00
+scene0259_01
+scene0586_00
+scene0586_01
+scene0586_02
+scene0309_00
+scene0309_01
+scene0125_00
+scene0455_00
+scene0177_00
+scene0177_01
+scene0177_02
+scene0326_00
+scene0372_00
+scene0171_00
+scene0171_01
+scene0374_00
+scene0654_00
+scene0654_01
+scene0445_00
+scene0445_01
+scene0475_00
+scene0475_01
+scene0475_02
+scene0349_00
+scene0349_01
+scene0234_00
+scene0669_00
+scene0669_01
+scene0375_00
+scene0375_01
+scene0375_02
+scene0387_00
+scene0387_01
+scene0387_02
+scene0312_00
+scene0312_01
+scene0312_02
+scene0384_00
+scene0385_00
+scene0385_01
+scene0385_02
+scene0000_00
+scene0000_01
+scene0000_02
+scene0376_00
+scene0376_01
+scene0376_02
+scene0301_00
+scene0301_01
+scene0301_02
+scene0322_00
+scene0542_00
+scene0079_00
+scene0079_01
+scene0099_00
+scene0099_01
+scene0476_00
+scene0476_01
+scene0476_02
+scene0394_00
+scene0394_01
+scene0147_00
+scene0147_01
+scene0067_00
+scene0067_01
+scene0067_02
+scene0397_00
+scene0397_01
+scene0337_00
+scene0337_01
+scene0337_02
+scene0431_00
+scene0223_00
+scene0223_01
+scene0223_02
+scene0010_00
+scene0010_01
+scene0402_00
+scene0268_00
+scene0268_01
+scene0268_02
+scene0679_00
+scene0679_01
+scene0405_00
+scene0128_00
+scene0408_00
+scene0408_01
+scene0190_00
+scene0107_00
+scene0076_00
+scene0167_00
+scene0361_00
+scene0361_01
+scene0361_02
+scene0216_00
+scene0202_00
+scene0303_00
+scene0303_01
+scene0303_02
+scene0446_00
+scene0446_01
+scene0089_00
+scene0089_01
+scene0089_02
+scene0360_00
+scene0150_00
+scene0150_01
+scene0150_02
+scene0421_00
+scene0421_01
+scene0421_02
+scene0454_00
+scene0626_00
+scene0626_01
+scene0626_02
+scene0186_00
+scene0186_01
+scene0538_00
+scene0479_00
+scene0479_01
+scene0479_02
+scene0656_00
+scene0656_01
+scene0656_02
+scene0656_03
+scene0525_00
+scene0525_01
+scene0525_02
+scene0308_00
+scene0396_00
+scene0396_01
+scene0396_02
+scene0624_00
+scene0292_00
+scene0292_01
+scene0632_00
+scene0253_00
+scene0021_00
+scene0325_00
+scene0325_01
+scene0437_00
+scene0437_01
+scene0438_00
+scene0590_00
+scene0590_01
+scene0400_00
+scene0400_01
+scene0541_00
+scene0541_01
+scene0541_02
+scene0677_00
+scene0677_01
+scene0677_02
+scene0443_00
+scene0315_00
+scene0288_00
+scene0288_01
+scene0288_02
+scene0422_00
+scene0672_00
+scene0672_01
+scene0184_00
+scene0449_00
+scene0449_01
+scene0449_02
+scene0048_00
+scene0048_01
+scene0138_00
+scene0452_00
+scene0452_01
+scene0452_02
+scene0667_00
+scene0667_01
+scene0667_02
+scene0463_00
+scene0463_01
+scene0078_00
+scene0078_01
+scene0078_02
+scene0636_00
+scene0457_00
+scene0457_01
+scene0457_02
+scene0465_00
+scene0465_01
+scene0577_00
+scene0151_00
+scene0151_01
+scene0339_00
+scene0573_00
+scene0573_01
+scene0154_00
+scene0096_00
+scene0096_01
+scene0096_02
+scene0235_00
+scene0168_00
+scene0168_01
+scene0168_02
+scene0594_00
+scene0587_00
+scene0587_01
+scene0587_02
+scene0587_03
+scene0229_00
+scene0229_01
+scene0229_02
+scene0512_00
+scene0106_00
+scene0106_01
+scene0106_02
+scene0472_00
+scene0472_01
+scene0472_02
+scene0489_00
+scene0489_01
+scene0489_02
+scene0425_00
+scene0425_01
+scene0641_00
+scene0526_00
+scene0526_01
+scene0317_00
+scene0317_01
+scene0544_00
+scene0017_00
+scene0017_01
+scene0017_02
+scene0042_00
+scene0042_01
+scene0042_02
+scene0576_00
+scene0576_01
+scene0576_02
+scene0347_00
+scene0347_01
+scene0347_02
+scene0436_00
+scene0226_00
+scene0226_01
+scene0485_00
+scene0486_00
+scene0487_00
+scene0487_01
+scene0619_00
+scene0097_00
+scene0367_00
+scene0367_01
+scene0491_00
+scene0492_00
+scene0492_01
+scene0005_00
+scene0005_01
+scene0543_00
+scene0543_01
+scene0543_02
+scene0657_00
+scene0341_00
+scene0341_01
diff --git a/datasets/scannet_preprocess/meta_data/scannetv1_val.txt b/datasets/scannet_preprocess/meta_data/scannetv1_val.txt
new file mode 100644
index 0000000000000000000000000000000000000000..965ff258035f857446c30b10e9a6be49f71d3dc7
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv1_val.txt
@@ -0,0 +1,156 @@
+scene0534_00
+scene0534_01
+scene0319_00
+scene0273_00
+scene0273_01
+scene0225_00
+scene0198_00
+scene0003_00
+scene0003_01
+scene0003_02
+scene0409_00
+scene0409_01
+scene0331_00
+scene0331_01
+scene0505_00
+scene0505_01
+scene0505_02
+scene0505_03
+scene0505_04
+scene0506_00
+scene0057_00
+scene0057_01
+scene0074_00
+scene0074_01
+scene0074_02
+scene0091_00
+scene0112_00
+scene0112_01
+scene0112_02
+scene0240_00
+scene0102_00
+scene0102_01
+scene0513_00
+scene0514_00
+scene0514_01
+scene0537_00
+scene0516_00
+scene0516_01
+scene0495_00
+scene0617_00
+scene0133_00
+scene0520_00
+scene0520_01
+scene0635_00
+scene0635_01
+scene0054_00
+scene0473_00
+scene0473_01
+scene0524_00
+scene0524_01
+scene0379_00
+scene0471_00
+scene0471_01
+scene0471_02
+scene0566_00
+scene0248_00
+scene0248_01
+scene0248_02
+scene0529_00
+scene0529_01
+scene0529_02
+scene0391_00
+scene0264_00
+scene0264_01
+scene0264_02
+scene0675_00
+scene0675_01
+scene0350_00
+scene0350_01
+scene0350_02
+scene0450_00
+scene0068_00
+scene0068_01
+scene0237_00
+scene0237_01
+scene0365_00
+scene0365_01
+scene0365_02
+scene0605_00
+scene0605_01
+scene0539_00
+scene0539_01
+scene0539_02
+scene0540_00
+scene0540_01
+scene0540_02
+scene0170_00
+scene0170_01
+scene0170_02
+scene0433_00
+scene0340_00
+scene0340_01
+scene0340_02
+scene0160_00
+scene0160_01
+scene0160_02
+scene0160_03
+scene0160_04
+scene0059_00
+scene0059_01
+scene0059_02
+scene0056_00
+scene0056_01
+scene0478_00
+scene0478_01
+scene0548_00
+scene0548_01
+scene0548_02
+scene0204_00
+scene0204_01
+scene0204_02
+scene0033_00
+scene0145_00
+scene0483_00
+scene0508_00
+scene0508_01
+scene0508_02
+scene0180_00
+scene0148_00
+scene0556_00
+scene0556_01
+scene0416_00
+scene0416_01
+scene0416_02
+scene0416_03
+scene0416_04
+scene0073_00
+scene0073_01
+scene0073_02
+scene0073_03
+scene0034_00
+scene0034_01
+scene0034_02
+scene0639_00
+scene0561_00
+scene0561_01
+scene0298_00
+scene0692_00
+scene0692_01
+scene0692_02
+scene0692_03
+scene0692_04
+scene0642_00
+scene0642_01
+scene0642_02
+scene0642_03
+scene0630_00
+scene0630_01
+scene0630_02
+scene0630_03
+scene0630_04
+scene0630_05
+scene0630_06
+scene0706_00
+scene0567_00
+scene0567_01
diff --git a/datasets/scannet_preprocess/meta_data/scannetv2-labels-old.combined.tsv b/datasets/scannet_preprocess/meta_data/scannetv2-labels-old.combined.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..05c006e98066aa78d126bebcfb3654200d351b93
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv2-labels-old.combined.tsv
@@ -0,0 +1,608 @@
+id	raw_category	category	count	nyu40id	eigen13id	nyuClass	nyu40class	eigen13class	ModelNet40	ModelNet10	ShapeNetCore55	synsetoffset	wnsynsetid	wnsynsetkey	mpcat40	mpcat40index
+1	wall	wall	8277	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+2	chair	chair	4646	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+22	books	book	1678	23	2	book	books	Books					n02870526	book.n.11	objects	39
+3	floor	floor	1553	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+5	door	door	1483	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+1163	object	object	1313	40	7		otherprop	Objects							objects	39
+16	window	window	1209	9	13	window	window	Window					n04587648	window.n.01	window	9
+4	table	table	1170	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+56	trash can	trash can	1090	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+13	pillow	pillow	937	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+15	picture	picture	862	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+41	ceiling	ceiling	806	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+26	box	box	775	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+161	doorframe	doorframe	768	8	12	door	door	Wall	door					doorframe.n.01	door	4
+19	monitor	monitor	765	40	7	monitor	otherprop	Objects	monitor	monitor	tv or monitor	3211117	n03782190	monitor.n.04	objects	39
+7	cabinet	cabinet	731	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+9	desk	desk	680	14	10	desk	desk	Table	desk	desk	table	4379243	n03179701	desk.n.01	table	5
+8	shelf	shelf	641	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+10	office chair	office chair	595	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04373704	swivel_chair.n.01	chair	3
+31	towel	towel	570	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+6	couch	couch	502	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+14	sink	sink	488	34	7	sink	sink	Objects	sink				n04223580	sink.n.01	sink	15
+48	backpack	backpack	479	40	7	backpack	otherprop	Objects					n02769748	backpack.n.01	objects	39
+28	lamp	lamp	419	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+11	bed	bed	370	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+18	bookshelf	bookshelf	360	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+71	mirror	mirror	349	19	7	mirror	mirror	Objects					n03773035	mirror.n.01	mirror	21
+21	curtain	curtain	347	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+40	plant	plant	331	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+52	whiteboard	whiteboard	327	30	7	whiteboard	whiteboard	Objects					n03211616	display_panel.n.01	board_panel	35
+96	radiator	radiator	322	39	6	radiator	otherfurniture	Furniture					n04041069	radiator.n.02	misc	40
+22	book	book	318	23	2	book	books	Books					n02870526	book.n.11	objects	39
+29	kitchen cabinet	kitchen cabinet	310	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+49	toilet paper	toilet paper	291	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+29	kitchen cabinets	kitchen cabinet	289	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+23	armchair	armchair	281	5	4	chair	chair	Chair	chair	chair	chair	3001627	n02738535	armchair.n.01	chair	3
+63	shoes	shoe	272	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+24	coffee table	coffee table	258	7	10	coffee table	table	Table	table	table	table	4379243	n03063968	coffee_table.n.01	table	5
+17	toilet	toilet	256	33	7	toilet	toilet	Objects	toilet	toilet			n04446276	toilet.n.01	toilet	18
+47	bag	bag	252	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+32	clothes	clothes	248	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+46	keyboard	keyboard	246	40	7	keyboard	otherprop	Objects	keyboard		computer keyboard	3085013	n03085013	computer_keyboard.n.01	objects	39
+65	bottle	bottle	226	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+97	recycling bin	recycling bin	225	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+34	nightstand	nightstand	224	32	6	night stand	night stand	Furniture	night_stand	night_stand			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+38	stool	stool	221	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+33	tv	tv	219	25	11	television	television	TV			tv or monitor	3211117	n03211117	display.n.06	tv_monitor	22
+75	file cabinet	file cabinet	217	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+36	dresser	dresser	213	17	6	dresser	dresser	Furniture	dresser	dresser			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+64	computer tower	computer tower	203	40	7	computer	otherprop	Objects					n03082979	computer.n.01	objects	39
+32	clothing	clothes	165	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+101	telephone	telephone	164	40	7	telephone	otherprop	Objects			telephone	4401088	n04401088	telephone.n.01	objects	39
+130	cup	cup	157	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+27	refrigerator	refrigerator	154	24	6	refridgerator	refridgerator	Furniture					n04070727	refrigerator.n.01	appliances	37
+44	end table	end table	147	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+131	jacket	jacket	146	40	7	jacket	otherprop	Objects					n03589791	jacket.n.01	clothes	38
+55	shower curtain	shower curtain	144	28	7	shower curtain	shower curtain	Objects	curtain				n04209239	shower_curtain.n.01	curtain	12
+42	bathtub	bathtub	144	36	7	bathtub	bathtub	Objects	bathtub	bathtub	tub	2808440	n02808440	bathtub.n.01	bathtub	25
+59	microwave	microwave	141	40	7	microwave	otherprop	Objects			microwave	3761084	n03761084	microwave.n.02	appliances	37
+159	kitchen counter	kitchen counter	140	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+74	sofa chair	sofa chair	129	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+82	paper towel dispenser	paper towel dispenser	129	40	7	paper towel dispenser	otherprop	Objects							objects	39
+1164	bathroom vanity	bathroom vanity	126	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	table	5
+93	suitcase	suitcase	118	40	7	luggage	otherprop	Objects					n02773838	bag.n.06	objects	39
+77	laptop	laptop	111	40	7	laptop	otherprop	Objects	laptop		laptop	3642806	n03642806	laptop.n.01	objects	39
+67	ottoman	ottoman	111	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+128	shower walls	shower wall	109	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+50	printer	printer	106	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+35	counter	counter	104	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+69	board	board	100	38	7	board	otherstructure	Objects							board_panel	35
+100	soap dispenser	soap dispenser	99	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+62	stove	stove	95	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+105	light	light	93	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+1165	closet wall	closet wall	90	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+165	mini fridge	mini fridge	87	24	6	refridgerator	refridgerator	Furniture					n03273913	electric_refrigerator.n.01	appliances	37
+7	cabinets	cabinet	79	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+5	doors	door	76	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+76	fan	fan	75	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+230	tissue box	tissue box	73	40	7	tissue box	otherprop	Objects					n02883344	box.n.01	objects	39
+54	blanket	blanket	72	40	7	blanket	otherprop	Objects					n02849154	blanket.n.01	objects	39
+125	bathroom stall	bathroom stall	71	38	7		otherstructure	Objects					n02873839	booth.n.02	misc	40
+72	copier	copier	70	40	7		otherprop	Objects					n03257586	duplicator.n.01	appliances	37
+68	bench	bench	66	39	6	bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+145	bar	bar	66	38	7	bar	otherstructure	Objects					n02788689	bar.n.03	misc	40
+157	soap dish	soap dish	65	40	7	soap dish	otherprop	Objects					n04254009	soap_dish.n.01	objects	39
+1166	laundry hamper	laundry hamper	65	40	7	laundry basket	otherprop	Objects							objects	39
+132	storage bin	storage bin	63	40	7	storage bin	otherprop	Objects							objects	39
+1167	bathroom stall door	bathroom stall door	62	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+232	light switch	light switch	61	38	7	light switch	otherstructure	Objects					n04372370	switch.n.01	misc	40
+134	coffee maker	coffee maker	61	40	7		otherprop	Objects					n03063338	coffee_maker.n.01	appliances	37
+51	tv stand	tv stand	61	39	6	tv stand	otherfurniture	Furniture	tv_stand				n03290653	entertainment_center.n.01	furniture	36
+250	decoration	decoration	60	40	7		otherprop	Objects					n03169390	decoration.n.01	misc	40
+1168	ceiling light	ceiling light	59	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+342	range hood	range hood	59	38	7	range hood	otherstructure	Objects	range_hood				n04053677	range_hood.n.01	misc	40
+89	blackboard	blackboard	58	38	7	blackboard	otherstructure	Objects					n02846511	blackboard.n.01	board_panel	35
+103	clock	clock	58	40	7	clock	otherprop	Objects			clock	3046257	n03046257	clock.n.01	objects	39
+99	wardrobe closet	wardrobe	54	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+95	rail	rail	53	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+154	bulletin board	bulletin board	53	38	7	board	otherstructure	Objects					n03211616	display_panel.n.01	board_panel	35
+140	mat	mat	52	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1169	trash bin	trash bin	52	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+193	ledge	ledge	51	38	7		otherstructure	Objects					n09337253	ledge.n.01	misc	40
+116	seat	seat	49	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+202	mouse	mouse	49	40	7	mouse	otherprop	Objects					n03793489	mouse.n.04	objects	39
+73	basket	basket	48	40	7	basket	otherprop	Objects			basket	2801938	n02801938	basket.n.01	objects	39
+78	shower	shower	48	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+1170	dumbbell	dumbbell	48	40	7		otherprop	Objects					n03255030	dumbbell.n.01	objects	39
+79	paper	paper	46	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+80	person	person	46	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+141	windowsill	windowsill	45	38	7		otherstructure	Objects					n04590263	windowsill.n.01	window	9
+57	closet	closet	45	39	6	wardrobe	otherfurniture	Furniture	wardrobe						misc	40
+102	bucket	bucket	45	40	7	bucket	otherprop	Objects					n02909870	bucket.n.01	misc	40
+261	sign	sign	44	40	7	sign	otherprop	Objects					n04217882	signboard.n.01	objects	39
+118	speaker	speaker	43	40	7	speaker	otherprop	Objects			speaker	3691459	n03691459	loudspeaker.n.01	objects	39
+136	dishwasher	dishwasher	43	38	7	dishwasher	otherstructure	Objects			dishwasher	3207941	n03207941	dishwasher.n.01	appliances	37
+98	container	container	43	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1171	stair rail	stair rail	42	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+170	shower curtain rod	shower curtain rod	42	40	7		otherprop	Objects							curtain	12
+1172	tube	tube	41	40	7		otherprop	Objects							misc	40
+1173	bathroom cabinet	bathroom cabinet	39	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+79	papers	paper	39	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+221	storage container	storage container	39	40	7	container	otherprop	Objects							objects	39
+570	paper bag	paper bag	39	37	7	bag	bag	Objects					n04122825	sack.n.01	objects	39
+138	paper towel roll	paper towel roll	39	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+168	ball	ball	39	40	7	ball	otherprop	Objects							objects	39
+276	closet doors	closet door	38	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+106	laundry basket	laundry basket	37	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+214	cart	cart	37	40	7	cart	otherprop	Objects					n03484083	handcart.n.01	shelving	31
+276	closet door	closet door	35	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+323	dish rack	dish rack	35	40	7	dish rack	otherprop	Objects					n03207630	dish_rack.n.01	objects	39
+58	stairs	stairs	35	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+86	blinds	blinds	35	13	13	blinds	blinds	Window					n02851099	blind.n.03	blinds	32
+2	stack of chairs	chair	35	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+399	purse	purse	34	40	7	purse	otherprop	Objects					n02774152	bag.n.04	objects	39
+121	bicycle	bicycle	33	40	7	bicycle	otherprop	Objects			bicycle	2834778	n02834778	bicycle.n.01	objects	39
+185	tray	tray	32	40	7	tray	otherprop	Objects					n04476259	tray.n.01	objects	39
+300	plunger	plunger	30	40	7		otherprop	Objects					n03970156	plunger.n.03	objects	39
+180	paper cutter	paper cutter	30	40	7	paper cutter	otherprop	Objects					n03886940	paper_cutter.n.01	objects	39
+163	toilet paper dispenser	toilet paper dispenser	29	40	7		otherprop	Objects							objects	39
+26	boxes	box	29	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+66	bin	bin	28	40	7	bin	otherprop	Objects					n02839910	bin.n.01	objects	39
+208	toilet seat cover dispenser	toilet seat cover dispenser	28	40	7		otherprop	Objects							objects	39
+112	guitar	guitar	28	40	7	guitar	otherprop	Objects	guitar		guitar	3467517	n03467517	guitar.n.01	objects	39
+540	mailboxes	mailbox	28	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+395	handicap bar	handicap bar	27	38	7	bar	otherstructure	Objects							misc	40
+166	fire extinguisher	fire extinguisher	27	40	7	fire extinguisher	otherprop	Objects					n03345837	fire_extinguisher.n.01	misc	40
+122	ladder	ladder	27	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	stairs	16
+120	column	column	26	38	7	column	otherstructure	Objects					n03074380	column.n.06	column	24
+107	pipe	pipe	25	40	7	pipe	otherprop	Objects					n03944672	pipe.n.02	misc	40
+283	vacuum cleaner	vacuum cleaner	25	40	7		otherprop	Objects					n04517823	vacuum.n.04	objects	39
+88	plate	plate	24	40	7	plate	otherprop	Objects					n03959485	plate.n.04	objects	39
+90	piano	piano	24	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+177	water cooler	water cooler	24	39	6	water cooler	otherfurniture	Furniture					n04559166	water_cooler.n.01	misc	40
+1174	cd case	cd case	24	40	7		otherprop	Objects							objects	39
+562	bowl	bowl	24	40	7	bowl	otherprop	Objects	bowl		bowl	2880940	n02880940	bowl.n.03	objects	39
+1175	closet rod	closet rod	24	40	7		otherprop	Objects					n04100174	rod.n.01	misc	40
+1156	bathroom counter	bathroom counter	24	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+84	oven	oven	23	38	7	oven	otherstructure	Objects					n03862676	oven.n.01	appliances	37
+104	stand	stand	23	39	6	stand	otherfurniture	Furniture	table	table	table	4379243	n04301000	stand.n.04	table	5
+229	scale	scale	23	40	7	scale	otherprop	Objects					n04141975	scale.n.07	objects	39
+70	washing machine	washing machine	23	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+325	broom	broom	22	40	7	broom	otherprop	Objects					n02906734	broom.n.01	objects	39
+169	hat	hat	22	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+128	shower wall	shower wall	22	1	12	wall	wall	Wall					n04208936	shower.n.01	wall	1
+331	guitar case	guitar case	21	40	7	guitar case	otherprop	Objects							objects	39
+87	rack	rack	21	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+488	water pitcher	water pitcher	21	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+776	laundry detergent	laundry detergent	21	40	7		otherprop	Objects							objects	39
+370	hair dryer	hair dryer	21	40	7	hair dryer	otherprop	Objects					n03483316	hand_blower.n.01	objects	39
+191	pillar	pillar	21	38	7	column	otherstructure	Objects					n03073977	column.n.07	column	24
+748	divider	divider	20	40	7		otherprop	Objects							wall	1
+242	power outlet	power outlet	19	40	7		otherprop	Objects							misc	40
+45	dining table	dining table	19	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+417	shower floor	shower floor	19	2	5	floor	floor	Floor					n04208936	shower.n.01	floor	2
+70	washing machines	washing machine	19	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+188	shower door	shower door	19	8	12	door	door	Wall	door				n04208936	shower.n.01	door	4
+1176	coffee kettle	coffee kettle	18	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1177	wardrobe cabinet	wardrobe 	18	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1178	structure	structure	18	38	7		otherstructure	Objects							misc	40
+18	bookshelves	bookshelf	17	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+110	clothes dryer	clothes dryer	17	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+148	toaster	toaster	17	40	7	toaster	otherprop	Objects					n04442312	toaster.n.02	appliances	37
+63	shoe	shoe	17	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+155	ironing board	ironing board	16	39	6	ironing board	otherfurniture	Furniture					n03586090	ironing_board.n.01	objects	39
+572	alarm clock	alarm clock	16	40	7	alarm clock	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+1179	shower head	shower head	15	38	7		otherstructure	Objects							shower	23
+28	lamp base	lamp	15	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+392	water bottle	water bottle	15	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n04557648	water_bottle.n.01	objects	39
+1180	keyboard piano	keyboard piano	15	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+609	projector screen	projector screen	15	38	7	projector screen	otherstructure	Objects							misc	40
+1181	case of water bottles	case of water bottles	15	40	7		otherprop	Objects							objects	39
+195	toaster oven	toaster oven	14	40	7	toaster oven	otherprop	Objects					n04442441	toaster_oven.n.01	appliances	37
+581	music stand	music stand	14	39	6	music stand	otherfurniture	Furniture					n03801760	music_stand.n.01	furniture	36
+58	staircase	stairs	14	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+1182	coat rack	coat rack	14	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	3
+1183	storage organizer	storage organizer	14	40	7		otherprop	Objects							shelving	3
+139	machine	machine	14	40	7	machine	otherprop	Objects					n03699975	machine.n.01	appliances	37
+1184	folded chair	folded chair	14	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1185	fire alarm	fire alarm	14	40	7		otherprop	Objects					n03343737	fire_alarm.n.02	misc	40
+156	fireplace	fireplace	13	38	7	fireplace	otherstructure	Objects					n03346455	fireplace.n.01	fireplace	27
+408	vent	vent	13	40	7		otherprop	Objects					n04526241	vent.n.01	misc	40
+213	furniture	furniture	13	39	6	furniture	otherfurniture	Furniture					n03405725	furniture.n.01	furniture	36
+1186	power strip	power strip	13	40	7		otherprop	Objects							objects	39
+1187	calendar	calendar	13	40	7		otherprop	Objects							objects	39
+1188	poster	poster	13	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+115	toilet paper holder	toilet paper holder	13	40	7	toilet paper holder	otherprop	Objects							objects	39
+1189	potted plant	potted plant	12	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+304	stuffed animal	stuffed animal	12	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+1190	luggage	luggage	12	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+21	curtains	curtain	12	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+312	headphones	headphones	12	40	7		otherprop	Objects					n03261776	earphone.n.01	objects	39
+233	crate	crate	12	39	6	crate	otherfurniture	Furniture					n03127925	crate.n.01	objects	39
+286	candle	candle	12	40	7	candle	otherprop	Objects	lamp				n02948072	candle.n.01	objects	39
+264	projector	projector	12	40	7	projector	otherprop	Objects					n04009552	projector.n.02	objects	39
+110	clothes dryers	clothes dryer	12	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+1191	mattress	mattress	12	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+356	dustpan	dustpan	12	40	7		otherprop	Objects					n03259009	dustpan.n.02	objects	39
+25	drawer	drawer	11	39	6	drawer	otherfurniture	Furniture					n03233905	drawer.n.01	furniture	36
+750	rod	rod	11	40	7		otherprop	Objects			pistol	3948459	n03427202	gat.n.01	misc	40
+269	globe	globe	11	40	7	globe	otherprop	Objects							objects	39
+307	footrest	footrest	11	39	6	foot rest	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+410	piano bench	piano bench	11	39	6	piano bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+730	breakfast bar	breakfast bar	11	38	7	bar	otherstructure	Objects							counter	26
+216	step stool	step stool	11	40	7	step stool	otherprop	Objects	stool				n04315713	step_stool.n.01	stool	19
+1192	hand rail	hand rail	11	38	7	railing	otherstructure	Objects							railing	30
+119	vending machine	vending machine	11	40	7	machine	otherprop	Objects					n04525305	vending_machine.n.01	appliances	37
+682	ceiling fan	ceiling fan	11	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+434	swiffer	swiffer	11	40	7		otherprop	Objects							objects	39
+126	foosball table	foosball table	11	39	6	foosball table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+919	jar	jar	11	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+85	footstool	footstool	11	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+1193	folded table	folded table	10	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+108	round table	round table	10	7	10	table	table	Table	table	table	table	4379243	n04114554	round_table.n.02	table	5
+135	hamper	hamper	10	40	7	basket	otherprop	Objects			basket	2801938	n03482405	hamper.n.02	objects	39
+1194	poster tube	poster tube	10	40	7		otherprop	Objects							objects	39
+432	case	case	10	40	7	case	otherprop	Objects							objects	39
+53	carpet	carpet	10	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+1195	thermostat	thermostat	10	40	7		otherprop	Objects					n04422875	thermostat.n.01	misc	40
+111	coat	coat	10	40	7	jacket	otherprop	Objects					n03057021	coat.n.01	clothes	38
+305	water fountain	water fountain	10	38	7	water fountain	otherstructure	Objects					n03241335	drinking_fountain.n.01	misc	40
+1125	smoke detector	smoke detector	10	40	7		otherprop	Objects							misc	40
+13	pillows	pillow	9	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+1196	flip flops	flip flops	9	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1197	cloth	cloth	9	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1198	banner	banner	9	40	7		otherprop	Objects					n02788021	banner.n.01	misc	40
+1199	clothes hanger	clothes hanger	9	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	objects	39
+1200	whiteboard eraser	whiteboard eraser	9	40	7		otherprop	Objects							objects	39
+378	iron	iron	9	40	7		otherprop	Objects					n03584829	iron.n.04	objects	39
+591	instrument case	instrument case	9	40	7	case	otherprop	Objects							objects	39
+49	toilet paper rolls	toilet paper	9	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+92	soap	soap	9	40	7	soap	otherprop	Objects					n04253437	soap.n.01	objects	39
+1098	block	block	9	40	7		otherprop	Objects							misc	40
+291	wall hanging	wall hanging	8	40	7		otherprop	Objects					n03491178	hanging.n.01	picture	6
+1063	kitchen island	kitchen island	8	38	7	kitchen island	otherstructure	Objects					n03620600	kitchen_island.n.01	counter	26
+107	pipes	pipe	8	38	7		otherstructure	Objects							misc	40
+1135	toothbrush	toothbrush	8	40	7	toothbrush	otherprop	Objects					n04453156	toothbrush.n.01	objects	39
+189	shirt	shirt	8	40	7		otherprop	Objects					n04197391	shirt.n.01	clothes	38
+245	cutting board	cutting board	8	40	7	cutting board	otherprop	Objects					n03025513	chopping_board.n.01	objects	39
+194	vase	vase	8	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1201	shower control valve	shower control valve	8	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+386	exercise machine	exercise machine	8	40	7	machine	otherprop	Objects							gym_equipment	33
+1202	compost bin	compost bin	8	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+857	shorts	shorts	8	40	7	shorts	otherprop	Objects							clothes	38
+452	tire	tire	8	40	7		otherprop	Objects					n04440749	tire.n.01	objects	39
+1203	teddy bear	teddy bear	7	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+346	bathrobe	bathrobe	7	40	7		otherprop	Objects					n02807616	bathrobe.n.01	clothes	38
+152	handrail	handrail	7	38	7	railing	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+83	faucet	faucet	7	40	7	faucet	otherprop	Objects			faucet	3325088	n03325088	faucet.n.01	misc	40
+1204	pantry wall	pantry wall	7	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+726	thermos	thermos	7	40	7	flask	otherprop	Objects	bottle		bottle	2876657	n04422727	thermos.n.01	objects	39
+61	rug	rug	7	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+39	couch cushions	cushion	7	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1117	tripod	tripod	7	39	6	stand	otherfurniture	Furniture					n04485082	tripod.n.01	objects	39
+540	mailbox	mailbox	7	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+1205	tupperware	tupperware	7	40	7		otherprop	Objects							objects	39
+415	shoe rack	shoe rack	7	40	7	shoe rack	otherprop	Objects							shelving	31
+31	towels	towel	6	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+1206	beer bottles	beer bottle	6	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+153	treadmill	treadmill	6	39	6	treadmill	otherfurniture	Furniture					n04477387	treadmill.n.01	gym_equipment	33
+1207	salt	salt	6	40	7		otherprop	Objects							objects	39
+129	chest	chest	6	39	6	chest	otherfurniture	Furniture	dresser	dresser					chest_of_drawers	13
+220	dispenser	dispenser	6	40	7		otherprop	Objects					n03210683	dispenser.n.01	objects	39
+1208	mirror doors	mirror door	6	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+231	remote	remote	6	40	7		otherprop	Objects			remote_control	4074963	n04074963	remote_control.n.01	objects	39
+1209	folded ladder	folded ladder	6	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	misc	40
+39	cushion	cushion	6	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1210	carton	carton	6	40	7		otherprop	Objects							objects	39
+117	step	step	6	38	7		otherstructure	Objects					n04314914	step.n.04	misc	40
+822	drying rack	drying rack	6	39	6	drying rack	otherfurniture	Furniture							shelving	31
+238	slippers	slipper	6	40	7	shoe	otherprop	Objects					n04241394	slipper.n.01	clothes	38
+143	pool table	pool table	6	39	6	pool table	otherfurniture	Furniture	table	table	table	4379243	n03982430	pool_table.n.01	table	5
+1211	soda stream	soda stream	6	40	7		otherprop	Objects							objects	39
+228	toilet brush	toilet brush	6	40	7	toilet brush	otherprop	Objects							objects	39
+494	loft bed	loft bed	6	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+226	cooking pot	cooking pot	6	40	7	pot	otherprop	Objects							objects	39
+91	heater	heater	6	39	6	heater	otherfurniture	Furniture					n03508101	heater.n.01	misc	40
+1072	messenger bag	messenger bag	6	37	7	bag	bag	Objects							objects	39
+435	stapler	stapler	6	40	7	stapler	otherprop	Objects					n04303497	stapler.n.01	objects	39
+1165	closet walls	closet wall	5	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+345	scanner	scanner	5	40	7		otherprop	Objects							appliances	37
+893	elliptical machine	elliptical machine	5	40	7	machine	otherprop	Objects							gym_equipment	33
+621	kettle	kettle	5	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1212	metronome	metronome	5	40	7		otherprop	Objects					n03757604	metronome.n.01	objects	39
+297	dumbell	dumbell	5	40	7		otherprop	Objects							objects	39
+1213	music book	music book	5	23	2	book	books	Books					n02870526	book.n.11	objects	39
+1214	rice cooker	rice cooker	5	40	7		otherprop	Objects							objects	39
+1215	dart board	dart board	5	38	7	board	otherstructure	Objects					n03162940	dartboard.n.01	objects	39
+529	sewing machine	sewing machine	5	40	7	sewing machine	otherprop	Objects					n04179913	sewing_machine.n.01	objects	39
+1216	grab bar	grab bar	5	38	7	railing	otherstructure	Objects							railing	30
+1217	flowerpot	flowerpot	5	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1218	painting	painting	5	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1219	railing	railing	5	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+1220	stair	stair	5	38	7	stairs	otherstructure	Objects	stairs				n04314914	step.n.04	stairs	16
+525	toolbox	toolbox	5	39	6	chest	otherfurniture	Furniture					n04452615	toolbox.n.01	objects	39
+204	nerf gun	nerf gun	5	40	7		otherprop	Objects							objects	39
+693	binders	binder	5	40	7	binder	otherprop	Objects							objects	39
+179	desk lamp	desk lamp	5	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1221	quadcopter	quadcopter	5	40	7		otherprop	Objects							objects	39
+1222	pitcher	pitcher	5	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+1223	hanging	hanging	5	40	7		otherprop	Objects							misc	40
+1224	mail	mail	5	40	7		otherprop	Objects							misc	40
+1225	closet ceiling	closet ceiling	5	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+1226	hoverboard	hoverboard	5	40	7		otherprop	Objects							objects	39
+1227	beanbag chair	beanbag chair	5	39	6	bean bag	otherfurniture	Furniture					n02816656	beanbag.n.01	chair	3
+571	water heater	water heater	5	40	7	water heater	otherprop	Objects					n04560113	water_heater.n.01	misc	40
+1228	spray bottle	spray bottle	5	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+556	rope	rope	5	40	7	rope	otherprop	Objects					n04108268	rope.n.01	objects	39
+280	plastic container	plastic container	5	40	7	container	otherprop	Objects							objects	39
+1229	soap bottle	soap bottle	5	40	7	soap	otherprop	Objects							objects	39
+1230	ikea bag	ikea bag	4	37	7	bag	bag	Objects				2773838	n02773838	bag.n.06	objects	39
+1231	sleeping bag	sleeping bag	4	40	7		otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+1232	duffel bag	duffel bag	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+746	frying pan	frying pan	4	40	7	frying pan	otherprop	Objects					n03400231	frying_pan.n.01	objects	39
+1233	oven mitt	oven mitt	4	40	7		otherprop	Objects							objects	39
+1234	pot	pot	4	40	7	pot	otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+144	hand dryer	hand dryer	4	40	7		otherprop	Objects							objects	39
+282	dollhouse	dollhouse	4	39	6	doll house	otherfurniture	Furniture					n03219483	dollhouse.n.01	objects	39
+167	shampoo bottle	shampoo bottle	4	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1235	hair brush	hair brush	4	40	7		otherprop	Objects					n02908217	brush.n.02	objects	39
+1236	tennis racket	tennis racket	4	40	7		otherprop	Objects					n04409806	tennis_racket.n.01	objects	39
+1237	display case	display case	4	40	7	case	otherprop	Objects							objects	39
+234	ping pong table	ping pong table	4	39	6	ping pong table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+563	boiler	boiler	4	40	7		otherprop	Objects							misc	40
+1238	bag of coffee beans	bag of coffee beans	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+1239	bananas	banana	4	40	7		otherprop	Objects					n00021265	food.n.01	objects	39
+1240	carseat	carseat	4	40	7		otherprop	Objects							misc	40
+366	helmet	helmet	4	40	7		otherprop	Objects			helmet	3513137	n03513137	helmet.n.02	clothes	38
+816	umbrella	umbrella	4	40	7	umbrella	otherprop	Objects					n04507155	umbrella.n.01	objects	39
+1241	coffee box	coffee box	4	40	7		otherprop	Objects							objects	39
+719	envelope	envelope	4	40	7	envelope	otherprop	Objects					n03291819	envelope.n.01	objects	39
+284	wet floor sign	wet floor sign	4	40	7	sign	otherprop	Objects							misc	40
+1242	clothing rack	clothing rack	4	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+247	controller	controller	4	40	7		otherprop	Objects					n03096960	control.n.09	objects	39
+1243	bath walls	bathroom wall	4	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+1244	podium	podium	4	39	6		otherfurniture	Furniture					n03159640	dais.n.01	furniture	36
+1245	storage box	storage box	4	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1246	dolly	dolly	4	40	7		otherprop	Objects							misc	40
+1247	shampoo	shampoo	3	40	7		otherprop	Objects					n04183516	shampoo.n.01	objects	39
+592	paper tray	paper tray	3	40	7	paper tray	otherprop	Objects							objects	39
+385	cabinet door	cabinet door	3	8	12	door	door	Wall	door						door	4
+1248	changing station	changing station	3	40	7		otherprop	Objects							misc	40
+1249	poster printer	poster printer	3	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+133	screen	screen	3	40	7		otherprop	Objects					n03151077	curtain.n.01	curtain	12
+301	soap bar	soap bar	3	38	7	bar	otherstructure	Objects							objects	39
+1250	crutches	crutches	3	40	7		otherprop	Objects					n03141823	crutch.n.01	objects	39
+379	studio light	studio light	3	38	7	light	otherstructure	Objects							lighting	28
+130	stack of cups	cup	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+1251	toilet flush button	toilet flush button	3	40	7		otherprop	Objects							objects	39
+450	trunk	trunk	3	40	7		otherprop	Objects							misc	40
+1252	grocery bag	grocery bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03461288	grocery_bag.n.01	objects	39
+316	plastic bin	plastic bin	3	40	7	bin	otherprop	Objects							objects	39
+1253	pizza box	pizza box	3	29	7	box	box	Objects							objects	39
+385	cabinet doors	cabinet door	3	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	door	4
+1254	legs	legs	3	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+461	car	car	3	40	7	car	otherprop	Objects	car		car	2958343	n02958343	car.n.01	misc	40
+1255	shaving cream	shaving cream	3	40	7		otherprop	Objects					n04186051	shaving_cream.n.01	objects	39
+1256	luggage stand	luggage stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+599	shredder	shredder	3	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+281	statue	statue	3	40	7	sculpture	otherprop	Objects					n04306847	statue.n.01	misc	40
+1257	urinal	urinal	3	33	7	toilet	toilet	Objects	toilet	toilet			n04515991	urinal.n.01	toilet	18
+1258	hose	hose	3	40	7		otherprop	Objects					n03539875	hose.n.03	misc	40
+1259	bike pump	bike pump	3	40	7		otherprop	Objects							objects	39
+319	coatrack	coatrack	3	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1260	bear	bear	3	40	7		otherprop	Objects							objects	39
+28	wall lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1261	humidifier	humidifier	3	40	7		otherprop	Objects							objects	39
+546	toothpaste	toothpaste	3	40	7	toothpaste	otherprop	Objects							objects	39
+1262	mouthwash bottle	mouthwash bottle	3	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1263	poster cutter	poster cutter	3	40	7		otherprop	Objects							objects	39
+1264	golf bag	golf bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03445617	golf_bag.n.01	objects	39
+1265	food container	food container	3	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1266	camera	camera	3	40	7		otherprop	Objects							objects	39
+28	table lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n04380533	table_lamp.n.01	lighting	28
+1267	yoga mat	yoga mat	3	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1268	card	card	3	40	7		otherprop	Objects							objects	39
+1269	mug	mug	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+188	shower doors	shower door	3	38	7		otherstructure	Objects					n04208936	shower.n.01	door	4
+689	cardboard	cardboard	3	40	7		otherprop	Objects							objects	39
+1270	rack stand	rack stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+1271	boxes of paper	boxes of paper	3	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1272	flag	flag	3	40	7		otherprop	Objects							misc	40
+354	futon	futon	3	39	6	mattress	otherfurniture	Furniture					n03408444	futon.n.01	sofa	10
+339	magazine	magazine	3	40	7	magazine	otherprop	Objects					n06595351	magazine.n.01	objects	39
+1009	exit sign	exit sign	3	40	7	exit sign	otherprop	Objects							misc	40
+1273	rolled poster	rolled poster	3	40	7		otherprop	Objects							objects	39
+1274	wheel	wheel	3	40	7		otherprop	Objects							objects	39
+15	pictures	picture	3	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1275	blackboard eraser	blackboard eraser	3	40	7	eraser	otherprop	Objects					n03294833	eraser.n.01	objects	39
+361	organizer	organizer	3	40	7		otherprop	Objects					n03918737	personal_digital_assistant.n.01	objects	39
+1276	doll	doll	3	40	7	toy	otherprop	Objects					n03219135	doll.n.01	objects	39
+326	book rack	book rack	3	39	6	bookrack	otherfurniture	Furniture							objects	39
+1277	laundry bag	laundry bag	3	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+1278	sponge	sponge	3	40	7		otherprop	Objects					n01906749	sponge.n.04	objects	39
+116	seating	seat	3	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+1184	folded chairs	folded chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1279	lotion bottle	lotion bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+212	can	can	2	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1280	lunch box	lunch box	2	40	7		otherprop	Objects							objects	39
+1281	food display	food display	2	40	7		otherprop	Objects							misc	40
+794	storage shelf	storage shelf	2	40	7		otherprop	Objects							shelving	31
+1282	sliding wood door	sliding wood door	2	40	7		otherprop	Objects							door	4
+955	pants	pants	2	40	7		otherprop	Objects					n04489008	trouser.n.01	clothes	38
+387	wood	wood	2	40	7		otherprop	Objects							misc	40
+69	boards	board	2	38	7	board	otherstructure	Objects							board_panel	35
+65	bottles	bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+523	washcloth	washcloth	2	40	7		otherprop	Objects					n04554523	washcloth.n.01	towel	20
+389	workbench	workbench	2	39	6	bench	otherfurniture	Furniture	bench		table	4379243	n04600486	workbench.n.01	table	5
+29	open kitchen cabinet	kitchen cabinet	2	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+1283	organizer shelf	organizer shelf	2	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+146	frame	frame	2	38	7		otherstructure	Objects							misc	40
+130	cups	cup	2	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+372	exercise ball	exercise ball	2	40	7	ball	otherprop	Objects					n04285146	sports_equipment.n.01	gym_equipment	33
+289	easel	easel	2	39	6	stand	otherfurniture	Furniture					n03262809	easel.n.01	furniture	36
+440	garbage bag	garbage bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+321	roomba	roomba	2	40	7		otherprop	Objects							objects	39
+976	garage door	garage door	2	38	7	garage door	otherstructure	Objects	door						door	4
+1256	luggage rack	luggage stand	2	39	6	stand	otherfurniture	Furniture					n04038440		shelving	31
+1284	bike lock	bike lock	2	40	7		otherprop	Objects							objects	39
+1285	briefcase	briefcase	2	40	7		otherprop	Objects					n02900705	briefcase.n.01	objects	39
+357	hand towel	hand towel	2	27	7	towel	towel	Objects					n03490006	hand_towel.n.01	towel	20
+1286	bath products	bath product	2	40	7		otherprop	Objects							objects	39
+1287	star	star	2	40	7		otherprop	Objects					n09444783	star.n.03	misc	40
+365	map	map	2	40	7	map	otherprop	Objects					n03720163	map.n.01	misc	40
+1288	coffee bean bag	coffee bean bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+81	headboard	headboard	2	39	6	headboard	otherfurniture	Furniture					n03502200	headboard.n.01	bed	11
+1289	ipad	ipad	2	40	7		otherprop	Objects							objects	39
+1290	display rack	display rack	2	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+948	traffic cone	traffic cone	2	40	7	cone	otherprop	Objects	cone						objects	39
+174	toiletry	toiletry	2	40	7		otherprop	Objects					n04447443	toiletry.n.01	objects	39
+1028	canopy	canopy	2	40	7		otherprop	Objects							misc	40
+1291	massage chair	massage chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1292	paper organizer	paper organizer	2	40	7		otherprop	Objects							objects	39
+1005	barricade	barricade	2	40	7		otherprop	Objects							misc	40
+235	platform	platform	2	38	7		otherstructure	Objects							misc	40
+1293	cap	cap	2	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+1294	dumbbell plates	dumbbell plates	2	40	7		otherprop	Objects							objects	39
+1295	elevator	elevator	2	38	7		otherstructure	Objects							misc	40
+1296	cooking pan	cooking pan	2	40	7	pan	otherprop	Objects					n03880531	pan.n.01	objects	39
+1297	trash bag	trash bag	2	37	7	bag	bag	Objects							objects	39
+1298	santa	santa	2	40	7		otherprop	Objects							misc	40
+1299	jewelry box	jewelry box	2	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1300	boat	boat	2	40	7		otherprop	Objects							misc	40
+1301	sock	sock	2	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1051	kinect	kinect	2	40	7	kinect	otherprop	Objects							objects	39
+566	crib	crib	2	39	6	crib	otherfurniture	Furniture							furniture	36
+1302	plastic storage bin	plastic storage bin	2	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1062	cooler	cooler	2	24	6	refridgerator	refridgerator	Furniture					n03102654	cooler.n.01	appliances	37
+1303	kitchen apron	kitchen apron	2	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1304	dishwashing soap bottle	dishwashing soap bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1305	xbox controller	xbox controller	2	40	7		otherprop	Objects							objects	39
+1306	banana holder	banana holder	2	40	7		otherprop	Objects							objects	39
+298	ping pong paddle	ping pong paddle	2	40	7		otherprop	Objects							table	5
+1307	airplane	airplane	2	40	7		otherprop	Objects							misc	40
+1308	conditioner bottle	conditioner bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1309	tea kettle	tea kettle	2	40	7	tea kettle	otherprop	Objects					n04397768	teakettle.n.01	objects	39
+43	bedframe	bedframe	2	39	6		otherfurniture	Furniture					n02822579	bedstead.n.01	bed	11
+1310	wood beam	wood beam	2	38	7		otherstructure	Objects							beam	29
+593	toilet paper package	toilet paper package	2	40	7		otherprop	Objects							objects	39
+1311	wall mounted coat rack	wall mounted coat rack	2	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1312	film light	film light	2	40	7		otherprop	Objects							lighting	28
+749	ceiling lamp	ceiling lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+623	chain	chain	1	40	7		otherprop	Objects							chair	3
+1313	sofa	sofa	1	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+99	closet wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+265	sweater	sweater	1	40	7		otherprop	Objects					n04370048	sweater.n.01	clothes	38
+1314	kitchen mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+99	wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1315	water softener	water softener	1	40	7		otherprop	Objects							misc	40
+448	banister	banister	1	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+257	trolley	trolley	1	40	7	trolley	otherprop	Objects					n04335435	streetcar.n.01	misc	40
+1316	pantry shelf	pantry shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+786	sofa bed	sofa bed	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+801	loofa	loofa	1	40	7		otherprop	Objects							objects	39
+972	shower faucet handle	shower faucet handle	1	40	7	handle	otherprop	Objects							shower	23
+1317	toy piano	toy piano	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1318	fish	fish	1	40	7		otherprop	Objects					n02512053	fish.n.01	objects	39
+75	file cabinets	file cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n03337140	file.n.03	cabinet	7
+657	cat litter box	cat litter box	1	29	7	box	box	Objects							objects	39
+561	electric panel	electric panel	1	40	7		otherprop	Objects							misc	40
+93	suitcases	suitcase	1	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+513	curtain rod	curtain rod	1	38	7	curtain rod	otherstructure	Objects							curtain	12
+411	bunk bed	bunk bed	1	39	6	bunk bed	otherfurniture	Furniture	bed	bed	bed	2818832	n02920259	bunk_bed.n.01	bed	11
+1122	chandelier	chandelier	1	38	7	chandelier	otherstructure	Objects					n03005285	chandelier.n.01	lighting	28
+922	tape	tape	1	40	7	tape	otherprop	Objects							objects	39
+88	plates	plate	1	40	7		otherprop	Objects					n03959485	plate.n.04	objects	39
+518	alarm	alarm	1	40	7	alarm	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+814	fire hose	fire hose	1	40	7		otherprop	Objects					n03346004	fire_hose.n.01	misc	40
+1319	toy dinosaur	toy dinosaur	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1320	cone	cone	1	40	7		otherprop	Objects							objects	39
+649	glass doors	glass door	1	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+607	hatrack	hatrack	1	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+819	subwoofer	subwoofer	1	40	7	speaker	otherprop	Objects			speaker	3691459	n04349401	subwoofer.n.01	objects	39
+1321	fire sprinkler	fire sprinkler	1	40	7		otherprop	Objects							misc	40
+1322	trash cabinet	trash cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+1204	pantry walls	pantry wall	1	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+227	photo	photo	1	40	7	photo	otherprop	Objects					n03925226	photograph.n.01	picture	6
+817	barrier	barrier	1	40	7		otherprop	Objects					n02796623	barrier.n.01	misc	40
+130	stacks of cups	cup	1	40	7		otherprop	Objects					n03147509	cup.n.01	objects	39
+712	beachball	beachball	1	40	7	ball	otherprop	Objects					n02814224	beach_ball.n.01	objects	39
+1323	folded boxes	folded boxes	1	40	7		otherprop	Objects							objects	39
+1324	contact lens solution bottle	contact lens solution bottle	1	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+673	covered box	covered box	1	29	7	box	box	Objects							objects	39
+459	folder	folder	1	40	7	folder	otherprop	Objects					n03376279	folder.n.02	objects	39
+643	mail trays	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+238	slipper	slipper	1	40	7		otherprop	Objects					n04241394	slipper.n.01	clothes	38
+765	magazine rack	magazine rack	1	39	6	stand	otherfurniture	Furniture					n03704549	magazine_rack.n.01	shelving	31
+1008	sticker	sticker	1	40	7	sticker	otherprop	Objects					n07272545	gummed_label.n.01	objects	39
+225	lotion	lotion	1	40	7		otherprop	Objects					n03690938	lotion.n.01	objects	39
+1083	buddha	buddha	1	40	7		otherprop	Objects							objects	39
+813	file organizer	file organizer	1	40	7		otherprop	Objects							objects	39
+138	paper towel rolls	paper towel roll	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+1145	night lamp	night lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+796	fuse box	fuse box	1	40	7		otherprop	Objects							misc	40
+1325	knife block	knife block	1	40	7		otherprop	Objects							objects	39
+363	furnace	furnace	1	39	6	furnace	otherfurniture	Furniture					n03404449	furnace.n.01		
+1174	cd cases	cd case	1	40	7		otherprop	Objects							objects	39
+38	stools	stool	1	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+1326	hand sanitzer dispenser	hand sanitzer dispenser	1	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+997	teapot	teapot	1	40	7	tea pot	otherprop	Objects					n04398044	teapot.n.01	objects	39
+1327	pen holder	pen holder	1	40	7		otherprop	Objects							objects	39
+1328	tray rack	tray rack	1	40	7		otherprop	Objects							objects	39
+1329	wig	wig	1	40	7		otherprop	Objects					n04584207	wig.n.01	objects	39
+182	switch	switch	1	40	7		otherprop	Objects					n04372370	switch.n.01	misc	40
+280	plastic containers	plastic container	1	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1330	night light	night light	1	40	7		otherprop	Objects							lighting	28
+1331	notepad	notepad	1	40	7		otherprop	Objects							objects	39
+1332	mail bin	mail bin	1	40	7		otherprop	Objects							misc	40
+1333	elevator button	elevator button	1	40	7		otherprop	Objects							misc	40
+939	gaming wheel	gaming wheel	1	40	7		otherprop	Objects							objects	39
+1334	drum set	drum set	1	40	7		otherprop	Objects							objects	39
+480	cosmetic bag	cosmetic bag	1	37	7	bag	bag	Objects							objects	39
+907	coffee mug	coffee mug	1	40	7	vessel	otherprop	Objects			cup or mug	3797390	n03063599	coffee_mug.n.01	objects	39
+1335	closet shelf	closet shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+1336	baby mobile	baby mobile	1	40	7		otherprop	Objects							objects	39
+829	diaper bin	diaper bin	1	40	7	bin	otherprop	Objects							objects	39
+947	door wall	door wall	1	1	12	wall	wall	Wall							wall	1
+1116	stepstool	stepstool	1	40	7	step stool	otherprop	Objects							objects	39
+599	paper shredder	shredder	1	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+733	dress rack	dress rack	1	40	7		otherprop	Objects					n03238762	dress_rack.n.01	misc	40
+123	cover	cover	1	40	7	blanket	otherprop	Objects							objects	39
+506	shopping bag	shopping bag	1	37	7	bag	bag	Objects					n04204081	shopping_bag.n.01	objects	39
+569	sliding door	sliding door	1	8	12	door	door	Wall	door				n04239074	sliding_door.n.01	door	4
+1337	exercise bike	exercise bike	1	40	7	machine	otherprop	Objects					n04210120	shredder.n.01	gym_equipment	33
+1338	recliner chair	recliner chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03238762	dress_rack.n.01	chair	3
+1314	kitchenaid mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+1339	soda can	soda can	1	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1340	stovetop	stovetop	1	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+851	stepladder	stepladder	1	39	6	ladder	otherfurniture	Furniture	stairs				n04315599	step_ladder.n.01	stairs	16
+142	tap	tap	1	40	7	faucet	otherprop	Objects			faucet	3325088	n04559451	water_faucet.n.01	objects	39
+436	cable	cable	1	40	7	cables	otherprop	Objects							objects	39
+1341	baby changing station	baby changing station	1	39	6		otherfurniture	Furniture							furniture	36
+1342	costume	costume	1	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+885	rocking chair	rocking chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04099969	rocking_chair.n.01	chair	3
+693	binder	binder	1	40	7	binder	otherprop	Objects							objects	39
+815	media center	media center	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+401	towel rack	towel rack	1	40	7		otherprop	Objects					n04459773	towel_rack.n.01	misc	40
+1343	medal	medal	1	40	7		otherprop	Objects							objects	39
+1184	stack of folded chairs	folded chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1344	telescope	telescope	1	40	7		otherprop	Objects					n04403638	telescope.n.01	objects	39
+1345	closet doorframe	closet doorframe	1	8	12	door	door	Wall	door						door	4
+160	glass	glass	1	38	7	glass	otherstructure	Objects					n03438257	glass.n.02	misc	40
+1126	baseball cap	baseball cap	1	40	7		otherprop	Objects			cap	2954340	n02799323	baseball_cap.n.01	clothes	38
+1346	battery disposal jar	battery disposal jar	1	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+332	mop	mop	1	40	7		otherprop	Objects					n04367480	swab.n.02	objects	39
+397	tank	tank	1	40	7		otherprop	Objects							objects	39
+643	mail tray	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+551	centerpiece	centerpiece	1	40	7	centerpiece	otherprop	Objects					n02994419	centerpiece.n.02	objects	39
+1163	stick	stick	1	40	7	stick	otherprop	Objects							objects	39
+1347	closet floor	closet floor	1	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+1348	dryer sheets	dryer sheets	1	40	7		otherprop	Objects							objects	39
+803	bycicle	bycicle	1	40	7		otherprop	Objects							misc	40
+484	flower stand	flower stand	1	39	6	stand	otherfurniture	Furniture							furniture	36
+1349	air mattress	air mattress	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02690809	air_mattress.n.01	bed	11
+1350	clip	clip	1	40	7		otherprop	Objects							objects	39
+222	side table	side table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1253	pizza boxes	pizza box	1	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1351	display	display	1	39	7		otherfurniture	Furniture					n03211117	display.n.06	misc	40
+1352	postcard	postcard	1	40	7		otherprop	Objects							objects	39
+828	display sign	display sign	1	40	7	sign	otherprop	Objects							misc	40
+1353	paper towel	paper towel	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+612	boots	boot	1	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1354	tennis racket bag	tennis racket bag	1	40	7		otherprop	Objects							objects	39
+1355	air hockey table	air hockey table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1301	socks	sock	1	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1356	food bag	food bag	1	37	7	bag	bag	Objects							objects	39
+1199	clothes hangers	clothes hanger	1	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	misc	40
+1357	starbucks cup	starbucks cup	1	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
diff --git a/datasets/scannet_preprocess/meta_data/scannetv2-labels.combined.tsv b/datasets/scannet_preprocess/meta_data/scannetv2-labels.combined.tsv
new file mode 100644
index 0000000000000000000000000000000000000000..cff61b132f3ebf4edd513445b76fd39db54462d2
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv2-labels.combined.tsv
@@ -0,0 +1,608 @@
+id	raw_category	category	count	nyu40id	eigen13id	nyuClass	nyu40class	eigen13class	ModelNet40	ModelNet10	ShapeNetCore55	synsetoffset	wnsynsetid	wnsynsetkey	mpcat40	mpcat40index
+1	wall	wall	8277	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+2	chair	chair	4646	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+22	books	book	1678	23	2	book	books	Books					n02870526	book.n.11	objects	39
+3	floor	floor	1553	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+5	door	door	1483	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+1163	object	object	1313	40	7		otherprop	Objects							objects	39
+16	window	window	1209	9	13	window	window	Window					n04587648	window.n.01	window	9
+4	table	table	1170	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+56	trash can	trash can	1090	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+13	pillow	pillow	937	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+15	picture	picture	862	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+41	ceiling	ceiling	806	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+26	box	box	775	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+161	doorframe	doorframe	768	8	12	door	door	Wall	door					doorframe.n.01	door	4
+19	monitor	monitor	765	40	7	monitor	otherprop	Objects	monitor	monitor	tv or monitor	3211117	n03782190	monitor.n.04	objects	39
+7	cabinet	cabinet	731	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+9	desk	desk	680	14	10	desk	desk	Table	desk	desk	table	4379243	n03179701	desk.n.01	table	5
+8	shelf	shelf	641	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+10	office chair	office chair	595	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04373704	swivel_chair.n.01	chair	3
+31	towel	towel	570	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+6	couch	couch	502	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+14	sink	sink	488	34	7	sink	sink	Objects	sink				n04223580	sink.n.01	sink	15
+48	backpack	backpack	479	40	7	backpack	otherprop	Objects					n02769748	backpack.n.01	objects	39
+28	lamp	lamp	419	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+11	bed	bed	370	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+18	bookshelf	bookshelf	360	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+71	mirror	mirror	349	19	7	mirror	mirror	Objects					n03773035	mirror.n.01	mirror	21
+21	curtain	curtain	347	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+40	plant	plant	331	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+52	whiteboard	whiteboard	327	30	7	whiteboard	whiteboard	Objects					n03211616	display_panel.n.01	board_panel	35
+96	radiator	radiator	322	39	6	radiator	otherfurniture	Furniture					n04041069	radiator.n.02	misc	40
+22	book	book	318	23	2	book	books	Books					n02870526	book.n.11	objects	39
+29	kitchen cabinet	kitchen cabinet	310	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+49	toilet paper	toilet paper	291	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+29	kitchen cabinets	kitchen cabinet	289	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+23	armchair	armchair	281	5	4	chair	chair	Chair	chair	chair	chair	3001627	n02738535	armchair.n.01	chair	3
+63	shoes	shoe	272	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+24	coffee table	coffee table	258	7	10	coffee table	table	Table	table	table	table	4379243	n03063968	coffee_table.n.01	table	5
+17	toilet	toilet	256	33	7	toilet	toilet	Objects	toilet	toilet			n04446276	toilet.n.01	toilet	18
+47	bag	bag	252	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+32	clothes	clothes	248	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+46	keyboard	keyboard	246	40	7	keyboard	otherprop	Objects	keyboard		computer keyboard	3085013	n03085013	computer_keyboard.n.01	objects	39
+65	bottle	bottle	226	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+97	recycling bin	recycling bin	225	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+34	nightstand	nightstand	224	32	6	night stand	night stand	Furniture	night_stand	night_stand			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+38	stool	stool	221	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+33	tv	tv	219	25	11	television	television	TV			tv or monitor	3211117	n03211117	display.n.06	tv_monitor	22
+75	file cabinet	file cabinet	217	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+36	dresser	dresser	213	17	6	dresser	dresser	Furniture	dresser	dresser			n03015254	chest_of_drawers.n.01	chest_of_drawers	13
+64	computer tower	computer tower	203	40	7	computer	otherprop	Objects					n03082979	computer.n.01	objects	39
+32	clothing	clothes	165	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+101	telephone	telephone	164	40	7	telephone	otherprop	Objects			telephone	4401088	n04401088	telephone.n.01	objects	39
+130	cup	cup	157	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+27	refrigerator	refrigerator	154	24	6	refridgerator	refridgerator	Furniture					n04070727	refrigerator.n.01	appliances	37
+44	end table	end table	147	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+131	jacket	jacket	146	40	7	jacket	otherprop	Objects					n03589791	jacket.n.01	clothes	38
+55	shower curtain	shower curtain	144	28	7	shower curtain	shower curtain	Objects	curtain				n04209239	shower_curtain.n.01	curtain	12
+42	bathtub	bathtub	144	36	7	bathtub	bathtub	Objects	bathtub	bathtub	tub	2808440	n02808440	bathtub.n.01	bathtub	25
+59	microwave	microwave	141	40	7	microwave	otherprop	Objects			microwave	3761084	n03761084	microwave.n.02	appliances	37
+159	kitchen counter	kitchen counter	140	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+74	sofa chair	sofa chair	129	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+82	paper towel dispenser	paper towel dispenser	129	40	7	paper towel dispenser	otherprop	Objects							objects	39
+1164	bathroom vanity	bathroom vanity	126	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	table	5
+93	suitcase	suitcase	118	40	7	luggage	otherprop	Objects					n02773838	bag.n.06	objects	39
+77	laptop	laptop	111	40	7	laptop	otherprop	Objects	laptop		laptop	3642806	n03642806	laptop.n.01	objects	39
+67	ottoman	ottoman	111	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+128	shower walls	shower wall	109	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+50	printer	printer	106	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+35	counter	counter	104	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+69	board	board	100	38	7	board	otherstructure	Objects							board_panel	35
+100	soap dispenser	soap dispenser	99	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+62	stove	stove	95	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+105	light	light	93	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+1165	closet wall	closet wall	90	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+165	mini fridge	mini fridge	87	24	6	refridgerator	refridgerator	Furniture					n03273913	electric_refrigerator.n.01	appliances	37
+7	cabinets	cabinet	79	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+5	doors	door	76	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+76	fan	fan	75	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+230	tissue box	tissue box	73	40	7	tissue box	otherprop	Objects					n02883344	box.n.01	objects	39
+54	blanket	blanket	72	40	7	blanket	otherprop	Objects					n02849154	blanket.n.01	objects	39
+125	bathroom stall	bathroom stall	71	38	7		otherstructure	Objects					n02873839	booth.n.02	misc	40
+72	copier	copier	70	40	7		otherprop	Objects					n03257586	duplicator.n.01	appliances	37
+68	bench	bench	66	39	6	bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+145	bar	bar	66	38	7	bar	otherstructure	Objects					n02788689	bar.n.03	misc	40
+157	soap dish	soap dish	65	40	7	soap dish	otherprop	Objects					n04254009	soap_dish.n.01	objects	39
+1166	laundry hamper	laundry hamper	65	40	7	laundry basket	otherprop	Objects							objects	39
+132	storage bin	storage bin	63	40	7	storage bin	otherprop	Objects							objects	39
+1167	bathroom stall door	bathroom stall door	62	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+232	light switch	light switch	61	38	7	light switch	otherstructure	Objects					n04372370	switch.n.01	misc	40
+134	coffee maker	coffee maker	61	40	7		otherprop	Objects					n03063338	coffee_maker.n.01	appliances	37
+51	tv stand	tv stand	61	39	6	tv stand	otherfurniture	Furniture	tv_stand				n03290653	entertainment_center.n.01	furniture	36
+250	decoration	decoration	60	40	7		otherprop	Objects					n03169390	decoration.n.01	misc	40
+1168	ceiling light	ceiling light	59	38	7	light	otherstructure	Objects					n03665366	light.n.02	lighting	28
+342	range hood	range hood	59	38	7	range hood	otherstructure	Objects	range_hood				n04053677	range_hood.n.01	misc	40
+89	blackboard	blackboard	58	38	7	blackboard	otherstructure	Objects					n02846511	blackboard.n.01	board_panel	35
+103	clock	clock	58	40	7	clock	otherprop	Objects			clock	3046257	n03046257	clock.n.01	objects	39
+99	wardrobe closet	wardrobe	54	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+95	rail	rail	53	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+154	bulletin board	bulletin board	53	38	7	board	otherstructure	Objects					n03211616	display_panel.n.01	board_panel	35
+140	mat	mat	52	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1169	trash bin	trash bin	52	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+193	ledge	ledge	51	38	7		otherstructure	Objects					n09337253	ledge.n.01	misc	40
+116	seat	seat	49	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+202	mouse	mouse	49	40	7	mouse	otherprop	Objects					n03793489	mouse.n.04	objects	39
+73	basket	basket	48	40	7	basket	otherprop	Objects			basket	2801938	n02801938	basket.n.01	objects	39
+78	shower	shower	48	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+1170	dumbbell	dumbbell	48	40	7		otherprop	Objects					n03255030	dumbbell.n.01	objects	39
+79	paper	paper	46	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+80	person	person	46	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+141	windowsill	windowsill	45	38	7		otherstructure	Objects					n04590263	windowsill.n.01	window	9
+57	closet	closet	45	39	6	wardrobe	otherfurniture	Furniture	wardrobe						misc	40
+102	bucket	bucket	45	40	7	bucket	otherprop	Objects					n02909870	bucket.n.01	misc	40
+261	sign	sign	44	40	7	sign	otherprop	Objects					n04217882	signboard.n.01	objects	39
+118	speaker	speaker	43	40	7	speaker	otherprop	Objects			speaker	3691459	n03691459	loudspeaker.n.01	objects	39
+136	dishwasher	dishwasher	43	38	7	dishwasher	otherstructure	Objects			dishwasher	3207941	n03207941	dishwasher.n.01	appliances	37
+98	container	container	43	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1171	stair rail	stair rail	42	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+170	shower curtain rod	shower curtain rod	42	40	7		otherprop	Objects							curtain	12
+1172	tube	tube	41	40	7		otherprop	Objects							misc	40
+1173	bathroom cabinet	bathroom cabinet	39	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+79	papers	paper	39	26	7	paper	paper	Objects					n14974264	paper.n.01	objects	39
+221	storage container	storage container	39	40	7	container	otherprop	Objects							objects	39
+570	paper bag	paper bag	39	37	7	bag	bag	Objects					n04122825	sack.n.01	objects	39
+138	paper towel roll	paper towel roll	39	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+168	ball	ball	39	40	7	ball	otherprop	Objects							objects	39
+276	closet doors	closet door	38	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+106	laundry basket	laundry basket	37	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+214	cart	cart	37	40	7	cart	otherprop	Objects					n03484083	handcart.n.01	shelving	31
+276	closet door	closet door	35	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+323	dish rack	dish rack	35	40	7	dish rack	otherprop	Objects					n03207630	dish_rack.n.01	objects	39
+58	stairs	stairs	35	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+86	blinds	blinds	35	13	13	blinds	blinds	Window					n02851099	blind.n.03	blinds	32
+2	stack of chairs	chair	35	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+399	purse	purse	34	40	7	purse	otherprop	Objects					n02774152	bag.n.04	objects	39
+121	bicycle	bicycle	33	40	7	bicycle	otherprop	Objects			bicycle	2834778	n02834778	bicycle.n.01	objects	39
+185	tray	tray	32	40	7	tray	otherprop	Objects					n04476259	tray.n.01	objects	39
+300	plunger	plunger	30	40	7		otherprop	Objects					n03970156	plunger.n.03	objects	39
+180	paper cutter	paper cutter	30	40	7	paper cutter	otherprop	Objects					n03886940	paper_cutter.n.01	objects	39
+163	toilet paper dispenser	toilet paper dispenser	29	40	7		otherprop	Objects							objects	39
+26	boxes	box	29	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+66	bin	bin	28	40	7	bin	otherprop	Objects					n02839910	bin.n.01	objects	39
+208	toilet seat cover dispenser	toilet seat cover dispenser	28	40	7		otherprop	Objects							objects	39
+112	guitar	guitar	28	40	7	guitar	otherprop	Objects	guitar		guitar	3467517	n03467517	guitar.n.01	objects	39
+540	mailboxes	mailbox	28	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+395	handicap bar	handicap bar	27	38	7	bar	otherstructure	Objects							misc	40
+166	fire extinguisher	fire extinguisher	27	40	7	fire extinguisher	otherprop	Objects					n03345837	fire_extinguisher.n.01	misc	40
+122	ladder	ladder	27	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	stairs	16
+120	column	column	26	38	7	column	otherstructure	Objects					n03074380	column.n.06	column	24
+107	pipe	pipe	25	40	7	pipe	otherprop	Objects					n03944672	pipe.n.02	misc	40
+283	vacuum cleaner	vacuum cleaner	25	40	7		otherprop	Objects					n04517823	vacuum.n.04	objects	39
+88	plate	plate	24	40	7	plate	otherprop	Objects					n03959485	plate.n.04	objects	39
+90	piano	piano	24	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+177	water cooler	water cooler	24	39	6	water cooler	otherfurniture	Furniture					n04559166	water_cooler.n.01	misc	40
+1174	cd case	cd case	24	40	7		otherprop	Objects							objects	39
+562	bowl	bowl	24	40	7	bowl	otherprop	Objects	bowl		bowl	2880940	n02880940	bowl.n.03	objects	39
+1175	closet rod	closet rod	24	40	7		otherprop	Objects					n04100174	rod.n.01	misc	40
+1156	bathroom counter	bathroom counter	24	12	6	counter	counter	Furniture	table	table	table	4379243	n03116530	counter.n.01	counter	26
+84	oven	oven	23	38	7	oven	otherstructure	Objects					n03862676	oven.n.01	appliances	37
+104	stand	stand	23	39	6	stand	otherfurniture	Furniture	table	table	table	4379243	n04301000	stand.n.04	table	5
+229	scale	scale	23	40	7	scale	otherprop	Objects					n04141975	scale.n.07	objects	39
+70	washing machine	washing machine	23	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+325	broom	broom	22	40	7	broom	otherprop	Objects					n02906734	broom.n.01	objects	39
+169	hat	hat	22	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+128	shower wall	shower wall	22	1	12	wall	wall	Wall					n04208936	shower.n.01	wall	1
+331	guitar case	guitar case	21	40	7	guitar case	otherprop	Objects							objects	39
+87	rack	rack	21	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+488	water pitcher	water pitcher	21	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+776	laundry detergent	laundry detergent	21	40	7		otherprop	Objects							objects	39
+370	hair dryer	hair dryer	21	40	7	hair dryer	otherprop	Objects					n03483316	hand_blower.n.01	objects	39
+191	pillar	pillar	21	38	7	column	otherstructure	Objects					n03073977	column.n.07	column	24
+748	divider	divider	20	40	7		otherprop	Objects							wall	1
+242	power outlet	power outlet	19	40	7		otherprop	Objects							misc	40
+45	dining table	dining table	19	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+417	shower floor	shower floor	19	2	5	floor	floor	Floor					n04208936	shower.n.01	floor	2
+70	washing machines	washing machine	19	39	6	washing machine	otherfurniture	Furniture			washing_machine	4554684	n04554684	washer.n.03	appliances	37
+188	shower door	shower door	19	8	12	door	door	Wall	door				n04208936	shower.n.01	door	4
+1176	coffee kettle	coffee kettle	18	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1177	wardrobe cabinet	wardrobe	18	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1178	structure	structure	18	38	7		otherstructure	Objects							misc	40
+18	bookshelves	bookshelf	17	10	6	bookshelf	bookshelf	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+110	clothes dryer	clothes dryer	17	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+148	toaster	toaster	17	40	7	toaster	otherprop	Objects					n04442312	toaster.n.02	appliances	37
+63	shoe	shoe	17	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+155	ironing board	ironing board	16	39	6	ironing board	otherfurniture	Furniture					n03586090	ironing_board.n.01	objects	39
+572	alarm clock	alarm clock	16	40	7	alarm clock	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+1179	shower head	shower head	15	38	7		otherstructure	Objects							shower	23
+28	lamp base	lamp	15	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+392	water bottle	water bottle	15	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n04557648	water_bottle.n.01	objects	39
+1180	keyboard piano	keyboard piano	15	39	6	piano	otherfurniture	Furniture	piano		piano	3928116	n03928116	piano.n.01	furniture	36
+609	projector screen	projector screen	15	38	7	projector screen	otherstructure	Objects							misc	40
+1181	case of water bottles	case of water bottles	15	40	7		otherprop	Objects							objects	39
+195	toaster oven	toaster oven	14	40	7	toaster oven	otherprop	Objects					n04442441	toaster_oven.n.01	appliances	37
+581	music stand	music stand	14	39	6	music stand	otherfurniture	Furniture					n03801760	music_stand.n.01	furniture	36
+58	staircase	stairs	14	38	7	stairs	otherstructure	Objects					n04298308	stairway.n.01	stairs	16
+1182	coat rack	coat rack	14	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	3
+1183	storage organizer	storage organizer	14	40	7		otherprop	Objects							shelving	3
+139	machine	machine	14	40	7	machine	otherprop	Objects					n03699975	machine.n.01	appliances	37
+1184	folded chair	folded chair	14	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1185	fire alarm	fire alarm	14	40	7		otherprop	Objects					n03343737	fire_alarm.n.02	misc	40
+156	fireplace	fireplace	13	38	7	fireplace	otherstructure	Objects					n03346455	fireplace.n.01	fireplace	27
+408	vent	vent	13	40	7		otherprop	Objects					n04526241	vent.n.01	misc	40
+213	furniture	furniture	13	39	6	furniture	otherfurniture	Furniture					n03405725	furniture.n.01	furniture	36
+1186	power strip	power strip	13	40	7		otherprop	Objects							objects	39
+1187	calendar	calendar	13	40	7		otherprop	Objects							objects	39
+1188	poster	poster	13	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+115	toilet paper holder	toilet paper holder	13	40	7	toilet paper holder	otherprop	Objects							objects	39
+1189	potted plant	potted plant	12	40	7	plant	otherprop	Objects	plant				n00017222	plant.n.02	plant	14
+304	stuffed animal	stuffed animal	12	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+1190	luggage	luggage	12	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+21	curtains	curtain	12	16	13	curtain	curtain	Window	curtain				n03151077	curtain.n.01	curtain	12
+312	headphones	headphones	12	40	7		otherprop	Objects					n03261776	earphone.n.01	objects	39
+233	crate	crate	12	39	6	crate	otherfurniture	Furniture					n03127925	crate.n.01	objects	39
+286	candle	candle	12	40	7	candle	otherprop	Objects	lamp				n02948072	candle.n.01	objects	39
+264	projector	projector	12	40	7	projector	otherprop	Objects					n04009552	projector.n.02	objects	39
+110	clothes dryers	clothes dryer	12	39	6		otherfurniture	Furniture					n03251766	dryer.n.01	appliances	37
+1191	mattress	mattress	12	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+356	dustpan	dustpan	12	40	7		otherprop	Objects					n03259009	dustpan.n.02	objects	39
+25	drawer	drawer	11	39	6	drawer	otherfurniture	Furniture					n03233905	drawer.n.01	furniture	36
+750	rod	rod	11	40	7		otherprop	Objects			pistol	3948459	n03427202	gat.n.01	misc	40
+269	globe	globe	11	40	7	globe	otherprop	Objects							objects	39
+307	footrest	footrest	11	39	6	foot rest	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+410	piano bench	piano bench	11	39	6	piano bench	otherfurniture	Furniture	bench		bench	2828884	n02828884	bench.n.01	seating	34
+730	breakfast bar	breakfast bar	11	38	7	bar	otherstructure	Objects							counter	26
+216	step stool	step stool	11	40	7	step stool	otherprop	Objects	stool				n04315713	step_stool.n.01	stool	19
+1192	hand rail	hand rail	11	38	7	railing	otherstructure	Objects							railing	30
+119	vending machine	vending machine	11	40	7	machine	otherprop	Objects					n04525305	vending_machine.n.01	appliances	37
+682	ceiling fan	ceiling fan	11	40	7	fan	otherprop	Objects					n03320046	fan.n.01	misc	40
+434	swiffer	swiffer	11	40	7		otherprop	Objects							objects	39
+126	foosball table	foosball table	11	39	6	foosball table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+919	jar	jar	11	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+85	footstool	footstool	11	39	6	ottoman	otherfurniture	Furniture	stool				n03380724	footstool.n.01	stool	19
+1193	folded table	folded table	10	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+108	round table	round table	10	7	10	table	table	Table	table	table	table	4379243	n04114554	round_table.n.02	table	5
+135	hamper	hamper	10	40	7	basket	otherprop	Objects			basket	2801938	n03482405	hamper.n.02	objects	39
+1194	poster tube	poster tube	10	40	7		otherprop	Objects							objects	39
+432	case	case	10	40	7	case	otherprop	Objects							objects	39
+53	carpet	carpet	10	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+1195	thermostat	thermostat	10	40	7		otherprop	Objects					n04422875	thermostat.n.01	misc	40
+111	coat	coat	10	40	7	jacket	otherprop	Objects					n03057021	coat.n.01	clothes	38
+305	water fountain	water fountain	10	38	7	water fountain	otherstructure	Objects					n03241335	drinking_fountain.n.01	misc	40
+1125	smoke detector	smoke detector	10	40	7		otherprop	Objects							misc	40
+13	pillows	pillow	9	18	7	pillow	pillow	Objects			pillow	3938244	n03938244	pillow.n.01	cushion	8
+1196	flip flops	flip flops	9	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1197	cloth	cloth	9	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1198	banner	banner	9	40	7		otherprop	Objects					n02788021	banner.n.01	misc	40
+1199	clothes hanger	clothes hanger	9	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	objects	39
+1200	whiteboard eraser	whiteboard eraser	9	40	7		otherprop	Objects							objects	39
+378	iron	iron	9	40	7		otherprop	Objects					n03584829	iron.n.04	objects	39
+591	instrument case	instrument case	9	40	7	case	otherprop	Objects							objects	39
+49	toilet paper rolls	toilet paper	9	40	7	toilet paper	otherprop	Objects					n15075141	toilet_tissue.n.01	objects	39
+92	soap	soap	9	40	7	soap	otherprop	Objects					n04253437	soap.n.01	objects	39
+1098	block	block	9	40	7		otherprop	Objects							misc	40
+291	wall hanging	wall hanging	8	40	7		otherprop	Objects					n03491178	hanging.n.01	picture	6
+1063	kitchen island	kitchen island	8	38	7	kitchen island	otherstructure	Objects					n03620600	kitchen_island.n.01	counter	26
+107	pipes	pipe	8	38	7		otherstructure	Objects							misc	40
+1135	toothbrush	toothbrush	8	40	7	toothbrush	otherprop	Objects					n04453156	toothbrush.n.01	objects	39
+189	shirt	shirt	8	40	7		otherprop	Objects					n04197391	shirt.n.01	clothes	38
+245	cutting board	cutting board	8	40	7	cutting board	otherprop	Objects					n03025513	chopping_board.n.01	objects	39
+194	vase	vase	8	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1201	shower control valve	shower control valve	8	38	7		otherstructure	Objects					n04208936	shower.n.01	shower	23
+386	exercise machine	exercise machine	8	40	7	machine	otherprop	Objects							gym_equipment	33
+1202	compost bin	compost bin	8	39	6	garbage bin	otherfurniture	Furniture			trash_bin	2747177	n02747177	ashcan.n.01	objects	39
+857	shorts	shorts	8	40	7	shorts	otherprop	Objects							clothes	38
+452	tire	tire	8	40	7		otherprop	Objects					n04440749	tire.n.01	objects	39
+1203	teddy bear	teddy bear	7	40	7	stuffed animal	otherprop	Objects					n04399382	teddy.n.01	objects	39
+346	bathrobe	bathrobe	7	40	7		otherprop	Objects					n02807616	bathrobe.n.01	clothes	38
+152	handrail	handrail	7	38	7	railing	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+83	faucet	faucet	7	40	7	faucet	otherprop	Objects			faucet	3325088	n03325088	faucet.n.01	misc	40
+1204	pantry wall	pantry wall	7	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+726	thermos	thermos	7	40	7	flask	otherprop	Objects	bottle		bottle	2876657	n04422727	thermos.n.01	objects	39
+61	rug	rug	7	40	7	rug	otherprop	Objects					n04118021	rug.n.01	floor	2
+39	couch cushions	cushion	7	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1117	tripod	tripod	7	39	6	stand	otherfurniture	Furniture					n04485082	tripod.n.01	objects	39
+540	mailbox	mailbox	7	29	7	box	box	Objects			mailbox	3710193	n03710193	mailbox.n.01	misc	40
+1205	tupperware	tupperware	7	40	7		otherprop	Objects							objects	39
+415	shoe rack	shoe rack	7	40	7	shoe rack	otherprop	Objects							shelving	31
+31	towels	towel	6	27	7	towel	towel	Objects					n04459362	towel.n.01	towel	20
+1206	beer bottles	beer bottle	6	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+153	treadmill	treadmill	6	39	6	treadmill	otherfurniture	Furniture					n04477387	treadmill.n.01	gym_equipment	33
+1207	salt	salt	6	40	7		otherprop	Objects							objects	39
+129	chest	chest	6	39	6	chest	otherfurniture	Furniture	dresser	dresser					chest_of_drawers	13
+220	dispenser	dispenser	6	40	7		otherprop	Objects					n03210683	dispenser.n.01	objects	39
+1208	mirror doors	mirror door	6	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+231	remote	remote	6	40	7		otherprop	Objects			remote_control	4074963	n04074963	remote_control.n.01	objects	39
+1209	folded ladder	folded ladder	6	39	6	ladder	otherfurniture	Furniture	stairs				n03632277	ladder.n.01	misc	40
+39	cushion	cushion	6	18	7	pillow	pillow	Objects					n03151500	cushion.n.03	cushion	8
+1210	carton	carton	6	40	7		otherprop	Objects							objects	39
+117	step	step	6	38	7		otherstructure	Objects					n04314914	step.n.04	misc	40
+822	drying rack	drying rack	6	39	6	drying rack	otherfurniture	Furniture							shelving	31
+238	slippers	slipper	6	40	7	shoe	otherprop	Objects					n04241394	slipper.n.01	clothes	38
+143	pool table	pool table	6	39	6	pool table	otherfurniture	Furniture	table	table	table	4379243	n03982430	pool_table.n.01	table	5
+1211	soda stream	soda stream	6	40	7		otherprop	Objects							objects	39
+228	toilet brush	toilet brush	6	40	7	toilet brush	otherprop	Objects							objects	39
+494	loft bed	loft bed	6	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+226	cooking pot	cooking pot	6	40	7	pot	otherprop	Objects							objects	39
+91	heater	heater	6	39	6	heater	otherfurniture	Furniture					n03508101	heater.n.01	misc	40
+1072	messenger bag	messenger bag	6	37	7	bag	bag	Objects							objects	39
+435	stapler	stapler	6	40	7	stapler	otherprop	Objects					n04303497	stapler.n.01	objects	39
+1165	closet walls	closet wall	5	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+345	scanner	scanner	5	40	7		otherprop	Objects							appliances	37
+893	elliptical machine	elliptical machine	5	40	7	machine	otherprop	Objects							gym_equipment	33
+621	kettle	kettle	5	40	7	pot	otherprop	Objects					n03612814	kettle.n.01	objects	39
+1212	metronome	metronome	5	40	7		otherprop	Objects					n03757604	metronome.n.01	objects	39
+297	dumbell	dumbell	5	40	7		otherprop	Objects							objects	39
+1213	music book	music book	5	23	2	book	books	Books					n02870526	book.n.11	objects	39
+1214	rice cooker	rice cooker	5	40	7		otherprop	Objects							objects	39
+1215	dart board	dart board	5	38	7	board	otherstructure	Objects					n03162940	dartboard.n.01	objects	39
+529	sewing machine	sewing machine	5	40	7	sewing machine	otherprop	Objects					n04179913	sewing_machine.n.01	objects	39
+1216	grab bar	grab bar	5	38	7	railing	otherstructure	Objects							railing	30
+1217	flowerpot	flowerpot	5	40	7	vase	otherprop	Objects	vase		jar	3593526	n04522168	vase.n.01	objects	39
+1218	painting	painting	5	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1219	railing	railing	5	38	7	railing	otherstructure	Objects					n04047401	railing.n.01	railing	30
+1220	stair	stair	5	38	7	stairs	otherstructure	Objects	stairs				n04314914	step.n.04	stairs	16
+525	toolbox	toolbox	5	39	6	chest	otherfurniture	Furniture					n04452615	toolbox.n.01	objects	39
+204	nerf gun	nerf gun	5	40	7		otherprop	Objects							objects	39
+693	binders	binder	5	40	7	binder	otherprop	Objects							objects	39
+179	desk lamp	desk lamp	5	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1221	quadcopter	quadcopter	5	40	7		otherprop	Objects							objects	39
+1222	pitcher	pitcher	5	40	7	pitcher	otherprop	Objects					n03950228	pitcher.n.02	objects	39
+1223	hanging	hanging	5	40	7		otherprop	Objects							misc	40
+1224	mail	mail	5	40	7		otherprop	Objects							misc	40
+1225	closet ceiling	closet ceiling	5	22	3	ceiling	ceiling	Ceiling					n02990373	ceiling.n.01	ceiling	17
+1226	hoverboard	hoverboard	5	40	7		otherprop	Objects							objects	39
+1227	beanbag chair	beanbag chair	5	39	6	bean bag	otherfurniture	Furniture					n02816656	beanbag.n.01	chair	3
+571	water heater	water heater	5	40	7	water heater	otherprop	Objects					n04560113	water_heater.n.01	misc	40
+1228	spray bottle	spray bottle	5	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+556	rope	rope	5	40	7	rope	otherprop	Objects					n04108268	rope.n.01	objects	39
+280	plastic container	plastic container	5	40	7	container	otherprop	Objects							objects	39
+1229	soap bottle	soap bottle	5	40	7	soap	otherprop	Objects							objects	39
+1230	ikea bag	ikea bag	4	37	7	bag	bag	Objects				2773838	n02773838	bag.n.06	objects	39
+1231	sleeping bag	sleeping bag	4	40	7		otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+1232	duffel bag	duffel bag	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+746	frying pan	frying pan	4	40	7	frying pan	otherprop	Objects					n03400231	frying_pan.n.01	objects	39
+1233	oven mitt	oven mitt	4	40	7		otherprop	Objects							objects	39
+1234	pot	pot	4	40	7	pot	otherprop	Objects					n04235860	sleeping_bag.n.01	objects	39
+144	hand dryer	hand dryer	4	40	7		otherprop	Objects							objects	39
+282	dollhouse	dollhouse	4	39	6	doll house	otherfurniture	Furniture					n03219483	dollhouse.n.01	objects	39
+167	shampoo bottle	shampoo bottle	4	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1235	hair brush	hair brush	4	40	7		otherprop	Objects					n02908217	brush.n.02	objects	39
+1236	tennis racket	tennis racket	4	40	7		otherprop	Objects					n04409806	tennis_racket.n.01	objects	39
+1237	display case	display case	4	40	7	case	otherprop	Objects							objects	39
+234	ping pong table	ping pong table	4	39	6	ping pong table	otherfurniture	Furniture	table	table	table	4379243	n04379243	table.n.02	table	5
+563	boiler	boiler	4	40	7		otherprop	Objects							misc	40
+1238	bag of coffee beans	bag of coffee beans	4	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+1239	bananas	banana	4	40	7		otherprop	Objects					n00021265	food.n.01	objects	39
+1240	carseat	carseat	4	40	7		otherprop	Objects							misc	40
+366	helmet	helmet	4	40	7		otherprop	Objects			helmet	3513137	n03513137	helmet.n.02	clothes	38
+816	umbrella	umbrella	4	40	7	umbrella	otherprop	Objects					n04507155	umbrella.n.01	objects	39
+1241	coffee box	coffee box	4	40	7		otherprop	Objects							objects	39
+719	envelope	envelope	4	40	7	envelope	otherprop	Objects					n03291819	envelope.n.01	objects	39
+284	wet floor sign	wet floor sign	4	40	7	sign	otherprop	Objects							misc	40
+1242	clothing rack	clothing rack	4	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+247	controller	controller	4	40	7		otherprop	Objects					n03096960	control.n.09	objects	39
+1243	bath walls	bathroom wall	4	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+1244	podium	podium	4	39	6		otherfurniture	Furniture					n03159640	dais.n.01	furniture	36
+1245	storage box	storage box	4	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1246	dolly	dolly	4	40	7		otherprop	Objects							misc	40
+1247	shampoo	shampoo	3	40	7		otherprop	Objects					n04183516	shampoo.n.01	objects	39
+592	paper tray	paper tray	3	40	7	paper tray	otherprop	Objects							objects	39
+385	cabinet door	cabinet door	3	8	12	door	door	Wall	door						door	4
+1248	changing station	changing station	3	40	7		otherprop	Objects							misc	40
+1249	poster printer	poster printer	3	40	7	printer	otherprop	Objects			printer	4004475	n04004475	printer.n.03	appliances	37
+133	screen	screen	3	40	7		otherprop	Objects					n03151077	curtain.n.01	curtain	12
+301	soap bar	soap bar	3	38	7	bar	otherstructure	Objects							objects	39
+1250	crutches	crutches	3	40	7		otherprop	Objects					n03141823	crutch.n.01	objects	39
+379	studio light	studio light	3	38	7	light	otherstructure	Objects							lighting	28
+130	stack of cups	cup	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+1251	toilet flush button	toilet flush button	3	40	7		otherprop	Objects							objects	39
+450	trunk	trunk	3	40	7		otherprop	Objects							misc	40
+1252	grocery bag	grocery bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03461288	grocery_bag.n.01	objects	39
+316	plastic bin	plastic bin	3	40	7	bin	otherprop	Objects							objects	39
+1253	pizza box	pizza box	3	29	7	box	box	Objects							objects	39
+385	cabinet doors	cabinet door	3	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	door	4
+1254	legs	legs	3	31	7	person	person	Objects	person				n05217688	person.n.02	misc	40
+461	car	car	3	40	7	car	otherprop	Objects	car		car	2958343	n02958343	car.n.01	misc	40
+1255	shaving cream	shaving cream	3	40	7		otherprop	Objects					n04186051	shaving_cream.n.01	objects	39
+1256	luggage stand	luggage stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+599	shredder	shredder	3	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+281	statue	statue	3	40	7	sculpture	otherprop	Objects					n04306847	statue.n.01	misc	40
+1257	urinal	urinal	3	33	7	toilet	toilet	Objects	toilet	toilet			n04515991	urinal.n.01	toilet	18
+1258	hose	hose	3	40	7		otherprop	Objects					n03539875	hose.n.03	misc	40
+1259	bike pump	bike pump	3	40	7		otherprop	Objects							objects	39
+319	coatrack	coatrack	3	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1260	bear	bear	3	40	7		otherprop	Objects							objects	39
+28	wall lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+1261	humidifier	humidifier	3	40	7		otherprop	Objects							objects	39
+546	toothpaste	toothpaste	3	40	7	toothpaste	otherprop	Objects							objects	39
+1262	mouthwash bottle	mouthwash bottle	3	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1263	poster cutter	poster cutter	3	40	7		otherprop	Objects							objects	39
+1264	golf bag	golf bag	3	37	7	bag	bag	Objects			suitcase	2773838	n03445617	golf_bag.n.01	objects	39
+1265	food container	food container	3	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1266	camera	camera	3	40	7		otherprop	Objects							objects	39
+28	table lamp	lamp	3	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n04380533	table_lamp.n.01	lighting	28
+1267	yoga mat	yoga mat	3	20	5	floor mat	floor mat	Floor					n03727837	mat.n.01	floor	2
+1268	card	card	3	40	7		otherprop	Objects							objects	39
+1269	mug	mug	3	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+188	shower doors	shower door	3	38	7		otherstructure	Objects					n04208936	shower.n.01	door	4
+689	cardboard	cardboard	3	40	7		otherprop	Objects							objects	39
+1270	rack stand	rack stand	3	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+1271	boxes of paper	boxes of paper	3	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1272	flag	flag	3	40	7		otherprop	Objects							misc	40
+354	futon	futon	3	39	6	mattress	otherfurniture	Furniture					n03408444	futon.n.01	sofa	10
+339	magazine	magazine	3	40	7	magazine	otherprop	Objects					n06595351	magazine.n.01	objects	39
+1009	exit sign	exit sign	3	40	7	exit sign	otherprop	Objects							misc	40
+1273	rolled poster	rolled poster	3	40	7		otherprop	Objects							objects	39
+1274	wheel	wheel	3	40	7		otherprop	Objects							objects	39
+15	pictures	picture	3	11	8	picture	picture	Picture					n03931044	picture.n.01	picture	6
+1275	blackboard eraser	blackboard eraser	3	40	7	eraser	otherprop	Objects					n03294833	eraser.n.01	objects	39
+361	organizer	organizer	3	40	7		otherprop	Objects					n03918737	personal_digital_assistant.n.01	objects	39
+1276	doll	doll	3	40	7	toy	otherprop	Objects					n03219135	doll.n.01	objects	39
+326	book rack	book rack	3	39	6	bookrack	otherfurniture	Furniture							objects	39
+1277	laundry bag	laundry bag	3	40	7	laundry basket	otherprop	Objects			basket	2801938	n03050864	clothes_hamper.n.01	objects	39
+1278	sponge	sponge	3	40	7		otherprop	Objects					n01906749	sponge.n.04	objects	39
+116	seating	seat	3	39	6	furniture	otherfurniture	Furniture					n04161981	seat.n.03	furniture	36
+1184	folded chairs	folded chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1279	lotion bottle	lotion bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+212	can	can	2	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1280	lunch box	lunch box	2	40	7		otherprop	Objects							objects	39
+1281	food display	food display	2	40	7		otherprop	Objects							misc	40
+794	storage shelf	storage shelf	2	40	7		otherprop	Objects							shelving	31
+1282	sliding wood door	sliding wood door	2	40	7		otherprop	Objects							door	4
+955	pants	pants	2	40	7		otherprop	Objects					n04489008	trouser.n.01	clothes	38
+387	wood	wood	2	40	7		otherprop	Objects							misc	40
+69	boards	board	2	38	7	board	otherstructure	Objects							board_panel	35
+65	bottles	bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+523	washcloth	washcloth	2	40	7		otherprop	Objects					n04554523	washcloth.n.01	towel	20
+389	workbench	workbench	2	39	6	bench	otherfurniture	Furniture	bench		table	4379243	n04600486	workbench.n.01	table	5
+29	open kitchen cabinet	kitchen cabinet	2	3	6	cabinet	cabinet	Furniture					n02933112	cabinet.n.01	cabinet	7
+1283	organizer shelf	organizer shelf	2	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+146	frame	frame	2	38	7		otherstructure	Objects							misc	40
+130	cups	cup	2	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
+372	exercise ball	exercise ball	2	40	7	ball	otherprop	Objects					n04285146	sports_equipment.n.01	gym_equipment	33
+289	easel	easel	2	39	6	stand	otherfurniture	Furniture					n03262809	easel.n.01	furniture	36
+440	garbage bag	garbage bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+321	roomba	roomba	2	40	7		otherprop	Objects							objects	39
+976	garage door	garage door	2	38	7	garage door	otherstructure	Objects	door						door	4
+1256	luggage rack	luggage stand	2	39	6	stand	otherfurniture	Furniture					n04038440		shelving	31
+1284	bike lock	bike lock	2	40	7		otherprop	Objects							objects	39
+1285	briefcase	briefcase	2	40	7		otherprop	Objects					n02900705	briefcase.n.01	objects	39
+357	hand towel	hand towel	2	27	7	towel	towel	Objects					n03490006	hand_towel.n.01	towel	20
+1286	bath products	bath product	2	40	7		otherprop	Objects							objects	39
+1287	star	star	2	40	7		otherprop	Objects					n09444783	star.n.03	misc	40
+365	map	map	2	40	7	map	otherprop	Objects					n03720163	map.n.01	misc	40
+1288	coffee bean bag	coffee bean bag	2	37	7	bag	bag	Objects			suitcase	2773838	n02773838	bag.n.06	objects	39
+81	headboard	headboard	2	39	6	headboard	otherfurniture	Furniture					n03502200	headboard.n.01	bed	11
+1289	ipad	ipad	2	40	7		otherprop	Objects							objects	39
+1290	display rack	display rack	2	39	6	stand	otherfurniture	Furniture					n04038440	rack.n.05	shelving	31
+948	traffic cone	traffic cone	2	40	7	cone	otherprop	Objects	cone						objects	39
+174	toiletry	toiletry	2	40	7		otherprop	Objects					n04447443	toiletry.n.01	objects	39
+1028	canopy	canopy	2	40	7		otherprop	Objects							misc	40
+1291	massage chair	massage chair	2	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1292	paper organizer	paper organizer	2	40	7		otherprop	Objects							objects	39
+1005	barricade	barricade	2	40	7		otherprop	Objects							misc	40
+235	platform	platform	2	38	7		otherstructure	Objects							misc	40
+1293	cap	cap	2	40	7	hat	otherprop	Objects					n03497657	hat.n.01	clothes	38
+1294	dumbbell plates	dumbbell plates	2	40	7		otherprop	Objects							objects	39
+1295	elevator	elevator	2	38	7		otherstructure	Objects							misc	40
+1296	cooking pan	cooking pan	2	40	7	pan	otherprop	Objects					n03880531	pan.n.01	objects	39
+1297	trash bag	trash bag	2	37	7	bag	bag	Objects							objects	39
+1298	santa	santa	2	40	7		otherprop	Objects							misc	40
+1299	jewelry box	jewelry box	2	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1300	boat	boat	2	40	7		otherprop	Objects							misc	40
+1301	sock	sock	2	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1051	kinect	kinect	2	40	7	kinect	otherprop	Objects							objects	39
+566	crib	crib	2	39	6	crib	otherfurniture	Furniture							furniture	36
+1302	plastic storage bin	plastic storage bin	2	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1062	cooler	cooler	2	24	6	refridgerator	refridgerator	Furniture					n03102654	cooler.n.01	appliances	37
+1303	kitchen apron	kitchen apron	2	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+1304	dishwashing soap bottle	dishwashing soap bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1305	xbox controller	xbox controller	2	40	7		otherprop	Objects							objects	39
+1306	banana holder	banana holder	2	40	7		otherprop	Objects							objects	39
+298	ping pong paddle	ping pong paddle	2	40	7		otherprop	Objects							table	5
+1307	airplane	airplane	2	40	7		otherprop	Objects							misc	40
+1308	conditioner bottle	conditioner bottle	2	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+1309	tea kettle	tea kettle	2	40	7	tea kettle	otherprop	Objects					n04397768	teakettle.n.01	objects	39
+43	bedframe	bedframe	2	39	6		otherfurniture	Furniture					n02822579	bedstead.n.01	bed	11
+1310	wood beam	wood beam	2	38	7		otherstructure	Objects							beam	29
+593	toilet paper package	toilet paper package	2	40	7		otherprop	Objects							objects	39
+1311	wall mounted coat rack	wall mounted coat rack	2	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+1312	film light	film light	2	40	7		otherprop	Objects							lighting	28
+749	ceiling lamp	ceiling lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+623	chain	chain	1	40	7		otherprop	Objects							chair	3
+1313	sofa	sofa	1	6	9	sofa	sofa	Sofa	sofa	sofa	sofa	4256520	n04256520	sofa.n.01	sofa	10
+99	closet wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+265	sweater	sweater	1	40	7		otherprop	Objects					n04370048	sweater.n.01	clothes	38
+1314	kitchen mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+99	wardrobe	wardrobe	1	39	6	wardrobe	otherfurniture	Furniture	wardrobe				n04550184	wardrobe.n.01	furniture	36
+1315	water softener	water softener	1	40	7		otherprop	Objects							misc	40
+448	banister	banister	1	38	7	banister	otherstructure	Objects					n02788148	bannister.n.02	railing	30
+257	trolley	trolley	1	40	7	trolley	otherprop	Objects					n04335435	streetcar.n.01	misc	40
+1316	pantry shelf	pantry shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+786	sofa bed	sofa bed	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02818832	bed.n.01	bed	11
+801	loofa	loofa	1	40	7		otherprop	Objects							objects	39
+972	shower faucet handle	shower faucet handle	1	40	7	handle	otherprop	Objects							shower	23
+1317	toy piano	toy piano	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1318	fish	fish	1	40	7		otherprop	Objects					n02512053	fish.n.01	objects	39
+75	file cabinets	file cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n03337140	file.n.03	cabinet	7
+657	cat litter box	cat litter box	1	29	7	box	box	Objects							objects	39
+561	electric panel	electric panel	1	40	7		otherprop	Objects							misc	40
+93	suitcases	suitcase	1	40	7	luggage	otherprop	Objects					n02774630	baggage.n.01	objects	39
+513	curtain rod	curtain rod	1	38	7	curtain rod	otherstructure	Objects							curtain	12
+411	bunk bed	bunk bed	1	39	6	bunk bed	otherfurniture	Furniture	bed	bed	bed	2818832	n02920259	bunk_bed.n.01	bed	11
+1122	chandelier	chandelier	1	38	7	chandelier	otherstructure	Objects					n03005285	chandelier.n.01	lighting	28
+922	tape	tape	1	40	7	tape	otherprop	Objects							objects	39
+88	plates	plate	1	40	7		otherprop	Objects					n03959485	plate.n.04	objects	39
+518	alarm	alarm	1	40	7	alarm	otherprop	Objects			clock	3046257	n02694662	alarm_clock.n.01	objects	39
+814	fire hose	fire hose	1	40	7		otherprop	Objects					n03346004	fire_hose.n.01	misc	40
+1319	toy dinosaur	toy dinosaur	1	40	7	toy	otherprop	Objects					n03964744	plaything.n.01	objects	39
+1320	cone	cone	1	40	7		otherprop	Objects							objects	39
+649	glass doors	glass door	1	8	12	door	door	Wall	door				n03221720	door.n.01	door	4
+607	hatrack	hatrack	1	40	7		otherprop	Objects					n03059103	coatrack.n.01	shelving	31
+819	subwoofer	subwoofer	1	40	7	speaker	otherprop	Objects			speaker	3691459	n04349401	subwoofer.n.01	objects	39
+1321	fire sprinkler	fire sprinkler	1	40	7		otherprop	Objects							misc	40
+1322	trash cabinet	trash cabinet	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+1204	pantry walls	pantry wall	1	1	12	wall	wall	Wall					n04546855	wall.n.01	wall	1
+227	photo	photo	1	40	7	photo	otherprop	Objects					n03925226	photograph.n.01	picture	6
+817	barrier	barrier	1	40	7		otherprop	Objects					n02796623	barrier.n.01	misc	40
+130	stacks of cups	cup	1	40	7		otherprop	Objects					n03147509	cup.n.01	objects	39
+712	beachball	beachball	1	40	7	ball	otherprop	Objects					n02814224	beach_ball.n.01	objects	39
+1323	folded boxes	folded boxes	1	40	7		otherprop	Objects							objects	39
+1324	contact lens solution bottle	contact lens solution bottle	1	40	7	bottle	otherprop	Objects	bottle		bottle	2876657	n02876657	bottle.n.01	objects	39
+673	covered box	covered box	1	29	7	box	box	Objects							objects	39
+459	folder	folder	1	40	7	folder	otherprop	Objects					n03376279	folder.n.02	objects	39
+643	mail trays	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+238	slipper	slipper	1	40	7		otherprop	Objects					n04241394	slipper.n.01	clothes	38
+765	magazine rack	magazine rack	1	39	6	stand	otherfurniture	Furniture					n03704549	magazine_rack.n.01	shelving	31
+1008	sticker	sticker	1	40	7	sticker	otherprop	Objects					n07272545	gummed_label.n.01	objects	39
+225	lotion	lotion	1	40	7		otherprop	Objects					n03690938	lotion.n.01	objects	39
+1083	buddha	buddha	1	40	7		otherprop	Objects							objects	39
+813	file organizer	file organizer	1	40	7		otherprop	Objects							objects	39
+138	paper towel rolls	paper towel roll	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+1145	night lamp	night lamp	1	35	7	lamp	lamp	Objects	lamp		lamp	3636649	n03636649	lamp.n.02	lighting	28
+796	fuse box	fuse box	1	40	7		otherprop	Objects							misc	40
+1325	knife block	knife block	1	40	7		otherprop	Objects							objects	39
+363	furnace	furnace	1	39	6	furnace	otherfurniture	Furniture					n03404449	furnace.n.01		
+1174	cd cases	cd case	1	40	7		otherprop	Objects							objects	39
+38	stools	stool	1	40	7	stool	otherprop	Objects	stool				n04326896	stool.n.01	stool	19
+1326	hand sanitzer dispenser	hand sanitzer dispenser	1	40	7		otherprop	Objects					n04254120	soap_dispenser.n.01	objects	39
+997	teapot	teapot	1	40	7	tea pot	otherprop	Objects					n04398044	teapot.n.01	objects	39
+1327	pen holder	pen holder	1	40	7		otherprop	Objects							objects	39
+1328	tray rack	tray rack	1	40	7		otherprop	Objects							objects	39
+1329	wig	wig	1	40	7		otherprop	Objects					n04584207	wig.n.01	objects	39
+182	switch	switch	1	40	7		otherprop	Objects					n04372370	switch.n.01	misc	40
+280	plastic containers	plastic container	1	40	7	container	otherprop	Objects					n03094503	container.n.01	objects	39
+1330	night light	night light	1	40	7		otherprop	Objects							lighting	28
+1331	notepad	notepad	1	40	7		otherprop	Objects							objects	39
+1332	mail bin	mail bin	1	40	7		otherprop	Objects							misc	40
+1333	elevator button	elevator button	1	40	7		otherprop	Objects							misc	40
+939	gaming wheel	gaming wheel	1	40	7		otherprop	Objects							objects	39
+1334	drum set	drum set	1	40	7		otherprop	Objects							objects	39
+480	cosmetic bag	cosmetic bag	1	37	7	bag	bag	Objects							objects	39
+907	coffee mug	coffee mug	1	40	7	vessel	otherprop	Objects			cup or mug	3797390	n03063599	coffee_mug.n.01	objects	39
+1335	closet shelf	closet shelf	1	15	6	shelves	shelves	Furniture	bookshelf		bookshelf	2871439	n02871439	bookshelf.n.01	shelving	31
+1336	baby mobile	baby mobile	1	40	7		otherprop	Objects							objects	39
+829	diaper bin	diaper bin	1	40	7	bin	otherprop	Objects							objects	39
+947	door wall	door wall	1	1	12	wall	wall	Wall							wall	1
+1116	stepstool	stepstool	1	40	7	step stool	otherprop	Objects							objects	39
+599	paper shredder	shredder	1	40	7		otherprop	Objects					n04210120	shredder.n.01	objects	39
+733	dress rack	dress rack	1	40	7		otherprop	Objects					n03238762	dress_rack.n.01	misc	40
+123	cover	cover	1	40	7	blanket	otherprop	Objects							objects	39
+506	shopping bag	shopping bag	1	37	7	bag	bag	Objects					n04204081	shopping_bag.n.01	objects	39
+569	sliding door	sliding door	1	8	12	door	door	Wall	door				n04239074	sliding_door.n.01	door	4
+1337	exercise bike	exercise bike	1	40	7	machine	otherprop	Objects					n04210120	shredder.n.01	gym_equipment	33
+1338	recliner chair	recliner chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03238762	dress_rack.n.01	chair	3
+1314	kitchenaid mixer	kitchen mixer	1	40	7		otherprop	Objects							appliances	37
+1339	soda can	soda can	1	40	7	can	otherprop	Objects			can	2946921	n02946921	can.n.01	objects	39
+1340	stovetop	stovetop	1	38	7	stove	otherstructure	Objects			stove	4330267	n04330267	stove.n.02	appliances	37
+851	stepladder	stepladder	1	39	6	ladder	otherfurniture	Furniture	stairs				n04315599	step_ladder.n.01	stairs	16
+142	tap	tap	1	40	7	faucet	otherprop	Objects			faucet	3325088	n04559451	water_faucet.n.01	objects	39
+436	cable	cable	1	40	7	cables	otherprop	Objects							objects	39
+1341	baby changing station	baby changing station	1	39	6		otherfurniture	Furniture							furniture	36
+1342	costume	costume	1	21	7	clothes	clothes	Objects					n02728440	apparel.n.01	clothes	38
+885	rocking chair	rocking chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n04099969	rocking_chair.n.01	chair	3
+693	binder	binder	1	40	7	binder	otherprop	Objects							objects	39
+815	media center	media center	1	3	6	cabinet	cabinet	Furniture			cabinet	2933112	n02933112	cabinet.n.01	cabinet	7
+401	towel rack	towel rack	1	40	7		otherprop	Objects					n04459773	towel_rack.n.01	misc	40
+1343	medal	medal	1	40	7		otherprop	Objects							objects	39
+1184	stack of folded chairs	folded chair	1	5	4	chair	chair	Chair	chair	chair	chair	3001627	n03001627	chair.n.01	chair	3
+1344	telescope	telescope	1	40	7		otherprop	Objects					n04403638	telescope.n.01	objects	39
+1345	closet doorframe	closet doorframe	1	8	12	door	door	Wall	door						door	4
+160	glass	glass	1	38	7	glass	otherstructure	Objects					n03438257	glass.n.02	misc	40
+1126	baseball cap	baseball cap	1	40	7		otherprop	Objects			cap	2954340	n02799323	baseball_cap.n.01	clothes	38
+1346	battery disposal jar	battery disposal jar	1	40	7	jar	otherprop	Objects			jar	3593526	n03593526	jar.n.01	objects	39
+332	mop	mop	1	40	7		otherprop	Objects					n04367480	swab.n.02	objects	39
+397	tank	tank	1	40	7		otherprop	Objects							objects	39
+643	mail tray	mail tray	1	40	7	mail tray	otherprop	Objects							objects	39
+551	centerpiece	centerpiece	1	40	7	centerpiece	otherprop	Objects					n02994419	centerpiece.n.02	objects	39
+1163	object	stick	1	40	7	stick	otherprop	Objects							objects	39
+1347	closet floor	closet floor	1	2	5	floor	floor	Floor					n03365592	floor.n.01	floor	2
+1348	dryer sheets	dryer sheets	1	40	7		otherprop	Objects							objects	39
+803	bycicle	bycicle	1	40	7		otherprop	Objects							misc	40
+484	flower stand	flower stand	1	39	6	stand	otherfurniture	Furniture							furniture	36
+1349	air mattress	air mattress	1	4	1	bed	bed	Bed	bed	bed	bed	2818832	n02690809	air_mattress.n.01	bed	11
+1350	clip	clip	1	40	7		otherprop	Objects							objects	39
+222	side table	side table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1253	pizza boxes	pizza box	1	29	7	box	box	Objects					n02883344	box.n.01	objects	39
+1351	display	display	1	39	7		otherfurniture	Furniture					n03211117	display.n.06	misc	40
+1352	postcard	postcard	1	40	7		otherprop	Objects							objects	39
+828	display sign	display sign	1	40	7	sign	otherprop	Objects							misc	40
+1353	paper towel	paper towel	1	40	7	paper towel	otherprop	Objects					n03887697	paper_towel.n.01	towel	20
+612	boots	boot	1	40	7	shoe	otherprop	Objects					n04199027	shoe.n.01	clothes	38
+1354	tennis racket bag	tennis racket bag	1	40	7		otherprop	Objects							objects	39
+1355	air hockey table	air hockey table	1	7	10	table	table	Table	table	table	table	4379243	n04379243	table.n.02	table	5
+1301	socks	sock	1	21	7	clothes	clothes	Objects					n04254777	sock.n.01	clothes	38
+1356	food bag	food bag	1	37	7	bag	bag	Objects							objects	39
+1199	clothes hangers	clothes hanger	1	40	7		otherprop	Objects					n03057920	coat_hanger.n.01	misc	40
+1357	starbucks cup	starbucks cup	1	40	7	cup	otherprop	Objects	cup		cup or mug	3797390	n03797390	mug.n.04	objects	39
\ No newline at end of file
diff --git a/datasets/scannet_preprocess/meta_data/scannetv2_test.txt b/datasets/scannet_preprocess/meta_data/scannetv2_test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..79d15b0ee4afa889883562a722b837b78ee8ce4b
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv2_test.txt
@@ -0,0 +1,100 @@
+scene0707_00
+scene0708_00
+scene0709_00
+scene0710_00
+scene0711_00
+scene0712_00
+scene0713_00
+scene0714_00
+scene0715_00
+scene0716_00
+scene0717_00
+scene0718_00
+scene0719_00
+scene0720_00
+scene0721_00
+scene0722_00
+scene0723_00
+scene0724_00
+scene0725_00
+scene0726_00
+scene0727_00
+scene0728_00
+scene0729_00
+scene0730_00
+scene0731_00
+scene0732_00
+scene0733_00
+scene0734_00
+scene0735_00
+scene0736_00
+scene0737_00
+scene0738_00
+scene0739_00
+scene0740_00
+scene0741_00
+scene0742_00
+scene0743_00
+scene0744_00
+scene0745_00
+scene0746_00
+scene0747_00
+scene0748_00
+scene0749_00
+scene0750_00
+scene0751_00
+scene0752_00
+scene0753_00
+scene0754_00
+scene0755_00
+scene0756_00
+scene0757_00
+scene0758_00
+scene0759_00
+scene0760_00
+scene0761_00
+scene0762_00
+scene0763_00
+scene0764_00
+scene0765_00
+scene0766_00
+scene0767_00
+scene0768_00
+scene0769_00
+scene0770_00
+scene0771_00
+scene0772_00
+scene0773_00
+scene0774_00
+scene0775_00
+scene0776_00
+scene0777_00
+scene0778_00
+scene0779_00
+scene0780_00
+scene0781_00
+scene0782_00
+scene0783_00
+scene0784_00
+scene0785_00
+scene0786_00
+scene0787_00
+scene0788_00
+scene0789_00
+scene0790_00
+scene0791_00
+scene0792_00
+scene0793_00
+scene0794_00
+scene0795_00
+scene0796_00
+scene0797_00
+scene0798_00
+scene0799_00
+scene0800_00
+scene0801_00
+scene0802_00
+scene0803_00
+scene0804_00
+scene0805_00
+scene0806_00
diff --git a/datasets/scannet_preprocess/meta_data/scannetv2_train.txt b/datasets/scannet_preprocess/meta_data/scannetv2_train.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ef625f120b812fea5ac507d3b7049fc7ebd2e7e4
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv2_train.txt
@@ -0,0 +1,1201 @@
+scene0191_00
+scene0191_01
+scene0191_02
+scene0119_00
+scene0230_00
+scene0528_00
+scene0528_01
+scene0705_00
+scene0705_01
+scene0705_02
+scene0415_00
+scene0415_01
+scene0415_02
+scene0007_00
+scene0141_00
+scene0141_01
+scene0141_02
+scene0515_00
+scene0515_01
+scene0515_02
+scene0447_00
+scene0447_01
+scene0447_02
+scene0531_00
+scene0503_00
+scene0285_00
+scene0069_00
+scene0584_00
+scene0584_01
+scene0584_02
+scene0581_00
+scene0581_01
+scene0581_02
+scene0620_00
+scene0620_01
+scene0263_00
+scene0263_01
+scene0481_00
+scene0481_01
+scene0020_00
+scene0020_01
+scene0291_00
+scene0291_01
+scene0291_02
+scene0469_00
+scene0469_01
+scene0469_02
+scene0659_00
+scene0659_01
+scene0024_00
+scene0024_01
+scene0024_02
+scene0564_00
+scene0117_00
+scene0027_00
+scene0027_01
+scene0027_02
+scene0028_00
+scene0330_00
+scene0418_00
+scene0418_01
+scene0418_02
+scene0233_00
+scene0233_01
+scene0673_00
+scene0673_01
+scene0673_02
+scene0673_03
+scene0673_04
+scene0673_05
+scene0585_00
+scene0585_01
+scene0362_00
+scene0362_01
+scene0362_02
+scene0362_03
+scene0035_00
+scene0035_01
+scene0358_00
+scene0358_01
+scene0358_02
+scene0037_00
+scene0194_00
+scene0321_00
+scene0293_00
+scene0293_01
+scene0623_00
+scene0623_01
+scene0592_00
+scene0592_01
+scene0569_00
+scene0569_01
+scene0413_00
+scene0313_00
+scene0313_01
+scene0313_02
+scene0480_00
+scene0480_01
+scene0401_00
+scene0517_00
+scene0517_01
+scene0517_02
+scene0032_00
+scene0032_01
+scene0613_00
+scene0613_01
+scene0613_02
+scene0306_00
+scene0306_01
+scene0052_00
+scene0052_01
+scene0052_02
+scene0053_00
+scene0444_00
+scene0444_01
+scene0055_00
+scene0055_01
+scene0055_02
+scene0560_00
+scene0589_00
+scene0589_01
+scene0589_02
+scene0610_00
+scene0610_01
+scene0610_02
+scene0364_00
+scene0364_01
+scene0383_00
+scene0383_01
+scene0383_02
+scene0006_00
+scene0006_01
+scene0006_02
+scene0275_00
+scene0451_00
+scene0451_01
+scene0451_02
+scene0451_03
+scene0451_04
+scene0451_05
+scene0135_00
+scene0065_00
+scene0065_01
+scene0065_02
+scene0104_00
+scene0674_00
+scene0674_01
+scene0448_00
+scene0448_01
+scene0448_02
+scene0502_00
+scene0502_01
+scene0502_02
+scene0440_00
+scene0440_01
+scene0440_02
+scene0071_00
+scene0072_00
+scene0072_01
+scene0072_02
+scene0509_00
+scene0509_01
+scene0509_02
+scene0649_00
+scene0649_01
+scene0602_00
+scene0694_00
+scene0694_01
+scene0101_00
+scene0101_01
+scene0101_02
+scene0101_03
+scene0101_04
+scene0101_05
+scene0218_00
+scene0218_01
+scene0579_00
+scene0579_01
+scene0579_02
+scene0039_00
+scene0039_01
+scene0493_00
+scene0493_01
+scene0242_00
+scene0242_01
+scene0242_02
+scene0083_00
+scene0083_01
+scene0127_00
+scene0127_01
+scene0662_00
+scene0662_01
+scene0662_02
+scene0018_00
+scene0087_00
+scene0087_01
+scene0087_02
+scene0332_00
+scene0332_01
+scene0332_02
+scene0628_00
+scene0628_01
+scene0628_02
+scene0134_00
+scene0134_01
+scene0134_02
+scene0238_00
+scene0238_01
+scene0092_00
+scene0092_01
+scene0092_02
+scene0092_03
+scene0092_04
+scene0022_00
+scene0022_01
+scene0467_00
+scene0392_00
+scene0392_01
+scene0392_02
+scene0424_00
+scene0424_01
+scene0424_02
+scene0646_00
+scene0646_01
+scene0646_02
+scene0098_00
+scene0098_01
+scene0044_00
+scene0044_01
+scene0044_02
+scene0510_00
+scene0510_01
+scene0510_02
+scene0571_00
+scene0571_01
+scene0166_00
+scene0166_01
+scene0166_02
+scene0563_00
+scene0172_00
+scene0172_01
+scene0388_00
+scene0388_01
+scene0215_00
+scene0215_01
+scene0252_00
+scene0287_00
+scene0668_00
+scene0572_00
+scene0572_01
+scene0572_02
+scene0026_00
+scene0224_00
+scene0113_00
+scene0113_01
+scene0551_00
+scene0381_00
+scene0381_01
+scene0381_02
+scene0371_00
+scene0371_01
+scene0460_00
+scene0118_00
+scene0118_01
+scene0118_02
+scene0417_00
+scene0008_00
+scene0634_00
+scene0521_00
+scene0123_00
+scene0123_01
+scene0123_02
+scene0045_00
+scene0045_01
+scene0511_00
+scene0511_01
+scene0114_00
+scene0114_01
+scene0114_02
+scene0070_00
+scene0029_00
+scene0029_01
+scene0029_02
+scene0129_00
+scene0103_00
+scene0103_01
+scene0002_00
+scene0002_01
+scene0132_00
+scene0132_01
+scene0132_02
+scene0124_00
+scene0124_01
+scene0143_00
+scene0143_01
+scene0143_02
+scene0604_00
+scene0604_01
+scene0604_02
+scene0507_00
+scene0105_00
+scene0105_01
+scene0105_02
+scene0428_00
+scene0428_01
+scene0311_00
+scene0140_00
+scene0140_01
+scene0182_00
+scene0182_01
+scene0182_02
+scene0142_00
+scene0142_01
+scene0399_00
+scene0399_01
+scene0012_00
+scene0012_01
+scene0012_02
+scene0060_00
+scene0060_01
+scene0370_00
+scene0370_01
+scene0370_02
+scene0310_00
+scene0310_01
+scene0310_02
+scene0661_00
+scene0650_00
+scene0152_00
+scene0152_01
+scene0152_02
+scene0158_00
+scene0158_01
+scene0158_02
+scene0482_00
+scene0482_01
+scene0600_00
+scene0600_01
+scene0600_02
+scene0393_00
+scene0393_01
+scene0393_02
+scene0562_00
+scene0174_00
+scene0174_01
+scene0157_00
+scene0157_01
+scene0161_00
+scene0161_01
+scene0161_02
+scene0159_00
+scene0254_00
+scene0254_01
+scene0115_00
+scene0115_01
+scene0115_02
+scene0162_00
+scene0163_00
+scene0163_01
+scene0523_00
+scene0523_01
+scene0523_02
+scene0459_00
+scene0459_01
+scene0175_00
+scene0085_00
+scene0085_01
+scene0279_00
+scene0279_01
+scene0279_02
+scene0201_00
+scene0201_01
+scene0201_02
+scene0283_00
+scene0456_00
+scene0456_01
+scene0429_00
+scene0043_00
+scene0043_01
+scene0419_00
+scene0419_01
+scene0419_02
+scene0368_00
+scene0368_01
+scene0348_00
+scene0348_01
+scene0348_02
+scene0442_00
+scene0178_00
+scene0380_00
+scene0380_01
+scene0380_02
+scene0165_00
+scene0165_01
+scene0165_02
+scene0181_00
+scene0181_01
+scene0181_02
+scene0181_03
+scene0333_00
+scene0614_00
+scene0614_01
+scene0614_02
+scene0404_00
+scene0404_01
+scene0404_02
+scene0185_00
+scene0126_00
+scene0126_01
+scene0126_02
+scene0519_00
+scene0236_00
+scene0236_01
+scene0189_00
+scene0075_00
+scene0267_00
+scene0192_00
+scene0192_01
+scene0192_02
+scene0281_00
+scene0420_00
+scene0420_01
+scene0420_02
+scene0195_00
+scene0195_01
+scene0195_02
+scene0597_00
+scene0597_01
+scene0597_02
+scene0041_00
+scene0041_01
+scene0111_00
+scene0111_01
+scene0111_02
+scene0666_00
+scene0666_01
+scene0666_02
+scene0200_00
+scene0200_01
+scene0200_02
+scene0536_00
+scene0536_01
+scene0536_02
+scene0390_00
+scene0280_00
+scene0280_01
+scene0280_02
+scene0344_00
+scene0344_01
+scene0205_00
+scene0205_01
+scene0205_02
+scene0484_00
+scene0484_01
+scene0009_00
+scene0009_01
+scene0009_02
+scene0302_00
+scene0302_01
+scene0209_00
+scene0209_01
+scene0209_02
+scene0210_00
+scene0210_01
+scene0395_00
+scene0395_01
+scene0395_02
+scene0683_00
+scene0601_00
+scene0601_01
+scene0214_00
+scene0214_01
+scene0214_02
+scene0477_00
+scene0477_01
+scene0439_00
+scene0439_01
+scene0468_00
+scene0468_01
+scene0468_02
+scene0546_00
+scene0466_00
+scene0466_01
+scene0220_00
+scene0220_01
+scene0220_02
+scene0122_00
+scene0122_01
+scene0130_00
+scene0110_00
+scene0110_01
+scene0110_02
+scene0327_00
+scene0156_00
+scene0266_00
+scene0266_01
+scene0001_00
+scene0001_01
+scene0228_00
+scene0199_00
+scene0219_00
+scene0464_00
+scene0232_00
+scene0232_01
+scene0232_02
+scene0299_00
+scene0299_01
+scene0530_00
+scene0363_00
+scene0453_00
+scene0453_01
+scene0570_00
+scene0570_01
+scene0570_02
+scene0183_00
+scene0239_00
+scene0239_01
+scene0239_02
+scene0373_00
+scene0373_01
+scene0241_00
+scene0241_01
+scene0241_02
+scene0188_00
+scene0622_00
+scene0622_01
+scene0244_00
+scene0244_01
+scene0691_00
+scene0691_01
+scene0206_00
+scene0206_01
+scene0206_02
+scene0247_00
+scene0247_01
+scene0061_00
+scene0061_01
+scene0082_00
+scene0250_00
+scene0250_01
+scene0250_02
+scene0501_00
+scene0501_01
+scene0501_02
+scene0320_00
+scene0320_01
+scene0320_02
+scene0320_03
+scene0631_00
+scene0631_01
+scene0631_02
+scene0255_00
+scene0255_01
+scene0255_02
+scene0047_00
+scene0265_00
+scene0265_01
+scene0265_02
+scene0004_00
+scene0336_00
+scene0336_01
+scene0058_00
+scene0058_01
+scene0260_00
+scene0260_01
+scene0260_02
+scene0243_00
+scene0603_00
+scene0603_01
+scene0093_00
+scene0093_01
+scene0093_02
+scene0109_00
+scene0109_01
+scene0434_00
+scene0434_01
+scene0434_02
+scene0290_00
+scene0627_00
+scene0627_01
+scene0470_00
+scene0470_01
+scene0137_00
+scene0137_01
+scene0137_02
+scene0270_00
+scene0270_01
+scene0270_02
+scene0271_00
+scene0271_01
+scene0504_00
+scene0274_00
+scene0274_01
+scene0274_02
+scene0036_00
+scene0036_01
+scene0276_00
+scene0276_01
+scene0272_00
+scene0272_01
+scene0499_00
+scene0698_00
+scene0698_01
+scene0051_00
+scene0051_01
+scene0051_02
+scene0051_03
+scene0108_00
+scene0245_00
+scene0369_00
+scene0369_01
+scene0369_02
+scene0284_00
+scene0289_00
+scene0289_01
+scene0286_00
+scene0286_01
+scene0286_02
+scene0286_03
+scene0031_00
+scene0031_01
+scene0031_02
+scene0545_00
+scene0545_01
+scene0545_02
+scene0557_00
+scene0557_01
+scene0557_02
+scene0533_00
+scene0533_01
+scene0116_00
+scene0116_01
+scene0116_02
+scene0611_00
+scene0611_01
+scene0688_00
+scene0294_00
+scene0294_01
+scene0294_02
+scene0295_00
+scene0295_01
+scene0296_00
+scene0296_01
+scene0596_00
+scene0596_01
+scene0596_02
+scene0532_00
+scene0532_01
+scene0637_00
+scene0638_00
+scene0121_00
+scene0121_01
+scene0121_02
+scene0040_00
+scene0040_01
+scene0197_00
+scene0197_01
+scene0197_02
+scene0410_00
+scene0410_01
+scene0305_00
+scene0305_01
+scene0615_00
+scene0615_01
+scene0703_00
+scene0703_01
+scene0555_00
+scene0297_00
+scene0297_01
+scene0297_02
+scene0582_00
+scene0582_01
+scene0582_02
+scene0023_00
+scene0094_00
+scene0013_00
+scene0013_01
+scene0013_02
+scene0136_00
+scene0136_01
+scene0136_02
+scene0407_00
+scene0407_01
+scene0062_00
+scene0062_01
+scene0062_02
+scene0386_00
+scene0318_00
+scene0554_00
+scene0554_01
+scene0497_00
+scene0213_00
+scene0258_00
+scene0323_00
+scene0323_01
+scene0324_00
+scene0324_01
+scene0016_00
+scene0016_01
+scene0016_02
+scene0681_00
+scene0398_00
+scene0398_01
+scene0227_00
+scene0090_00
+scene0066_00
+scene0262_00
+scene0262_01
+scene0155_00
+scene0155_01
+scene0155_02
+scene0352_00
+scene0352_01
+scene0352_02
+scene0038_00
+scene0038_01
+scene0038_02
+scene0335_00
+scene0335_01
+scene0335_02
+scene0261_00
+scene0261_01
+scene0261_02
+scene0261_03
+scene0640_00
+scene0640_01
+scene0640_02
+scene0080_00
+scene0080_01
+scene0080_02
+scene0403_00
+scene0403_01
+scene0282_00
+scene0282_01
+scene0282_02
+scene0682_00
+scene0173_00
+scene0173_01
+scene0173_02
+scene0522_00
+scene0687_00
+scene0345_00
+scene0345_01
+scene0612_00
+scene0612_01
+scene0411_00
+scene0411_01
+scene0411_02
+scene0625_00
+scene0625_01
+scene0211_00
+scene0211_01
+scene0211_02
+scene0211_03
+scene0676_00
+scene0676_01
+scene0179_00
+scene0498_00
+scene0498_01
+scene0498_02
+scene0547_00
+scene0547_01
+scene0547_02
+scene0269_00
+scene0269_01
+scene0269_02
+scene0366_00
+scene0680_00
+scene0680_01
+scene0588_00
+scene0588_01
+scene0588_02
+scene0588_03
+scene0346_00
+scene0346_01
+scene0359_00
+scene0359_01
+scene0014_00
+scene0120_00
+scene0120_01
+scene0212_00
+scene0212_01
+scene0212_02
+scene0176_00
+scene0049_00
+scene0259_00
+scene0259_01
+scene0586_00
+scene0586_01
+scene0586_02
+scene0309_00
+scene0309_01
+scene0125_00
+scene0455_00
+scene0177_00
+scene0177_01
+scene0177_02
+scene0326_00
+scene0372_00
+scene0171_00
+scene0171_01
+scene0374_00
+scene0654_00
+scene0654_01
+scene0445_00
+scene0445_01
+scene0475_00
+scene0475_01
+scene0475_02
+scene0349_00
+scene0349_01
+scene0234_00
+scene0669_00
+scene0669_01
+scene0375_00
+scene0375_01
+scene0375_02
+scene0387_00
+scene0387_01
+scene0387_02
+scene0312_00
+scene0312_01
+scene0312_02
+scene0384_00
+scene0385_00
+scene0385_01
+scene0385_02
+scene0000_00
+scene0000_01
+scene0000_02
+scene0376_00
+scene0376_01
+scene0376_02
+scene0301_00
+scene0301_01
+scene0301_02
+scene0322_00
+scene0542_00
+scene0079_00
+scene0079_01
+scene0099_00
+scene0099_01
+scene0476_00
+scene0476_01
+scene0476_02
+scene0394_00
+scene0394_01
+scene0147_00
+scene0147_01
+scene0067_00
+scene0067_01
+scene0067_02
+scene0397_00
+scene0397_01
+scene0337_00
+scene0337_01
+scene0337_02
+scene0431_00
+scene0223_00
+scene0223_01
+scene0223_02
+scene0010_00
+scene0010_01
+scene0402_00
+scene0268_00
+scene0268_01
+scene0268_02
+scene0679_00
+scene0679_01
+scene0405_00
+scene0128_00
+scene0408_00
+scene0408_01
+scene0190_00
+scene0107_00
+scene0076_00
+scene0167_00
+scene0361_00
+scene0361_01
+scene0361_02
+scene0216_00
+scene0202_00
+scene0303_00
+scene0303_01
+scene0303_02
+scene0446_00
+scene0446_01
+scene0089_00
+scene0089_01
+scene0089_02
+scene0360_00
+scene0150_00
+scene0150_01
+scene0150_02
+scene0421_00
+scene0421_01
+scene0421_02
+scene0454_00
+scene0626_00
+scene0626_01
+scene0626_02
+scene0186_00
+scene0186_01
+scene0538_00
+scene0479_00
+scene0479_01
+scene0479_02
+scene0656_00
+scene0656_01
+scene0656_02
+scene0656_03
+scene0525_00
+scene0525_01
+scene0525_02
+scene0308_00
+scene0396_00
+scene0396_01
+scene0396_02
+scene0624_00
+scene0292_00
+scene0292_01
+scene0632_00
+scene0253_00
+scene0021_00
+scene0325_00
+scene0325_01
+scene0437_00
+scene0437_01
+scene0438_00
+scene0590_00
+scene0590_01
+scene0400_00
+scene0400_01
+scene0541_00
+scene0541_01
+scene0541_02
+scene0677_00
+scene0677_01
+scene0677_02
+scene0443_00
+scene0315_00
+scene0288_00
+scene0288_01
+scene0288_02
+scene0422_00
+scene0672_00
+scene0672_01
+scene0184_00
+scene0449_00
+scene0449_01
+scene0449_02
+scene0048_00
+scene0048_01
+scene0138_00
+scene0452_00
+scene0452_01
+scene0452_02
+scene0667_00
+scene0667_01
+scene0667_02
+scene0463_00
+scene0463_01
+scene0078_00
+scene0078_01
+scene0078_02
+scene0636_00
+scene0457_00
+scene0457_01
+scene0457_02
+scene0465_00
+scene0465_01
+scene0577_00
+scene0151_00
+scene0151_01
+scene0339_00
+scene0573_00
+scene0573_01
+scene0154_00
+scene0096_00
+scene0096_01
+scene0096_02
+scene0235_00
+scene0168_00
+scene0168_01
+scene0168_02
+scene0594_00
+scene0587_00
+scene0587_01
+scene0587_02
+scene0587_03
+scene0229_00
+scene0229_01
+scene0229_02
+scene0512_00
+scene0106_00
+scene0106_01
+scene0106_02
+scene0472_00
+scene0472_01
+scene0472_02
+scene0489_00
+scene0489_01
+scene0489_02
+scene0425_00
+scene0425_01
+scene0641_00
+scene0526_00
+scene0526_01
+scene0317_00
+scene0317_01
+scene0544_00
+scene0017_00
+scene0017_01
+scene0017_02
+scene0042_00
+scene0042_01
+scene0042_02
+scene0576_00
+scene0576_01
+scene0576_02
+scene0347_00
+scene0347_01
+scene0347_02
+scene0436_00
+scene0226_00
+scene0226_01
+scene0485_00
+scene0486_00
+scene0487_00
+scene0487_01
+scene0619_00
+scene0097_00
+scene0367_00
+scene0367_01
+scene0491_00
+scene0492_00
+scene0492_01
+scene0005_00
+scene0005_01
+scene0543_00
+scene0543_01
+scene0543_02
+scene0657_00
+scene0341_00
+scene0341_01
+scene0534_00
+scene0534_01
+scene0319_00
+scene0273_00
+scene0273_01
+scene0225_00
+scene0198_00
+scene0003_00
+scene0003_01
+scene0003_02
+scene0409_00
+scene0409_01
+scene0331_00
+scene0331_01
+scene0505_00
+scene0505_01
+scene0505_02
+scene0505_03
+scene0505_04
+scene0506_00
+scene0057_00
+scene0057_01
+scene0074_00
+scene0074_01
+scene0074_02
+scene0091_00
+scene0112_00
+scene0112_01
+scene0112_02
+scene0240_00
+scene0102_00
+scene0102_01
+scene0513_00
+scene0514_00
+scene0514_01
+scene0537_00
+scene0516_00
+scene0516_01
+scene0495_00
+scene0617_00
+scene0133_00
+scene0520_00
+scene0520_01
+scene0635_00
+scene0635_01
+scene0054_00
+scene0473_00
+scene0473_01
+scene0524_00
+scene0524_01
+scene0379_00
+scene0471_00
+scene0471_01
+scene0471_02
+scene0566_00
+scene0248_00
+scene0248_01
+scene0248_02
+scene0529_00
+scene0529_01
+scene0529_02
+scene0391_00
+scene0264_00
+scene0264_01
+scene0264_02
+scene0675_00
+scene0675_01
+scene0350_00
+scene0350_01
+scene0350_02
+scene0450_00
+scene0068_00
+scene0068_01
+scene0237_00
+scene0237_01
+scene0365_00
+scene0365_01
+scene0365_02
+scene0605_00
+scene0605_01
+scene0539_00
+scene0539_01
+scene0539_02
+scene0540_00
+scene0540_01
+scene0540_02
+scene0170_00
+scene0170_01
+scene0170_02
+scene0433_00
+scene0340_00
+scene0340_01
+scene0340_02
+scene0160_00
+scene0160_01
+scene0160_02
+scene0160_03
+scene0160_04
+scene0059_00
+scene0059_01
+scene0059_02
+scene0056_00
+scene0056_01
+scene0478_00
+scene0478_01
+scene0548_00
+scene0548_01
+scene0548_02
+scene0204_00
+scene0204_01
+scene0204_02
+scene0033_00
+scene0145_00
+scene0483_00
+scene0508_00
+scene0508_01
+scene0508_02
+scene0180_00
+scene0148_00
+scene0556_00
+scene0556_01
+scene0416_00
+scene0416_01
+scene0416_02
+scene0416_03
+scene0416_04
+scene0073_00
+scene0073_01
+scene0073_02
+scene0073_03
+scene0034_00
+scene0034_01
+scene0034_02
+scene0639_00
+scene0561_00
+scene0561_01
+scene0298_00
+scene0692_00
+scene0692_01
+scene0692_02
+scene0692_03
+scene0692_04
+scene0642_00
+scene0642_01
+scene0642_02
+scene0642_03
+scene0630_00
+scene0630_01
+scene0630_02
+scene0630_03
+scene0630_04
+scene0630_05
+scene0630_06
+scene0706_00
+scene0567_00
+scene0567_01
diff --git a/datasets/scannet_preprocess/meta_data/scannetv2_val.txt b/datasets/scannet_preprocess/meta_data/scannetv2_val.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b9e7d9205321e8ca047a527466f4b7100c9c9d2c
--- /dev/null
+++ b/datasets/scannet_preprocess/meta_data/scannetv2_val.txt
@@ -0,0 +1,312 @@
+scene0568_00
+scene0568_01
+scene0568_02
+scene0304_00
+scene0488_00
+scene0488_01
+scene0412_00
+scene0412_01
+scene0217_00
+scene0019_00
+scene0019_01
+scene0414_00
+scene0575_00
+scene0575_01
+scene0575_02
+scene0426_00
+scene0426_01
+scene0426_02
+scene0426_03
+scene0549_00
+scene0549_01
+scene0578_00
+scene0578_01
+scene0578_02
+scene0665_00
+scene0665_01
+scene0050_00
+scene0050_01
+scene0050_02
+scene0257_00
+scene0025_00
+scene0025_01
+scene0025_02
+scene0583_00
+scene0583_01
+scene0583_02
+scene0701_00
+scene0701_01
+scene0701_02
+scene0580_00
+scene0580_01
+scene0565_00
+scene0169_00
+scene0169_01
+scene0655_00
+scene0655_01
+scene0655_02
+scene0063_00
+scene0221_00
+scene0221_01
+scene0591_00
+scene0591_01
+scene0591_02
+scene0678_00
+scene0678_01
+scene0678_02
+scene0462_00
+scene0427_00
+scene0595_00
+scene0193_00
+scene0193_01
+scene0164_00
+scene0164_01
+scene0164_02
+scene0164_03
+scene0598_00
+scene0598_01
+scene0598_02
+scene0599_00
+scene0599_01
+scene0599_02
+scene0328_00
+scene0300_00
+scene0300_01
+scene0354_00
+scene0458_00
+scene0458_01
+scene0423_00
+scene0423_01
+scene0423_02
+scene0307_00
+scene0307_01
+scene0307_02
+scene0606_00
+scene0606_01
+scene0606_02
+scene0432_00
+scene0432_01
+scene0608_00
+scene0608_01
+scene0608_02
+scene0651_00
+scene0651_01
+scene0651_02
+scene0430_00
+scene0430_01
+scene0689_00
+scene0357_00
+scene0357_01
+scene0574_00
+scene0574_01
+scene0574_02
+scene0329_00
+scene0329_01
+scene0329_02
+scene0153_00
+scene0153_01
+scene0616_00
+scene0616_01
+scene0671_00
+scene0671_01
+scene0618_00
+scene0382_00
+scene0382_01
+scene0490_00
+scene0621_00
+scene0607_00
+scene0607_01
+scene0149_00
+scene0695_00
+scene0695_01
+scene0695_02
+scene0695_03
+scene0389_00
+scene0377_00
+scene0377_01
+scene0377_02
+scene0342_00
+scene0139_00
+scene0629_00
+scene0629_01
+scene0629_02
+scene0496_00
+scene0633_00
+scene0633_01
+scene0518_00
+scene0652_00
+scene0406_00
+scene0406_01
+scene0406_02
+scene0144_00
+scene0144_01
+scene0494_00
+scene0278_00
+scene0278_01
+scene0316_00
+scene0609_00
+scene0609_01
+scene0609_02
+scene0609_03
+scene0084_00
+scene0084_01
+scene0084_02
+scene0696_00
+scene0696_01
+scene0696_02
+scene0351_00
+scene0351_01
+scene0643_00
+scene0644_00
+scene0645_00
+scene0645_01
+scene0645_02
+scene0081_00
+scene0081_01
+scene0081_02
+scene0647_00
+scene0647_01
+scene0535_00
+scene0353_00
+scene0353_01
+scene0353_02
+scene0559_00
+scene0559_01
+scene0559_02
+scene0593_00
+scene0593_01
+scene0246_00
+scene0653_00
+scene0653_01
+scene0064_00
+scene0064_01
+scene0356_00
+scene0356_01
+scene0356_02
+scene0030_00
+scene0030_01
+scene0030_02
+scene0222_00
+scene0222_01
+scene0338_00
+scene0338_01
+scene0338_02
+scene0378_00
+scene0378_01
+scene0378_02
+scene0660_00
+scene0553_00
+scene0553_01
+scene0553_02
+scene0527_00
+scene0663_00
+scene0663_01
+scene0663_02
+scene0664_00
+scene0664_01
+scene0664_02
+scene0334_00
+scene0334_01
+scene0334_02
+scene0046_00
+scene0046_01
+scene0046_02
+scene0203_00
+scene0203_01
+scene0203_02
+scene0088_00
+scene0088_01
+scene0088_02
+scene0088_03
+scene0086_00
+scene0086_01
+scene0086_02
+scene0670_00
+scene0670_01
+scene0256_00
+scene0256_01
+scene0256_02
+scene0249_00
+scene0441_00
+scene0658_00
+scene0704_00
+scene0704_01
+scene0187_00
+scene0187_01
+scene0131_00
+scene0131_01
+scene0131_02
+scene0207_00
+scene0207_01
+scene0207_02
+scene0461_00
+scene0011_00
+scene0011_01
+scene0343_00
+scene0251_00
+scene0077_00
+scene0077_01
+scene0684_00
+scene0684_01
+scene0550_00
+scene0686_00
+scene0686_01
+scene0686_02
+scene0208_00
+scene0500_00
+scene0500_01
+scene0552_00
+scene0552_01
+scene0648_00
+scene0648_01
+scene0435_00
+scene0435_01
+scene0435_02
+scene0435_03
+scene0690_00
+scene0690_01
+scene0693_00
+scene0693_01
+scene0693_02
+scene0700_00
+scene0700_01
+scene0700_02
+scene0699_00
+scene0231_00
+scene0231_01
+scene0231_02
+scene0697_00
+scene0697_01
+scene0697_02
+scene0697_03
+scene0474_00
+scene0474_01
+scene0474_02
+scene0474_03
+scene0474_04
+scene0474_05
+scene0355_00
+scene0355_01
+scene0146_00
+scene0146_01
+scene0146_02
+scene0196_00
+scene0702_00
+scene0702_01
+scene0702_02
+scene0314_00
+scene0277_00
+scene0277_01
+scene0277_02
+scene0095_00
+scene0095_01
+scene0015_00
+scene0100_00
+scene0100_01
+scene0100_02
+scene0558_00
+scene0558_01
+scene0558_02
+scene0685_00
+scene0685_01
+scene0685_02
diff --git a/datasets/scannet_preprocess/prepare_2d_data/SensorData.py b/datasets/scannet_preprocess/prepare_2d_data/SensorData.py
new file mode 100644
index 0000000000000000000000000000000000000000..dec0d116a63d4ea55a72979266a75e9be89a9fc9
--- /dev/null
+++ b/datasets/scannet_preprocess/prepare_2d_data/SensorData.py
@@ -0,0 +1,121 @@
+
+import os, struct
+import numpy as np
+import zlib
+import imageio
+import cv2
+
+COMPRESSION_TYPE_COLOR = {-1:'unknown', 0:'raw', 1:'png', 2:'jpeg'}
+COMPRESSION_TYPE_DEPTH = {-1:'unknown', 0:'raw_ushort', 1:'zlib_ushort', 2:'occi_ushort'}
+
+class RGBDFrame():
+
+  def load(self, file_handle):
+    self.camera_to_world = np.asarray(struct.unpack('f'*16, file_handle.read(16*4)), dtype=np.float32).reshape(4, 4)
+    self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0]
+    self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0]
+    self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
+    self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
+    self.color_data = b''.join(struct.unpack('c'*self.color_size_bytes, file_handle.read(self.color_size_bytes)))
+    self.depth_data = b''.join(struct.unpack('c'*self.depth_size_bytes, file_handle.read(self.depth_size_bytes)))
+
+
+  def decompress_depth(self, compression_type):
+    if compression_type == 'zlib_ushort':
+       return self.decompress_depth_zlib()
+    else:
+       raise
+
+
+  def decompress_depth_zlib(self):
+    return zlib.decompress(self.depth_data)
+
+
+  def decompress_color(self, compression_type):
+    if compression_type == 'jpeg':
+       return self.decompress_color_jpeg()
+    else:
+       raise
+
+
+  def decompress_color_jpeg(self):
+    return imageio.imread(self.color_data)
+
+
+class SensorData:
+
+  def __init__(self, filename):
+    self.version = 4
+    self.load(filename)
+
+
+  def load(self, filename):
+    with open(filename, 'rb') as f:
+      version = struct.unpack('I', f.read(4))[0]
+      assert self.version == version
+      strlen = struct.unpack('Q', f.read(8))[0]
+      self.sensor_name = b''.join(struct.unpack('c'*strlen, f.read(strlen)))
+      self.intrinsic_color = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4)
+      self.extrinsic_color = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4)
+      self.intrinsic_depth = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4)
+      self.extrinsic_depth = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4)
+      self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack('i', f.read(4))[0]]
+      self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack('i', f.read(4))[0]]
+      self.color_width = struct.unpack('I', f.read(4))[0]
+      self.color_height =  struct.unpack('I', f.read(4))[0]
+      self.depth_width = struct.unpack('I', f.read(4))[0]
+      self.depth_height =  struct.unpack('I', f.read(4))[0]
+      self.depth_shift =  struct.unpack('f', f.read(4))[0]
+      num_frames =  struct.unpack('Q', f.read(8))[0]
+      self.frames = []
+      for i in range(num_frames):
+        frame = RGBDFrame()
+        frame.load(f)
+        self.frames.append(frame)
+
+
+  def export_depth_images(self, output_path, image_size=None, frame_skip=1):
+    if not os.path.exists(output_path):
+      os.makedirs(output_path)
+    # print 'exporting', len(self.frames)//frame_skip, ' depth frames to', output_path
+    for f in range(0, len(self.frames), frame_skip):
+      depth_data = self.frames[f].decompress_depth(self.depth_compression_type)
+      depth = np.fromstring(depth_data, dtype=np.uint16).reshape(self.depth_height, self.depth_width)
+      if image_size is not None:
+        depth = cv2.resize(depth, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST)
+      imageio.imwrite(os.path.join(output_path, str(f) + '.png'), depth)
+
+
+  def export_color_images(self, output_path, image_size=None, frame_skip=1):
+    if not os.path.exists(output_path):
+      os.makedirs(output_path)
+    # print 'exporting', len(self.frames)//frame_skip, 'color frames to', output_path
+    for f in range(0, len(self.frames), frame_skip):
+      color = self.frames[f].decompress_color(self.color_compression_type)
+      if image_size is not None:
+        color = cv2.resize(color, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST)
+      imageio.imwrite(os.path.join(output_path, str(f) + '.jpg'), color)
+
+
+  def save_mat_to_file(self, matrix, filename):
+    with open(filename, 'w') as f:
+      for line in matrix:
+        np.savetxt(f, line[np.newaxis], fmt='%f')
+
+
+  def export_poses(self, output_path, frame_skip=1):
+    if not os.path.exists(output_path):
+      os.makedirs(output_path)
+    # print 'exporting', len(self.frames)//frame_skip, 'camera poses to', output_path
+    for f in range(0, len(self.frames), frame_skip):
+      self.save_mat_to_file(self.frames[f].camera_to_world, os.path.join(output_path, str(f) + '.txt'))
+
+
+  def export_intrinsics(self, output_path):
+    if not os.path.exists(output_path):
+      os.makedirs(output_path)
+    # print 'exporting camera intrinsics to', output_path
+    self.save_mat_to_file(self.intrinsic_color, os.path.join(output_path, 'intrinsic_color.txt'))
+    self.save_mat_to_file(self.extrinsic_color, os.path.join(output_path, 'extrinsic_color.txt'))
+    self.save_mat_to_file(self.intrinsic_depth, os.path.join(output_path, 'intrinsic_depth.txt'))
+    self.save_mat_to_file(self.extrinsic_depth, os.path.join(output_path, 'extrinsic_depth.txt'))
\ No newline at end of file
diff --git a/datasets/scannet_preprocess/prepare_2d_data/prepare_2d_data.py b/datasets/scannet_preprocess/prepare_2d_data/prepare_2d_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc37236cefe0201d89be8eb8feb9d6369505446
--- /dev/null
+++ b/datasets/scannet_preprocess/prepare_2d_data/prepare_2d_data.py
@@ -0,0 +1,123 @@
+# pre-process ScanNet 2D data
+# note: depends on the sens file reader from ScanNet:
+#       https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py
+# if export_label_images flag is on:
+#   - depends on https://github.com/ScanNet/ScanNet/tree/master/BenchmarkScripts/util.py
+#   - also assumes that label images are unzipped as scene*/label*/*.png
+# expected file structure:
+#  - prepare_2d_data.py
+#  - https://github.com/ScanNet/ScanNet/tree/master/BenchmarkScripts/util.py
+#  - https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py
+#
+# example usage:
+#    python prepare_2d_data.py --scannet_path data/scannetv2 --output_path data/scannetv2_images --export_label_images
+
+import argparse
+import os, sys
+import numpy as np
+import skimage.transform as sktf
+import imageio
+from SensorData import SensorData
+import util
+# try:
+#     from prepare_2d_data.SensorData import SensorData
+# except:
+#     print('Failed to import SensorData (from ScanNet code toolbox)')
+#     sys.exit(-1)
+# try:
+#     from prepare_2d_data import util
+# except:
+#     print('Failed to import ScanNet code toolbox util')
+#     sys.exit(-1)
+
+# params
+parser = argparse.ArgumentParser()
+parser.add_argument('--scannet_path', required=True, help='path to scannet data')
+parser.add_argument('--output_path', required=True, help='where to output 2d data')
+parser.add_argument('--export_label_images', dest='export_label_images', action='store_true')
+parser.add_argument('--label_type', default='label-filt', help='which labels (label or label-filt)')
+parser.add_argument('--frame_skip', type=int, default=20, help='export every nth frame')
+parser.add_argument('--label_map_file', default='scannet-preprocess/meta_data/scannetv2-labels.combined.tsv',
+                    help='path to scannetv2-labels.combined.tsv (required for label export only)')
+parser.add_argument('--output_image_width', type=int, default=640, help='export image width')
+parser.add_argument('--output_image_height', type=int, default=480, help='export image height')
+
+parser.set_defaults(export_label_images=False)
+opt = parser.parse_args()
+if opt.export_label_images:
+    assert opt.label_map_file != ''
+print(opt)
+
+
+def print_error(message):
+    sys.stderr.write('ERROR: ' + str(message) + '\n')
+    sys.exit(-1)
+
+
+# from https://github.com/ScanNet/ScanNet/tree/master/BenchmarkScripts/2d_helpers/convert_scannet_label_image.py
+def map_label_image(image, label_mapping):
+    mapped = np.copy(image)
+    for k, v in label_mapping.iteritems():
+        mapped[image == k] = v
+    return mapped.astype(np.uint8)
+
+
+def main():
+    if not os.path.exists(opt.output_path):
+        os.makedirs(opt.output_path)
+
+    label_mapping = None
+    if opt.export_label_images:
+        label_map = util.read_label_mapping(opt.label_map_file, label_from='id', label_to='nyu40id')
+
+    scenes = [d for d in os.listdir(opt.scannet_path) if os.path.isdir(os.path.join(opt.scannet_path, d))]
+    print('Found %d scenes' % len(scenes))
+    for i in range(0,len(scenes)):
+        if scenes[i] != 'scene0000_00': continue
+        sens_file = os.path.join(opt.scannet_path, scenes[i], scenes[i] + '.sens')
+        label_path = os.path.join(opt.scannet_path, scenes[i], opt.label_type)
+        if opt.export_label_images and not os.path.isdir(label_path):
+            print_error('Error: using export_label_images option but label path %s does not exist' % label_path)
+        output_color_path = os.path.join(opt.output_path, scenes[i], 'color')
+        if not os.path.isdir(output_color_path):
+            os.makedirs(output_color_path)
+        output_depth_path = os.path.join(opt.output_path, scenes[i], 'depth')
+        if not os.path.isdir(output_depth_path):
+            os.makedirs(output_depth_path)
+        output_pose_path = os.path.join(opt.output_path, scenes[i], 'pose')
+        if not os.path.isdir(output_pose_path):
+            os.makedirs(output_pose_path)
+        output_label_path = os.path.join(opt.output_path, scenes[i], 'label')
+        if opt.export_label_images and not os.path.isdir(output_label_path):
+            os.makedirs(output_label_path)
+        output_intrinsics_path = os.path.join(opt.output_path, scenes[i], 'intrinsics')
+        if opt.export_label_images and not os.path.isdir(output_label_path):
+            os.makedirs(output_label_path)
+
+        # read and export
+        sys.stdout.write('\r[ %d | %d ] %s\tloading...' % ((i + 1), len(scenes), scenes[i]))
+        sys.stdout.flush()
+        sd = SensorData(sens_file)
+        sys.stdout.write('\r[ %d | %d ] %s\texporting...' % ((i + 1), len(scenes), scenes[i]))
+        sys.stdout.flush()
+        sd.export_color_images(output_color_path, image_size=[opt.output_image_height, opt.output_image_width],
+                               frame_skip=opt.frame_skip)
+        sd.export_depth_images(output_depth_path, image_size=[opt.output_image_height, opt.output_image_width],
+                               frame_skip=opt.frame_skip)
+        sd.export_poses(output_pose_path, frame_skip=opt.frame_skip)
+        sd.export_intrinsics(output_intrinsics_path)
+
+        if opt.export_label_images:
+
+            for f in range(0, len(sd.frames), opt.frame_skip):
+                label_file = os.path.join(label_path, str(f) + '.png')
+                image = np.array(imageio.imread(label_file))
+                image = sktf.resize(image, [opt.output_image_height, opt.output_image_width], order=0,
+                                    preserve_range=True)
+                mapped_image = map_label_image(image, label_map)
+                imageio.imwrite(os.path.join(output_label_path, str(f) + '.png'), mapped_image)
+    print('')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/datasets/scannet_preprocess/prepare_2d_data/util.py b/datasets/scannet_preprocess/prepare_2d_data/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b781c2559a62e24df5859e462b90eac8d894d0b
--- /dev/null
+++ b/datasets/scannet_preprocess/prepare_2d_data/util.py
@@ -0,0 +1,127 @@
+import os, sys
+import csv
+
+try:
+    import numpy as np
+except:
+    # print "Failed to import numpy package."
+    sys.exit(-1)
+try:
+    import imageio
+except:
+    print("Please install the module 'imageio' for image processing, e.g.")
+    print("pip install imageio")
+    sys.exit(-1)
+
+
+# print an error message and quit
+def print_error(message, user_fault=False):
+    sys.stderr.write('ERROR: ' + str(message) + '\n')
+    if user_fault:
+        sys.exit(2)
+    sys.exit(-1)
+
+
+# if string s represents an int
+def represents_int(s):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        return False
+
+
+def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'):
+    assert os.path.isfile(filename)
+    mapping = dict()
+    with open(filename) as csvfile:
+        reader = csv.DictReader(csvfile, delimiter='\t')
+        for row in reader:
+            mapping[row[label_from]] = int(row[label_to])
+    # if ints convert 
+    if represents_int(list(mapping.keys())[0]):
+        mapping = {int(k): v for k, v in mapping.items()}
+    return mapping
+
+
+# input: scene_types.txt or scene_types_all.txt
+def read_scene_types_mapping(filename, remove_spaces=True):
+    assert os.path.isfile(filename)
+    mapping = dict()
+    lines = open(filename).read().splitlines()
+    lines = [line.split('\t') for line in lines]
+    if remove_spaces:
+        mapping = {x[1].strip(): int(x[0]) for x in lines}
+    else:
+        mapping = {x[1]: int(x[0]) for x in lines}
+    return mapping
+
+
+# color by label
+def visualize_label_image(filename, image):
+    height = image.shape[0]
+    width = image.shape[1]
+    vis_image = np.zeros([height, width, 3], dtype=np.uint8)
+    color_palette = create_color_palette()
+    for idx, color in enumerate(color_palette):
+        vis_image[image == idx] = color
+    imageio.imwrite(filename, vis_image)
+
+
+# color by different instances (mod length of color palette)
+def visualize_instance_image(filename, image):
+    height = image.shape[0]
+    width = image.shape[1]
+    vis_image = np.zeros([height, width, 3], dtype=np.uint8)
+    color_palette = create_color_palette()
+    instances = np.unique(image)
+    for idx, inst in enumerate(instances):
+        vis_image[image == inst] = color_palette[inst % len(color_palette)]
+    imageio.imwrite(filename, vis_image)
+
+
+# color palette for nyu40 labels
+def create_color_palette():
+    return [
+        (0, 0, 0),
+        (174, 199, 232),  # wall
+        (152, 223, 138),  # floor
+        (31, 119, 180),  # cabinet
+        (255, 187, 120),  # bed
+        (188, 189, 34),  # chair
+        (140, 86, 75),  # sofa
+        (255, 152, 150),  # table
+        (214, 39, 40),  # door
+        (197, 176, 213),  # window
+        (148, 103, 189),  # bookshelf
+        (196, 156, 148),  # picture
+        (23, 190, 207),  # counter
+        (178, 76, 76),
+        (247, 182, 210),  # desk
+        (66, 188, 102),
+        (219, 219, 141),  # curtain
+        (140, 57, 197),
+        (202, 185, 52),
+        (51, 176, 203),
+        (200, 54, 131),
+        (92, 193, 61),
+        (78, 71, 183),
+        (172, 114, 82),
+        (255, 127, 14),  # refrigerator
+        (91, 163, 138),
+        (153, 98, 156),
+        (140, 153, 101),
+        (158, 218, 229),  # shower curtain
+        (100, 125, 154),
+        (178, 127, 135),
+        (120, 185, 128),
+        (146, 111, 194),
+        (44, 160, 44),  # toilet
+        (112, 128, 144),  # sink
+        (96, 207, 209),
+        (227, 119, 194),  # bathtub
+        (213, 92, 176),
+        (94, 106, 211),
+        (82, 84, 163),  # otherfurn
+        (100, 85, 144)
+    ]
diff --git a/datasets/scannet_preprocess/preprocess_scannet.py b/datasets/scannet_preprocess/preprocess_scannet.py
new file mode 100644
index 0000000000000000000000000000000000000000..714a18d9ceeb7825824ce1c8e0f81d7e33969894
--- /dev/null
+++ b/datasets/scannet_preprocess/preprocess_scannet.py
@@ -0,0 +1,215 @@
+"""
+Preprocessing Script for ScanNet 20/200
+
+Author: Xiaoyang Wu (xiaoyang.wu.cs@gmail.com)
+Please cite our work if the code is helpful to you.
+"""
+
+import warnings
+
+import torch
+
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+import sys
+import os
+import argparse
+import glob
+import json
+import plyfile
+import numpy as np
+import pandas as pd
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor
+from itertools import repeat
+
+# Load external constants
+from meta_data.scannet200_constants import VALID_CLASS_IDS_200, VALID_CLASS_IDS_20
+
+CLOUD_FILE_PFIX = '_vh_clean_2'
+SEGMENTS_FILE_PFIX = '.0.010000.segs.json'
+AGGREGATIONS_FILE_PFIX = '.aggregation.json'
+CLASS_IDS200 = VALID_CLASS_IDS_200
+CLASS_IDS20 = VALID_CLASS_IDS_20
+IGNORE_INDEX = -1
+
+
+def read_plymesh(filepath):
+    """Read ply file and return it as numpy array. Returns None if emtpy."""
+    with open(filepath, 'rb') as f:
+        plydata = plyfile.PlyData.read(f)
+    if plydata.elements:
+        vertices = pd.DataFrame(plydata['vertex'].data).values
+        faces = np.stack(plydata['face'].data['vertex_indices'], axis=0)
+        return vertices, faces
+
+
+# Map the raw category id to the point cloud
+def point_indices_from_group(seg_indices, group, labels_pd):
+    group_segments = np.array(group['segments'])
+    label = group['label']
+
+    # Map the category name to id
+    label_id20 = labels_pd[labels_pd['raw_category'] == label]['nyu40id']
+    label_id20 = int(label_id20.iloc[0]) if len(label_id20) > 0 else 0
+    label_id200 = labels_pd[labels_pd['raw_category'] == label]['id']
+    label_id200 = int(label_id200.iloc[0]) if len(label_id200) > 0 else 0
+
+    # Only store for the valid categories
+    if label_id20 in CLASS_IDS20:
+        label_id20 = CLASS_IDS20.index(label_id20)
+    else:
+        label_id20 = IGNORE_INDEX
+
+    if label_id200 in CLASS_IDS200:
+        label_id200 = CLASS_IDS200.index(label_id200)
+    else:
+        label_id200 = IGNORE_INDEX
+
+    # get points, where segment indices (points labelled with segment ids) are in the group segment list
+    point_idx = np.where(np.isin(seg_indices, group_segments))[0]
+    return point_idx, label_id20, label_id200
+
+
+def face_normal(vertex, face):
+    v01 = vertex[face[:, 1]] - vertex[face[:, 0]]
+    v02 = vertex[face[:, 2]] - vertex[face[:, 0]]
+    vec = np.cross(v01, v02)
+    length = np.sqrt(np.sum(vec ** 2, axis=1, keepdims=True)) + 1.0e-8
+    nf = vec / length
+    area = length * 0.5
+    return nf, area
+
+
+def vertex_normal(vertex, face):
+    nf, area = face_normal(vertex, face)
+    nf = nf * area
+
+    nv = np.zeros_like(vertex)
+    for i in range(face.shape[0]):
+        nv[face[i]] += nf[i]
+
+    length = np.sqrt(np.sum(nv ** 2, axis=1, keepdims=True)) + 1.0e-8
+    nv = nv / length
+    return nv
+
+
+def handle_process(scene_path, output_path, labels_pd, train_scenes, val_scenes, parse_normals=True):
+    scene_id = os.path.basename(scene_path)
+    mesh_path = os.path.join(scene_path, f'{scene_id}{CLOUD_FILE_PFIX}.ply')
+    segments_file = os.path.join(scene_path, f'{scene_id}{CLOUD_FILE_PFIX}{SEGMENTS_FILE_PFIX}')
+    aggregations_file = os.path.join(scene_path, f'{scene_id}{AGGREGATIONS_FILE_PFIX}')
+    info_file = os.path.join(scene_path, f'{scene_id}.txt')
+
+    if scene_id in train_scenes:
+        output_file = os.path.join(output_path, 'train', f'{scene_id}.pth')
+        split_name = 'train'
+    elif scene_id in val_scenes:
+        output_file = os.path.join(output_path, 'val', f'{scene_id}.pth')
+        split_name = 'val'
+    else:
+        output_file = os.path.join(output_path, 'test', f'{scene_id}.pth')
+        split_name = 'test'
+
+    print(f'Processing: {scene_id} in {split_name}')
+
+    vertices, faces = read_plymesh(mesh_path)
+    coords = vertices[:, :3]
+    colors = vertices[:, 3:6]
+    save_dict = dict(coord=coords, color=colors, scene_id=scene_id)
+
+    # # Rotating the mesh to axis aligned
+    # info_dict = {}
+    # with open(info_file) as f:
+    #     for line in f:
+    #         (key, val) = line.split(" = ")
+    #         info_dict[key] = np.fromstring(val, sep=' ')
+    #
+    # if 'axisAlignment' not in info_dict:
+    #     rot_matrix = np.identity(4)
+    # else:
+    #     rot_matrix = info_dict['axisAlignment'].reshape(4, 4)
+    # r_coords = coords.transpose()
+    # r_coords = np.append(r_coords, np.ones((1, r_coords.shape[1])), axis=0)
+    # r_coords = np.dot(rot_matrix, r_coords)
+    # coords = r_coords
+
+    # Parse Normals
+    if parse_normals:
+        save_dict["normal"] = vertex_normal(coords, faces)
+
+    # Load segments file
+    if split_name != "test":
+        with open(segments_file) as f:
+            segments = json.load(f)
+            seg_indices = np.array(segments['segIndices'])
+
+        # Load Aggregations file
+        with open(aggregations_file) as f:
+            aggregation = json.load(f)
+            seg_groups = np.array(aggregation['segGroups'])
+
+        # Generate new labels
+        semantic_gt20 = np.ones((vertices.shape[0])) * IGNORE_INDEX
+        semantic_gt200 = np.ones((vertices.shape[0])) * IGNORE_INDEX
+        instance_ids = np.ones((vertices.shape[0])) * IGNORE_INDEX
+        for group in seg_groups:
+            point_idx, label_id20, label_id200 = \
+                point_indices_from_group(seg_indices, group, labels_pd)
+
+            semantic_gt20[point_idx] = label_id20
+            semantic_gt200[point_idx] = label_id200
+            instance_ids[point_idx] = group['id']
+
+        semantic_gt20 = semantic_gt20.astype(int)
+        semantic_gt200 = semantic_gt200.astype(int)
+        instance_ids = instance_ids.astype(int)
+
+        save_dict["semantic_gt20"] = semantic_gt20
+        save_dict["semantic_gt200"] = semantic_gt200
+        save_dict["instance_gt"] = instance_ids
+
+        # Concatenate with original cloud
+        processed_vertices = np.hstack((semantic_gt200, instance_ids))
+
+        if np.any(np.isnan(processed_vertices)) or not np.all(np.isfinite(processed_vertices)):
+            raise ValueError(f'Find NaN in Scene: {scene_id}')
+
+    # Save processed data
+    torch.save(save_dict, output_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset_root', required=True, help='Path to the ScanNet dataset containing scene folders')
+    parser.add_argument('--output_root', required=True, help='Output path where train/val folders will be located')
+    parser.add_argument('--parse_normals', default=True, type=bool, help='Whether parse point normals')
+    config = parser.parse_args()
+
+    # Load label map
+    labels_pd = pd.read_csv('scannet-preprocess/meta_data/scannetv2-labels.combined.tsv',
+                            sep='\t', header=0)
+
+    # Load train/val splits
+    with open('scannet-preprocess/meta_data/scannetv2_train.txt') as train_file:
+        train_scenes = train_file.read().splitlines()
+    with open('scannet-preprocess/meta_data/scannetv2_val.txt') as val_file:
+        val_scenes = val_file.read().splitlines()
+
+    # Create output directories
+    train_output_dir = os.path.join(config.output_root, 'train')
+    os.makedirs(train_output_dir, exist_ok=True)
+    val_output_dir = os.path.join(config.output_root, 'val')
+    os.makedirs(val_output_dir, exist_ok=True)
+    test_output_dir = os.path.join(config.output_root, 'test')
+    os.makedirs(test_output_dir, exist_ok=True)
+
+    # Load scene paths
+    scene_paths = sorted(glob.glob(config.dataset_root + '/scans*/scene*'))
+
+    # Preprocess data.
+    print('Processing scenes...')
+    pool = ProcessPoolExecutor(max_workers=mp.cpu_count())
+    # pool = ProcessPoolExecutor(max_workers=1)
+    _ = list(pool.map(handle_process, scene_paths, repeat(config.output_root), repeat(labels_pd), repeat(train_scenes),
+                      repeat(val_scenes), repeat(config.parse_normals)))
diff --git a/datasets/scannet_preprocess/scannet_pair/SensorData.py b/datasets/scannet_preprocess/scannet_pair/SensorData.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a35bfb52a4f389d3368e4d3ce014d008674f21b
--- /dev/null
+++ b/datasets/scannet_preprocess/scannet_pair/SensorData.py
@@ -0,0 +1,121 @@
+import os, struct
+import numpy as np
+import zlib
+import imageio
+import cv2
+
+COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'}
+COMPRESSION_TYPE_DEPTH = {-1: 'unknown', 0: 'raw_ushort', 1: 'zlib_ushort', 2: 'occi_ushort'}
+
+
+class RGBDFrame():
+
+    def load(self, file_handle):
+        self.camera_to_world = np.asarray(struct.unpack('f' * 16, file_handle.read(16 * 4)), dtype=np.float32).reshape(
+            4, 4)
+        self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0]
+        self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0]
+        self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
+        self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0]
+        self.color_data = b''.join(struct.unpack('c' * self.color_size_bytes, file_handle.read(self.color_size_bytes)))
+        self.depth_data = b''.join(struct.unpack('c' * self.depth_size_bytes, file_handle.read(self.depth_size_bytes)))
+
+    def decompress_depth(self, compression_type):
+        if compression_type == 'zlib_ushort':
+            return self.decompress_depth_zlib()
+        else:
+            raise
+
+    def decompress_depth_zlib(self):
+        return zlib.decompress(self.depth_data)
+
+    def decompress_color(self, compression_type):
+        if compression_type == 'jpeg':
+            return self.decompress_color_jpeg()
+        else:
+            raise
+
+    def decompress_color_jpeg(self):
+        return imageio.imread(self.color_data)
+
+
+class SensorData:
+    def __init__(self, filename):
+        self.version = 4
+        self.load(filename)
+
+    def load(self, filename):
+        with open(filename, 'rb') as f:
+            version = struct.unpack('I', f.read(4))[0]
+            assert self.version == version
+            strlen = struct.unpack('Q', f.read(8))[0]
+            self.sensor_name = b''.join(struct.unpack('c' * strlen, f.read(strlen)))
+            self.intrinsic_color = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4)
+            self.extrinsic_color = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4)
+            self.intrinsic_depth = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4)
+            self.extrinsic_depth = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4)
+            self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack('i', f.read(4))[0]]
+            self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack('i', f.read(4))[0]]
+            self.color_width = struct.unpack('I', f.read(4))[0]
+            self.color_height = struct.unpack('I', f.read(4))[0]
+            self.depth_width = struct.unpack('I', f.read(4))[0]
+            self.depth_height = struct.unpack('I', f.read(4))[0]
+            self.depth_shift = struct.unpack('f', f.read(4))[0]
+            num_frames = struct.unpack('Q', f.read(8))[0]
+            self.frames = []
+            for i in range(num_frames):
+                frame = RGBDFrame()
+                frame.load(f)
+                self.frames.append(frame)
+
+    def export_depth_images(self, output_path, image_size=None, frame_skip=1):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        print('exporting', len(self.frames) // frame_skip, ' depth frames to', output_path)
+        for f in range(0, len(self.frames), frame_skip):
+            if os.path.exists((os.path.join(output_path, str(f) + '.png'))):
+                continue
+            if f % 100 == 0:
+                print('exporting', f, 'th depth frames to', os.path.join(output_path, str(f) + '.png'))
+
+            depth_data = self.frames[f].decompress_depth(self.depth_compression_type)
+            depth = np.fromstring(depth_data, dtype=np.uint16).reshape(self.depth_height, self.depth_width)
+            if image_size is not None:
+                depth = cv2.resize(depth, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST)
+            imageio.imwrite(os.path.join(output_path, str(f) + '.png'), depth)
+
+    def export_color_images(self, output_path, image_size=None, frame_skip=1):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        print('exporting', len(self.frames) // frame_skip, 'color frames to', output_path)
+        for f in range(0, len(self.frames), frame_skip):
+            if os.path.exists((os.path.join(output_path, str(f) + '.png'))):
+                continue
+            if f % 100 == 0:
+                print('exporting', f, 'th color frames to', os.path.join(output_path, str(f) + '.png'))
+            color = self.frames[f].decompress_color(self.color_compression_type)
+            if image_size is not None:
+                color = cv2.resize(color, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST)
+            # imageio.imwrite(os.path.join(output_path, str(f) + '.jpg'), color)
+            imageio.imwrite(os.path.join(output_path, str(f) + '.png'), color)
+
+    def save_mat_to_file(self, matrix, filename):
+        with open(filename, 'w') as f:
+            for line in matrix:
+                np.savetxt(f, line[np.newaxis], fmt='%f')
+
+    def export_poses(self, output_path, frame_skip=1):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        print('exporting', len(self.frames) // frame_skip, 'camera poses to', output_path)
+        for f in range(0, len(self.frames), frame_skip):
+            self.save_mat_to_file(self.frames[f].camera_to_world, os.path.join(output_path, str(f) + '.txt'))
+
+    def export_intrinsics(self, output_path):
+        if not os.path.exists(output_path):
+            os.makedirs(output_path)
+        print('exporting camera intrinsics to', output_path)
+        self.save_mat_to_file(self.intrinsic_color, os.path.join(output_path, 'intrinsic_color.txt'))
+        self.save_mat_to_file(self.extrinsic_color, os.path.join(output_path, 'extrinsic_color.txt'))
+        self.save_mat_to_file(self.intrinsic_depth, os.path.join(output_path, 'intrinsic_depth.txt'))
+        self.save_mat_to_file(self.extrinsic_depth, os.path.join(output_path, 'extrinsic_depth.txt'))
diff --git a/datasets/scannet_preprocess/scannet_pair/compute_full_overlapping.py b/datasets/scannet_preprocess/scannet_pair/compute_full_overlapping.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b6e2ec653a4019ac85491ca9b847a257a8b3b0f
--- /dev/null
+++ b/datasets/scannet_preprocess/scannet_pair/compute_full_overlapping.py
@@ -0,0 +1,80 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+import torch
+import numpy as np
+import math
+import glob, os
+import argparse
+import open3d as o3d
+
+
+def make_open3d_point_cloud(xyz, color=None, voxel_size=None):
+    if np.isnan(xyz).any():
+        return None
+
+    xyz = xyz[:,:3]
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(xyz)
+    if color is not None:
+        pcd.colors = o3d.utility.Vector3dVector(color)
+    if voxel_size is not None:
+        pcd = pcd.voxel_down_sample(voxel_size)
+    
+    return pcd
+
+
+def compute_overlap_ratio(pcd0, pcd1, voxel_size):
+    pcd0_down = pcd0.voxel_down_sample(voxel_size)
+    pcd1_down = pcd1.voxel_down_sample(voxel_size)
+    matching01 = get_matching_indices(pcd0_down, pcd1_down, voxel_size * 1.5, 1)
+    matching10 = get_matching_indices(pcd1_down, pcd0_down, voxel_size * 1.5, 1)
+    overlap0 = float(len(matching01)) / float(len(pcd0_down.points))
+    overlap1 = float(len(matching10)) / float(len(pcd1_down.points))
+    return max(overlap0, overlap1)
+
+
+def get_matching_indices(source, pcd_tree, search_voxel_size, K=None):
+    match_inds = []
+    for i, point in enumerate(source.points):
+        [_, idx, _] = pcd_tree.search_radius_vector_3d(point, search_voxel_size)
+        if K is not None:
+            idx = idx[:K]
+        for j in idx:
+            match_inds.append((i, j))
+    return match_inds
+
+
+def compute_full_overlapping(data_root, scene_id, voxel_size=0.05):
+    _points = [
+        (pcd_name, make_open3d_point_cloud(torch.load(pcd_name)['coord'], voxel_size=voxel_size))
+        for pcd_name in glob.glob(os.path.join(data_root, scene_id, "pcd", "*.pth"))
+    ]
+    points = [(pcd_name, pcd) for (pcd_name, pcd) in _points if pcd is not None]
+    print('load {} point clouds ({} invalid has been filtered), computing matching/overlapping'.format(
+        len(points), len(_points) - len(points)))
+
+    matching_matrix = np.zeros((len(points), len(points)))
+    for i, (pcd0_name, pcd0) in enumerate(points):
+        print('matching to...{}'.format(pcd0_name))
+        pcd0_tree = o3d.geometry.KDTreeFlann(copy.deepcopy(pcd0))
+        for j, (pcd1_name, pcd1) in enumerate(points):
+            if i == j:
+                continue
+            matching_matrix[i, j] = float(len(get_matching_indices(pcd1, pcd0_tree, 1.5 * voxel_size, 1))) / float(
+                len(pcd1.points))
+
+    # write to file
+    with open(os.path.join(data_root, scene_id, "pcd", "overlap.txt"), 'w') as f:
+        for i, (pcd0_name, pcd0) in enumerate(points):
+            for j, (pcd1_name, pcd1) in enumerate(points):
+                if i < j:
+                    overlap = max(matching_matrix[i, j], matching_matrix[j, i])
+                    f.write("{} {} {}\n".format(
+                        pcd0_name.replace(data_root, ""), pcd1_name.replace(data_root, ""), overlap
+                    ))
+
+
diff --git a/datasets/scannet_preprocess/scannet_pair/generage_list.py b/datasets/scannet_preprocess/scannet_pair/generage_list.py
new file mode 100644
index 0000000000000000000000000000000000000000..8faf9feb74b68123bd363e08f7603bc7ad12c8b7
--- /dev/null
+++ b/datasets/scannet_preprocess/scannet_pair/generage_list.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import argparse
+import glob, os, sys
+
+from SensorData import SensorData
+
+# params
+parser = argparse.ArgumentParser()
+# data paths
+parser.add_argument('--target_dir', required=True, help='path to the target dir')
+
+opt = parser.parse_args()
+print(opt)
+
+def main():
+    overlaps = glob.glob(os.path.join(opt.target_dir, "*/pcd/overlap.txt"))
+    with open(os.path.join(opt.target_dir, 'overlap30.txt'), 'w') as f:
+        for fo in overlaps:
+            for line in open(fo):
+                pcd0, pcd1, op = line.strip().split()
+                if float(op) >= 0.3:
+                    print('{} {} {}'.format(pcd0, pcd1, op), file=f)
+    print('done')
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/datasets/scannet_preprocess/scannet_pair/plyfile.py b/datasets/scannet_preprocess/scannet_pair/plyfile.py
new file mode 100644
index 0000000000000000000000000000000000000000..69c2aa9e898a999406ee4ecfa856c715f14b9251
--- /dev/null
+++ b/datasets/scannet_preprocess/scannet_pair/plyfile.py
@@ -0,0 +1,916 @@
+#   Copyright 2014 Darsh Ranjan
+#
+#   This file is part of python-plyfile.
+#
+#   python-plyfile is free software: you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License as
+#   published by the Free Software Foundation, either version 3 of the
+#   License, or (at your option) any later version.
+#
+#   python-plyfile is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with python-plyfile.  If not, see
+#       <http://www.gnu.org/licenses/>.
+
+from itertools import islice as _islice
+
+import numpy as _np
+from sys import byteorder as _byteorder
+
+
+try:
+    _range = xrange
+except NameError:
+    _range = range
+
+
+# Many-many relation
+_data_type_relation = [
+    ('int8', 'i1'),
+    ('char', 'i1'),
+    ('uint8', 'u1'),
+    ('uchar', 'b1'),
+    ('uchar', 'u1'),
+    ('int16', 'i2'),
+    ('short', 'i2'),
+    ('uint16', 'u2'),
+    ('ushort', 'u2'),
+    ('int32', 'i4'),
+    ('int', 'i4'),
+    ('uint32', 'u4'),
+    ('uint', 'u4'),
+    ('float32', 'f4'),
+    ('float', 'f4'),
+    ('float64', 'f8'),
+    ('double', 'f8')
+]
+
+_data_types = dict(_data_type_relation)
+_data_type_reverse = dict((b, a) for (a, b) in _data_type_relation)
+
+_types_list = []
+_types_set = set()
+for (_a, _b) in _data_type_relation:
+    if _a not in _types_set:
+        _types_list.append(_a)
+        _types_set.add(_a)
+    if _b not in _types_set:
+        _types_list.append(_b)
+        _types_set.add(_b)
+
+
+_byte_order_map = {
+    'ascii': '=',
+    'binary_little_endian': '<',
+    'binary_big_endian': '>'
+}
+
+_byte_order_reverse = {
+    '<': 'binary_little_endian',
+    '>': 'binary_big_endian'
+}
+
+_native_byte_order = {'little': '<', 'big': '>'}[_byteorder]
+
+
+def _lookup_type(type_str):
+    if type_str not in _data_type_reverse:
+        try:
+            type_str = _data_types[type_str]
+        except KeyError:
+            raise ValueError("field type %r not in %r" %
+                             (type_str, _types_list))
+
+    return _data_type_reverse[type_str]
+
+
+def _split_line(line, n):
+    fields = line.split(None, n)
+    if len(fields) == n:
+        fields.append('')
+
+    assert len(fields) == n + 1
+
+    return fields
+
+
+def make2d(array, cols=None, dtype=None):
+    '''
+    Make a 2D array from an array of arrays.  The `cols' and `dtype'
+    arguments can be omitted if the array is not empty.
+
+    '''
+    if (cols is None or dtype is None) and not len(array):
+        raise RuntimeError("cols and dtype must be specified for empty "
+                           "array")
+
+    if cols is None:
+        cols = len(array[0])
+
+    if dtype is None:
+        dtype = array[0].dtype
+
+    return _np.fromiter(array, [('_', dtype, (cols,))],
+                        count=len(array))['_']
+
+
+class PlyParseError(Exception):
+
+    '''
+    Raised when a PLY file cannot be parsed.
+
+    The attributes `element', `row', `property', and `message' give
+    additional information.
+
+    '''
+
+    def __init__(self, message, element=None, row=None, prop=None):
+        self.message = message
+        self.element = element
+        self.row = row
+        self.prop = prop
+
+        s = ''
+        if self.element:
+            s += 'element %r: ' % self.element.name
+        if self.row is not None:
+            s += 'row %d: ' % self.row
+        if self.prop:
+            s += 'property %r: ' % self.prop.name
+        s += self.message
+
+        Exception.__init__(self, s)
+
+    def __repr__(self):
+        return ('PlyParseError(%r, element=%r, row=%r, prop=%r)' %
+                self.message, self.element, self.row, self.prop)
+
+
+class PlyData(object):
+
+    '''
+    PLY file header and data.
+
+    A PlyData instance is created in one of two ways: by the static
+    method PlyData.read (to read a PLY file), or directly from __init__
+    given a sequence of elements (which can then be written to a PLY
+    file).
+
+    '''
+
+    def __init__(self, elements=[], text=False, byte_order='=',
+                 comments=[], obj_info=[]):
+        '''
+        elements: sequence of PlyElement instances.
+
+        text: whether the resulting PLY file will be text (True) or
+            binary (False).
+
+        byte_order: '<' for little-endian, '>' for big-endian, or '='
+            for native.  This is only relevant if `text' is False.
+
+        comments: sequence of strings that will be placed in the header
+            between the 'ply' and 'format ...' lines.
+
+        obj_info: like comments, but will be placed in the header with
+            "obj_info ..." instead of "comment ...".
+
+        '''
+        if byte_order == '=' and not text:
+            byte_order = _native_byte_order
+
+        self.byte_order = byte_order
+        self.text = text
+
+        self.comments = list(comments)
+        self.obj_info = list(obj_info)
+        self.elements = elements
+
+    def _get_elements(self):
+        return self._elements
+
+    def _set_elements(self, elements):
+        self._elements = tuple(elements)
+        self._index()
+
+    elements = property(_get_elements, _set_elements)
+
+    def _get_byte_order(self):
+        return self._byte_order
+
+    def _set_byte_order(self, byte_order):
+        if byte_order not in ['<', '>', '=']:
+            raise ValueError("byte order must be '<', '>', or '='")
+
+        self._byte_order = byte_order
+
+    byte_order = property(_get_byte_order, _set_byte_order)
+
+    def _index(self):
+        self._element_lookup = dict((elt.name, elt) for elt in
+                                    self._elements)
+        if len(self._element_lookup) != len(self._elements):
+            raise ValueError("two elements with same name")
+
+    @staticmethod
+    def _parse_header(stream):
+        '''
+        Parse a PLY header from a readable file-like stream.
+
+        '''
+        lines = []
+        comments = {'comment': [], 'obj_info': []}
+        while True:
+            line = stream.readline().decode('ascii').strip()
+            fields = _split_line(line, 1)
+
+            if fields[0] == 'end_header':
+                break
+
+            elif fields[0] in comments.keys():
+                lines.append(fields)
+            else:
+                lines.append(line.split())
+
+        a = 0
+        if lines[a] != ['ply']:
+            raise PlyParseError("expected 'ply'")
+
+        a += 1
+        while lines[a][0] in comments.keys():
+            comments[lines[a][0]].append(lines[a][1])
+            a += 1
+
+        if lines[a][0] != 'format':
+            raise PlyParseError("expected 'format'")
+
+        if lines[a][2] != '1.0':
+            raise PlyParseError("expected version '1.0'")
+
+        if len(lines[a]) != 3:
+            raise PlyParseError("too many fields after 'format'")
+
+        fmt = lines[a][1]
+
+        if fmt not in _byte_order_map:
+            raise PlyParseError("don't understand format %r" % fmt)
+
+        byte_order = _byte_order_map[fmt]
+        text = fmt == 'ascii'
+
+        a += 1
+        while a < len(lines) and lines[a][0] in comments.keys():
+            comments[lines[a][0]].append(lines[a][1])
+            a += 1
+
+        return PlyData(PlyElement._parse_multi(lines[a:]),
+                       text, byte_order,
+                       comments['comment'], comments['obj_info'])
+
+    @staticmethod
+    def read(stream):
+        '''
+        Read PLY data from a readable file-like object or filename.
+
+        '''
+        (must_close, stream) = _open_stream(stream, 'read')
+        try:
+            data = PlyData._parse_header(stream)
+            for elt in data:
+                elt._read(stream, data.text, data.byte_order)
+        finally:
+            if must_close:
+                stream.close()
+
+        return data
+
+    def write(self, stream):
+        '''
+        Write PLY data to a writeable file-like object or filename.
+
+        '''
+        (must_close, stream) = _open_stream(stream, 'write')
+        try:
+            stream.write(self.header.encode('ascii'))
+            stream.write(b'\r\n')
+            for elt in self:
+                elt._write(stream, self.text, self.byte_order)
+        finally:
+            if must_close:
+                stream.close()
+
+    @property
+    def header(self):
+        '''
+        Provide PLY-formatted metadata for the instance.
+
+        '''
+        lines = ['ply']
+
+        if self.text:
+            lines.append('format ascii 1.0')
+        else:
+            lines.append('format ' +
+                         _byte_order_reverse[self.byte_order] +
+                         ' 1.0')
+
+        # Some information is lost here, since all comments are placed
+        # between the 'format' line and the first element.
+        for c in self.comments:
+            lines.append('comment ' + c)
+
+        for c in self.obj_info:
+            lines.append('obj_info ' + c)
+
+        lines.extend(elt.header for elt in self.elements)
+        lines.append('end_header')
+        return '\r\n'.join(lines)
+
+    def __iter__(self):
+        return iter(self.elements)
+
+    def __len__(self):
+        return len(self.elements)
+
+    def __contains__(self, name):
+        return name in self._element_lookup
+
+    def __getitem__(self, name):
+        return self._element_lookup[name]
+
+    def __str__(self):
+        return self.header
+
+    def __repr__(self):
+        return ('PlyData(%r, text=%r, byte_order=%r, '
+                'comments=%r, obj_info=%r)' %
+                (self.elements, self.text, self.byte_order,
+                 self.comments, self.obj_info))
+
+
+def _open_stream(stream, read_or_write):
+    if hasattr(stream, read_or_write):
+        return (False, stream)
+    try:
+        return (True, open(stream, read_or_write[0] + 'b'))
+    except TypeError:
+        raise RuntimeError("expected open file or filename")
+
+
+class PlyElement(object):
+
+    '''
+    PLY file element.
+
+    A client of this library doesn't normally need to instantiate this
+    directly, so the following is only for the sake of documenting the
+    internals.
+
+    Creating a PlyElement instance is generally done in one of two ways:
+    as a byproduct of PlyData.read (when reading a PLY file) and by
+    PlyElement.describe (before writing a PLY file).
+
+    '''
+
+    def __init__(self, name, properties, count, comments=[]):
+        '''
+        This is not part of the public interface.  The preferred methods
+        of obtaining PlyElement instances are PlyData.read (to read from
+        a file) and PlyElement.describe (to construct from a numpy
+        array).
+
+        '''
+        self._name = str(name)
+        self._check_name()
+        self._count = count
+
+        self._properties = tuple(properties)
+        self._index()
+
+        self.comments = list(comments)
+
+        self._have_list = any(isinstance(p, PlyListProperty)
+                              for p in self.properties)
+
+    @property
+    def count(self):
+        return self._count
+
+    def _get_data(self):
+        return self._data
+
+    def _set_data(self, data):
+        self._data = data
+        self._count = len(data)
+        self._check_sanity()
+
+    data = property(_get_data, _set_data)
+
+    def _check_sanity(self):
+        for prop in self.properties:
+            if prop.name not in self._data.dtype.fields:
+                raise ValueError("dangling property %r" % prop.name)
+
+    def _get_properties(self):
+        return self._properties
+
+    def _set_properties(self, properties):
+        self._properties = tuple(properties)
+        self._check_sanity()
+        self._index()
+
+    properties = property(_get_properties, _set_properties)
+
+    def _index(self):
+        self._property_lookup = dict((prop.name, prop)
+                                     for prop in self._properties)
+        if len(self._property_lookup) != len(self._properties):
+            raise ValueError("two properties with same name")
+
+    def ply_property(self, name):
+        return self._property_lookup[name]
+
+    @property
+    def name(self):
+        return self._name
+
+    def _check_name(self):
+        if any(c.isspace() for c in self._name):
+            msg = "element name %r contains spaces" % self._name
+            raise ValueError(msg)
+
+    def dtype(self, byte_order='='):
+        '''
+        Return the numpy dtype of the in-memory representation of the
+        data.  (If there are no list properties, and the PLY format is
+        binary, then this also accurately describes the on-disk
+        representation of the element.)
+
+        '''
+        return [(prop.name, prop.dtype(byte_order))
+                for prop in self.properties]
+
+    @staticmethod
+    def _parse_multi(header_lines):
+        '''
+        Parse a list of PLY element definitions.
+
+        '''
+        elements = []
+        while header_lines:
+            (elt, header_lines) = PlyElement._parse_one(header_lines)
+            elements.append(elt)
+
+        return elements
+
+    @staticmethod
+    def _parse_one(lines):
+        '''
+        Consume one element definition.  The unconsumed input is
+        returned along with a PlyElement instance.
+
+        '''
+        a = 0
+        line = lines[a]
+
+        if line[0] != 'element':
+            raise PlyParseError("expected 'element'")
+        if len(line) > 3:
+            raise PlyParseError("too many fields after 'element'")
+        if len(line) < 3:
+            raise PlyParseError("too few fields after 'element'")
+
+        (name, count) = (line[1], int(line[2]))
+
+        comments = []
+        properties = []
+        while True:
+            a += 1
+            if a >= len(lines):
+                break
+
+            if lines[a][0] == 'comment':
+                comments.append(lines[a][1])
+            elif lines[a][0] == 'property':
+                properties.append(PlyProperty._parse_one(lines[a]))
+            else:
+                break
+
+        return (PlyElement(name, properties, count, comments),
+                lines[a:])
+
+    @staticmethod
+    def describe(data, name, len_types={}, val_types={},
+                 comments=[]):
+        '''
+        Construct a PlyElement from an array's metadata.
+
+        len_types and val_types can be given as mappings from list
+        property names to type strings (like 'u1', 'f4', etc., or
+        'int8', 'float32', etc.). These can be used to define the length
+        and value types of list properties.  List property lengths
+        always default to type 'u1' (8-bit unsigned integer), and value
+        types default to 'i4' (32-bit integer).
+
+        '''
+        if not isinstance(data, _np.ndarray):
+            raise TypeError("only numpy arrays are supported")
+
+        if len(data.shape) != 1:
+            raise ValueError("only one-dimensional arrays are "
+                             "supported")
+
+        count = len(data)
+
+        properties = []
+        descr = data.dtype.descr
+
+        for t in descr:
+            if not isinstance(t[1], str):
+                raise ValueError("nested records not supported")
+
+            if not t[0]:
+                raise ValueError("field with empty name")
+
+            if len(t) != 2 or t[1][1] == 'O':
+                # non-scalar field, which corresponds to a list
+                # property in PLY.
+
+                if t[1][1] == 'O':
+                    if len(t) != 2:
+                        raise ValueError("non-scalar object fields not "
+                                         "supported")
+
+                len_str = _data_type_reverse[len_types.get(t[0], 'u1')]
+                if t[1][1] == 'O':
+                    val_type = val_types.get(t[0], 'i4')
+                    val_str = _lookup_type(val_type)
+                else:
+                    val_str = _lookup_type(t[1][1:])
+
+                prop = PlyListProperty(t[0], len_str, val_str)
+            else:
+                val_str = _lookup_type(t[1][1:])
+                prop = PlyProperty(t[0], val_str)
+
+            properties.append(prop)
+
+        elt = PlyElement(name, properties, count, comments)
+        elt.data = data
+
+        return elt
+
+    def _read(self, stream, text, byte_order):
+        '''
+        Read the actual data from a PLY file.
+
+        '''
+        if text:
+            self._read_txt(stream)
+        else:
+            if self._have_list:
+                # There are list properties, so a simple load is
+                # impossible.
+                self._read_bin(stream, byte_order)
+            else:
+                # There are no list properties, so loading the data is
+                # much more straightforward.
+                self._data = _np.fromfile(stream,
+                                          self.dtype(byte_order),
+                                          self.count)
+
+        if len(self._data) < self.count:
+            k = len(self._data)
+            del self._data
+            raise PlyParseError("early end-of-file", self, k)
+
+        self._check_sanity()
+
+    def _write(self, stream, text, byte_order):
+        '''
+        Write the data to a PLY file.
+
+        '''
+        if text:
+            self._write_txt(stream)
+        else:
+            if self._have_list:
+                # There are list properties, so serialization is
+                # slightly complicated.
+                self._write_bin(stream, byte_order)
+            else:
+                # no list properties, so serialization is
+                # straightforward.
+                self.data.astype(self.dtype(byte_order),
+                                 copy=False).tofile(stream)
+
+    def _read_txt(self, stream):
+        '''
+        Load a PLY element from an ASCII-format PLY file.  The element
+        may contain list properties.
+
+        '''
+        self._data = _np.empty(self.count, dtype=self.dtype())
+
+        k = 0
+        for line in _islice(iter(stream.readline, b''), self.count):
+            fields = iter(line.strip().split())
+            for prop in self.properties:
+                try:
+                    self._data[prop.name][k] = prop._from_fields(fields)
+                except StopIteration:
+                    raise PlyParseError("early end-of-line",
+                                        self, k, prop)
+                except ValueError:
+                    raise PlyParseError("malformed input",
+                                        self, k, prop)
+            try:
+                next(fields)
+            except StopIteration:
+                pass
+            else:
+                raise PlyParseError("expected end-of-line", self, k)
+            k += 1
+
+        if k < self.count:
+            del self._data
+            raise PlyParseError("early end-of-file", self, k)
+
+    def _write_txt(self, stream):
+        '''
+        Save a PLY element to an ASCII-format PLY file.  The element may
+        contain list properties.
+
+        '''
+        for rec in self.data:
+            fields = []
+            for prop in self.properties:
+                fields.extend(prop._to_fields(rec[prop.name]))
+
+            _np.savetxt(stream, [fields], '%.18g', newline='\r\n')
+
+    def _read_bin(self, stream, byte_order):
+        '''
+        Load a PLY element from a binary PLY file.  The element may
+        contain list properties.
+
+        '''
+        self._data = _np.empty(self.count, dtype=self.dtype(byte_order))
+
+        for k in _range(self.count):
+            for prop in self.properties:
+                try:
+                    self._data[prop.name][k] = \
+                        prop._read_bin(stream, byte_order)
+                except StopIteration:
+                    raise PlyParseError("early end-of-file",
+                                        self, k, prop)
+
+    def _write_bin(self, stream, byte_order):
+        '''
+        Save a PLY element to a binary PLY file.  The element may
+        contain list properties.
+
+        '''
+        for rec in self.data:
+            for prop in self.properties:
+                prop._write_bin(rec[prop.name], stream, byte_order)
+
+    @property
+    def header(self):
+        '''
+        Format this element's metadata as it would appear in a PLY
+        header.
+
+        '''
+        lines = ['element %s %d' % (self.name, self.count)]
+
+        # Some information is lost here, since all comments are placed
+        # between the 'element' line and the first property definition.
+        for c in self.comments:
+            lines.append('comment ' + c)
+
+        lines.extend(list(map(str, self.properties)))
+
+        return '\r\n'.join(lines)
+
+    def __getitem__(self, key):
+        return self.data[key]
+
+    def __setitem__(self, key, value):
+        self.data[key] = value
+
+    def __str__(self):
+        return self.header
+
+    def __repr__(self):
+        return ('PlyElement(%r, %r, count=%d, comments=%r)' %
+                (self.name, self.properties, self.count,
+                 self.comments))
+
+
+class PlyProperty(object):
+
+    '''
+    PLY property description.  This class is pure metadata; the data
+    itself is contained in PlyElement instances.
+
+    '''
+
+    def __init__(self, name, val_dtype):
+        self._name = str(name)
+        self._check_name()
+        self.val_dtype = val_dtype
+
+    def _get_val_dtype(self):
+        return self._val_dtype
+
+    def _set_val_dtype(self, val_dtype):
+        self._val_dtype = _data_types[_lookup_type(val_dtype)]
+
+    val_dtype = property(_get_val_dtype, _set_val_dtype)
+
+    @property
+    def name(self):
+        return self._name
+
+    def _check_name(self):
+        if any(c.isspace() for c in self._name):
+            msg = "Error: property name %r contains spaces" % self._name
+            raise RuntimeError(msg)
+
+    @staticmethod
+    def _parse_one(line):
+        assert line[0] == 'property'
+
+        if line[1] == 'list':
+            if len(line) > 5:
+                raise PlyParseError("too many fields after "
+                                    "'property list'")
+            if len(line) < 5:
+                raise PlyParseError("too few fields after "
+                                    "'property list'")
+
+            return PlyListProperty(line[4], line[2], line[3])
+
+        else:
+            if len(line) > 3:
+                raise PlyParseError("too many fields after "
+                                    "'property'")
+            if len(line) < 3:
+                raise PlyParseError("too few fields after "
+                                    "'property'")
+
+            return PlyProperty(line[2], line[1])
+
+    def dtype(self, byte_order='='):
+        '''
+        Return the numpy dtype description for this property (as a tuple
+        of strings).
+
+        '''
+        return byte_order + self.val_dtype
+
+    def _from_fields(self, fields):
+        '''
+        Parse from generator.  Raise StopIteration if the property could
+        not be read.
+
+        '''
+        return _np.dtype(self.dtype()).type(next(fields))
+
+    def _to_fields(self, data):
+        '''
+        Return generator over one item.
+
+        '''
+        yield _np.dtype(self.dtype()).type(data)
+
+    def _read_bin(self, stream, byte_order):
+        '''
+        Read data from a binary stream.  Raise StopIteration if the
+        property could not be read.
+
+        '''
+        try:
+            return _np.fromfile(stream, self.dtype(byte_order), 1)[0]
+        except IndexError:
+            raise StopIteration
+
+    def _write_bin(self, data, stream, byte_order):
+        '''
+        Write data to a binary stream.
+
+        '''
+        _np.dtype(self.dtype(byte_order)).type(data).tofile(stream)
+
+    def __str__(self):
+        val_str = _data_type_reverse[self.val_dtype]
+        return 'property %s %s' % (val_str, self.name)
+
+    def __repr__(self):
+        return 'PlyProperty(%r, %r)' % (self.name,
+                                        _lookup_type(self.val_dtype))
+
+
+class PlyListProperty(PlyProperty):
+
+    '''
+    PLY list property description.
+
+    '''
+
+    def __init__(self, name, len_dtype, val_dtype):
+        PlyProperty.__init__(self, name, val_dtype)
+
+        self.len_dtype = len_dtype
+
+    def _get_len_dtype(self):
+        return self._len_dtype
+
+    def _set_len_dtype(self, len_dtype):
+        self._len_dtype = _data_types[_lookup_type(len_dtype)]
+
+    len_dtype = property(_get_len_dtype, _set_len_dtype)
+
+    def dtype(self, byte_order='='):
+        '''
+        List properties always have a numpy dtype of "object".
+
+        '''
+        return '|O'
+
+    def list_dtype(self, byte_order='='):
+        '''
+        Return the pair (len_dtype, val_dtype) (both numpy-friendly
+        strings).
+
+        '''
+        return (byte_order + self.len_dtype,
+                byte_order + self.val_dtype)
+
+    def _from_fields(self, fields):
+        (len_t, val_t) = self.list_dtype()
+
+        n = int(_np.dtype(len_t).type(next(fields)))
+
+        data = _np.loadtxt(list(_islice(fields, n)), val_t, ndmin=1)
+        if len(data) < n:
+            raise StopIteration
+
+        return data
+
+    def _to_fields(self, data):
+        '''
+        Return generator over the (numerical) PLY representation of the
+        list data (length followed by actual data).
+
+        '''
+        (len_t, val_t) = self.list_dtype()
+
+        data = _np.asarray(data, dtype=val_t).ravel()
+
+        yield _np.dtype(len_t).type(data.size)
+        for x in data:
+            yield x
+
+    def _read_bin(self, stream, byte_order):
+        (len_t, val_t) = self.list_dtype(byte_order)
+
+        try:
+            n = _np.fromfile(stream, len_t, 1)[0]
+        except IndexError:
+            raise StopIteration
+
+        data = _np.fromfile(stream, val_t, n)
+        if len(data) < n:
+            raise StopIteration
+
+        return data
+
+    def _write_bin(self, data, stream, byte_order):
+        '''
+        Write data to a binary stream.
+
+        '''
+        (len_t, val_t) = self.list_dtype(byte_order)
+
+        data = _np.asarray(data, dtype=val_t).ravel()
+
+        _np.array(data.size, dtype=len_t).tofile(stream)
+        data.tofile(stream)
+
+    def __str__(self):
+        len_str = _data_type_reverse[self.len_dtype]
+        val_str = _data_type_reverse[self.val_dtype]
+        return 'property list %s %s %s' % (len_str, val_str, self.name)
+
+    def __repr__(self):
+        return ('PlyListProperty(%r, %r, %r)' %
+                (self.name,
+                 _lookup_type(self.len_dtype),
+                 _lookup_type(self.val_dtype)))
diff --git a/datasets/scannet_preprocess/scannet_pair/point_cloud_extractor.py b/datasets/scannet_preprocess/scannet_pair/point_cloud_extractor.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2b8568eef6413e2e024025b21f3d07401b0f223
--- /dev/null
+++ b/datasets/scannet_preprocess/scannet_pair/point_cloud_extractor.py
@@ -0,0 +1,89 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# 
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import glob, os
+import numpy as np
+import cv2
+import torch
+
+
+def extractor(input_path, output_path):
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+
+    # Load Depth Camera Intrinsic
+    depth_intrinsic = np.loadtxt(input_path + '/intrinsic/intrinsic_depth.txt')
+    print('Depth intrinsic: ')
+    print(depth_intrinsic)
+
+    # Compute Camrea Distance (just for demo, so you can choose the camera distance in frame sampling)
+    poses = sorted(glob.glob(input_path + '/pose/*.txt'), key=lambda a: int(os.path.basename(a).split('.')[0]))
+    depths = sorted(glob.glob(input_path + '/depth/*.png'), key=lambda a: int(os.path.basename(a).split('.')[0]))
+    colors = sorted(glob.glob(input_path + '/color/*.png'), key=lambda a: int(os.path.basename(a).split('.')[0]))
+
+    # # Get Aligned Point Clouds.
+    for ind, (pose, depth, color) in enumerate(zip(poses, depths, colors)):
+        name = os.path.basename(pose).split('.')[0]
+
+        if os.path.exists(output_path + '/{}.npz'.format(name)):
+            continue
+
+        try:
+            print('=' * 50, ': {}'.format(pose))
+            depth_img = cv2.imread(depth, -1)  # read 16bit grayscale image
+            mask = (depth_img != 0)
+            color_image = cv2.imread(color)
+            color_image = cv2.resize(color_image, (640, 480))
+            color_image = np.reshape(color_image[mask], [-1, 3])
+            colors = np.zeros_like(color_image)
+            colors[:, 0] = color_image[:, 2]
+            colors[:, 1] = color_image[:, 1]
+            colors[:, 2] = color_image[:, 0]
+
+            pose = np.loadtxt(poses[ind])
+            print('Camera pose: ')
+            print(pose)
+
+            depth_shift = 1000.0
+            x, y = np.meshgrid(np.linspace(0, depth_img.shape[1] - 1, depth_img.shape[1]),
+                               np.linspace(0, depth_img.shape[0] - 1, depth_img.shape[0]))
+            uv_depth = np.zeros((depth_img.shape[0], depth_img.shape[1], 3))
+            uv_depth[:, :, 0] = x
+            uv_depth[:, :, 1] = y
+            uv_depth[:, :, 2] = depth_img / depth_shift
+            uv_depth = np.reshape(uv_depth, [-1, 3])
+            uv_depth = uv_depth[np.where(uv_depth[:, 2] != 0), :].squeeze()
+
+            intrinsic_inv = np.linalg.inv(depth_intrinsic)
+            fx = depth_intrinsic[0, 0]
+            fy = depth_intrinsic[1, 1]
+            cx = depth_intrinsic[0, 2]
+            cy = depth_intrinsic[1, 2]
+            bx = depth_intrinsic[0, 3]
+            by = depth_intrinsic[1, 3]
+            point_list = []
+            n = uv_depth.shape[0]
+            points = np.ones((n, 4))
+            X = (uv_depth[:, 0] - cx) * uv_depth[:, 2] / fx + bx
+            Y = (uv_depth[:, 1] - cy) * uv_depth[:, 2] / fy + by
+            points[:, 0] = X
+            points[:, 1] = Y
+            points[:, 2] = uv_depth[:, 2]
+            points_world = np.dot(points, np.transpose(pose))
+            print(points_world.shape)
+
+            pcd = dict(coord=points_world[:, :3], color=colors)
+            # pcd_save = np.zeros((points_world.shape[0], 7))
+            # pcd_save[:, :3] = points_world[:, :3]
+            # pcd_save[:, 3:6] = colors
+
+            # print('Saving npz file...')
+            # np.savez(output_path + '/{}.npz'.format(name), pcd=pcd_save)
+            torch.save(pcd, output_path + '/{}.pth'.format(name))
+        except:
+            continue
+
+
diff --git a/datasets/scannet_preprocess/scannet_pair/preprocess.py b/datasets/scannet_preprocess/scannet_pair/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c17a67936bf30ab41564625318079b92d173182
--- /dev/null
+++ b/datasets/scannet_preprocess/scannet_pair/preprocess.py
@@ -0,0 +1,38 @@
+import os
+import argparse
+import glob
+import multiprocessing as mp
+from concurrent.futures import ProcessPoolExecutor
+from itertools import repeat
+from reader import reader
+from point_cloud_extractor import extractor
+from compute_full_overlapping import compute_full_overlapping
+
+
+frame_skip = 25
+
+
+def parse_sens(sens_dir, output_dir):
+    scene_id = os.path.basename(os.path.dirname(sens_dir))
+    print(f"Parsing sens data{sens_dir}")
+    reader(sens_dir, os.path.join(output_dir, scene_id), frame_skip,
+           export_color_images=True, export_depth_images=True, export_poses=True, export_intrinsics=True)
+    extractor(os.path.join(output_dir, scene_id), os.path.join(output_dir, scene_id, "pcd"))
+    compute_full_overlapping(output_dir, scene_id)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--dataset_root', required=True, help='Path to the ScanNet dataset containing scene folders')
+    parser.add_argument('--output_root', required=True, help='Output path where train/val folders will be located')
+    opt = parser.parse_args()
+    sens_list = sorted(glob.glob(os.path.join(opt.dataset_root, "scans/scene*/*.sens")))
+    # Preprocess data.
+    pool = ProcessPoolExecutor(max_workers=mp.cpu_count())
+    # pool = ProcessPoolExecutor(max_workers=1)
+    print('Processing scenes...')
+    _ = list(pool.map(parse_sens, sens_list, repeat(opt.output_root)))
+
+    # sens_dir = "/home/gofinge/Documents/datasets/scannet/scans/scene0024_00/scene0024_00.sens"
+    # output_dir = "/home/gofinge/Downloads"
+    # parse_sens(sens_dir, output_dir)
diff --git a/datasets/scannet_preprocess/scannet_pair/reader.py b/datasets/scannet_preprocess/scannet_pair/reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..3077ec828988957c95fa85e8399ed382273a0257
--- /dev/null
+++ b/datasets/scannet_preprocess/scannet_pair/reader.py
@@ -0,0 +1,27 @@
+import argparse
+import os, sys
+
+from SensorData import SensorData
+
+
+def reader(filename,
+           output_path,
+           frame_skip,
+           export_color_images=False,
+           export_depth_images=False,
+           export_poses=False,
+           export_intrinsics=False):
+    if not os.path.exists(output_path):
+        os.makedirs(output_path)
+
+    # load the data
+    print('loading %s...' % filename)
+    sd = SensorData(filename)
+    if export_depth_images:
+        sd.export_depth_images(os.path.join(output_path, 'depth'), frame_skip=frame_skip)
+    if export_color_images:
+        sd.export_color_images(os.path.join(output_path, 'color'), frame_skip=frame_skip)
+    if export_poses:
+        sd.export_poses(os.path.join(output_path, 'pose'), frame_skip=frame_skip)
+    if export_intrinsics:
+        sd.export_intrinsics(os.path.join(output_path, 'intrinsic'))
diff --git a/demo.py b/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..8055add6448d1ae28a79576bb4a738a7da7afb44
--- /dev/null
+++ b/demo.py
@@ -0,0 +1,123 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import argparse
+import glob
+import multiprocessing as mp
+import os
+import time
+import cv2
+import tqdm
+import numpy as np
+
+from detectron2.config import get_cfg
+
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.data.detection_utils import read_image
+from detectron2.utils.logger import setup_logger
+from open_vocab_seg import add_ovseg_config
+
+from open_vocab_seg.utils import VisualizationDemo
+
+# constants
+WINDOW_NAME = "Open vocabulary segmentation"
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    # for poly lr schedule
+    add_deeplab_config(cfg)
+    add_ovseg_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation")
+    parser.add_argument(
+        "--config-file",
+        default="configs/ovseg_swinB_vitL_demo.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--class-names",
+        nargs="+",
+        help="A list of user-defined class_names"
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationDemo(cfg)
+    class_names = args.class_names
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            start_time = time.time()
+            predictions, visualized_output_rgb, visualized_output_depth, visualized_output_rgb_sam, visualized_output_depth_sam = demo.run_on_image_sam(path, class_names)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output_rgb.save('RGB_Semantic_SAM.png')
+                visualized_output_depth.save('Depth_Semantic_SAM.png')
+                visualized_output_rgb_sam.save('RGB_Semantic_SAM_Mask.png')
+                visualized_output_depth_sam.save('Depth_Semantic_SAM_Mask.png')
+                rgb_3d_sam = demo.get_xyzrgb('RGB_Semantic_SAM.png', path)
+                depth_3d_sam = demo.get_xyzrgb('Depth_Semantic_SAM.png', path)
+                rgb_3d_sam_mask = demo.get_xyzrgb('RGB_Semantic_SAM_Mask.png', path)
+                depth_3d_sam_mask = demo.get_xyzrgb('Depth_Semantic_SAM_Mask.png', path)
+                np.savez('xyzrgb.npz', rgb_3d_sam = rgb_3d_sam, depth_3d_sam = depth_3d_sam, rgb_3d_sam_mask = rgb_3d_sam_mask, depth_3d_sam_mask = depth_3d_sam_mask)
+                demo.render_3d_video('xyzrgb.npz', path)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output_rgb.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    else:
+        raise NotImplementedError
diff --git a/flagged/log.csv b/flagged/log.csv
new file mode 100644
index 0000000000000000000000000000000000000000..afcc908ea4df4a8b0353ab174a38818fc1d859b5
--- /dev/null
+++ b/flagged/log.csv
@@ -0,0 +1,3 @@
+name,output,flag,username,timestamp
+t,/mnt/lustre/jkyang/PSG4D/segment_anything_sailvos3d/ov-seg/flagged/output/tmpii192qpn.png,,,2023-04-23 12:23:23.301078
+t,/mnt/lustre/jkyang/PSG4D/segment_anything_sailvos3d/ov-seg/flagged/output/tmpqm122tsi.png,,,2023-04-23 12:26:06.661559
diff --git a/flagged/output/tmpii192qpn.png b/flagged/output/tmpii192qpn.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbb8f921b3dae75f2a719d74170f8f4c87b127c0
Binary files /dev/null and b/flagged/output/tmpii192qpn.png differ
diff --git a/flagged/output/tmpqm122tsi.png b/flagged/output/tmpqm122tsi.png
new file mode 100644
index 0000000000000000000000000000000000000000..fbb8f921b3dae75f2a719d74170f8f4c87b127c0
Binary files /dev/null and b/flagged/output/tmpqm122tsi.png differ
diff --git a/open_vocab_seg/__init__.py b/open_vocab_seg/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54fce14b8a029f1355bc8b74c20884e880ee9c4
--- /dev/null
+++ b/open_vocab_seg/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from . import data
+from . import modeling
+from .config import add_ovseg_config
+
+from .test_time_augmentation import SemanticSegmentorWithTTA
+from .ovseg_model import OVSeg, OVSegDEMO
diff --git a/open_vocab_seg/config.py b/open_vocab_seg/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..400e9a05d4995e3f3401b34a22ae687b2c9c90e0
--- /dev/null
+++ b/open_vocab_seg/config.py
@@ -0,0 +1,133 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from detectron2.config import CfgNode as CN
+
+
+def add_mask_former_default_config(cfg):
+    # data config
+    # select the dataset mapper
+    cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic"
+    # Color augmentation
+    cfg.INPUT.COLOR_AUG_SSD = False
+    # We retry random cropping until no single category in semantic segmentation GT occupies more
+    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
+    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
+    # Pad image and segmentation GT in dataset mapper.
+    cfg.INPUT.SIZE_DIVISIBILITY = -1
+
+    # solver config
+    # test batch size
+    cfg.SOLVER.TEST_IMS_PER_BATCH = 1
+    # weight decay on embedding
+    cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
+    # optimizer
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
+
+    # mask_former model config
+    cfg.MODEL.MASK_FORMER = CN()
+
+    # loss
+    cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True
+    cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0
+    cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0
+
+    # transformer config
+    cfg.MODEL.MASK_FORMER.NHEADS = 8
+    cfg.MODEL.MASK_FORMER.DROPOUT = 0.1
+    cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0
+    cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6
+    cfg.MODEL.MASK_FORMER.PRE_NORM = False
+
+    cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256
+    cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100
+
+    cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5"
+    cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False
+
+    # mask_former inference config
+    cfg.MODEL.MASK_FORMER.TEST = CN()
+    cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False
+    cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0
+    cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0
+    cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
+
+    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
+    # you can use this config to override
+    cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32
+
+    # pixel decoder config
+    cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
+    # adding transformer in pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
+    # pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
+
+    # swin transformer backbone
+    cfg.MODEL.SWIN = CN()
+    cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
+    cfg.MODEL.SWIN.PATCH_SIZE = 4
+    cfg.MODEL.SWIN.EMBED_DIM = 96
+    cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+    cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+    cfg.MODEL.SWIN.WINDOW_SIZE = 7
+    cfg.MODEL.SWIN.MLP_RATIO = 4.0
+    cfg.MODEL.SWIN.QKV_BIAS = True
+    cfg.MODEL.SWIN.QK_SCALE = None
+    cfg.MODEL.SWIN.NORM_INDICES = None
+    cfg.MODEL.SWIN.PROJECTION = False
+    cfg.MODEL.SWIN.PROJECT_DIM = 256
+    cfg.MODEL.SWIN.DROP_RATE = 0.0
+    cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
+    cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
+    cfg.MODEL.SWIN.APE = False
+    cfg.MODEL.SWIN.PATCH_NORM = True
+    cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+
+
+def add_our_config(cfg):
+    cfg.TEST.SLIDING_WINDOW = False
+    cfg.TEST.SLIDING_TILE_SIZE = 224
+    cfg.TEST.SLIDING_OVERLAP = 2 / 3.0
+    # whether to use dense crf
+    cfg.TEST.DENSE_CRF = False
+    cfg.DATASETS.SAMPLE_PER_CLASS = -1
+    cfg.DATASETS.SAMPLE_SEED = 0
+    # embedding head
+    cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM = 512
+    cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM = 1024
+    cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS = 2
+    # clip_adapter
+    cfg.MODEL.CLIP_ADAPTER = CN()
+    cfg.MODEL.CLIP_ADAPTER.TEXT_TEMPLATES = "vild"
+    # for predefined
+    cfg.MODEL.CLIP_ADAPTER.PREDEFINED_PROMPT_TEMPLATES = ["a photo of a {}."]
+    # for learnable prompt
+    cfg.MODEL.CLIP_ADAPTER.PROMPT_CHECKPOINT = ""
+    cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME = "ViT-B/16"
+    cfg.MODEL.CLIP_ADAPTER.MASK_FILL = "mean"
+    cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO = 1.0
+    cfg.MODEL.CLIP_ADAPTER.MASK_THR = 0.4
+    cfg.MODEL.CLIP_ADAPTER.MASK_MATTING = False
+    cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED = True
+    cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE = True
+    cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT = 0.7
+    # for mask prompt
+    cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH = 3
+    cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD = False
+
+    # wandb
+    cfg.WANDB = CN()
+    cfg.WANDB.PROJECT = "open_vocab_seg"
+    cfg.WANDB.NAME = None
+
+
+def add_ovseg_config(cfg):
+    """
+    Add config for open_vocab_seg.
+    """
+    add_mask_former_default_config(cfg)
+    add_our_config(cfg)
diff --git a/open_vocab_seg/data/__init__.py b/open_vocab_seg/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..970e2c8ce7f90afab089bf84e249af5ee7124951
--- /dev/null
+++ b/open_vocab_seg/data/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from .dataset_mappers import *
+from . import datasets
+from .build import (
+    build_detection_train_loader,
+    build_detection_test_loader,
+)
diff --git a/open_vocab_seg/data/augmentations.py b/open_vocab_seg/data/augmentations.py
new file mode 100644
index 0000000000000000000000000000000000000000..44e4906d4827812fa707f50e703f253a64ab6e43
--- /dev/null
+++ b/open_vocab_seg/data/augmentations.py
@@ -0,0 +1,202 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import math
+import numbers
+import numpy as np
+from detectron2.data.transforms.augmentation import Augmentation
+from detectron2.data.transforms.transform import (
+    CropTransform,
+    ResizeTransform,
+    TransformList,
+)
+from PIL import Image
+from fvcore.transforms.transform import PadTransform
+
+
+def mask2box(mask: np.ndarray):
+    # use naive way
+    row = np.nonzero(mask.sum(axis=0))[0]
+    if len(row) == 0:
+        return None
+    x1 = row.min()
+    x2 = row.max()
+    col = np.nonzero(mask.sum(axis=1))[0]
+    y1 = col.min()
+    y2 = col.max()
+    return x1, y1, x2 + 1 - x1, y2 + 1 - y1
+
+
+def expand_box(x, y, w, h, expand_ratio=1.0, max_h=None, max_w=None):
+    cx = x + 0.5 * w
+    cy = y + 0.5 * h
+    w = w * expand_ratio
+    h = h * expand_ratio
+    box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h]
+    if max_h is not None:
+        box[1] = max(0, box[1])
+        box[3] = min(max_h - 1, box[3])
+    if max_w is not None:
+        box[0] = max(0, box[0])
+        box[2] = min(max_w - 1, box[2])
+    box[2] = box[2] - box[0]
+    box[3] = box[3] - box[1]
+
+    return [int(b) for b in box]
+
+
+class CropImageWithMask(Augmentation):
+    def __init__(self, expand_ratio=1.0, mode="choice"):
+        if isinstance(expand_ratio, numbers.Number):
+            expand_ratio = (expand_ratio, expand_ratio)
+        self.mode = mode
+        self.expand_ratio = expand_ratio
+        if self.mode == "range":
+            assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1]
+
+    def get_transform(self, image, sem_seg, category_id):
+        input_size = image.shape[:2]
+        bin_mask = sem_seg == category_id
+        x, y, w, h = mask2box(bin_mask)
+        if self.mode == "choice":
+            expand_ratio = np.random.choice(self.expand_ratio)
+        else:
+            expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1])
+        x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size)
+        w = max(w, 1)
+        h = max(h, 1)
+        return CropTransform(x, y, w, h, input_size[1], input_size[0])
+
+
+class CropImageWithBox(Augmentation):
+    def __init__(self, expand_ratio=1.0, mode="choice"):
+        if isinstance(expand_ratio, numbers.Number):
+            expand_ratio = (expand_ratio, expand_ratio)
+        self.mode = mode
+        self.expand_ratio = expand_ratio
+        if self.mode == "range":
+            assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1]
+
+    def get_transform(self, image, boxes):
+        input_size = image.shape[:2]
+        x, y, x2, y2 = boxes[0]
+        w = x2 - x + 1
+        h = y2 - y + 1
+        if self.mode == "choice":
+            expand_ratio = np.random.choice(self.expand_ratio)
+        else:
+            expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1])
+        x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size)
+        w = max(w, 1)
+        h = max(h, 1)
+        return CropTransform(x, y, w, h, input_size[1], input_size[0])
+
+
+class RandomResizedCrop(Augmentation):
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation=Image.BILINEAR,
+    ):
+        if isinstance(size, int):
+            size = (size, size)
+        else:
+            assert isinstance(size, (tuple, list)) and len(size) == 2
+
+        self.size = size
+
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = interpolation
+
+    def get_transform(self, image):
+        height, width = image.shape[:2]
+        area = height * width
+
+        log_ratio = np.log(np.array(self.ratio))
+        is_success = False
+        for _ in range(10):
+            target_area = area * np.random.uniform(self.scale[0], self.scale[1])
+            aspect_ratio = np.exp(np.random.uniform(log_ratio[0], log_ratio[1]))
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                i = np.random.randint(0, width - w + 1)
+                j = np.random.randint(0, height - h + 1)
+
+                is_success = True
+                break
+
+        if not is_success:
+            # Fallback to central crop
+            in_ratio = float(width) / float(height)
+            if in_ratio < min(self.ratio):
+                w = width
+                h = int(round(w / min(self.ratio)))
+            elif in_ratio > max(self.ratio):
+                h = height
+                w = int(round(h * max(self.ratio)))
+            else:  # whole image
+                w = width
+                h = height
+            i = (width - w) // 2
+            j = (height - h) // 2
+        return TransformList(
+            [
+                CropTransform(i, j, w, h, width, height),
+                ResizeTransform(
+                    h, w, self.size[1], self.size[0], interp=self.interpolation
+                ),
+            ]
+        )
+
+
+class CenterCrop(Augmentation):
+    def __init__(self, size, seg_ignore_label):
+        if isinstance(size, numbers.Number):
+            size = (int(size), int(size))
+        elif isinstance(size, (tuple, list)) and len(size) == 1:
+            size = (size[0], size[0])
+        self.size = size
+        self.seg_ignore_label = seg_ignore_label
+
+    def get_transform(self, image):
+
+        image_height, image_width = image.shape[:2]
+        crop_height, crop_width = self.size
+
+        transforms = []
+        if crop_width > image_width or crop_height > image_height:
+            padding_ltrb = [
+                (crop_width - image_width) // 2 if crop_width > image_width else 0,
+                (crop_height - image_height) // 2 if crop_height > image_height else 0,
+                (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+                (crop_height - image_height + 1) // 2
+                if crop_height > image_height
+                else 0,
+            ]
+            transforms.append(
+                PadTransform(
+                    *padding_ltrb,
+                    orig_w=image_width,
+                    orig_h=image_height,
+                    seg_pad_value=self.seg_ignore_label
+                )
+            )
+            image_width, image_height = (
+                image_width + padding_ltrb[0] + padding_ltrb[2],
+                image_height + padding_ltrb[1] + padding_ltrb[3],
+            )
+
+        crop_top = int(round((image_height - crop_height) / 2.0))
+        crop_left = int(round((image_width - crop_width) / 2.0))
+        transforms.append(
+            CropTransform(
+                crop_left, crop_top, crop_width, crop_height, image_width, image_height
+            )
+        )
+        return TransformList(transforms)
diff --git a/open_vocab_seg/data/build.py b/open_vocab_seg/data/build.py
new file mode 100644
index 0000000000000000000000000000000000000000..bcd3b9dcebb86c319b91a632c25bcf7827292c3f
--- /dev/null
+++ b/open_vocab_seg/data/build.py
@@ -0,0 +1,344 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import itertools
+import logging
+import numpy as np
+from collections import Counter
+import torch.utils.data
+from tabulate import tabulate
+from termcolor import colored
+
+from detectron2.utils.logger import _log_api_usage, log_first_n
+from detectron2.data.catalog import DatasetCatalog, MetadataCatalog
+import torch.utils.data
+from detectron2.config import configurable
+from detectron2.data.build import (
+    build_batch_data_loader,
+    trivial_batch_collator,
+    load_proposals_into_dataset,
+    filter_images_with_only_crowd_annotations,
+    filter_images_with_few_keypoints,
+    print_instances_class_histogram,
+)
+
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.detection_utils import check_metadata_consistency
+from detectron2.data.samplers import (
+    InferenceSampler,
+    RandomSubsetTrainingSampler,
+    RepeatFactorTrainingSampler,
+    TrainingSampler,
+)
+
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+
+__all__ = [
+    "build_detection_train_loader",
+    "build_detection_test_loader",
+]
+
+
+def print_classification_instances_class_histogram(dataset_dicts, class_names):
+    """
+    Args:
+        dataset_dicts (list[dict]): list of dataset dicts.
+        class_names (list[str]): list of class names (zero-indexed).
+    """
+    num_classes = len(class_names)
+    hist_bins = np.arange(num_classes + 1)
+    histogram = np.zeros((num_classes,), dtype=np.int)
+    for entry in dataset_dicts:
+        classes = np.asarray([entry["category_id"]], dtype=np.int)
+        if len(classes):
+            assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}"
+            assert (
+                classes.max() < num_classes
+            ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes"
+        histogram += np.histogram(classes, bins=hist_bins)[0]
+
+    N_COLS = min(6, len(class_names) * 2)
+
+    def short_name(x):
+        # make long class names shorter. useful for lvis
+        if len(x) > 13:
+            return x[:11] + ".."
+        return x
+
+    data = list(
+        itertools.chain(
+            *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)]
+        )
+    )
+    total_num_instances = sum(data[1::2])
+    data.extend([None] * (N_COLS - (len(data) % N_COLS)))
+    if num_classes > 1:
+        data.extend(["total", total_num_instances])
+    data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)])
+    table = tabulate(
+        data,
+        headers=["category", "#instances"] * (N_COLS // 2),
+        tablefmt="pipe",
+        numalign="left",
+        stralign="center",
+    )
+    log_first_n(
+        logging.INFO,
+        "Distribution of instances among all {} categories:\n".format(num_classes)
+        + colored(table, "cyan"),
+        key="message",
+    )
+
+
+def wrap_metas(dataset_dict, **kwargs):
+    def _assign_attr(data_dict: dict, **kwargs):
+        assert not any(
+            [key in data_dict for key in kwargs]
+        ), "Assigned attributes should not exist in the original sample."
+        data_dict.update(kwargs)
+        return data_dict
+
+    return [_assign_attr(sample, meta=kwargs) for sample in dataset_dict]
+
+
+def get_detection_dataset_dicts(
+    names, filter_empty=True, min_keypoints=0, proposal_files=None
+):
+    """
+    Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation.
+
+    Args:
+        names (str or list[str]): a dataset name or a list of dataset names
+        filter_empty (bool): whether to filter out images without instance annotations
+        min_keypoints (int): filter out images with fewer keypoints than
+            `min_keypoints`. Set to 0 to do nothing.
+        proposal_files (list[str]): if given, a list of object proposal files
+            that match each dataset in `names`.
+
+    Returns:
+        list[dict]: a list of dicts following the standard dataset dict format.
+    """
+    if isinstance(names, str):
+        names = [names]
+    assert len(names), names
+    dataset_dicts = [
+        wrap_metas(DatasetCatalog.get(dataset_name), dataset_name=dataset_name)
+        for dataset_name in names
+    ]
+    for dataset_name, dicts in zip(names, dataset_dicts):
+        assert len(dicts), "Dataset '{}' is empty!".format(dataset_name)
+
+    if proposal_files is not None:
+        assert len(names) == len(proposal_files)
+        # load precomputed proposals from proposal files
+        dataset_dicts = [
+            load_proposals_into_dataset(dataset_i_dicts, proposal_file)
+            for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files)
+        ]
+
+    dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts))
+
+    has_instances = "annotations" in dataset_dicts[0]
+    if filter_empty and has_instances:
+        dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts)
+    if min_keypoints > 0 and has_instances:
+        dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints)
+
+    if has_instances:
+        try:
+            class_names = MetadataCatalog.get(names[0]).thing_classes
+            check_metadata_consistency("thing_classes", names)
+            print_instances_class_histogram(dataset_dicts, class_names)
+        except AttributeError:  # class names are not available for this dataset
+            pass
+
+    assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names))
+    return dataset_dicts
+
+
+def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None):
+    if dataset is None:
+        dataset = get_detection_dataset_dicts(
+            cfg.DATASETS.TRAIN,
+            filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS,
+            min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE
+            if cfg.MODEL.KEYPOINT_ON
+            else 0,
+            proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN
+            if cfg.MODEL.LOAD_PROPOSALS
+            else None,
+        )
+        _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0])
+
+    if mapper is None:
+        mapper = DatasetMapper(cfg, True)
+
+    if sampler is None:
+        sampler_name = cfg.DATALOADER.SAMPLER_TRAIN
+        logger = logging.getLogger(__name__)
+        logger.info("Using training sampler {}".format(sampler_name))
+        if sampler_name == "TrainingSampler":
+            sampler = TrainingSampler(len(dataset))
+        elif sampler_name == "RepeatFactorTrainingSampler":
+            repeat_factors = (
+                RepeatFactorTrainingSampler.repeat_factors_from_category_frequency(
+                    dataset, cfg.DATALOADER.REPEAT_THRESHOLD
+                )
+            )
+            sampler = RepeatFactorTrainingSampler(repeat_factors)
+        elif sampler_name == "RandomSubsetTrainingSampler":
+            sampler = RandomSubsetTrainingSampler(
+                len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO
+            )
+        else:
+            raise ValueError("Unknown training sampler: {}".format(sampler_name))
+
+    return {
+        "dataset": dataset,
+        "sampler": sampler,
+        "mapper": mapper,
+        "total_batch_size": cfg.SOLVER.IMS_PER_BATCH,
+        "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+    }
+
+
+# TODO can allow dataset as an iterable or IterableDataset to make this function more general
+@configurable(from_config=_train_loader_from_config)
+def build_detection_train_loader(
+    dataset,
+    *,
+    mapper,
+    sampler=None,
+    total_batch_size,
+    aspect_ratio_grouping=True,
+    num_workers=0,
+):
+    """
+    Build a dataloader for object detection with some default features.
+    This interface is experimental.
+
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset and
+            returns the format to be consumed by the model.
+            When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`TrainingSampler`,
+            which coordinates an infinite random shuffle sequence across all workers.
+        total_batch_size (int): total batch size across all workers. Batching
+            simply puts data into a list.
+        aspect_ratio_grouping (bool): whether to group images with similar
+            aspect ratio for efficiency. When enabled, it requires each
+            element in dataset be a dict with keys "width" and "height".
+        num_workers (int): number of parallel data loading workers
+
+    Returns:
+        torch.utils.data.DataLoader:
+            a dataloader. Each output from it is a ``list[mapped_element]`` of length
+            ``total_batch_size / num_workers``, where ``mapped_element`` is produced
+            by the ``mapper``.
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = TrainingSampler(len(dataset))
+    assert isinstance(sampler, torch.utils.data.sampler.Sampler)
+    return build_batch_data_loader(
+        dataset,
+        sampler,
+        total_batch_size,
+        aspect_ratio_grouping=aspect_ratio_grouping,
+        num_workers=num_workers,
+    )
+
+
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)]
+            for x in dataset_name
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return {
+        "dataset": dataset,
+        "mapper": mapper,
+        "num_workers": 0,
+        "samples_per_gpu": cfg.SOLVER.TEST_IMS_PER_BATCH,
+    }
+
+
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(
+    dataset, *, mapper, sampler=None, num_workers=0, samples_per_gpu=1
+):
+    """
+    Similar to `build_detection_train_loader`, but uses a batch size of 1,
+    and :class:`InferenceSampler`. This sampler coordinates all workers to
+    produce the exact set of all samples.
+    This interface is experimental.
+
+    Args:
+        dataset (list or torch.utils.data.Dataset): a list of dataset dicts,
+            or a map-style pytorch dataset. They can be obtained by using
+            :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper (callable): a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+            which splits the dataset across all workers.
+        num_workers (int): number of parallel data loading workers
+
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if sampler is None:
+        sampler = InferenceSampler(len(dataset))
+    # Always use 1 image per worker during inference since this is the
+    # standard when reporting inference time in papers.
+    batch_sampler = torch.utils.data.sampler.BatchSampler(
+        sampler, samples_per_gpu, drop_last=False
+    )
+    data_loader = torch.utils.data.DataLoader(
+        dataset,
+        num_workers=num_workers,
+        batch_sampler=batch_sampler,
+        collate_fn=trivial_batch_collator,
+    )
+    return data_loader
+
diff --git a/open_vocab_seg/data/dataset_mappers/__init__.py b/open_vocab_seg/data/dataset_mappers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f63cd5c034fcb60af8c78431205ae9b410f33250
--- /dev/null
+++ b/open_vocab_seg/data/dataset_mappers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper
diff --git a/open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py b/open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..2836579942cf91c726cb34cbbd2d137c975bee37
--- /dev/null
+++ b/open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py
@@ -0,0 +1,208 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import copy
+import logging
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.projects.point_rend import ColorAugSSDTransform
+from detectron2.structures import BitMasks, Instances
+
+__all__ = ["MaskFormerSemanticDatasetMapper"]
+
+
+class MaskFormerSemanticDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by MaskFormer for semantic segmentation.
+
+    The callable currently does the following:
+
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        augmentations,
+        image_format,
+        ignore_label,
+        size_divisibility,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            ignore_label: the label that is ignored to evaluation
+            size_divisibility: pad image size to be divisible by this value
+        """
+        self.is_train = is_train
+        self.tfm_gens = augmentations
+        self.img_format = image_format
+        self.ignore_label = ignore_label
+        self.size_divisibility = size_divisibility
+
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(
+            f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}"
+        )
+
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        if is_train:
+            augs = [
+                T.ResizeShortestEdge(
+                    cfg.INPUT.MIN_SIZE_TRAIN,
+                    cfg.INPUT.MAX_SIZE_TRAIN,
+                    cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
+                )
+            ]
+            if cfg.INPUT.CROP.ENABLED:
+                augs.append(
+                    T.RandomCrop_CategoryAreaConstraint(
+                        cfg.INPUT.CROP.TYPE,
+                        cfg.INPUT.CROP.SIZE,
+                        cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
+                        cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                    )
+                )
+            if cfg.INPUT.COLOR_AUG_SSD:
+                augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
+            augs.append(T.RandomFlip())
+
+            # Assume always applies to the training set.
+            dataset_names = cfg.DATASETS.TRAIN
+        else:
+            min_size = cfg.INPUT.MIN_SIZE_TEST
+            max_size = cfg.INPUT.MAX_SIZE_TEST
+            sample_style = "choice"
+            augs = [T.ResizeShortestEdge(min_size, max_size, sample_style)]
+            dataset_names = cfg.DATASETS.TEST
+        meta = MetadataCatalog.get(dataset_names[0])
+        ignore_label = meta.ignore_label
+
+        ret = {
+            "is_train": is_train,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "ignore_label": ignore_label,
+            "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY if is_train else -1,
+        }
+        return ret
+
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        # assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!"
+
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+
+        if "sem_seg_file_name" in dataset_dict:
+            # PyTorch transformation not implemented for uint16, so converting it to double first
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype(
+                "double"
+            )
+        else:
+            sem_seg_gt = None
+
+        if sem_seg_gt is None:
+            raise ValueError(
+                "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format(
+                    dataset_dict["file_name"]
+                )
+            )
+
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        sem_seg_gt = aug_input.sem_seg
+
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            image = F.pad(image, padding_size, value=128).contiguous()
+            if sem_seg_gt is not None:
+                sem_seg_gt = F.pad(
+                    sem_seg_gt, padding_size, value=self.ignore_label
+                ).contiguous()
+
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = sem_seg_gt.long()
+
+        if "annotations" in dataset_dict:
+            raise ValueError(
+                "Semantic segmentation dataset should not have 'annotations'."
+            )
+
+        # Prepare per-category binary masks
+        if sem_seg_gt is not None:
+            sem_seg_gt = sem_seg_gt.numpy()
+            instances = Instances(image_shape)
+            classes = np.unique(sem_seg_gt)
+            # remove ignored region
+            classes = classes[classes != self.ignore_label]
+            instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+
+            masks = []
+            for class_id in classes:
+                masks.append(sem_seg_gt == class_id)
+
+            if len(masks) == 0:
+                # Some image does not have annotation (all ignored)
+                instances.gt_masks = torch.zeros(
+                    (0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1])
+                )
+            else:
+                masks = BitMasks(
+                    torch.stack(
+                        [
+                            torch.from_numpy(np.ascontiguousarray(x.copy()))
+                            for x in masks
+                        ]
+                    )
+                )
+                instances.gt_masks = masks.tensor
+
+            dataset_dict["instances"] = instances
+
+        return dataset_dict
diff --git a/open_vocab_seg/data/datasets/__init__.py b/open_vocab_seg/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..352792b6fcdbffefa229d5d67a5c7375769fa345
--- /dev/null
+++ b/open_vocab_seg/data/datasets/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+from . import register_coco_stuff, register_voc_seg
+from . import register_cc3m
+from . import register_ade20k_full
+from . import register_pascal_context
\ No newline at end of file
diff --git a/open_vocab_seg/data/datasets/csv_data.py b/open_vocab_seg/data/datasets/csv_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a4c9e52b0b792d49c48fe8bc2693be5ea879581
--- /dev/null
+++ b/open_vocab_seg/data/datasets/csv_data.py
@@ -0,0 +1,459 @@
+#   Copyright (c) Meta Platforms, Inc. and affiliates.
+import ast
+import json
+import logging
+import math
+import os
+import random
+import sys
+import time
+from dataclasses import dataclass
+from multiprocessing import Value
+
+import braceexpand
+import numpy as np
+import pandas as pd
+import torch
+import torchvision.datasets as datasets
+import webdataset as wds
+from PIL import Image
+from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, IterableDataset, get_worker_info
+from torch.utils.data.distributed import DistributedSampler
+from webdataset.filters import _shuffle
+from webdataset.tariterators import base_plus_ext, url_opener, tar_file_expander, valid_sample
+
+try:
+    import horovod.torch as hvd
+except ImportError:
+    hvd = None
+
+from clip import tokenize
+
+
+class CsvDataset(Dataset):
+    def __init__(self, input_filename, transforms, img_key, caption_key, sep="\t"):
+        logging.debug(f'Loading csv data from {input_filename}.')
+        df = pd.read_csv(input_filename, sep=sep)
+
+        self.images = df[img_key].tolist()
+        self.captions = df[caption_key].tolist()
+        self.transforms = transforms
+        logging.debug('Done loading data.')
+
+    def __len__(self):
+        return len(self.captions)
+
+    def __getitem__(self, idx):
+        images = self.transforms(Image.open(str(self.images[idx])))
+        texts = tokenize([str(self.captions[idx])])[0]
+        return images, texts
+
+
+class SharedEpoch:
+    def __init__(self, epoch: int = 0):
+        self.shared_epoch = Value('i', epoch)
+
+    def set_value(self, epoch):
+        self.shared_epoch.value = epoch
+
+    def get_value(self):
+        return self.shared_epoch.value
+
+
+@dataclass
+class DataInfo:
+    dataloader: DataLoader
+    sampler: DistributedSampler = None
+    shared_epoch: SharedEpoch = None
+
+    def set_epoch(self, epoch):
+        if self.shared_epoch is not None:
+            self.shared_epoch.set_value(epoch)
+        if self.sampler is not None and isinstance(self.sampler, DistributedSampler):
+            self.sampler.set_epoch(epoch)
+
+
+def preprocess_txt(text):
+    return tokenize([str(text)])[0]
+
+
+def get_dataset_size(shards):
+    shards_list = list(braceexpand.braceexpand(shards))
+    dir_path = os.path.dirname(shards)
+    sizes_filename = os.path.join(dir_path, 'sizes.json')
+    len_filename = os.path.join(dir_path, '__len__')
+    if os.path.exists(sizes_filename):
+        sizes = json.load(open(sizes_filename, 'r'))
+        total_size = sum([int(sizes[os.path.basename(shard)]) for shard in shards_list])
+    elif os.path.exists(len_filename):
+        # FIXME this used to be eval(open(...)) but that seemed rather unsafe
+        total_size = ast.literal_eval(open(len_filename, 'r').read())
+    else:
+        total_size = None  # num samples undefined
+        # some common dataset sizes (at time of authors last download)
+        # CC3M (train): 2905954
+        # CC12M: 10968539
+        # LAION-400M: 407332084
+        # LAION-2B (english): 2170337258
+    num_shards = len(shards_list)
+    return total_size, num_shards
+
+
+def get_imagenet(args, preprocess_fns, split):
+    assert split in ["train", "val", "v2"]
+    is_train = split == "train"
+    preprocess_train, preprocess_val = preprocess_fns
+
+    if split == "v2":
+        from imagenetv2_pytorch import ImageNetV2Dataset
+        dataset = ImageNetV2Dataset(location=args.imagenet_v2, transform=preprocess_val)
+    else:
+        if is_train:
+            data_path = args.imagenet_train
+            preprocess_fn = preprocess_train
+        else:
+            data_path = args.imagenet_val
+            preprocess_fn = preprocess_val
+        assert data_path
+
+        dataset = datasets.ImageFolder(data_path, transform=preprocess_fn)
+
+    if is_train:
+        idxs = np.zeros(len(dataset.targets))
+        target_array = np.array(dataset.targets)
+        k = 50
+        for c in range(1000):
+            m = target_array == c
+            n = len(idxs[m])
+            arr = np.zeros(n)
+            arr[:k] = 1
+            np.random.shuffle(arr)
+            idxs[m] = arr
+
+        idxs = idxs.astype('int')
+        sampler = SubsetRandomSampler(np.where(idxs)[0])
+    else:
+        sampler = None
+
+    dataloader = torch.utils.data.DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        num_workers=args.workers,
+        sampler=sampler,
+    )
+
+    return DataInfo(dataloader=dataloader, sampler=sampler)
+
+
+def count_samples(dataloader):
+    os.environ["WDS_EPOCH"] = "0"
+    n_elements, n_batches = 0, 0
+    for images, texts in dataloader:
+        n_batches += 1
+        n_elements += len(images)
+        assert len(images) == len(texts)
+    return n_elements, n_batches
+
+
+def filter_no_caption(sample):
+    return 'txt' in sample
+
+
+def log_and_continue(exn):
+    """Call in an exception handler to ignore any exception, isssue a warning, and continue."""
+    logging.warning(f'Handling webdataset error ({repr(exn)}). Ignoring.')
+    return True
+
+
+def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None):
+    """Return function over iterator that groups key, value pairs into samples.
+
+    :param keys: function that splits the key into key and extension (base_plus_ext)
+    :param lcase: convert suffixes to lower case (Default value = True)
+    """
+    current_sample = None
+    for filesample in data:
+        assert isinstance(filesample, dict)
+        fname, value = filesample["fname"], filesample["data"]
+        prefix, suffix = keys(fname)
+        if prefix is None:
+            continue
+        if lcase:
+            suffix = suffix.lower()
+        # FIXME webdataset version throws if suffix in current_sample, but we have a potential for
+        #  this happening in the current LAION400m dataset if a tar ends with same prefix as the next
+        #  begins, rare, but can happen since prefix aren't unique across tar files in that dataset
+        if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample:
+            if valid_sample(current_sample):
+                yield current_sample
+            current_sample = dict(__key__=prefix, __url__=filesample["__url__"])
+        if suffixes is None or suffix in suffixes:
+            current_sample[suffix] = value
+    if valid_sample(current_sample):
+        yield current_sample
+
+
+def tarfile_to_samples_nothrow(src, handler=log_and_continue):
+    # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw
+    streams = url_opener(src, handler=handler)
+    files = tar_file_expander(streams, handler=handler)
+    samples = group_by_keys_nothrow(files, handler=handler)
+    return samples
+
+
+def pytorch_worker_seed():
+    """get dataloader worker seed from pytorch"""
+    worker_info = get_worker_info()
+    if worker_info is not None:
+        # favour the seed already created for pytorch dataloader workers if it exists
+        return worker_info.seed
+    # fallback to wds rank based seed
+    return wds.utils.pytorch_worker_seed()
+
+
+_SHARD_SHUFFLE_SIZE = 2000
+_SHARD_SHUFFLE_INITIAL = 500
+_SAMPLE_SHUFFLE_SIZE = 5000
+_SAMPLE_SHUFFLE_INITIAL = 1000
+
+
+class detshuffle2(wds.PipelineStage):
+    def __init__(
+            self,
+            bufsize=1000,
+            initial=100,
+            seed=0,
+            epoch=-1,
+    ):
+        self.bufsize = bufsize
+        self.initial = initial
+        self.seed = seed
+        self.epoch = epoch
+
+    def run(self, src):
+        if isinstance(self.epoch, SharedEpoch):
+            epoch = self.epoch.get_value()
+        else:
+            # NOTE: this is epoch tracking is problematic in a multiprocess (dataloader workers or train)
+            # situation as different workers may wrap at different times (or not at all).
+            self.epoch += 1
+            epoch = self.epoch
+        rng = random.Random()
+        if self.seed < 0:
+            seed = pytorch_worker_seed() + epoch
+        else:
+            seed = self.seed + epoch
+        rng.seed(seed)
+        return _shuffle(src, self.bufsize, self.initial, rng)
+
+
+class ResampledShards2(IterableDataset):
+    """An iterable dataset yielding a list of urls."""
+
+    def __init__(
+        self,
+        urls,
+        nshards=sys.maxsize,
+        worker_seed=None,
+        deterministic=False,
+        epoch=-1,
+    ):
+        """Sample shards from the shard list with replacement.
+
+        :param urls: a list of URLs as a Python list or brace notation string
+        """
+        super().__init__()
+        urls = wds.shardlists.expand_urls(urls)
+        self.urls = urls
+        assert isinstance(self.urls[0], str)
+        self.nshards = nshards
+        self.rng = random.Random()
+        self.worker_seed = pytorch_worker_seed if worker_seed is None else worker_seed
+        self.deterministic = deterministic
+        self.epoch = epoch
+
+    def __iter__(self):
+        """Return an iterator over the shards."""
+        if isinstance(self.epoch, SharedEpoch):
+            epoch = self.epoch.get_value()
+        else:
+            # NOTE: this is epoch tracking is problematic in a multiprocess (dataloader workers or train)
+            # situation as different workers may wrap at different times (or not at all).
+            self.epoch += 1
+            epoch = self.epoch
+        if self.deterministic:
+            # reset seed w/ epoch if deterministic, worker seed should be deterministic due to arg.seed
+            self.rng.seed(self.worker_seed() + epoch)
+        for _ in range(self.nshards):
+            yield dict(url=self.rng.choice(self.urls))
+
+
+def get_wds_dataset(args, preprocess_img, is_train, epoch=0, floor=False):
+    input_shards = args.train_data if is_train else args.val_data
+    assert input_shards is not None
+    resampled = getattr(args, 'dataset_resampled', False) and is_train
+
+    num_samples, num_shards = get_dataset_size(input_shards)
+    if not num_samples:
+        if is_train:
+            num_samples = args.train_num_samples
+            if not num_samples:
+                raise RuntimeError(
+                    'Currently, number of dataset samples must be specified for training dataset. '
+                    'Please specify via `--train-num-samples` if no dataset length info present.')
+        else:
+            num_samples = args.val_num_samples or 0  # eval will just exhaust the iterator if not specified
+
+    shared_epoch = SharedEpoch(epoch=epoch)  # create a shared epoch store to sync epoch to dataloader worker proc
+    if resampled:
+        pipeline = [ResampledShards2(input_shards, deterministic=True, epoch=shared_epoch)]
+    else:
+        pipeline = [wds.SimpleShardList(input_shards)]
+
+    # at this point we have an iterator over all the shards
+    if is_train:
+        if not resampled:
+            pipeline.extend([
+                detshuffle2(
+                    bufsize=_SHARD_SHUFFLE_SIZE,
+                    initial=_SHARD_SHUFFLE_INITIAL,
+                    seed=args.seed,
+                    epoch=shared_epoch,
+                ),
+                wds.split_by_node,
+                wds.split_by_worker,
+            ])
+        pipeline.extend([
+            # at this point, we have an iterator over the shards assigned to each worker at each node
+            tarfile_to_samples_nothrow,  # wds.tarfile_to_samples(handler=log_and_continue),
+            wds.shuffle(
+                bufsize=_SAMPLE_SHUFFLE_SIZE,
+                initial=_SAMPLE_SHUFFLE_INITIAL,
+            ),
+        ])
+    else:
+        pipeline.extend([
+            wds.split_by_worker,
+            # at this point, we have an iterator over the shards assigned to each worker
+            wds.tarfile_to_samples(handler=log_and_continue),
+        ])
+    pipeline.extend([
+        wds.select(filter_no_caption),
+        wds.decode("pilrgb", handler=log_and_continue),
+        wds.rename(image="jpg;png", text="txt"),
+        wds.map_dict(image=preprocess_img, text=preprocess_txt),
+        wds.to_tuple("image", "text"),
+        wds.batched(args.batch_size, partial=not is_train),
+    ])
+
+    dataset = wds.DataPipeline(*pipeline)
+    if is_train:
+        if not resampled:
+            assert num_shards >= args.workers * args.world_size, 'number of shards must be >= total workers'
+        # roll over and repeat a few samples to get same number of full batches on each node
+        round_fn = math.floor if floor else math.ceil
+        global_batch_size = args.batch_size * args.world_size
+        num_batches = round_fn(num_samples / global_batch_size)
+        num_workers = max(1, args.workers)
+        num_worker_batches = round_fn(num_batches / num_workers)  # per dataloader worker
+        num_batches = num_worker_batches * num_workers
+        num_samples = num_batches * global_batch_size
+        dataset = dataset.with_epoch(num_worker_batches)  # each worker is iterating over this
+    else:
+        # last batches are partial, eval is done on single (master) node
+        num_batches = math.ceil(num_samples / args.batch_size)
+
+    dataloader = wds.WebLoader(
+        dataset,
+        batch_size=None,
+        shuffle=False,
+        num_workers=args.workers,
+        persistent_workers=True,
+    )
+
+    # FIXME not clear which approach is better, with_epoch before vs after dataloader?
+    # hoping to resolve via https://github.com/webdataset/webdataset/issues/169
+    # if is_train:
+    #     # roll over and repeat a few samples to get same number of full batches on each node
+    #     global_batch_size = args.batch_size * args.world_size
+    #     num_batches = math.ceil(num_samples / global_batch_size)
+    #     num_workers = max(1, args.workers)
+    #     num_batches = math.ceil(num_batches / num_workers) * num_workers
+    #     num_samples = num_batches * global_batch_size
+    #     dataloader = dataloader.with_epoch(num_batches)
+    # else:
+    #     # last batches are partial, eval is done on single (master) node
+    #     num_batches = math.ceil(num_samples / args.batch_size)
+
+    # add meta-data to dataloader instance for convenience
+    dataloader.num_batches = num_batches
+    dataloader.num_samples = num_samples
+
+    return DataInfo(dataloader=dataloader, shared_epoch=shared_epoch)
+
+
+def get_csv_dataset(args, preprocess_fn, is_train, epoch=0):
+    input_filename = args.train_data if is_train else args.val_data
+    assert input_filename
+    dataset = CsvDataset(
+        input_filename,
+        preprocess_fn,
+        img_key=args.csv_img_key,
+        caption_key=args.csv_caption_key,
+        sep=args.csv_separator)
+    num_samples = len(dataset)
+    sampler = DistributedSampler(dataset) if args.distributed and is_train else None
+    shuffle = is_train and sampler is None
+
+    dataloader = DataLoader(
+        dataset,
+        batch_size=args.batch_size,
+        shuffle=shuffle,
+        num_workers=args.workers,
+        pin_memory=True,
+        sampler=sampler,
+        drop_last=is_train,
+    )
+    dataloader.num_samples = num_samples
+    dataloader.num_batches = len(dataloader)
+
+    return DataInfo(dataloader, sampler)
+
+
+def get_dataset_fn(data_path, dataset_type):
+    if dataset_type == "webdataset":
+        return get_wds_dataset
+    elif dataset_type == "csv":
+        return get_csv_dataset
+    elif dataset_type == "auto":
+        ext = data_path.split('.')[-1]
+        if ext in ['csv', 'tsv']:
+            return get_csv_dataset
+        elif ext in ['tar']:
+            return get_wds_dataset
+        else:
+            raise ValueError(
+                f"Tried to figure out dataset type, but failed for extention {ext}.")
+    else:
+        raise ValueError(f"Unsupported dataset type: {dataset_type}")
+
+
+def get_data(args, preprocess_fns, epoch=0):
+    preprocess_train, preprocess_val = preprocess_fns
+    data = {}
+
+    if args.train_data:
+        data["train"] = get_dataset_fn(args.train_data, args.dataset_type)(
+            args, preprocess_train, is_train=True, epoch=epoch)
+
+    if args.val_data:
+        data["val"] = get_dataset_fn(args.val_data, args.dataset_type)(
+            args, preprocess_val, is_train=False)
+
+    if args.imagenet_val is not None:
+        data["imagenet-val"] = get_imagenet(args, preprocess_fns, "val")
+
+    if args.imagenet_v2 is not None:
+        data["imagenet-v2"] = get_imagenet(args, preprocess_fns, "v2")
+
+    return data
diff --git a/open_vocab_seg/data/datasets/register_ade20k_full.py b/open_vocab_seg/data/datasets/register_ade20k_full.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ba35274c8ba7f03cbe92621f944c8368794497f
--- /dev/null
+++ b/open_vocab_seg/data/datasets/register_ade20k_full.py
@@ -0,0 +1,995 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+
+ADE20K_SEM_SEG_FULL_CATEGORIES = [
+    {"name": "wall", "id": 2978, "trainId": 0},
+    {"name": "building, edifice", "id": 312, "trainId": 1},
+    {"name": "sky", "id": 2420, "trainId": 2},
+    {"name": "tree", "id": 2855, "trainId": 3},
+    {"name": "road, route", "id": 2131, "trainId": 4},
+    {"name": "floor, flooring", "id": 976, "trainId": 5},
+    {"name": "ceiling", "id": 447, "trainId": 6},
+    {"name": "bed", "id": 165, "trainId": 7},
+    {"name": "sidewalk, pavement", "id": 2377, "trainId": 8},
+    {"name": "earth, ground", "id": 838, "trainId": 9},
+    {"name": "cabinet", "id": 350, "trainId": 10},
+    {
+        "name": "person, individual, someone, somebody, mortal, soul",
+        "id": 1831,
+        "trainId": 11,
+    },
+    {"name": "grass", "id": 1125, "trainId": 12},
+    {"name": "windowpane, window", "id": 3055, "trainId": 13},
+    {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14},
+    {"name": "mountain, mount", "id": 1610, "trainId": 15},
+    {"name": "plant, flora, plant life", "id": 1910, "trainId": 16},
+    {"name": "table", "id": 2684, "trainId": 17},
+    {"name": "chair", "id": 471, "trainId": 18},
+    {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19},
+    {"name": "door", "id": 774, "trainId": 20},
+    {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21},
+    {"name": "sea", "id": 2264, "trainId": 22},
+    {"name": "painting, picture", "id": 1735, "trainId": 23},
+    {"name": "water", "id": 2994, "trainId": 24},
+    {"name": "mirror", "id": 1564, "trainId": 25},
+    {"name": "house", "id": 1276, "trainId": 26},
+    {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27},
+    {"name": "shelf", "id": 2329, "trainId": 28},
+    {"name": "armchair", "id": 57, "trainId": 29},
+    {"name": "fence, fencing", "id": 907, "trainId": 30},
+    {"name": "field", "id": 913, "trainId": 31},
+    {"name": "lamp", "id": 1395, "trainId": 32},
+    {"name": "rock, stone", "id": 2138, "trainId": 33},
+    {"name": "seat", "id": 2272, "trainId": 34},
+    {"name": "river", "id": 2128, "trainId": 35},
+    {"name": "desk", "id": 724, "trainId": 36},
+    {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37},
+    {"name": "railing, rail", "id": 2053, "trainId": 38},
+    {"name": "signboard, sign", "id": 2380, "trainId": 39},
+    {"name": "cushion", "id": 689, "trainId": 40},
+    {"name": "path", "id": 1788, "trainId": 41},
+    {"name": "work surface", "id": 3087, "trainId": 42},
+    {"name": "stairs, steps", "id": 2530, "trainId": 43},
+    {"name": "column, pillar", "id": 581, "trainId": 44},
+    {"name": "sink", "id": 2388, "trainId": 45},
+    {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46},
+    {"name": "snow", "id": 2454, "trainId": 47},
+    {"name": "refrigerator, icebox", "id": 2096, "trainId": 48},
+    {"name": "base, pedestal, stand", "id": 137, "trainId": 49},
+    {"name": "bridge, span", "id": 294, "trainId": 50},
+    {"name": "blind, screen", "id": 212, "trainId": 51},
+    {"name": "runway", "id": 2185, "trainId": 52},
+    {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53},
+    {"name": "sand", "id": 2212, "trainId": 54},
+    {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55},
+    {"name": "pillow", "id": 1869, "trainId": 56},
+    {"name": "screen door, screen", "id": 2251, "trainId": 57},
+    {
+        "name": "toilet, can, commode, crapper, pot, potty, stool, throne",
+        "id": 2793,
+        "trainId": 58,
+    },
+    {"name": "skyscraper", "id": 2423, "trainId": 59},
+    {"name": "grandstand, covered stand", "id": 1121, "trainId": 60},
+    {"name": "box", "id": 266, "trainId": 61},
+    {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62},
+    {"name": "palm, palm tree", "id": 1744, "trainId": 63},
+    {"name": "double door", "id": 783, "trainId": 64},
+    {"name": "coffee table, cocktail table", "id": 571, "trainId": 65},
+    {"name": "counter", "id": 627, "trainId": 66},
+    {"name": "countertop", "id": 629, "trainId": 67},
+    {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68},
+    {"name": "kitchen island", "id": 1374, "trainId": 69},
+    {"name": "boat", "id": 223, "trainId": 70},
+    {"name": "waterfall, falls", "id": 3016, "trainId": 71},
+    {
+        "name": "stove, kitchen stove, range, kitchen range, cooking stove",
+        "id": 2598,
+        "trainId": 72,
+    },
+    {"name": "flower", "id": 978, "trainId": 73},
+    {"name": "bookcase", "id": 239, "trainId": 74},
+    {"name": "controls", "id": 608, "trainId": 75},
+    {"name": "book", "id": 236, "trainId": 76},
+    {"name": "stairway, staircase", "id": 2531, "trainId": 77},
+    {"name": "streetlight, street lamp", "id": 2616, "trainId": 78},
+    {
+        "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system",
+        "id": 591,
+        "trainId": 79,
+    },
+    {
+        "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle",
+        "id": 327,
+        "trainId": 80,
+    },
+    {"name": "swivel chair", "id": 2679, "trainId": 81},
+    {"name": "light, light source", "id": 1451, "trainId": 82},
+    {"name": "bench", "id": 181, "trainId": 83},
+    {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84},
+    {"name": "towel", "id": 2821, "trainId": 85},
+    {"name": "fountain", "id": 1023, "trainId": 86},
+    {"name": "embankment", "id": 855, "trainId": 87},
+    {
+        "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box",
+        "id": 2733,
+        "trainId": 88,
+    },
+    {"name": "van", "id": 2928, "trainId": 89},
+    {"name": "hill", "id": 1240, "trainId": 90},
+    {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91},
+    {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92},
+    {"name": "truck, motortruck", "id": 2880, "trainId": 93},
+    {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94},
+    {"name": "pole", "id": 1936, "trainId": 95},
+    {"name": "tower", "id": 2828, "trainId": 96},
+    {"name": "court", "id": 631, "trainId": 97},
+    {"name": "ball", "id": 103, "trainId": 98},
+    {
+        "name": "aircraft carrier, carrier, flattop, attack aircraft carrier",
+        "id": 3144,
+        "trainId": 99,
+    },
+    {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100},
+    {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101},
+    {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102},
+    {"name": "minibike, motorbike", "id": 1563, "trainId": 103},
+    {
+        "name": "animal, animate being, beast, brute, creature, fauna",
+        "id": 29,
+        "trainId": 104,
+    },
+    {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105},
+    {"name": "step, stair", "id": 2569, "trainId": 106},
+    {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107},
+    {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108},
+    {"name": "doorframe, doorcase", "id": 778, "trainId": 109},
+    {"name": "sconce", "id": 2243, "trainId": 110},
+    {"name": "pond", "id": 1941, "trainId": 111},
+    {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112},
+    {
+        "name": "bannister, banister, balustrade, balusters, handrail",
+        "id": 120,
+        "trainId": 113,
+    },
+    {"name": "bag", "id": 95, "trainId": 114},
+    {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115},
+    {"name": "gazebo", "id": 1087, "trainId": 116},
+    {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117},
+    {"name": "land, ground, soil", "id": 1401, "trainId": 118},
+    {"name": "board, plank", "id": 220, "trainId": 119},
+    {"name": "arcade machine", "id": 47, "trainId": 120},
+    {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121},
+    {"name": "bar", "id": 123, "trainId": 122},
+    {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123},
+    {"name": "playground", "id": 1927, "trainId": 124},
+    {"name": "ship", "id": 2337, "trainId": 125},
+    {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126},
+    {
+        "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin",
+        "id": 64,
+        "trainId": 127,
+    },
+    {"name": "bottle", "id": 249, "trainId": 128},
+    {"name": "cradle", "id": 642, "trainId": 129},
+    {"name": "pot, flowerpot", "id": 1981, "trainId": 130},
+    {
+        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+        "id": 609,
+        "trainId": 131,
+    },
+    {"name": "train, railroad train", "id": 2840, "trainId": 132},
+    {"name": "stool", "id": 2586, "trainId": 133},
+    {"name": "lake", "id": 1393, "trainId": 134},
+    {"name": "tank, storage tank", "id": 2704, "trainId": 135},
+    {"name": "ice, water ice", "id": 1304, "trainId": 136},
+    {"name": "basket, handbasket", "id": 146, "trainId": 137},
+    {"name": "manhole", "id": 1494, "trainId": 138},
+    {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139},
+    {"name": "canopy", "id": 389, "trainId": 140},
+    {"name": "microwave, microwave oven", "id": 1551, "trainId": 141},
+    {"name": "barrel, cask", "id": 131, "trainId": 142},
+    {"name": "dirt track", "id": 738, "trainId": 143},
+    {"name": "beam", "id": 161, "trainId": 144},
+    {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145},
+    {"name": "plate", "id": 1919, "trainId": 146},
+    {"name": "screen, crt screen", "id": 3109, "trainId": 147},
+    {"name": "ruins", "id": 2179, "trainId": 148},
+    {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149},
+    {"name": "blanket, cover", "id": 206, "trainId": 150},
+    {"name": "plaything, toy", "id": 1930, "trainId": 151},
+    {"name": "food, solid food", "id": 1002, "trainId": 152},
+    {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153},
+    {"name": "oven", "id": 1708, "trainId": 154},
+    {"name": "stage", "id": 2526, "trainId": 155},
+    {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156},
+    {"name": "umbrella", "id": 2901, "trainId": 157},
+    {"name": "sculpture", "id": 2262, "trainId": 158},
+    {"name": "aqueduct", "id": 44, "trainId": 159},
+    {"name": "container", "id": 597, "trainId": 160},
+    {"name": "scaffolding, staging", "id": 2235, "trainId": 161},
+    {"name": "hood, exhaust hood", "id": 1260, "trainId": 162},
+    {"name": "curb, curbing, kerb", "id": 682, "trainId": 163},
+    {"name": "roller coaster", "id": 2151, "trainId": 164},
+    {"name": "horse, equus caballus", "id": 3107, "trainId": 165},
+    {"name": "catwalk", "id": 432, "trainId": 166},
+    {"name": "glass, drinking glass", "id": 1098, "trainId": 167},
+    {"name": "vase", "id": 2932, "trainId": 168},
+    {"name": "central reservation", "id": 461, "trainId": 169},
+    {"name": "carousel", "id": 410, "trainId": 170},
+    {"name": "radiator", "id": 2046, "trainId": 171},
+    {"name": "closet", "id": 533, "trainId": 172},
+    {"name": "machine", "id": 1481, "trainId": 173},
+    {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174},
+    {"name": "fan", "id": 894, "trainId": 175},
+    {"name": "inflatable bounce game", "id": 1322, "trainId": 176},
+    {"name": "pitch", "id": 1891, "trainId": 177},
+    {"name": "paper", "id": 1756, "trainId": 178},
+    {"name": "arcade, colonnade", "id": 49, "trainId": 179},
+    {"name": "hot tub", "id": 1272, "trainId": 180},
+    {"name": "helicopter", "id": 1229, "trainId": 181},
+    {"name": "tray", "id": 2850, "trainId": 182},
+    {"name": "partition, divider", "id": 1784, "trainId": 183},
+    {"name": "vineyard", "id": 2962, "trainId": 184},
+    {"name": "bowl", "id": 259, "trainId": 185},
+    {"name": "bullring", "id": 319, "trainId": 186},
+    {"name": "flag", "id": 954, "trainId": 187},
+    {"name": "pot", "id": 1974, "trainId": 188},
+    {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189},
+    {"name": "shower", "id": 2356, "trainId": 190},
+    {
+        "name": "bag, traveling bag, travelling bag, grip, suitcase",
+        "id": 97,
+        "trainId": 191,
+    },
+    {"name": "bulletin board, notice board", "id": 318, "trainId": 192},
+    {"name": "confessional booth", "id": 592, "trainId": 193},
+    {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194},
+    {"name": "forest", "id": 1017, "trainId": 195},
+    {"name": "elevator door", "id": 851, "trainId": 196},
+    {"name": "laptop, laptop computer", "id": 1407, "trainId": 197},
+    {"name": "instrument panel", "id": 1332, "trainId": 198},
+    {"name": "bucket, pail", "id": 303, "trainId": 199},
+    {"name": "tapestry, tapis", "id": 2714, "trainId": 200},
+    {"name": "platform", "id": 1924, "trainId": 201},
+    {"name": "jacket", "id": 1346, "trainId": 202},
+    {"name": "gate", "id": 1081, "trainId": 203},
+    {"name": "monitor, monitoring device", "id": 1583, "trainId": 204},
+    {
+        "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk",
+        "id": 2727,
+        "trainId": 205,
+    },
+    {"name": "spotlight, spot", "id": 2509, "trainId": 206},
+    {"name": "ring", "id": 2123, "trainId": 207},
+    {"name": "control panel", "id": 602, "trainId": 208},
+    {"name": "blackboard, chalkboard", "id": 202, "trainId": 209},
+    {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210},
+    {"name": "chest", "id": 490, "trainId": 211},
+    {"name": "clock", "id": 530, "trainId": 212},
+    {"name": "sand dune", "id": 2213, "trainId": 213},
+    {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214},
+    {"name": "vault", "id": 2934, "trainId": 215},
+    {"name": "table football", "id": 2687, "trainId": 216},
+    {"name": "cannon", "id": 387, "trainId": 217},
+    {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218},
+    {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219},
+    {"name": "statue", "id": 2547, "trainId": 220},
+    {
+        "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system",
+        "id": 1474,
+        "trainId": 221,
+    },
+    {"name": "exhibitor", "id": 877, "trainId": 222},
+    {"name": "ladder", "id": 1391, "trainId": 223},
+    {"name": "carport", "id": 414, "trainId": 224},
+    {"name": "dam", "id": 698, "trainId": 225},
+    {"name": "pulpit", "id": 2019, "trainId": 226},
+    {"name": "skylight, fanlight", "id": 2422, "trainId": 227},
+    {"name": "water tower", "id": 3010, "trainId": 228},
+    {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229},
+    {"name": "display board", "id": 753, "trainId": 230},
+    {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231},
+    {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232},
+    {"name": "ice rink", "id": 1301, "trainId": 233},
+    {"name": "fruit", "id": 1033, "trainId": 234},
+    {"name": "patio", "id": 1789, "trainId": 235},
+    {"name": "vending machine", "id": 2939, "trainId": 236},
+    {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237},
+    {"name": "net", "id": 1652, "trainId": 238},
+    {
+        "name": "backpack, back pack, knapsack, packsack, rucksack, haversack",
+        "id": 90,
+        "trainId": 239,
+    },
+    {"name": "jar", "id": 1349, "trainId": 240},
+    {"name": "track", "id": 2830, "trainId": 241},
+    {"name": "magazine", "id": 1485, "trainId": 242},
+    {"name": "shutter", "id": 2370, "trainId": 243},
+    {"name": "roof", "id": 2155, "trainId": 244},
+    {"name": "banner, streamer", "id": 118, "trainId": 245},
+    {"name": "landfill", "id": 1402, "trainId": 246},
+    {"name": "post", "id": 1957, "trainId": 247},
+    {"name": "altarpiece, reredos", "id": 3130, "trainId": 248},
+    {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249},
+    {"name": "arch, archway", "id": 52, "trainId": 250},
+    {"name": "table game", "id": 2688, "trainId": 251},
+    {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252},
+    {"name": "document, written document, papers", "id": 762, "trainId": 253},
+    {"name": "dome", "id": 772, "trainId": 254},
+    {"name": "pier", "id": 1857, "trainId": 255},
+    {"name": "shanties", "id": 2315, "trainId": 256},
+    {"name": "forecourt", "id": 1016, "trainId": 257},
+    {"name": "crane", "id": 643, "trainId": 258},
+    {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259},
+    {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260},
+    {"name": "drawing", "id": 791, "trainId": 261},
+    {"name": "cabin", "id": 349, "trainId": 262},
+    {
+        "name": "ad, advertisement, advertizement, advertising, advertizing, advert",
+        "id": 6,
+        "trainId": 263,
+    },
+    {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264},
+    {"name": "monument", "id": 1587, "trainId": 265},
+    {"name": "henhouse", "id": 1233, "trainId": 266},
+    {"name": "cockpit", "id": 559, "trainId": 267},
+    {"name": "heater, warmer", "id": 1223, "trainId": 268},
+    {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269},
+    {"name": "pool", "id": 1943, "trainId": 270},
+    {"name": "elevator, lift", "id": 853, "trainId": 271},
+    {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272},
+    {"name": "labyrinth", "id": 1390, "trainId": 273},
+    {"name": "text, textual matter", "id": 2748, "trainId": 274},
+    {"name": "printer", "id": 2007, "trainId": 275},
+    {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276},
+    {"name": "mattress", "id": 1513, "trainId": 277},
+    {"name": "straw", "id": 2600, "trainId": 278},
+    {"name": "stalls", "id": 2538, "trainId": 279},
+    {"name": "patio, terrace", "id": 1790, "trainId": 280},
+    {"name": "billboard, hoarding", "id": 194, "trainId": 281},
+    {"name": "bus stop", "id": 326, "trainId": 282},
+    {"name": "trouser, pant", "id": 2877, "trainId": 283},
+    {"name": "console table, console", "id": 594, "trainId": 284},
+    {"name": "rack", "id": 2036, "trainId": 285},
+    {"name": "notebook", "id": 1662, "trainId": 286},
+    {"name": "shrine", "id": 2366, "trainId": 287},
+    {"name": "pantry", "id": 1754, "trainId": 288},
+    {"name": "cart", "id": 418, "trainId": 289},
+    {"name": "steam shovel", "id": 2553, "trainId": 290},
+    {"name": "porch", "id": 1951, "trainId": 291},
+    {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292},
+    {"name": "figurine, statuette", "id": 918, "trainId": 293},
+    {"name": "recycling bin", "id": 2086, "trainId": 294},
+    {"name": "folding screen", "id": 997, "trainId": 295},
+    {"name": "telescope", "id": 2731, "trainId": 296},
+    {"name": "deck chair, beach chair", "id": 704, "trainId": 297},
+    {"name": "kennel", "id": 1365, "trainId": 298},
+    {"name": "coffee maker", "id": 569, "trainId": 299},
+    {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300},
+    {"name": "fish", "id": 948, "trainId": 301},
+    {"name": "easel", "id": 839, "trainId": 302},
+    {"name": "artificial golf green", "id": 63, "trainId": 303},
+    {"name": "iceberg", "id": 1305, "trainId": 304},
+    {"name": "candlestick, candle holder", "id": 378, "trainId": 305},
+    {"name": "shower stall, shower bath", "id": 2362, "trainId": 306},
+    {"name": "television stand", "id": 2734, "trainId": 307},
+    {
+        "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle",
+        "id": 2982,
+        "trainId": 308,
+    },
+    {"name": "skeleton", "id": 2398, "trainId": 309},
+    {"name": "grand piano, grand", "id": 1119, "trainId": 310},
+    {"name": "candy, confect", "id": 382, "trainId": 311},
+    {"name": "grille door", "id": 1141, "trainId": 312},
+    {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313},
+    {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314},
+    {"name": "shoe", "id": 2341, "trainId": 315},
+    {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316},
+    {"name": "shanty", "id": 2316, "trainId": 317},
+    {"name": "structure", "id": 2626, "trainId": 318},
+    {"name": "rocking chair, rocker", "id": 3104, "trainId": 319},
+    {"name": "bird", "id": 198, "trainId": 320},
+    {"name": "place mat", "id": 1896, "trainId": 321},
+    {"name": "tomb", "id": 2800, "trainId": 322},
+    {"name": "big top", "id": 190, "trainId": 323},
+    {
+        "name": "gas pump, gasoline pump, petrol pump, island dispenser",
+        "id": 3131,
+        "trainId": 324,
+    },
+    {"name": "lockers", "id": 1463, "trainId": 325},
+    {"name": "cage", "id": 357, "trainId": 326},
+    {"name": "finger", "id": 929, "trainId": 327},
+    {"name": "bleachers", "id": 209, "trainId": 328},
+    {"name": "ferris wheel", "id": 912, "trainId": 329},
+    {"name": "hairdresser chair", "id": 1164, "trainId": 330},
+    {"name": "mat", "id": 1509, "trainId": 331},
+    {"name": "stands", "id": 2539, "trainId": 332},
+    {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333},
+    {
+        "name": "streetcar, tram, tramcar, trolley, trolley car",
+        "id": 2615,
+        "trainId": 334,
+    },
+    {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335},
+    {"name": "dummy", "id": 818, "trainId": 336},
+    {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337},
+    {"name": "sand trap", "id": 2217, "trainId": 338},
+    {"name": "shop, store", "id": 2347, "trainId": 339},
+    {"name": "table cloth", "id": 2686, "trainId": 340},
+    {"name": "service station", "id": 2300, "trainId": 341},
+    {"name": "coffin", "id": 572, "trainId": 342},
+    {"name": "drawer", "id": 789, "trainId": 343},
+    {"name": "cages", "id": 358, "trainId": 344},
+    {"name": "slot machine, coin machine", "id": 2443, "trainId": 345},
+    {"name": "balcony", "id": 101, "trainId": 346},
+    {"name": "volleyball court", "id": 2969, "trainId": 347},
+    {"name": "table tennis", "id": 2692, "trainId": 348},
+    {"name": "control table", "id": 606, "trainId": 349},
+    {"name": "shirt", "id": 2339, "trainId": 350},
+    {"name": "merchandise, ware, product", "id": 1533, "trainId": 351},
+    {"name": "railway", "id": 2060, "trainId": 352},
+    {"name": "parterre", "id": 1782, "trainId": 353},
+    {"name": "chimney", "id": 495, "trainId": 354},
+    {"name": "can, tin, tin can", "id": 371, "trainId": 355},
+    {"name": "tanks", "id": 2707, "trainId": 356},
+    {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357},
+    {"name": "alga, algae", "id": 3156, "trainId": 358},
+    {"name": "system", "id": 2683, "trainId": 359},
+    {"name": "map", "id": 1499, "trainId": 360},
+    {"name": "greenhouse", "id": 1135, "trainId": 361},
+    {"name": "mug", "id": 1619, "trainId": 362},
+    {"name": "barbecue", "id": 125, "trainId": 363},
+    {"name": "trailer", "id": 2838, "trainId": 364},
+    {
+        "name": "toilet tissue, toilet paper, bathroom tissue",
+        "id": 2792,
+        "trainId": 365,
+    },
+    {"name": "organ", "id": 1695, "trainId": 366},
+    {"name": "dishrag, dishcloth", "id": 746, "trainId": 367},
+    {"name": "island", "id": 1343, "trainId": 368},
+    {"name": "keyboard", "id": 1370, "trainId": 369},
+    {"name": "trench", "id": 2858, "trainId": 370},
+    {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371},
+    {"name": "steering wheel, wheel", "id": 2565, "trainId": 372},
+    {"name": "pitcher, ewer", "id": 1892, "trainId": 373},
+    {"name": "goal", "id": 1103, "trainId": 374},
+    {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375},
+    {"name": "beds", "id": 170, "trainId": 376},
+    {"name": "wood", "id": 3073, "trainId": 377},
+    {"name": "file cabinet", "id": 922, "trainId": 378},
+    {"name": "newspaper, paper", "id": 1655, "trainId": 379},
+    {"name": "motorboat", "id": 1602, "trainId": 380},
+    {"name": "rope", "id": 2160, "trainId": 381},
+    {"name": "guitar", "id": 1151, "trainId": 382},
+    {"name": "rubble", "id": 2176, "trainId": 383},
+    {"name": "scarf", "id": 2239, "trainId": 384},
+    {"name": "barrels", "id": 132, "trainId": 385},
+    {"name": "cap", "id": 394, "trainId": 386},
+    {"name": "leaves", "id": 1424, "trainId": 387},
+    {"name": "control tower", "id": 607, "trainId": 388},
+    {"name": "dashboard", "id": 700, "trainId": 389},
+    {"name": "bandstand", "id": 116, "trainId": 390},
+    {"name": "lectern", "id": 1425, "trainId": 391},
+    {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392},
+    {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393},
+    {"name": "shower room", "id": 2360, "trainId": 394},
+    {"name": "smoke", "id": 2449, "trainId": 395},
+    {"name": "faucet, spigot", "id": 897, "trainId": 396},
+    {"name": "bulldozer", "id": 317, "trainId": 397},
+    {"name": "saucepan", "id": 2228, "trainId": 398},
+    {"name": "shops", "id": 2351, "trainId": 399},
+    {"name": "meter", "id": 1543, "trainId": 400},
+    {"name": "crevasse", "id": 656, "trainId": 401},
+    {"name": "gear", "id": 1088, "trainId": 402},
+    {"name": "candelabrum, candelabra", "id": 373, "trainId": 403},
+    {"name": "sofa bed", "id": 2472, "trainId": 404},
+    {"name": "tunnel", "id": 2892, "trainId": 405},
+    {"name": "pallet", "id": 1740, "trainId": 406},
+    {"name": "wire, conducting wire", "id": 3067, "trainId": 407},
+    {"name": "kettle, boiler", "id": 1367, "trainId": 408},
+    {"name": "bidet", "id": 188, "trainId": 409},
+    {
+        "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher",
+        "id": 79,
+        "trainId": 410,
+    },
+    {"name": "music stand", "id": 1633, "trainId": 411},
+    {"name": "pipe, tube", "id": 1885, "trainId": 412},
+    {"name": "cup", "id": 677, "trainId": 413},
+    {"name": "parking meter", "id": 1779, "trainId": 414},
+    {"name": "ice hockey rink", "id": 1297, "trainId": 415},
+    {"name": "shelter", "id": 2334, "trainId": 416},
+    {"name": "weeds", "id": 3027, "trainId": 417},
+    {"name": "temple", "id": 2735, "trainId": 418},
+    {"name": "patty, cake", "id": 1791, "trainId": 419},
+    {"name": "ski slope", "id": 2405, "trainId": 420},
+    {"name": "panel", "id": 1748, "trainId": 421},
+    {"name": "wallet", "id": 2983, "trainId": 422},
+    {"name": "wheel", "id": 3035, "trainId": 423},
+    {"name": "towel rack, towel horse", "id": 2824, "trainId": 424},
+    {"name": "roundabout", "id": 2168, "trainId": 425},
+    {"name": "canister, cannister, tin", "id": 385, "trainId": 426},
+    {"name": "rod", "id": 2148, "trainId": 427},
+    {"name": "soap dispenser", "id": 2465, "trainId": 428},
+    {"name": "bell", "id": 175, "trainId": 429},
+    {"name": "canvas", "id": 390, "trainId": 430},
+    {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431},
+    {"name": "teacup", "id": 2722, "trainId": 432},
+    {"name": "trellis", "id": 2857, "trainId": 433},
+    {"name": "workbench", "id": 3088, "trainId": 434},
+    {"name": "valley, vale", "id": 2926, "trainId": 435},
+    {"name": "toaster", "id": 2782, "trainId": 436},
+    {"name": "knife", "id": 1378, "trainId": 437},
+    {"name": "podium", "id": 1934, "trainId": 438},
+    {"name": "ramp", "id": 2072, "trainId": 439},
+    {"name": "tumble dryer", "id": 2889, "trainId": 440},
+    {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441},
+    {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442},
+    {"name": "lab bench", "id": 1383, "trainId": 443},
+    {"name": "equipment", "id": 867, "trainId": 444},
+    {"name": "rocky formation", "id": 2145, "trainId": 445},
+    {"name": "plastic", "id": 1915, "trainId": 446},
+    {"name": "calendar", "id": 361, "trainId": 447},
+    {"name": "caravan", "id": 402, "trainId": 448},
+    {"name": "check-in-desk", "id": 482, "trainId": 449},
+    {"name": "ticket counter", "id": 2761, "trainId": 450},
+    {"name": "brush", "id": 300, "trainId": 451},
+    {"name": "mill", "id": 1554, "trainId": 452},
+    {"name": "covered bridge", "id": 636, "trainId": 453},
+    {"name": "bowling alley", "id": 260, "trainId": 454},
+    {"name": "hanger", "id": 1186, "trainId": 455},
+    {"name": "excavator", "id": 871, "trainId": 456},
+    {"name": "trestle", "id": 2859, "trainId": 457},
+    {"name": "revolving door", "id": 2103, "trainId": 458},
+    {"name": "blast furnace", "id": 208, "trainId": 459},
+    {"name": "scale, weighing machine", "id": 2236, "trainId": 460},
+    {"name": "projector", "id": 2012, "trainId": 461},
+    {"name": "soap", "id": 2462, "trainId": 462},
+    {"name": "locker", "id": 1462, "trainId": 463},
+    {"name": "tractor", "id": 2832, "trainId": 464},
+    {"name": "stretcher", "id": 2617, "trainId": 465},
+    {"name": "frame", "id": 1024, "trainId": 466},
+    {"name": "grating", "id": 1129, "trainId": 467},
+    {"name": "alembic", "id": 18, "trainId": 468},
+    {"name": "candle, taper, wax light", "id": 376, "trainId": 469},
+    {"name": "barrier", "id": 134, "trainId": 470},
+    {"name": "cardboard", "id": 407, "trainId": 471},
+    {"name": "cave", "id": 434, "trainId": 472},
+    {"name": "puddle", "id": 2017, "trainId": 473},
+    {"name": "tarp", "id": 2717, "trainId": 474},
+    {"name": "price tag", "id": 2005, "trainId": 475},
+    {"name": "watchtower", "id": 2993, "trainId": 476},
+    {"name": "meters", "id": 1545, "trainId": 477},
+    {
+        "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb",
+        "id": 1445,
+        "trainId": 478,
+    },
+    {"name": "tracks", "id": 2831, "trainId": 479},
+    {"name": "hair dryer", "id": 1161, "trainId": 480},
+    {"name": "skirt", "id": 2411, "trainId": 481},
+    {"name": "viaduct", "id": 2949, "trainId": 482},
+    {"name": "paper towel", "id": 1769, "trainId": 483},
+    {"name": "coat", "id": 552, "trainId": 484},
+    {"name": "sheet", "id": 2327, "trainId": 485},
+    {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486},
+    {"name": "water wheel", "id": 3013, "trainId": 487},
+    {"name": "pottery, clayware", "id": 1986, "trainId": 488},
+    {"name": "magazine rack", "id": 1486, "trainId": 489},
+    {"name": "teapot", "id": 2723, "trainId": 490},
+    {"name": "microphone, mike", "id": 1549, "trainId": 491},
+    {"name": "support", "id": 2649, "trainId": 492},
+    {"name": "forklift", "id": 1020, "trainId": 493},
+    {"name": "canyon", "id": 392, "trainId": 494},
+    {"name": "cash register, register", "id": 422, "trainId": 495},
+    {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496},
+    {"name": "remote control, remote", "id": 2099, "trainId": 497},
+    {"name": "soap dish", "id": 2464, "trainId": 498},
+    {"name": "windshield, windscreen", "id": 3058, "trainId": 499},
+    {"name": "cat", "id": 430, "trainId": 500},
+    {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501},
+    {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502},
+    {"name": "videos", "id": 2955, "trainId": 503},
+    {"name": "shovel", "id": 2355, "trainId": 504},
+    {"name": "eaves", "id": 840, "trainId": 505},
+    {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506},
+    {"name": "shipyard", "id": 2338, "trainId": 507},
+    {"name": "hen, biddy", "id": 1232, "trainId": 508},
+    {"name": "traffic cone", "id": 2834, "trainId": 509},
+    {"name": "washing machines", "id": 2991, "trainId": 510},
+    {"name": "truck crane", "id": 2879, "trainId": 511},
+    {"name": "cds", "id": 444, "trainId": 512},
+    {"name": "niche", "id": 1657, "trainId": 513},
+    {"name": "scoreboard", "id": 2246, "trainId": 514},
+    {"name": "briefcase", "id": 296, "trainId": 515},
+    {"name": "boot", "id": 245, "trainId": 516},
+    {"name": "sweater, jumper", "id": 2661, "trainId": 517},
+    {"name": "hay", "id": 1202, "trainId": 518},
+    {"name": "pack", "id": 1714, "trainId": 519},
+    {"name": "bottle rack", "id": 251, "trainId": 520},
+    {"name": "glacier", "id": 1095, "trainId": 521},
+    {"name": "pergola", "id": 1828, "trainId": 522},
+    {"name": "building materials", "id": 311, "trainId": 523},
+    {"name": "television camera", "id": 2732, "trainId": 524},
+    {"name": "first floor", "id": 947, "trainId": 525},
+    {"name": "rifle", "id": 2115, "trainId": 526},
+    {"name": "tennis table", "id": 2738, "trainId": 527},
+    {"name": "stadium", "id": 2525, "trainId": 528},
+    {"name": "safety belt", "id": 2194, "trainId": 529},
+    {"name": "cover", "id": 634, "trainId": 530},
+    {"name": "dish rack", "id": 740, "trainId": 531},
+    {"name": "synthesizer", "id": 2682, "trainId": 532},
+    {"name": "pumpkin", "id": 2020, "trainId": 533},
+    {"name": "gutter", "id": 1156, "trainId": 534},
+    {"name": "fruit stand", "id": 1036, "trainId": 535},
+    {"name": "ice floe, floe", "id": 1295, "trainId": 536},
+    {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537},
+    {"name": "wheelchair", "id": 3037, "trainId": 538},
+    {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539},
+    {"name": "diploma", "id": 736, "trainId": 540},
+    {"name": "fairground ride", "id": 893, "trainId": 541},
+    {"name": "radio", "id": 2047, "trainId": 542},
+    {"name": "hotplate", "id": 1274, "trainId": 543},
+    {"name": "junk", "id": 1361, "trainId": 544},
+    {"name": "wheelbarrow", "id": 3036, "trainId": 545},
+    {"name": "stream", "id": 2606, "trainId": 546},
+    {"name": "toll plaza", "id": 2797, "trainId": 547},
+    {"name": "punching bag", "id": 2022, "trainId": 548},
+    {"name": "trough", "id": 2876, "trainId": 549},
+    {"name": "throne", "id": 2758, "trainId": 550},
+    {"name": "chair desk", "id": 472, "trainId": 551},
+    {"name": "weighbridge", "id": 3028, "trainId": 552},
+    {"name": "extractor fan", "id": 882, "trainId": 553},
+    {"name": "hanging clothes", "id": 1189, "trainId": 554},
+    {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555},
+    {"name": "alarm clock, alarm", "id": 3122, "trainId": 556},
+    {"name": "ski lift", "id": 2401, "trainId": 557},
+    {"name": "chain", "id": 468, "trainId": 558},
+    {"name": "garage", "id": 1061, "trainId": 559},
+    {"name": "mechanical shovel", "id": 1523, "trainId": 560},
+    {"name": "wine rack", "id": 3059, "trainId": 561},
+    {"name": "tramway", "id": 2843, "trainId": 562},
+    {"name": "treadmill", "id": 2853, "trainId": 563},
+    {"name": "menu", "id": 1529, "trainId": 564},
+    {"name": "block", "id": 214, "trainId": 565},
+    {"name": "well", "id": 3032, "trainId": 566},
+    {"name": "witness stand", "id": 3071, "trainId": 567},
+    {"name": "branch", "id": 277, "trainId": 568},
+    {"name": "duck", "id": 813, "trainId": 569},
+    {"name": "casserole", "id": 426, "trainId": 570},
+    {"name": "frying pan", "id": 1039, "trainId": 571},
+    {"name": "desk organizer", "id": 727, "trainId": 572},
+    {"name": "mast", "id": 1508, "trainId": 573},
+    {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574},
+    {"name": "service elevator", "id": 2299, "trainId": 575},
+    {"name": "dollhouse", "id": 768, "trainId": 576},
+    {"name": "hammock", "id": 1172, "trainId": 577},
+    {"name": "clothes hanging", "id": 537, "trainId": 578},
+    {"name": "photocopier", "id": 1847, "trainId": 579},
+    {"name": "notepad", "id": 1664, "trainId": 580},
+    {"name": "golf cart", "id": 1110, "trainId": 581},
+    {"name": "footpath", "id": 1014, "trainId": 582},
+    {"name": "cross", "id": 662, "trainId": 583},
+    {"name": "baptismal font", "id": 121, "trainId": 584},
+    {"name": "boiler", "id": 227, "trainId": 585},
+    {"name": "skip", "id": 2410, "trainId": 586},
+    {"name": "rotisserie", "id": 2165, "trainId": 587},
+    {"name": "tables", "id": 2696, "trainId": 588},
+    {"name": "water mill", "id": 3005, "trainId": 589},
+    {"name": "helmet", "id": 1231, "trainId": 590},
+    {"name": "cover curtain", "id": 635, "trainId": 591},
+    {"name": "brick", "id": 292, "trainId": 592},
+    {"name": "table runner", "id": 2690, "trainId": 593},
+    {"name": "ashtray", "id": 65, "trainId": 594},
+    {"name": "street box", "id": 2607, "trainId": 595},
+    {"name": "stick", "id": 2574, "trainId": 596},
+    {"name": "hangers", "id": 1188, "trainId": 597},
+    {"name": "cells", "id": 456, "trainId": 598},
+    {"name": "urinal", "id": 2913, "trainId": 599},
+    {"name": "centerpiece", "id": 459, "trainId": 600},
+    {"name": "portable fridge", "id": 1955, "trainId": 601},
+    {"name": "dvds", "id": 827, "trainId": 602},
+    {"name": "golf club", "id": 1111, "trainId": 603},
+    {"name": "skirting board", "id": 2412, "trainId": 604},
+    {"name": "water cooler", "id": 2997, "trainId": 605},
+    {"name": "clipboard", "id": 528, "trainId": 606},
+    {"name": "camera, photographic camera", "id": 366, "trainId": 607},
+    {"name": "pigeonhole", "id": 1863, "trainId": 608},
+    {"name": "chips", "id": 500, "trainId": 609},
+    {"name": "food processor", "id": 1001, "trainId": 610},
+    {"name": "post box", "id": 1958, "trainId": 611},
+    {"name": "lid", "id": 1441, "trainId": 612},
+    {"name": "drum", "id": 809, "trainId": 613},
+    {"name": "blender", "id": 210, "trainId": 614},
+    {"name": "cave entrance", "id": 435, "trainId": 615},
+    {"name": "dental chair", "id": 718, "trainId": 616},
+    {"name": "obelisk", "id": 1674, "trainId": 617},
+    {"name": "canoe", "id": 388, "trainId": 618},
+    {"name": "mobile", "id": 1572, "trainId": 619},
+    {"name": "monitors", "id": 1584, "trainId": 620},
+    {"name": "pool ball", "id": 1944, "trainId": 621},
+    {"name": "cue rack", "id": 674, "trainId": 622},
+    {"name": "baggage carts", "id": 99, "trainId": 623},
+    {"name": "shore", "id": 2352, "trainId": 624},
+    {"name": "fork", "id": 1019, "trainId": 625},
+    {"name": "paper filer", "id": 1763, "trainId": 626},
+    {"name": "bicycle rack", "id": 185, "trainId": 627},
+    {"name": "coat rack", "id": 554, "trainId": 628},
+    {"name": "garland", "id": 1066, "trainId": 629},
+    {"name": "sports bag", "id": 2508, "trainId": 630},
+    {"name": "fish tank", "id": 951, "trainId": 631},
+    {"name": "towel dispenser", "id": 2822, "trainId": 632},
+    {"name": "carriage", "id": 415, "trainId": 633},
+    {"name": "brochure", "id": 297, "trainId": 634},
+    {"name": "plaque", "id": 1914, "trainId": 635},
+    {"name": "stringer", "id": 2619, "trainId": 636},
+    {"name": "iron", "id": 1338, "trainId": 637},
+    {"name": "spoon", "id": 2505, "trainId": 638},
+    {"name": "flag pole", "id": 955, "trainId": 639},
+    {"name": "toilet brush", "id": 2786, "trainId": 640},
+    {"name": "book stand", "id": 238, "trainId": 641},
+    {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642},
+    {"name": "ticket office", "id": 2763, "trainId": 643},
+    {"name": "broom", "id": 299, "trainId": 644},
+    {"name": "dvd", "id": 822, "trainId": 645},
+    {"name": "ice bucket", "id": 1288, "trainId": 646},
+    {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647},
+    {"name": "tureen", "id": 2894, "trainId": 648},
+    {"name": "folders", "id": 992, "trainId": 649},
+    {"name": "chess", "id": 489, "trainId": 650},
+    {"name": "root", "id": 2157, "trainId": 651},
+    {"name": "sewing machine", "id": 2309, "trainId": 652},
+    {"name": "model", "id": 1576, "trainId": 653},
+    {"name": "pen", "id": 1810, "trainId": 654},
+    {"name": "violin", "id": 2964, "trainId": 655},
+    {"name": "sweatshirt", "id": 2662, "trainId": 656},
+    {"name": "recycling materials", "id": 2087, "trainId": 657},
+    {"name": "mitten", "id": 1569, "trainId": 658},
+    {"name": "chopping board, cutting board", "id": 503, "trainId": 659},
+    {"name": "mask", "id": 1505, "trainId": 660},
+    {"name": "log", "id": 1468, "trainId": 661},
+    {"name": "mouse, computer mouse", "id": 1613, "trainId": 662},
+    {"name": "grill", "id": 1138, "trainId": 663},
+    {"name": "hole", "id": 1256, "trainId": 664},
+    {"name": "target", "id": 2715, "trainId": 665},
+    {"name": "trash bag", "id": 2846, "trainId": 666},
+    {"name": "chalk", "id": 477, "trainId": 667},
+    {"name": "sticks", "id": 2576, "trainId": 668},
+    {"name": "balloon", "id": 108, "trainId": 669},
+    {"name": "score", "id": 2245, "trainId": 670},
+    {"name": "hair spray", "id": 1162, "trainId": 671},
+    {"name": "roll", "id": 2149, "trainId": 672},
+    {"name": "runner", "id": 2183, "trainId": 673},
+    {"name": "engine", "id": 858, "trainId": 674},
+    {"name": "inflatable glove", "id": 1324, "trainId": 675},
+    {"name": "games", "id": 1055, "trainId": 676},
+    {"name": "pallets", "id": 1741, "trainId": 677},
+    {"name": "baskets", "id": 149, "trainId": 678},
+    {"name": "coop", "id": 615, "trainId": 679},
+    {"name": "dvd player", "id": 825, "trainId": 680},
+    {"name": "rocking horse", "id": 2143, "trainId": 681},
+    {"name": "buckets", "id": 304, "trainId": 682},
+    {"name": "bread rolls", "id": 283, "trainId": 683},
+    {"name": "shawl", "id": 2322, "trainId": 684},
+    {"name": "watering can", "id": 3017, "trainId": 685},
+    {"name": "spotlights", "id": 2510, "trainId": 686},
+    {"name": "post-it", "id": 1960, "trainId": 687},
+    {"name": "bowls", "id": 265, "trainId": 688},
+    {"name": "security camera", "id": 2282, "trainId": 689},
+    {"name": "runner cloth", "id": 2184, "trainId": 690},
+    {"name": "lock", "id": 1461, "trainId": 691},
+    {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692},
+    {"name": "side", "id": 2372, "trainId": 693},
+    {"name": "roulette", "id": 2166, "trainId": 694},
+    {"name": "bone", "id": 232, "trainId": 695},
+    {"name": "cutlery", "id": 693, "trainId": 696},
+    {"name": "pool balls", "id": 1945, "trainId": 697},
+    {"name": "wheels", "id": 3039, "trainId": 698},
+    {"name": "spice rack", "id": 2494, "trainId": 699},
+    {"name": "plant pots", "id": 1908, "trainId": 700},
+    {"name": "towel ring", "id": 2827, "trainId": 701},
+    {"name": "bread box", "id": 280, "trainId": 702},
+    {"name": "video", "id": 2950, "trainId": 703},
+    {"name": "funfair", "id": 1044, "trainId": 704},
+    {"name": "breads", "id": 288, "trainId": 705},
+    {"name": "tripod", "id": 2863, "trainId": 706},
+    {"name": "ironing board", "id": 1342, "trainId": 707},
+    {"name": "skimmer", "id": 2409, "trainId": 708},
+    {"name": "hollow", "id": 1258, "trainId": 709},
+    {"name": "scratching post", "id": 2249, "trainId": 710},
+    {"name": "tricycle", "id": 2862, "trainId": 711},
+    {"name": "file box", "id": 920, "trainId": 712},
+    {"name": "mountain pass", "id": 1607, "trainId": 713},
+    {"name": "tombstones", "id": 2802, "trainId": 714},
+    {"name": "cooker", "id": 610, "trainId": 715},
+    {"name": "card game, cards", "id": 3129, "trainId": 716},
+    {"name": "golf bag", "id": 1108, "trainId": 717},
+    {"name": "towel paper", "id": 2823, "trainId": 718},
+    {"name": "chaise lounge", "id": 476, "trainId": 719},
+    {"name": "sun", "id": 2641, "trainId": 720},
+    {"name": "toilet paper holder", "id": 2788, "trainId": 721},
+    {"name": "rake", "id": 2070, "trainId": 722},
+    {"name": "key", "id": 1368, "trainId": 723},
+    {"name": "umbrella stand", "id": 2903, "trainId": 724},
+    {"name": "dartboard", "id": 699, "trainId": 725},
+    {"name": "transformer", "id": 2844, "trainId": 726},
+    {"name": "fireplace utensils", "id": 942, "trainId": 727},
+    {"name": "sweatshirts", "id": 2663, "trainId": 728},
+    {
+        "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone",
+        "id": 457,
+        "trainId": 729,
+    },
+    {"name": "tallboy", "id": 2701, "trainId": 730},
+    {"name": "stapler", "id": 2540, "trainId": 731},
+    {"name": "sauna", "id": 2231, "trainId": 732},
+    {"name": "test tube", "id": 2746, "trainId": 733},
+    {"name": "palette", "id": 1738, "trainId": 734},
+    {"name": "shopping carts", "id": 2350, "trainId": 735},
+    {"name": "tools", "id": 2808, "trainId": 736},
+    {"name": "push button, push, button", "id": 2025, "trainId": 737},
+    {"name": "star", "id": 2541, "trainId": 738},
+    {"name": "roof rack", "id": 2156, "trainId": 739},
+    {"name": "barbed wire", "id": 126, "trainId": 740},
+    {"name": "spray", "id": 2512, "trainId": 741},
+    {"name": "ear", "id": 831, "trainId": 742},
+    {"name": "sponge", "id": 2503, "trainId": 743},
+    {"name": "racket", "id": 2039, "trainId": 744},
+    {"name": "tins", "id": 2774, "trainId": 745},
+    {"name": "eyeglasses", "id": 886, "trainId": 746},
+    {"name": "file", "id": 919, "trainId": 747},
+    {"name": "scarfs", "id": 2240, "trainId": 748},
+    {"name": "sugar bowl", "id": 2636, "trainId": 749},
+    {"name": "flip flop", "id": 963, "trainId": 750},
+    {"name": "headstones", "id": 1218, "trainId": 751},
+    {"name": "laptop bag", "id": 1406, "trainId": 752},
+    {"name": "leash", "id": 1420, "trainId": 753},
+    {"name": "climbing frame", "id": 526, "trainId": 754},
+    {"name": "suit hanger", "id": 2639, "trainId": 755},
+    {"name": "floor spotlight", "id": 975, "trainId": 756},
+    {"name": "plate rack", "id": 1921, "trainId": 757},
+    {"name": "sewer", "id": 2305, "trainId": 758},
+    {"name": "hard drive", "id": 1193, "trainId": 759},
+    {"name": "sprinkler", "id": 2517, "trainId": 760},
+    {"name": "tools box", "id": 2809, "trainId": 761},
+    {"name": "necklace", "id": 1647, "trainId": 762},
+    {"name": "bulbs", "id": 314, "trainId": 763},
+    {"name": "steel industry", "id": 2560, "trainId": 764},
+    {"name": "club", "id": 545, "trainId": 765},
+    {"name": "jack", "id": 1345, "trainId": 766},
+    {"name": "door bars", "id": 775, "trainId": 767},
+    {
+        "name": "control panel, instrument panel, control board, board, panel",
+        "id": 603,
+        "trainId": 768,
+    },
+    {"name": "hairbrush", "id": 1163, "trainId": 769},
+    {"name": "napkin holder", "id": 1641, "trainId": 770},
+    {"name": "office", "id": 1678, "trainId": 771},
+    {"name": "smoke detector", "id": 2450, "trainId": 772},
+    {"name": "utensils", "id": 2915, "trainId": 773},
+    {"name": "apron", "id": 42, "trainId": 774},
+    {"name": "scissors", "id": 2242, "trainId": 775},
+    {"name": "terminal", "id": 2741, "trainId": 776},
+    {"name": "grinder", "id": 1143, "trainId": 777},
+    {"name": "entry phone", "id": 862, "trainId": 778},
+    {"name": "newspaper stand", "id": 1654, "trainId": 779},
+    {"name": "pepper shaker", "id": 1826, "trainId": 780},
+    {"name": "onions", "id": 1689, "trainId": 781},
+    {
+        "name": "central processing unit, cpu, c p u , central processor, processor, mainframe",
+        "id": 3124,
+        "trainId": 782,
+    },
+    {"name": "tape", "id": 2710, "trainId": 783},
+    {"name": "bat", "id": 152, "trainId": 784},
+    {"name": "coaster", "id": 549, "trainId": 785},
+    {"name": "calculator", "id": 360, "trainId": 786},
+    {"name": "potatoes", "id": 1982, "trainId": 787},
+    {"name": "luggage rack", "id": 1478, "trainId": 788},
+    {"name": "salt", "id": 2203, "trainId": 789},
+    {"name": "street number", "id": 2612, "trainId": 790},
+    {"name": "viewpoint", "id": 2956, "trainId": 791},
+    {"name": "sword", "id": 2681, "trainId": 792},
+    {"name": "cd", "id": 437, "trainId": 793},
+    {"name": "rowing machine", "id": 2171, "trainId": 794},
+    {"name": "plug", "id": 1933, "trainId": 795},
+    {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796},
+    {"name": "pepper", "id": 1824, "trainId": 797},
+    {"name": "tongs", "id": 2803, "trainId": 798},
+    {"name": "bonfire", "id": 234, "trainId": 799},
+    {"name": "dog dish", "id": 764, "trainId": 800},
+    {"name": "belt", "id": 177, "trainId": 801},
+    {"name": "dumbbells", "id": 817, "trainId": 802},
+    {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803},
+    {"name": "hook", "id": 1262, "trainId": 804},
+    {"name": "envelopes", "id": 864, "trainId": 805},
+    {"name": "shower faucet", "id": 2359, "trainId": 806},
+    {"name": "watch", "id": 2992, "trainId": 807},
+    {"name": "padlock", "id": 1725, "trainId": 808},
+    {"name": "swimming pool ladder", "id": 2667, "trainId": 809},
+    {"name": "spanners", "id": 2484, "trainId": 810},
+    {"name": "gravy boat", "id": 1133, "trainId": 811},
+    {"name": "notice board", "id": 1667, "trainId": 812},
+    {"name": "trash bags", "id": 2847, "trainId": 813},
+    {"name": "fire alarm", "id": 932, "trainId": 814},
+    {"name": "ladle", "id": 1392, "trainId": 815},
+    {"name": "stethoscope", "id": 2573, "trainId": 816},
+    {"name": "rocket", "id": 2140, "trainId": 817},
+    {"name": "funnel", "id": 1046, "trainId": 818},
+    {"name": "bowling pins", "id": 264, "trainId": 819},
+    {"name": "valve", "id": 2927, "trainId": 820},
+    {"name": "thermometer", "id": 2752, "trainId": 821},
+    {"name": "cups", "id": 679, "trainId": 822},
+    {"name": "spice jar", "id": 2493, "trainId": 823},
+    {"name": "night light", "id": 1658, "trainId": 824},
+    {"name": "soaps", "id": 2466, "trainId": 825},
+    {"name": "games table", "id": 1057, "trainId": 826},
+    {"name": "slotted spoon", "id": 2444, "trainId": 827},
+    {"name": "reel", "id": 2093, "trainId": 828},
+    {"name": "scourer", "id": 2248, "trainId": 829},
+    {"name": "sleeping robe", "id": 2432, "trainId": 830},
+    {"name": "desk mat", "id": 726, "trainId": 831},
+    {"name": "dumbbell", "id": 816, "trainId": 832},
+    {"name": "hammer", "id": 1171, "trainId": 833},
+    {"name": "tie", "id": 2766, "trainId": 834},
+    {"name": "typewriter", "id": 2900, "trainId": 835},
+    {"name": "shaker", "id": 2313, "trainId": 836},
+    {"name": "cheese dish", "id": 488, "trainId": 837},
+    {"name": "sea star", "id": 2265, "trainId": 838},
+    {"name": "racquet", "id": 2043, "trainId": 839},
+    {"name": "butane gas cylinder", "id": 332, "trainId": 840},
+    {"name": "paper weight", "id": 1771, "trainId": 841},
+    {"name": "shaving brush", "id": 2320, "trainId": 842},
+    {"name": "sunglasses", "id": 2646, "trainId": 843},
+    {"name": "gear shift", "id": 1089, "trainId": 844},
+    {"name": "towel rail", "id": 2826, "trainId": 845},
+    {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846},
+]
+
+
+def _get_ade20k_full_meta():
+    stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
+    assert len(stuff_ids) == 847, len(stuff_ids)
+
+    stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
+    stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES]
+
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+    }
+    return ret
+
+
+def register_all_ade20k_full(root):
+    meta = _get_ade20k_full_meta()
+    for name, dirname in [("val", "validation")]:
+        image_dir = os.path.join(root, "ADE20K_2021_17_01/images_detectron2", dirname)
+        gt_dir = os.path.join(root, "ADE20K_2021_17_01/annotations_detectron2", dirname)
+        name = f"ade20k_full_sem_seg_{name}"
+        DatasetCatalog.register(
+            name,
+            lambda x=image_dir, y=gt_dir: load_sem_seg(
+                y, x, gt_ext="tif", image_ext="jpg"
+            ),
+        )
+        MetadataCatalog.get(name).set(
+            stuff_classes=meta["stuff_classes"][:],
+            thing_classes=meta["stuff_classes"][:],  # the same as stuff_classes
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=65535,  # NOTE: gt is saved in 16-bit TIFF images
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_ade20k_full(_root)
diff --git a/open_vocab_seg/data/datasets/register_cc3m.py b/open_vocab_seg/data/datasets/register_cc3m.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aa5cb07bc99b574505b6319835750789bb3ee26
--- /dev/null
+++ b/open_vocab_seg/data/datasets/register_cc3m.py
@@ -0,0 +1,457 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+
+import pandas as pd
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+from detectron2.utils.file_io import PathManager
+
+
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"id": 92, "name": "banner", "supercategory": "textile"},
+    {"id": 93, "name": "blanket", "supercategory": "textile"},
+    {"id": 94, "name": "branch", "supercategory": "plant"},
+    {"id": 95, "name": "bridge", "supercategory": "building"},
+    {"id": 96, "name": "building-other", "supercategory": "building"},
+    {"id": 97, "name": "bush", "supercategory": "plant"},
+    {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"},
+    {"id": 99, "name": "cage", "supercategory": "structural"},
+    {"id": 100, "name": "cardboard", "supercategory": "raw-material"},
+    {"id": 101, "name": "carpet", "supercategory": "floor"},
+    {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"},
+    {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"},
+    {"id": 104, "name": "cloth", "supercategory": "textile"},
+    {"id": 105, "name": "clothes", "supercategory": "textile"},
+    {"id": 106, "name": "clouds", "supercategory": "sky"},
+    {"id": 107, "name": "counter", "supercategory": "furniture-stuff"},
+    {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"},
+    {"id": 109, "name": "curtain", "supercategory": "textile"},
+    {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"},
+    {"id": 111, "name": "dirt", "supercategory": "ground"},
+    {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"},
+    {"id": 113, "name": "fence", "supercategory": "structural"},
+    {"id": 114, "name": "floor-marble", "supercategory": "floor"},
+    {"id": 115, "name": "floor-other", "supercategory": "floor"},
+    {"id": 116, "name": "floor-stone", "supercategory": "floor"},
+    {"id": 117, "name": "floor-tile", "supercategory": "floor"},
+    {"id": 118, "name": "floor-wood", "supercategory": "floor"},
+    {"id": 119, "name": "flower", "supercategory": "plant"},
+    {"id": 120, "name": "fog", "supercategory": "water"},
+    {"id": 121, "name": "food-other", "supercategory": "food-stuff"},
+    {"id": 122, "name": "fruit", "supercategory": "food-stuff"},
+    {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"},
+    {"id": 124, "name": "grass", "supercategory": "plant"},
+    {"id": 125, "name": "gravel", "supercategory": "ground"},
+    {"id": 126, "name": "ground-other", "supercategory": "ground"},
+    {"id": 127, "name": "hill", "supercategory": "solid"},
+    {"id": 128, "name": "house", "supercategory": "building"},
+    {"id": 129, "name": "leaves", "supercategory": "plant"},
+    {"id": 130, "name": "light", "supercategory": "furniture-stuff"},
+    {"id": 131, "name": "mat", "supercategory": "textile"},
+    {"id": 132, "name": "metal", "supercategory": "raw-material"},
+    {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"},
+    {"id": 134, "name": "moss", "supercategory": "plant"},
+    {"id": 135, "name": "mountain", "supercategory": "solid"},
+    {"id": 136, "name": "mud", "supercategory": "ground"},
+    {"id": 137, "name": "napkin", "supercategory": "textile"},
+    {"id": 138, "name": "net", "supercategory": "structural"},
+    {"id": 139, "name": "paper", "supercategory": "raw-material"},
+    {"id": 140, "name": "pavement", "supercategory": "ground"},
+    {"id": 141, "name": "pillow", "supercategory": "textile"},
+    {"id": 142, "name": "plant-other", "supercategory": "plant"},
+    {"id": 143, "name": "plastic", "supercategory": "raw-material"},
+    {"id": 144, "name": "platform", "supercategory": "ground"},
+    {"id": 145, "name": "playingfield", "supercategory": "ground"},
+    {"id": 146, "name": "railing", "supercategory": "structural"},
+    {"id": 147, "name": "railroad", "supercategory": "ground"},
+    {"id": 148, "name": "river", "supercategory": "water"},
+    {"id": 149, "name": "road", "supercategory": "ground"},
+    {"id": 150, "name": "rock", "supercategory": "solid"},
+    {"id": 151, "name": "roof", "supercategory": "building"},
+    {"id": 152, "name": "rug", "supercategory": "textile"},
+    {"id": 153, "name": "salad", "supercategory": "food-stuff"},
+    {"id": 154, "name": "sand", "supercategory": "ground"},
+    {"id": 155, "name": "sea", "supercategory": "water"},
+    {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"},
+    {"id": 157, "name": "sky-other", "supercategory": "sky"},
+    {"id": 158, "name": "skyscraper", "supercategory": "building"},
+    {"id": 159, "name": "snow", "supercategory": "ground"},
+    {"id": 160, "name": "solid-other", "supercategory": "solid"},
+    {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"},
+    {"id": 162, "name": "stone", "supercategory": "solid"},
+    {"id": 163, "name": "straw", "supercategory": "plant"},
+    {"id": 164, "name": "structural-other", "supercategory": "structural"},
+    {"id": 165, "name": "table", "supercategory": "furniture-stuff"},
+    {"id": 166, "name": "tent", "supercategory": "building"},
+    {"id": 167, "name": "textile-other", "supercategory": "textile"},
+    {"id": 168, "name": "towel", "supercategory": "textile"},
+    {"id": 169, "name": "tree", "supercategory": "plant"},
+    {"id": 170, "name": "vegetable", "supercategory": "food-stuff"},
+    {"id": 171, "name": "wall-brick", "supercategory": "wall"},
+    {"id": 172, "name": "wall-concrete", "supercategory": "wall"},
+    {"id": 173, "name": "wall-other", "supercategory": "wall"},
+    {"id": 174, "name": "wall-panel", "supercategory": "wall"},
+    {"id": 175, "name": "wall-stone", "supercategory": "wall"},
+    {"id": 176, "name": "wall-tile", "supercategory": "wall"},
+    {"id": 177, "name": "wall-wood", "supercategory": "wall"},
+    {"id": 178, "name": "water-other", "supercategory": "water"},
+    {"id": 179, "name": "waterdrops", "supercategory": "water"},
+    {"id": 180, "name": "window-blind", "supercategory": "window"},
+    {"id": 181, "name": "window-other", "supercategory": "window"},
+    {"id": 182, "name": "wood", "supercategory": "solid"},
+]
+
+
+ADE20K_150_CATEGORIES = [
+    {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
+    {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
+    {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
+    {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
+    {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
+    {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
+    {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
+    {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
+    {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
+    {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
+    {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
+    {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
+    {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
+    {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
+    {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
+    {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
+    {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
+    {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
+    {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
+    {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
+    {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
+    {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
+    {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
+    {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
+    {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
+    {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
+    {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
+    {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
+    {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
+    {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
+    {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
+    {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
+    {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
+    {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
+    {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
+    {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
+    {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
+    {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
+    {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
+    {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
+    {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
+    {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
+    {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
+    {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
+    {
+        "color": [6, 51, 255],
+        "id": 44,
+        "isthing": 1,
+        "name": "chest of drawers, chest, bureau, dresser",
+    },
+    {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
+    {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
+    {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
+    {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
+    {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
+    {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
+    {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
+    {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
+    {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
+    {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
+    {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
+    {
+        "color": [255, 71, 0],
+        "id": 56,
+        "isthing": 1,
+        "name": "pool table, billiard table, snooker table",
+    },
+    {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
+    {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
+    {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
+    {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
+    {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
+    {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
+    {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
+    {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
+    {
+        "color": [0, 255, 133],
+        "id": 65,
+        "isthing": 1,
+        "name": "toilet, can, commode, crapper, pot, potty, stool, throne",
+    },
+    {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
+    {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
+    {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
+    {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
+    {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
+    {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
+    {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
+    {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
+    {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
+    {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
+    {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
+    {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
+    {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
+    {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
+    {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
+    {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
+    {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
+    {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
+    {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
+    {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
+    {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
+    {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
+    {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
+    {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
+    {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
+    {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
+    {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
+    {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
+    {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
+    {
+        "color": [0, 122, 255],
+        "id": 95,
+        "isthing": 1,
+        "name": "bannister, banister, balustrade, balusters, handrail",
+    },
+    {
+        "color": [0, 255, 163],
+        "id": 96,
+        "isthing": 0,
+        "name": "escalator, moving staircase, moving stairway",
+    },
+    {
+        "color": [255, 153, 0],
+        "id": 97,
+        "isthing": 1,
+        "name": "ottoman, pouf, pouffe, puff, hassock",
+    },
+    {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
+    {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
+    {
+        "color": [143, 255, 0],
+        "id": 100,
+        "isthing": 0,
+        "name": "poster, posting, placard, notice, bill, card",
+    },
+    {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
+    {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
+    {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
+    {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
+    {
+        "color": [133, 0, 255],
+        "id": 105,
+        "isthing": 0,
+        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+    },
+    {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
+    {
+        "color": [184, 0, 255],
+        "id": 107,
+        "isthing": 1,
+        "name": "washer, automatic washer, washing machine",
+    },
+    {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
+    {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
+    {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
+    {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
+    {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
+    {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
+    {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
+    {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
+    {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
+    {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
+    {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
+    {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
+    {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
+    {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
+    {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
+    {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
+    {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
+    {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
+    {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
+    {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
+    {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
+    {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
+    {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
+    {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
+    {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
+    {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
+    {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
+    {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
+    {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
+    {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
+    {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
+    {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
+    {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
+    {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
+    {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
+    {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
+    {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
+    {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
+    {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
+    {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
+    {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
+    {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
+]
+
+TEST_CATEGORIES = [
+    {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "Oculus"},
+    {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "Ukulele"},
+]
+
+COCO_BASE_CATEGORIES = [
+    c
+    for i, c in enumerate(COCO_CATEGORIES)
+    if c["id"] - 1
+    not in [20, 24, 32, 33, 40, 56, 86, 99, 105, 123, 144, 147, 148, 168, 171]
+]
+COCO_NOVEL_CATEGORIES = [
+    c
+    for i, c in enumerate(COCO_CATEGORIES)
+    if c["id"] - 1
+    in [20, 24, 32, 33, 40, 56, 86, 99, 105, 123, 144, 147, 148, 168, 171]
+]
+
+
+def load_cc_image(csv_file, img_key='filepath', caption_key='title', sep="\t"):
+    print(f'Loading csv data from {csv_file}.')
+    df = pd.read_csv(csv_file, sep=sep)
+
+    input_files = df[img_key].tolist()
+    captions = df[caption_key].tolist()
+
+    print("Loaded {} images".format(len(input_files)))
+
+    dataset_dicts = []
+    for (img_path, text) in zip(input_files, captions):
+        record = {}
+        record["file_name"] = img_path
+        record["caption"] = text
+        dataset_dicts.append(record)
+
+    return dataset_dicts
+
+
+def _get_coco_stuff_meta(cat_list):
+    # Id 0 is reserved for ignore_label, we change ignore_label for 0
+    # to 255 in our pre-processing.
+    stuff_ids = [k["id"] for k in cat_list]
+
+    # For semantic segmentation, this mapping maps from contiguous stuff id
+    # (in [0, 91], used in models) to ids in the dataset (used for processing results)
+    stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
+    stuff_classes = [k["name"] for k in cat_list]
+
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+    }
+    return ret
+
+
+def register_cc_3m(csv_file):
+
+    meta = _get_coco_stuff_meta(TEST_CATEGORIES)
+    name = "cc_3m_train"
+
+    DatasetCatalog.register(
+        name,
+        lambda x=csv_file: load_cc_image(x),
+    )
+    MetadataCatalog.get(name).set(
+        csv_file=csv_file,
+        evaluator_type="dummy",
+        ignore_label=255,
+        **meta,
+    )
+
+
+# _csv_file = "/home/jeffliang/zsseg/datasets/coco/coco_train_merge_captions.csv"
+_csv_file = "/home/jeffliang/zsseg/configs/masked_images/pred/samples.csv"
+register_cc_3m(_csv_file)
diff --git a/open_vocab_seg/data/datasets/register_coco_stuff.py b/open_vocab_seg/data/datasets/register_coco_stuff.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1a0f5b571a971fe20ebc8932d27499de856a565
--- /dev/null
+++ b/open_vocab_seg/data/datasets/register_coco_stuff.py
@@ -0,0 +1,250 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+
+
+COCO_CATEGORIES = [
+    {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"},
+    {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"},
+    {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"},
+    {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"},
+    {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"},
+    {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"},
+    {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"},
+    {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"},
+    {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"},
+    {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"},
+    {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"},
+    {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"},
+    {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"},
+    {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"},
+    {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"},
+    {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"},
+    {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"},
+    {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"},
+    {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"},
+    {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"},
+    {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"},
+    {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"},
+    {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"},
+    {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"},
+    {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"},
+    {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"},
+    {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"},
+    {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"},
+    {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"},
+    {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"},
+    {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"},
+    {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"},
+    {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"},
+    {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"},
+    {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"},
+    {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"},
+    {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"},
+    {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"},
+    {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"},
+    {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"},
+    {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"},
+    {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"},
+    {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"},
+    {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"},
+    {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"},
+    {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"},
+    {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"},
+    {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"},
+    {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"},
+    {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"},
+    {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"},
+    {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"},
+    {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"},
+    {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"},
+    {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"},
+    {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"},
+    {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"},
+    {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"},
+    {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"},
+    {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"},
+    {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"},
+    {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"},
+    {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"},
+    {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"},
+    {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"},
+    {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"},
+    {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"},
+    {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"},
+    {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"},
+    {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"},
+    {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"},
+    {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"},
+    {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"},
+    {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"},
+    {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"},
+    {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"},
+    {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"},
+    {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"},
+    {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"},
+    {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"},
+    {"id": 92, "name": "banner", "supercategory": "textile"},
+    {"id": 93, "name": "blanket", "supercategory": "textile"},
+    {"id": 94, "name": "branch", "supercategory": "plant"},
+    {"id": 95, "name": "bridge", "supercategory": "building"},
+    {"id": 96, "name": "building-other", "supercategory": "building"},
+    {"id": 97, "name": "bush", "supercategory": "plant"},
+    {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"},
+    {"id": 99, "name": "cage", "supercategory": "structural"},
+    {"id": 100, "name": "cardboard", "supercategory": "raw-material"},
+    {"id": 101, "name": "carpet", "supercategory": "floor"},
+    {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"},
+    {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"},
+    {"id": 104, "name": "cloth", "supercategory": "textile"},
+    {"id": 105, "name": "clothes", "supercategory": "textile"},
+    {"id": 106, "name": "clouds", "supercategory": "sky"},
+    {"id": 107, "name": "counter", "supercategory": "furniture-stuff"},
+    {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"},
+    {"id": 109, "name": "curtain", "supercategory": "textile"},
+    {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"},
+    {"id": 111, "name": "dirt", "supercategory": "ground"},
+    {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"},
+    {"id": 113, "name": "fence", "supercategory": "structural"},
+    {"id": 114, "name": "floor-marble", "supercategory": "floor"},
+    {"id": 115, "name": "floor-other", "supercategory": "floor"},
+    {"id": 116, "name": "floor-stone", "supercategory": "floor"},
+    {"id": 117, "name": "floor-tile", "supercategory": "floor"},
+    {"id": 118, "name": "floor-wood", "supercategory": "floor"},
+    {"id": 119, "name": "flower", "supercategory": "plant"},
+    {"id": 120, "name": "fog", "supercategory": "water"},
+    {"id": 121, "name": "food-other", "supercategory": "food-stuff"},
+    {"id": 122, "name": "fruit", "supercategory": "food-stuff"},
+    {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"},
+    {"id": 124, "name": "grass", "supercategory": "plant"},
+    {"id": 125, "name": "gravel", "supercategory": "ground"},
+    {"id": 126, "name": "ground-other", "supercategory": "ground"},
+    {"id": 127, "name": "hill", "supercategory": "solid"},
+    {"id": 128, "name": "house", "supercategory": "building"},
+    {"id": 129, "name": "leaves", "supercategory": "plant"},
+    {"id": 130, "name": "light", "supercategory": "furniture-stuff"},
+    {"id": 131, "name": "mat", "supercategory": "textile"},
+    {"id": 132, "name": "metal", "supercategory": "raw-material"},
+    {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"},
+    {"id": 134, "name": "moss", "supercategory": "plant"},
+    {"id": 135, "name": "mountain", "supercategory": "solid"},
+    {"id": 136, "name": "mud", "supercategory": "ground"},
+    {"id": 137, "name": "napkin", "supercategory": "textile"},
+    {"id": 138, "name": "net", "supercategory": "structural"},
+    {"id": 139, "name": "paper", "supercategory": "raw-material"},
+    {"id": 140, "name": "pavement", "supercategory": "ground"},
+    {"id": 141, "name": "pillow", "supercategory": "textile"},
+    {"id": 142, "name": "plant-other", "supercategory": "plant"},
+    {"id": 143, "name": "plastic", "supercategory": "raw-material"},
+    {"id": 144, "name": "platform", "supercategory": "ground"},
+    {"id": 145, "name": "playingfield", "supercategory": "ground"},
+    {"id": 146, "name": "railing", "supercategory": "structural"},
+    {"id": 147, "name": "railroad", "supercategory": "ground"},
+    {"id": 148, "name": "river", "supercategory": "water"},
+    {"id": 149, "name": "road", "supercategory": "ground"},
+    {"id": 150, "name": "rock", "supercategory": "solid"},
+    {"id": 151, "name": "roof", "supercategory": "building"},
+    {"id": 152, "name": "rug", "supercategory": "textile"},
+    {"id": 153, "name": "salad", "supercategory": "food-stuff"},
+    {"id": 154, "name": "sand", "supercategory": "ground"},
+    {"id": 155, "name": "sea", "supercategory": "water"},
+    {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"},
+    {"id": 157, "name": "sky-other", "supercategory": "sky"},
+    {"id": 158, "name": "skyscraper", "supercategory": "building"},
+    {"id": 159, "name": "snow", "supercategory": "ground"},
+    {"id": 160, "name": "solid-other", "supercategory": "solid"},
+    {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"},
+    {"id": 162, "name": "stone", "supercategory": "solid"},
+    {"id": 163, "name": "straw", "supercategory": "plant"},
+    {"id": 164, "name": "structural-other", "supercategory": "structural"},
+    {"id": 165, "name": "table", "supercategory": "furniture-stuff"},
+    {"id": 166, "name": "tent", "supercategory": "building"},
+    {"id": 167, "name": "textile-other", "supercategory": "textile"},
+    {"id": 168, "name": "towel", "supercategory": "textile"},
+    {"id": 169, "name": "tree", "supercategory": "plant"},
+    {"id": 170, "name": "vegetable", "supercategory": "food-stuff"},
+    {"id": 171, "name": "wall-brick", "supercategory": "wall"},
+    {"id": 172, "name": "wall-concrete", "supercategory": "wall"},
+    {"id": 173, "name": "wall-other", "supercategory": "wall"},
+    {"id": 174, "name": "wall-panel", "supercategory": "wall"},
+    {"id": 175, "name": "wall-stone", "supercategory": "wall"},
+    {"id": 176, "name": "wall-tile", "supercategory": "wall"},
+    {"id": 177, "name": "wall-wood", "supercategory": "wall"},
+    {"id": 178, "name": "water-other", "supercategory": "water"},
+    {"id": 179, "name": "waterdrops", "supercategory": "water"},
+    {"id": 180, "name": "window-blind", "supercategory": "window"},
+    {"id": 181, "name": "window-other", "supercategory": "window"},
+    {"id": 182, "name": "wood", "supercategory": "solid"},
+]
+
+def _get_coco_stuff_meta(cat_list):
+    # Id 0 is reserved for ignore_label, we change ignore_label for 0
+    # to 255 in our pre-processing.
+    stuff_ids = [k["id"] for k in cat_list]
+
+    # For semantic segmentation, this mapping maps from contiguous stuff id
+    # (in [0, 91], used in models) to ids in the dataset (used for processing results)
+    stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)}
+    stuff_classes = [k["name"] for k in cat_list]
+
+    ret = {
+        "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id,
+        "stuff_classes": stuff_classes,
+    }
+    return ret
+
+
+def register_all_coco_stuff_10k(root):
+    root = os.path.join(root, "coco", "coco_stuff_10k")
+    meta = _get_coco_stuff_meta(COCO_CATEGORIES)
+    for name, image_dirname, sem_seg_dirname in [
+        ("train", "images_detectron2/train", "annotations_detectron2/train"),
+    ]:
+        image_dir = os.path.join(root, image_dirname)
+        gt_dir = os.path.join(root, sem_seg_dirname)
+        name = f"coco_2017_{name}_stuff_10k_sem_seg"
+        DatasetCatalog.register(
+            name,
+            lambda x=image_dir, y=gt_dir: load_sem_seg(
+                y, x, gt_ext="png", image_ext="jpg"
+            ),
+        )
+        MetadataCatalog.get(name).set(
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=255,
+            **meta,
+        )
+
+
+def register_all_coco_stuff(root):
+    root = os.path.join(root, "coco")
+    meta = _get_coco_stuff_meta(COCO_CATEGORIES)
+
+    for name, image_dirname, sem_seg_dirname in [
+        ("train", "train2017", "stuffthingmaps_detectron2/train2017"),
+    ]:
+        image_dir = os.path.join(root, image_dirname)
+        gt_dir = os.path.join(root, sem_seg_dirname)
+        all_name = f"coco_2017_{name}_stuff_sem_seg"
+        DatasetCatalog.register(
+            all_name,
+            lambda x=image_dir, y=gt_dir: load_sem_seg(
+                y, x, gt_ext="png", image_ext="jpg"
+            ),
+        )
+        MetadataCatalog.get(all_name).set(
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=255,
+            **meta,
+        )
+
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_coco_stuff_10k(_root)
+register_all_coco_stuff(_root)
diff --git a/open_vocab_seg/data/datasets/register_pascal_context.py b/open_vocab_seg/data/datasets/register_pascal_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..e40f87c945da20e78c0a3ea230bc9f36d1800071
--- /dev/null
+++ b/open_vocab_seg/data/datasets/register_pascal_context.py
@@ -0,0 +1,588 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+
+PASCALCONTEX59_NAMES = (
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "table",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+    "bag",
+    "bed",
+    "bench",
+    "book",
+    "building",
+    "cabinet",
+    "ceiling",
+    "cloth",
+    "computer",
+    "cup",
+    "door",
+    "fence",
+    "floor",
+    "flower",
+    "food",
+    "grass",
+    "ground",
+    "keyboard",
+    "light",
+    "mountain",
+    "mouse",
+    "curtain",
+    "platform",
+    "sign",
+    "plate",
+    "road",
+    "rock",
+    "shelves",
+    "sidewalk",
+    "sky",
+    "snow",
+    "bedclothes",
+    "track",
+    "tree",
+    "truck",
+    "wall",
+    "water",
+    "window",
+    "wood",
+)
+
+PASCALCONTEX459_NAMES = (
+    "accordion",
+    "aeroplane",
+    "air conditioner",
+    "antenna",
+    "artillery",
+    "ashtray",
+    "atrium",
+    "baby carriage",
+    "bag",
+    "ball",
+    "balloon",
+    "bamboo weaving",
+    "barrel",
+    "baseball bat",
+    "basket",
+    "basketball backboard",
+    "bathtub",
+    "bed",
+    "bedclothes",
+    "beer",
+    "bell",
+    "bench",
+    "bicycle",
+    "binoculars",
+    "bird",
+    "bird cage",
+    "bird feeder",
+    "bird nest",
+    "blackboard",
+    "board",
+    "boat",
+    "bone",
+    "book",
+    "bottle",
+    "bottle opener",
+    "bowl",
+    "box",
+    "bracelet",
+    "brick",
+    "bridge",
+    "broom",
+    "brush",
+    "bucket",
+    "building",
+    "bus",
+    "cabinet",
+    "cabinet door",
+    "cage",
+    "cake",
+    "calculator",
+    "calendar",
+    "camel",
+    "camera",
+    "camera lens",
+    "can",
+    "candle",
+    "candle holder",
+    "cap",
+    "car",
+    "card",
+    "cart",
+    "case",
+    "casette recorder",
+    "cash register",
+    "cat",
+    "cd",
+    "cd player",
+    "ceiling",
+    "cell phone",
+    "cello",
+    "chain",
+    "chair",
+    "chessboard",
+    "chicken",
+    "chopstick",
+    "clip",
+    "clippers",
+    "clock",
+    "closet",
+    "cloth",
+    "clothes tree",
+    "coffee",
+    "coffee machine",
+    "comb",
+    "computer",
+    "concrete",
+    "cone",
+    "container",
+    "control booth",
+    "controller",
+    "cooker",
+    "copying machine",
+    "coral",
+    "cork",
+    "corkscrew",
+    "counter",
+    "court",
+    "cow",
+    "crabstick",
+    "crane",
+    "crate",
+    "cross",
+    "crutch",
+    "cup",
+    "curtain",
+    "cushion",
+    "cutting board",
+    "dais",
+    "disc",
+    "disc case",
+    "dishwasher",
+    "dock",
+    "dog",
+    "dolphin",
+    "door",
+    "drainer",
+    "dray",
+    "drink dispenser",
+    "drinking machine",
+    "drop",
+    "drug",
+    "drum",
+    "drum kit",
+    "duck",
+    "dumbbell",
+    "earphone",
+    "earrings",
+    "egg",
+    "electric fan",
+    "electric iron",
+    "electric pot",
+    "electric saw",
+    "electronic keyboard",
+    "engine",
+    "envelope",
+    "equipment",
+    "escalator",
+    "exhibition booth",
+    "extinguisher",
+    "eyeglass",
+    "fan",
+    "faucet",
+    "fax machine",
+    "fence",
+    "ferris wheel",
+    "fire extinguisher",
+    "fire hydrant",
+    "fire place",
+    "fish",
+    "fish tank",
+    "fishbowl",
+    "fishing net",
+    "fishing pole",
+    "flag",
+    "flagstaff",
+    "flame",
+    "flashlight",
+    "floor",
+    "flower",
+    "fly",
+    "foam",
+    "food",
+    "footbridge",
+    "forceps",
+    "fork",
+    "forklift",
+    "fountain",
+    "fox",
+    "frame",
+    "fridge",
+    "frog",
+    "fruit",
+    "funnel",
+    "furnace",
+    "game controller",
+    "game machine",
+    "gas cylinder",
+    "gas hood",
+    "gas stove",
+    "gift box",
+    "glass",
+    "glass marble",
+    "globe",
+    "glove",
+    "goal",
+    "grandstand",
+    "grass",
+    "gravestone",
+    "ground",
+    "guardrail",
+    "guitar",
+    "gun",
+    "hammer",
+    "hand cart",
+    "handle",
+    "handrail",
+    "hanger",
+    "hard disk drive",
+    "hat",
+    "hay",
+    "headphone",
+    "heater",
+    "helicopter",
+    "helmet",
+    "holder",
+    "hook",
+    "horse",
+    "horse-drawn carriage",
+    "hot-air balloon",
+    "hydrovalve",
+    "ice",
+    "inflator pump",
+    "ipod",
+    "iron",
+    "ironing board",
+    "jar",
+    "kart",
+    "kettle",
+    "key",
+    "keyboard",
+    "kitchen range",
+    "kite",
+    "knife",
+    "knife block",
+    "ladder",
+    "ladder truck",
+    "ladle",
+    "laptop",
+    "leaves",
+    "lid",
+    "life buoy",
+    "light",
+    "light bulb",
+    "lighter",
+    "line",
+    "lion",
+    "lobster",
+    "lock",
+    "machine",
+    "mailbox",
+    "mannequin",
+    "map",
+    "mask",
+    "mat",
+    "match book",
+    "mattress",
+    "menu",
+    "metal",
+    "meter box",
+    "microphone",
+    "microwave",
+    "mirror",
+    "missile",
+    "model",
+    "money",
+    "monkey",
+    "mop",
+    "motorbike",
+    "mountain",
+    "mouse",
+    "mouse pad",
+    "musical instrument",
+    "napkin",
+    "net",
+    "newspaper",
+    "oar",
+    "ornament",
+    "outlet",
+    "oven",
+    "oxygen bottle",
+    "pack",
+    "pan",
+    "paper",
+    "paper box",
+    "paper cutter",
+    "parachute",
+    "parasol",
+    "parterre",
+    "patio",
+    "pelage",
+    "pen",
+    "pen container",
+    "pencil",
+    "person",
+    "photo",
+    "piano",
+    "picture",
+    "pig",
+    "pillar",
+    "pillow",
+    "pipe",
+    "pitcher",
+    "plant",
+    "plastic",
+    "plate",
+    "platform",
+    "player",
+    "playground",
+    "pliers",
+    "plume",
+    "poker",
+    "poker chip",
+    "pole",
+    "pool table",
+    "postcard",
+    "poster",
+    "pot",
+    "pottedplant",
+    "printer",
+    "projector",
+    "pumpkin",
+    "rabbit",
+    "racket",
+    "radiator",
+    "radio",
+    "rail",
+    "rake",
+    "ramp",
+    "range hood",
+    "receiver",
+    "recorder",
+    "recreational machines",
+    "remote control",
+    "road",
+    "robot",
+    "rock",
+    "rocket",
+    "rocking horse",
+    "rope",
+    "rug",
+    "ruler",
+    "runway",
+    "saddle",
+    "sand",
+    "saw",
+    "scale",
+    "scanner",
+    "scissors",
+    "scoop",
+    "screen",
+    "screwdriver",
+    "sculpture",
+    "scythe",
+    "sewer",
+    "sewing machine",
+    "shed",
+    "sheep",
+    "shell",
+    "shelves",
+    "shoe",
+    "shopping cart",
+    "shovel",
+    "sidecar",
+    "sidewalk",
+    "sign",
+    "signal light",
+    "sink",
+    "skateboard",
+    "ski",
+    "sky",
+    "sled",
+    "slippers",
+    "smoke",
+    "snail",
+    "snake",
+    "snow",
+    "snowmobiles",
+    "sofa",
+    "spanner",
+    "spatula",
+    "speaker",
+    "speed bump",
+    "spice container",
+    "spoon",
+    "sprayer",
+    "squirrel",
+    "stage",
+    "stair",
+    "stapler",
+    "stick",
+    "sticky note",
+    "stone",
+    "stool",
+    "stove",
+    "straw",
+    "stretcher",
+    "sun",
+    "sunglass",
+    "sunshade",
+    "surveillance camera",
+    "swan",
+    "sweeper",
+    "swim ring",
+    "swimming pool",
+    "swing",
+    "switch",
+    "table",
+    "tableware",
+    "tank",
+    "tap",
+    "tape",
+    "tarp",
+    "telephone",
+    "telephone booth",
+    "tent",
+    "tire",
+    "toaster",
+    "toilet",
+    "tong",
+    "tool",
+    "toothbrush",
+    "towel",
+    "toy",
+    "toy car",
+    "track",
+    "train",
+    "trampoline",
+    "trash bin",
+    "tray",
+    "tree",
+    "tricycle",
+    "tripod",
+    "trophy",
+    "truck",
+    "tube",
+    "turtle",
+    "tvmonitor",
+    "tweezers",
+    "typewriter",
+    "umbrella",
+    "unknown",
+    "vacuum cleaner",
+    "vending machine",
+    "video camera",
+    "video game console",
+    "video player",
+    "video tape",
+    "violin",
+    "wakeboard",
+    "wall",
+    "wallet",
+    "wardrobe",
+    "washing machine",
+    "watch",
+    "water",
+    "water dispenser",
+    "water pipe",
+    "water skate board",
+    "watermelon",
+    "whale",
+    "wharf",
+    "wheel",
+    "wheelchair",
+    "window",
+    "window blinds",
+    "wineglass",
+    "wire",
+    "wood",
+    "wool",
+
+)
+
+
+def _get_voc_meta(cat_list):
+    ret = {
+        "stuff_classes": cat_list,
+    }
+    return ret
+
+
+def register_pascal_context_59(root):
+    root = os.path.join(root, "VOCdevkit/VOC2010")
+    meta = _get_voc_meta(PASCALCONTEX59_NAMES)
+    for name, image_dirname, sem_seg_dirname in [
+        ("val", "JPEGImages", "annotations_detectron2/pc59_val"),
+    ]:
+        image_dir = os.path.join(root, image_dirname)
+        gt_dir = os.path.join(root, sem_seg_dirname)
+        all_name = f"pascal_context_59_sem_seg_{name}"
+        DatasetCatalog.register(
+            all_name,
+            lambda x=image_dir, y=gt_dir: load_sem_seg(
+                y, x, gt_ext="png", image_ext="jpg"
+            ),
+        )
+        MetadataCatalog.get(all_name).set(
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=255,
+            **meta,
+        )
+
+def register_pascal_context_459(root):
+    root = os.path.join(root, "VOCdevkit/VOC2010")
+    meta = _get_voc_meta(PASCALCONTEX459_NAMES)
+    for name, image_dirname, sem_seg_dirname in [
+        ("val", "JPEGImages", "annotations_detectron2/pc459_val"),
+    ]:
+        image_dir = os.path.join(root, image_dirname)
+        gt_dir = os.path.join(root, sem_seg_dirname)
+        all_name = f"pascal_context_459_sem_seg_{name}"
+        DatasetCatalog.register(
+            all_name,
+            lambda x=image_dir, y=gt_dir: load_sem_seg(
+                y, x, gt_ext="tif", image_ext="jpg"
+            ),
+        )
+        MetadataCatalog.get(all_name).set(
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=65535,  # NOTE: gt is saved in 16-bit TIFF images
+            **meta,
+        )
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_pascal_context_59(_root)
+register_pascal_context_459(_root)
diff --git a/open_vocab_seg/data/datasets/register_voc_seg.py b/open_vocab_seg/data/datasets/register_voc_seg.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8c2be16f4bb5348de8f1051f3579e02e362488f
--- /dev/null
+++ b/open_vocab_seg/data/datasets/register_voc_seg.py
@@ -0,0 +1,62 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+
+PASCALVOC20_NAMES = (
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+)
+
+def _get_voc_meta(cat_list):
+    ret = {
+        "stuff_classes": cat_list,
+    }
+    return ret
+
+
+def register_pascalvoc(root):
+    root = os.path.join(root, "VOCdevkit/VOC2012")
+    meta = _get_voc_meta(PASCALVOC20_NAMES)
+
+    for name, image_dirname, sem_seg_dirname in [
+        ("val", "JPEGImages", "annotations_detectron2/val"),
+    ]:
+        image_dir = os.path.join(root, image_dirname)
+        gt_dir = os.path.join(root, sem_seg_dirname)
+        all_name = f"pascalvoc20_sem_seg_{name}"
+        DatasetCatalog.register(
+            all_name,
+            lambda x=image_dir, y=gt_dir: load_sem_seg(
+                y, x, gt_ext="png", image_ext="jpg"
+            ),
+        )
+        MetadataCatalog.get(all_name).set(
+            image_root=image_dir,
+            sem_seg_root=gt_dir,
+            evaluator_type="sem_seg",
+            ignore_label=255,
+            **meta,
+        )
+
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_pascalvoc(_root)
diff --git a/open_vocab_seg/evaluation/__init__.py b/open_vocab_seg/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d36d8e9659a1d31471273a6a0f82c2642ea982
--- /dev/null
+++ b/open_vocab_seg/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from .generalized_sem_seg_evaluation import GeneralizedSemSegEvaluator
diff --git a/open_vocab_seg/evaluation/generalized_sem_seg_evaluation.py b/open_vocab_seg/evaluation/generalized_sem_seg_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce960ae7cbffde4a981be941ed03a8fc7025ed80
--- /dev/null
+++ b/open_vocab_seg/evaluation/generalized_sem_seg_evaluation.py
@@ -0,0 +1,159 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import itertools
+import json
+import numpy as np
+import os
+from collections import OrderedDict
+import PIL.Image as Image
+import torch
+
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.comm import all_gather, is_main_process, synchronize
+from detectron2.utils.file_io import PathManager
+
+from detectron2.evaluation import SemSegEvaluator
+
+
+class GeneralizedSemSegEvaluator(SemSegEvaluator):
+    """
+    Evaluate semantic segmentation metrics.
+    """
+
+    def __init__(
+        self,
+        dataset_name,
+        distributed=True,
+        output_dir=None,
+        *,
+        num_classes=None,
+        ignore_label=None,
+        post_process_func=None,
+    ):
+        super().__init__(
+            dataset_name,
+            distributed=distributed,
+            output_dir=output_dir,
+            num_classes=num_classes,
+            ignore_label=ignore_label,
+        )
+        meta = MetadataCatalog.get(dataset_name)
+        try:
+            self._evaluation_set = meta.evaluation_set
+        except AttributeError:
+            self._evaluation_set = None
+        self.post_process_func = (
+            post_process_func
+            if post_process_func is not None
+            else lambda x, **kwargs: x
+        )
+
+    def process(self, inputs, outputs):
+        """
+        Args:
+            inputs: the inputs to a model.
+                It is a list of dicts. Each dict corresponds to an image and
+                contains keys like "height", "width", "file_name".
+            outputs: the outputs of a model. It is either list of semantic segmentation predictions
+                (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic
+                segmentation prediction in the same format.
+        """
+        for input, output in zip(inputs, outputs):
+            output = self.post_process_func(
+                output["sem_seg"], image=np.array(Image.open(input["file_name"]))
+            )
+            output = output.argmax(dim=0).to(self._cpu_device)
+            pred = np.array(output, dtype=np.int)
+            with PathManager.open(
+                self.input_file_to_gt_file[input["file_name"]], "rb"
+            ) as f:
+                gt = np.array(Image.open(f), dtype=np.int)
+
+            gt[gt == self._ignore_label] = self._num_classes
+
+            self._conf_matrix += np.bincount(
+                (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1),
+                minlength=self._conf_matrix.size,
+            ).reshape(self._conf_matrix.shape)
+
+            self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"]))
+
+    def evaluate(self):
+        """
+        Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval):
+
+        * Mean intersection-over-union averaged across classes (mIoU)
+        * Frequency Weighted IoU (fwIoU)
+        * Mean pixel accuracy averaged across classes (mACC)
+        * Pixel Accuracy (pACC)
+        """
+        if self._distributed:
+            synchronize()
+            conf_matrix_list = all_gather(self._conf_matrix)
+            self._predictions = all_gather(self._predictions)
+            self._predictions = list(itertools.chain(*self._predictions))
+            if not is_main_process():
+                return
+
+            self._conf_matrix = np.zeros_like(self._conf_matrix)
+            for conf_matrix in conf_matrix_list:
+                self._conf_matrix += conf_matrix
+
+        if self._output_dir:
+            PathManager.mkdirs(self._output_dir)
+            file_path = os.path.join(self._output_dir, "sem_seg_predictions.json")
+            with PathManager.open(file_path, "w") as f:
+                f.write(json.dumps(self._predictions))
+
+        acc = np.full(self._num_classes, np.nan, dtype=np.float)
+        iou = np.full(self._num_classes, np.nan, dtype=np.float)
+        tp = self._conf_matrix.diagonal()[:-1].astype(np.float)
+        pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float)
+        class_weights = pos_gt / np.sum(pos_gt)
+        pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float)
+        acc_valid = pos_gt > 0
+        acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid]
+        iou_valid = (pos_gt + pos_pred) > 0
+        union = pos_gt + pos_pred - tp
+        iou[acc_valid] = tp[acc_valid] / union[acc_valid]
+        macc = np.sum(acc[acc_valid]) / np.sum(acc_valid)
+        miou = np.sum(iou[acc_valid]) / np.sum(iou_valid)
+        fiou = np.sum(iou[acc_valid] * class_weights[acc_valid])
+        pacc = np.sum(tp) / np.sum(pos_gt)
+
+        res = {}
+        res["mIoU"] = 100 * miou
+        res["fwIoU"] = 100 * fiou
+        for i, name in enumerate(self._class_names):
+            res["IoU-{}".format(name)] = 100 * iou[i]
+        res["mACC"] = 100 * macc
+        res["pACC"] = 100 * pacc
+        for i, name in enumerate(self._class_names):
+            res["ACC-{}".format(name)] = 100 * acc[i]
+        if self._evaluation_set is not None:
+            for set_name, set_inds in self._evaluation_set.items():
+                iou_list = []
+                set_inds = np.array(set_inds, np.int)
+                mask = np.zeros((len(iou),)).astype(np.bool)
+                mask[set_inds] = 1
+                miou = np.sum(iou[mask][acc_valid[mask]]) / np.sum(iou_valid[mask])
+                pacc = np.sum(tp[mask]) / np.sum(pos_gt[mask])
+                res["mIoU-{}".format(set_name)] = 100 * miou
+                res["pAcc-{}".format(set_name)] = 100 * pacc
+                iou_list.append(miou)
+                miou = np.sum(iou[~mask][acc_valid[~mask]]) / np.sum(iou_valid[~mask])
+                pacc = np.sum(tp[~mask]) / np.sum(pos_gt[~mask])
+                res["mIoU-un{}".format(set_name)] = 100 * miou
+                res["pAcc-un{}".format(set_name)] = 100 * pacc
+                iou_list.append(miou)
+                res["hIoU-{}".format(set_name)] = (
+                    100 * len(iou_list) / sum([1 / iou for iou in iou_list])
+                )
+        if self._output_dir:
+            file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth")
+            with PathManager.open(file_path, "wb") as f:
+                torch.save(res, f)
+        results = OrderedDict({"sem_seg": res})
+        self._logger.info(results)
+        return results
diff --git a/open_vocab_seg/mask_former_model.py b/open_vocab_seg/mask_former_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..3708d65de4695368b1d088abde4bdf4a9fa39b2b
--- /dev/null
+++ b/open_vocab_seg/mask_former_model.py
@@ -0,0 +1,254 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from typing import Tuple
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.postprocessing import sem_seg_postprocess
+from detectron2.structures import ImageList
+
+from .modeling.criterion import SetCriterion
+from .modeling.matcher import HungarianMatcher
+
+
+@META_ARCH_REGISTRY.register()
+class MaskFormer(nn.Module):
+    """
+    Main class for mask classification semantic segmentation architectures.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        panoptic_on: bool,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            num_queries: int, number of queries
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__()
+        self.backbone = backbone
+        self.sem_seg_head = sem_seg_head
+        self.criterion = criterion
+        self.num_queries = num_queries
+        self.overlap_threshold = overlap_threshold
+        self.panoptic_on = panoptic_on
+        self.object_mask_threshold = object_mask_threshold
+        self.metadata = metadata
+        if size_divisibility < 0:
+            # use backbone size_divisibility if not set
+            size_divisibility = self.backbone.size_divisibility
+        self.size_divisibility = size_divisibility
+        self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference
+        self.register_buffer(
+            "pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False
+        )
+        self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False)
+
+    @classmethod
+    def from_config(cls, cfg):
+        backbone = build_backbone(cfg)
+        sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape())
+
+        # Loss parameters:
+        deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT
+        dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT
+        mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT
+
+        # building criterion
+        matcher = HungarianMatcher(
+            cost_class=1,
+            cost_mask=mask_weight,
+            cost_dice=dice_weight,
+        )
+
+        weight_dict = {"loss_ce": 1, "loss_mask": mask_weight, "loss_dice": dice_weight}
+        if deep_supervision:
+            dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+            aux_weight_dict = {}
+            for i in range(dec_layers - 1):
+                aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+
+        losses = ["labels", "masks"]
+
+        criterion = SetCriterion(
+            sem_seg_head.num_classes,
+            matcher=matcher,
+            weight_dict=weight_dict,
+            eos_coef=no_object_weight,
+            losses=losses,
+        )
+
+        return {
+            "backbone": backbone,
+            "sem_seg_head": sem_seg_head,
+            "criterion": criterion,
+            "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES,
+            "panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON,
+            "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD,
+            "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD,
+            "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]),
+            "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY,
+            "sem_seg_postprocess_before_inference": (
+                cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE
+                or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON
+            ),
+            "pixel_mean": cfg.MODEL.PIXEL_MEAN,
+            "pixel_std": cfg.MODEL.PIXEL_STD,
+        }
+
+    @property
+    def device(self):
+        return self.pixel_mean.device
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features)
+
+        if self.training:
+            # mask classification target
+            if "instances" in batched_inputs[0]:
+                gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+                targets = self.prepare_targets(gt_instances, images)
+            else:
+                targets = None
+
+            # bipartite matching-based loss
+            losses = self.criterion(outputs, targets)
+
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else:
+                    # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+
+            return losses
+        else:
+            mask_cls_results = outputs["pred_logits"]
+            mask_pred_results = outputs["pred_masks"]
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+            processed_results = []
+            for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
+                mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
+            ):
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+
+                if self.sem_seg_postprocess_before_inference:
+                    mask_pred_result = sem_seg_postprocess(
+                        mask_pred_result, image_size, height, width
+                    )
+
+                # semantic segmentation inference
+                r = self.semantic_inference(mask_cls_result, mask_pred_result)
+                if not self.sem_seg_postprocess_before_inference:
+                    r = sem_seg_postprocess(r, image_size, height, width)
+                processed_results.append({"sem_seg": r})
+
+                # panoptic segmentation inference
+                if self.panoptic_on:
+                    panoptic_r = self.panoptic_inference(
+                        mask_cls_result, mask_pred_result
+                    )
+                    processed_results[-1]["panoptic_seg"] = panoptic_r
+
+            return processed_results
+
+    def prepare_targets(self, targets, images):
+        h, w = images.tensor.shape[-2:]
+        new_targets = []
+        for targets_per_image in targets:
+            # pad gt
+            gt_masks = targets_per_image.gt_masks
+            padded_masks = torch.zeros(
+                (gt_masks.shape[0], h, w), dtype=gt_masks.dtype, device=gt_masks.device
+            )
+            padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks
+            new_targets.append(
+                {
+                    "labels": targets_per_image.gt_classes,
+                    "masks": padded_masks,
+                }
+            )
+        return new_targets
+
+    def semantic_inference(self, mask_cls, mask_pred):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg
diff --git a/open_vocab_seg/modeling/__init__.py b/open_vocab_seg/modeling/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b4dd2628880e93338b39b0b6562b2a5838692b5
--- /dev/null
+++ b/open_vocab_seg/modeling/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from .backbone.swin import D2SwinTransformer
+from .backbone.clip_resnet import D2ModifiedResNet
+from .heads.mask_former_head import MaskFormerHead
+from .heads.open_vocab_mask_former_head import OpenVocabMaskFormerHead
+from .heads.pixel_decoder import BasePixelDecoder
diff --git a/open_vocab_seg/modeling/backbone/__init__.py b/open_vocab_seg/modeling/backbone/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f9003b7a688f5396170dd89c26ef335a2c201f
--- /dev/null
+++ b/open_vocab_seg/modeling/backbone/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
diff --git a/open_vocab_seg/modeling/backbone/clip_resnet.py b/open_vocab_seg/modeling/backbone/clip_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d40d88c1eac79a873a1396f7203b3555c68a364
--- /dev/null
+++ b/open_vocab_seg/modeling/backbone/clip_resnet.py
@@ -0,0 +1,206 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from collections import OrderedDict
+import torch
+import torch.nn as nn
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(
+            planes, planes, 3, padding=1 * dilation, bias=False, dilation=dilation
+        )
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict(
+                    [
+                        ("-1", nn.AvgPool2d(stride)),
+                        (
+                            "0",
+                            nn.Conv2d(
+                                inplanes,
+                                planes * self.expansion,
+                                1,
+                                stride=1,
+                                bias=False,
+                            ),
+                        ),
+                        ("1", nn.BatchNorm2d(planes * self.expansion)),
+                    ]
+                )
+            )
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self, layers, width=64, strides=[2, 1, 2, 2, 2], multi_grid=[1, 1, 1]):
+        super().__init__()
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(strides[0]) if strides[0] > 1 else nn.Identity()
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0], stride=strides[1])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=strides[2])
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=strides[3])
+        self.layer4 = self._make_layer(
+            width * 8, layers[3], stride=strides[4], dilations=multi_grid
+        )
+        self.num_features = [width * 4, width * 8, width * 16, width * 32]
+
+    def _make_layer(self, planes, blocks, stride=1, dilations=None):
+        if dilations is None:
+            dilations = [1] * blocks
+        layers = [Bottleneck(self._inplanes, planes, stride, dilation=dilations[0])]
+        self._inplanes = planes * Bottleneck.expansion
+
+        for i in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes, dilation=dilations[i]))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [
+                (self.conv1, self.bn1),
+                (self.conv2, self.bn2),
+                (self.conv3, self.bn3),
+            ]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        output = {}
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)  # 1/4,1/4
+        x = self.layer1(x)
+        output["res2"] = x
+        x = self.layer2(x)  # 1/8,1/8
+        output["res3"] = x
+        x = self.layer3(x)  # 1/16,1/16
+        output["res4"] = x
+        x = self.layer4(x)  # 1/32,1/32
+        output["res5"] = x
+        return output
+
+
+@BACKBONE_REGISTRY.register()
+class D2ModifiedResNet(ModifiedResNet, Backbone):
+    def __init__(self, cfg, input_shape):
+        depth = cfg.MODEL.RESNETS.DEPTH
+        num_groups = cfg.MODEL.RESNETS.NUM_GROUPS
+        width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP
+        bottleneck_channels = num_groups * width_per_group
+        num_blocks_per_stage = {
+            18: [2, 2, 2, 2],
+            34: [3, 4, 6, 3],
+            50: [3, 4, 6, 3],
+            101: [3, 4, 23, 3],
+            152: [3, 8, 36, 3],
+        }[depth]
+        strides = [2, 1, 2, 2, 2]
+        multi_grid = cfg.MODEL.RESNETS.RES5_MULTI_GRID
+        if cfg.MODEL.RESNETS.STEM_TYPE == "deeplab":
+            strides = [1, 1, 2, 2, 2]
+        super().__init__(
+            num_blocks_per_stage,
+            bottleneck_channels,
+            strides=strides,
+            multi_grid=multi_grid,
+        )
+        self._out_features = cfg.MODEL.RESNETS.OUT_FEATURES
+
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+        }
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name],
+                stride=self._out_feature_strides[name],
+            )
+            for name in self._out_features
+        }
+
+    @property
+    def size_divisibility(self):
+        return 32
diff --git a/open_vocab_seg/modeling/backbone/swin.py b/open_vocab_seg/modeling/backbone/swin.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa651bdab51bb353e3be4b5554f41e251803d5cb
--- /dev/null
+++ b/open_vocab_seg/modeling/backbone/swin.py
@@ -0,0 +1,832 @@
+# --------------------------------------------------------
+# Swin Transformer
+# Copyright (c) 2021 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Ze Liu, Yutong Lin, Yixuan Wei
+# --------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+
+from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec
+
+
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+        drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size, C)
+    windows = (
+        x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
+    )
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(
+        B, H // window_size, W // window_size, window_size, window_size, -1
+    )
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        qk_scale=None,
+        attn_drop=0.0,
+        proj_drop=0.0,
+    ):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads)
+        )  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = (
+            coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        )  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0
+        ).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=0.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """Forward function.
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B_, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = q @ k.transpose(-2, -1)
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)
+        ].view(
+            self.window_size[0] * self.window_size[1],
+            self.window_size[0] * self.window_size[1],
+            -1,
+        )  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1
+        ).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(
+                1
+            ).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        act_layer=nn.GELU,
+        norm_layer=nn.LayerNorm,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert (
+            0 <= self.shift_size < self.window_size
+        ), "shift_size must in 0-window_size"
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(
+            dim,
+            window_size=to_2tuple(self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+
+        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+        )
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, "input feature has wrong size"
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(
+                x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)
+            )
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size
+        )  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(
+            -1, self.window_size * self.window_size, C
+        )  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask
+        )  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)
+            )
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, "input feature has wrong size"
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        dim,
+        depth,
+        num_heads,
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=0.0,
+        norm_layer=nn.LayerNorm,
+        downsample=None,
+        use_checkpoint=False,
+    ):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList(
+            [
+                SwinTransformerBlock(
+                    dim=dim,
+                    num_heads=num_heads,
+                    window_size=window_size,
+                    shift_size=0 if (i % 2 == 0) else window_size // 2,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_scale=qk_scale,
+                    drop=drop,
+                    attn_drop=attn_drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list)
+                    else drop_path,
+                    norm_layer=norm_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """Forward function.
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        w_slices = (
+            slice(0, -self.window_size),
+            slice(-self.window_size, -self.shift_size),
+            slice(-self.shift_size, None),
+        )
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size
+        )  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1, self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(
+            attn_mask == 0, float(0.0)
+        )
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(
+            in_chans, embed_dim, kernel_size=patch_size, stride=patch_size
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+
+    def __init__(
+        self,
+        pretrain_img_size=224,
+        patch_size=4,
+        in_chans=3,
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.2,
+        norm_layer=nn.LayerNorm,
+        ape=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        norm_indices=None,
+        frozen_stages=-1,
+        use_checkpoint=False,
+        projection=False,
+        project_dim=256,
+    ):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.norm_indices = norm_indices if norm_indices is not None else out_indices
+        self.frozen_stages = frozen_stages
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None,
+        )
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1],
+            ]
+
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1])
+            )
+            trunc_normal_(self.absolute_pos_embed, std=0.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=int(embed_dim * 2 ** i_layer),
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])],
+                norm_layer=norm_layer,
+                downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                use_checkpoint=use_checkpoint,
+            )
+            self.layers.append(layer)
+
+        num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in self.norm_indices:
+            if i_layer >= len(self.num_features):
+                continue
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f"norm{i_layer}"
+            self.add_module(layer_name, layer)
+        # add projector head
+        self.projection = projection
+        if projection:
+            self.project_dim = project_dim
+            self.norm = norm_layer(self.num_features[-1])
+            self.projector = nn.Linear(self.num_features[-1], project_dim, bias=False)
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    def init_weights(self, pretrained=None):
+        """Initialize the weights in backbone.
+        Args:
+            pretrained (str, optional): Path to pre-trained weights.
+                Defaults to None.
+        """
+
+        def _init_weights(m):
+            if isinstance(m, nn.Linear):
+                trunc_normal_(m.weight, std=0.02)
+                if isinstance(m, nn.Linear) and m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.LayerNorm):
+                nn.init.constant_(m.bias, 0)
+                nn.init.constant_(m.weight, 1.0)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(
+                self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic"
+            )
+            x = (x + absolute_pos_embed).flatten(2).transpose(1, 2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = {}
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                if i in self.norm_indices:
+                    norm_layer = getattr(self, f"norm{i}")
+                    x_out = norm_layer(x_out)
+                out = (
+                    x_out.view(-1, H, W, self.num_features[i])
+                    .permute(0, 3, 1, 2)
+                    .contiguous()
+                )
+                outs["res{}".format(i + 2)] = out
+        if self.projection:
+            x_out = self.norm(x_out)
+            x_out = x_out.view(-1, H, W, self.num_features[-1]).contiguous()
+            outs["fc"] = self.projector(x_out).permute(0, 3, 1, 2)
+
+        return outs
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+
+@BACKBONE_REGISTRY.register()
+class D2SwinTransformer(SwinTransformer, Backbone):
+    def __init__(self, cfg, input_shape):
+
+        pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE
+        patch_size = cfg.MODEL.SWIN.PATCH_SIZE
+        in_chans = 3
+        embed_dim = cfg.MODEL.SWIN.EMBED_DIM
+        depths = cfg.MODEL.SWIN.DEPTHS
+        num_heads = cfg.MODEL.SWIN.NUM_HEADS
+        window_size = cfg.MODEL.SWIN.WINDOW_SIZE
+        mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO
+        qkv_bias = cfg.MODEL.SWIN.QKV_BIAS
+        qk_scale = cfg.MODEL.SWIN.QK_SCALE
+        drop_rate = cfg.MODEL.SWIN.DROP_RATE
+        attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE
+        drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE
+        norm_layer = nn.LayerNorm
+        ape = cfg.MODEL.SWIN.APE
+        patch_norm = cfg.MODEL.SWIN.PATCH_NORM
+        norm_indices = cfg.MODEL.SWIN.NORM_INDICES
+        projection = cfg.MODEL.SWIN.PROJECTION
+        project_dim = cfg.MODEL.SWIN.PROJECT_DIM
+        super().__init__(
+            pretrain_img_size,
+            patch_size,
+            in_chans,
+            embed_dim,
+            depths,
+            num_heads,
+            window_size,
+            mlp_ratio,
+            qkv_bias,
+            qk_scale,
+            drop_rate,
+            attn_drop_rate,
+            drop_path_rate,
+            norm_layer,
+            ape,
+            patch_norm,
+            norm_indices=norm_indices,
+            projection=projection,
+            project_dim=project_dim,
+        )
+
+        self._out_features = cfg.MODEL.SWIN.OUT_FEATURES
+
+        self._out_feature_strides = {
+            "res2": 4,
+            "res3": 8,
+            "res4": 16,
+            "res5": 32,
+            "fc": 32,
+        }
+        self._out_feature_channels = {
+            "res2": self.num_features[0],
+            "res3": self.num_features[1],
+            "res4": self.num_features[2],
+            "res5": self.num_features[3],
+            "fc": self.num_features[3],
+        }
+
+    def forward(self, x):
+        """
+        Args:
+            x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``.
+        Returns:
+            dict[str->Tensor]: names and the corresponding features
+        """
+        assert (
+            x.dim() == 4
+        ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!"
+        outputs = {}
+        y = super().forward(x)
+        for k in y.keys():
+            if k in self._out_features:
+                outputs[k] = y[k]
+        return outputs
+
+    def output_shape(self):
+        return {
+            name: ShapeSpec(
+                channels=self._out_feature_channels[name],
+                stride=self._out_feature_strides[name],
+            )
+            for name in self._out_features
+        }
+
+    @property
+    def size_divisibility(self):
+        return 32
diff --git a/open_vocab_seg/modeling/clip_adapter/__init__.py b/open_vocab_seg/modeling/clip_adapter/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c880f121e329e0fc2bb31de5aa8240b44b4a25a
--- /dev/null
+++ b/open_vocab_seg/modeling/clip_adapter/__init__.py
@@ -0,0 +1,23 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from .text_template import (
+    PredefinedPromptExtractor,
+    ImageNetPromptExtractor,
+    VILDPromptExtractor,
+)
+from .adapter import ClipAdapter, MaskFormerClipAdapter
+
+
+def build_text_prompt(cfg):
+    if cfg.TEXT_TEMPLATES == "predefined":
+        text_templates = PredefinedPromptExtractor(cfg.PREDEFINED_PROMPT_TEMPLATES)
+    elif cfg.TEXT_TEMPLATES == "imagenet":
+        text_templates = ImageNetPromptExtractor()
+    elif cfg.TEXT_TEMPLATES == "vild":
+        text_templates = VILDPromptExtractor()
+    else:
+        raise NotImplementedError(
+            "Prompt learner {} is not supported".format(cfg.TEXT_TEMPLATES)
+        )
+    return text_templates
diff --git a/open_vocab_seg/modeling/clip_adapter/adapter.py b/open_vocab_seg/modeling/clip_adapter/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..864d20b160714865b4130fab8714f323aaae2572
--- /dev/null
+++ b/open_vocab_seg/modeling/clip_adapter/adapter.py
@@ -0,0 +1,206 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from
+# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/clip_adapter/adapter.py
+
+from typing import List
+import torch
+from torch import nn
+from torch.nn import functional as F
+from detectron2.structures import BitMasks
+from .utils import build_clip_model, crop_with_mask
+from .text_template import PromptExtractor
+
+
+PIXEL_MEAN = (0.48145466, 0.4578275, 0.40821073)
+PIXEL_STD = (0.26862954, 0.26130258, 0.27577711)
+
+
+class ClipAdapter(nn.Module):
+    def __init__(self, clip_model_name: str, mask_prompt_depth: int, text_templates: PromptExtractor):
+        super().__init__()
+        self.clip_model = build_clip_model(clip_model_name, mask_prompt_depth)
+        self.text_templates = text_templates
+        self.text_templates.init_buffer(self.clip_model)
+        self.text_feature_buffer = {}
+
+    def forward(self, image: torch.Tensor, text: List[str], **kwargs):
+        image = self._preprocess_image(image, **kwargs)
+        text_feature = self.get_text_features(text)  # k,feat_dim
+        image_features = self.get_image_features(image)
+        return self.get_sim_logits(text_feature, image_features)
+
+    def _preprocess_image(self, image: torch.Tensor):
+        return image
+
+    def _get_text_features(self, noun_list: List[str]):
+        left_noun_list = [
+            noun for noun in noun_list if noun not in self.text_feature_buffer
+        ]
+        if len(left_noun_list) > 0:
+            left_text_features = self.text_templates(
+                left_noun_list, self.clip_model
+            )
+            self.text_feature_buffer.update(
+                {
+                    noun: text_feature
+                    for noun, text_feature in zip(
+                        left_noun_list, left_text_features
+                    )
+                }
+            )
+        return torch.stack([self.text_feature_buffer[noun] for noun in noun_list])
+
+
+    def get_text_features(self, noun_list: List[str]):
+        return self._get_text_features(noun_list)
+
+    def get_image_features(self, image: torch.Tensor):
+        image_features = self.clip_model.visual(image)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        return image_features
+
+    def get_sim_logits(
+        self,
+        text_features: torch.Tensor,
+        image_features: torch.Tensor,
+        temperature: float = 100,
+    ):
+        return temperature * image_features @ text_features.T
+
+    def normalize_feature(self, feat: torch.Tensor):
+        return feat / feat.norm(dim=-1, keepdim=True)
+
+
+class MaskFormerClipAdapter(ClipAdapter):
+    def __init__(
+        self,
+        clip_model_name: str,
+        text_templates: PromptExtractor,
+        mask_fill: str = "mean",
+        mask_expand_ratio: float = 1.0,
+        mask_thr: float = 0.5,
+        mask_matting: bool = False,
+        region_resized: bool = True,
+        mask_prompt_depth: int = 0,
+        mask_prompt_fwd: bool = False,
+    ):
+        super().__init__(clip_model_name, mask_prompt_depth, text_templates)
+        self.non_object_embedding = nn.Parameter(
+            torch.empty(1, self.clip_model.text_projection.shape[-1])
+        )
+        nn.init.normal_(
+            self.non_object_embedding.data,
+            std=self.clip_model.transformer.width ** -0.5,
+        )
+        # for test
+        self.mask_fill = mask_fill
+        if self.mask_fill == "zero":
+            self.mask_fill = (0.0, 0.0, 0.0)
+        elif self.mask_fill == "mean":
+            self.mask_fill = [255.0 * c for c in PIXEL_MEAN]
+        else:
+            raise NotImplementedError(
+                "Unknown mask_fill method: {}".format(self.mask_fill)
+            )
+        self.mask_expand_ratio = mask_expand_ratio
+        self.mask_thr = mask_thr
+        self.mask_matting = mask_matting
+        self.region_resized = region_resized
+        self.mask_prompt_fwd = mask_prompt_fwd
+        self.register_buffer(
+            "pixel_mean", torch.Tensor(PIXEL_MEAN).reshape(1, 3, 1, 1) * 255.0
+        )
+        self.register_buffer(
+            "pixel_std", torch.Tensor(PIXEL_STD).reshape(1, 3, 1, 1) * 255.0
+        )
+
+    def forward(
+        self,
+        image: torch.Tensor,
+        text: List[str],
+        mask: torch.Tensor,
+        normalize: bool = True,
+        fwd_w_region_mask: bool = False,
+    ):
+        (regions, unnorm_regions), region_masks, valid_flag = self._preprocess_image(image, mask, normalize=normalize)
+        if regions is None:
+            return None, valid_flag
+        if isinstance(regions, list):
+            assert NotImplementedError
+            image_features = torch.cat(
+                [self.get_image_features(image_i) for image_i in regions], dim=0
+            )
+        else:
+            if self.mask_prompt_fwd:
+                image_features = self.get_image_features(regions, region_masks)
+            else:
+                image_features = self.get_image_features(regions)
+        text_feature = self.get_text_features(text)  # k,feat_dim
+        return self.get_sim_logits(text_feature, image_features), unnorm_regions, valid_flag
+
+    def get_image_features(self, image, region_masks=None):
+        image_features = self.clip_model.visual(image, region_masks)
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        return image_features
+
+    def _preprocess_image(
+        self, image: torch.Tensor, mask: torch.Tensor, normalize: bool = True
+    ):
+        """crop, mask and normalize the image
+
+        Args:
+            image ([type]): [C,H,W]
+            mask ([type]): [K,H,W
+            normalize (bool, optional): [description]. Defaults to True.
+        """
+        dtype = mask.dtype
+        bin_mask = mask > self.mask_thr
+        valid = bin_mask.sum(dim=(-1, -2)) > 0
+        bin_mask = bin_mask[valid]
+        mask = mask[valid]
+        if not self.mask_matting:
+            mask = bin_mask
+        bin_mask = BitMasks(bin_mask)
+        bboxes = bin_mask.get_bounding_boxes()
+        # crop,mask
+        regions = []
+        region_masks = []
+        for bbox, single_mask in zip(bboxes, mask):
+            region, region_mask = crop_with_mask(
+                image.type(dtype),
+                single_mask.type(dtype),
+                bbox,
+                fill=self.mask_fill,
+                expand_ratio=self.mask_expand_ratio,
+            )
+            regions.append(region.unsqueeze(0))
+            region_masks.append(region_mask.unsqueeze(0))
+        if len(regions) == 0:
+            return None, valid
+        unnorm_regions = regions
+        if normalize:
+            regions = [(r - self.pixel_mean) / self.pixel_std for r in regions]
+        # resize
+        if self.region_resized:
+            regions = [
+                F.interpolate(r, size=(224, 224), mode="bicubic") for r in regions
+            ]
+            regions = torch.cat(regions)
+            region_masks = [
+                F.interpolate(r, size=(224, 224), mode="nearest") for r in region_masks
+            ]
+            region_masks = torch.cat(region_masks)
+            unnorm_regions = [
+                F.interpolate(r, size=(224, 224), mode="bicubic") for r in unnorm_regions
+            ]
+            unnorm_regions = torch.cat(unnorm_regions)
+        return (regions, unnorm_regions), region_masks, valid
+
+    def get_text_features(self, noun_list: List[str]):
+        object_text_features = self._get_text_features(noun_list)
+        non_object_text_features = (
+            self.non_object_embedding
+            / self.non_object_embedding.norm(dim=-1, keepdim=True)
+        )
+        return torch.cat([object_text_features, non_object_text_features], dim=0)
diff --git a/open_vocab_seg/modeling/clip_adapter/text_template.py b/open_vocab_seg/modeling/clip_adapter/text_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dd085f9435650bbd982c81a1cf0d9899ce7feb2
--- /dev/null
+++ b/open_vocab_seg/modeling/clip_adapter/text_template.py
@@ -0,0 +1,155 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from
+# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/clip_adapter/text_prompt.py
+# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/clip_adapter/utils.py
+
+from typing import List
+
+import clip
+import torch
+from torch import nn
+
+IMAGENET_PROMPT = [
+    "a bad photo of a {}.",
+    "a photo of many {}.",
+    "a sculpture of a {}.",
+    "a photo of the hard to see {}.",
+    "a low resolution photo of the {}.",
+    "a rendering of a {}.",
+    "graffiti of a {}.",
+    "a bad photo of the {}.",
+    "a cropped photo of the {}.",
+    "a tattoo of a {}.",
+    "the embroidered {}.",
+    "a photo of a hard to see {}.",
+    "a bright photo of a {}.",
+    "a photo of a clean {}.",
+    "a photo of a dirty {}.",
+    "a dark photo of the {}.",
+    "a drawing of a {}.",
+    "a photo of my {}.",
+    "the plastic {}.",
+    "a photo of the cool {}.",
+    "a close-up photo of a {}.",
+    "a black and white photo of the {}.",
+    "a painting of the {}.",
+    "a painting of a {}.",
+    "a pixelated photo of the {}.",
+    "a sculpture of the {}.",
+    "a bright photo of the {}.",
+    "a cropped photo of a {}.",
+    "a plastic {}.",
+    "a photo of the dirty {}.",
+    "a jpeg corrupted photo of a {}.",
+    "a blurry photo of the {}.",
+    "a photo of the {}.",
+    "a good photo of the {}.",
+    "a rendering of the {}.",
+    "a {} in a video game.",
+    "a photo of one {}.",
+    "a doodle of a {}.",
+    "a close-up photo of the {}.",
+    "a photo of a {}.",
+    "the origami {}.",
+    "the {} in a video game.",
+    "a sketch of a {}.",
+    "a doodle of the {}.",
+    "a origami {}.",
+    "a low resolution photo of a {}.",
+    "the toy {}.",
+    "a rendition of the {}.",
+    "a photo of the clean {}.",
+    "a photo of a large {}.",
+    "a rendition of a {}.",
+    "a photo of a nice {}.",
+    "a photo of a weird {}.",
+    "a blurry photo of a {}.",
+    "a cartoon {}.",
+    "art of a {}.",
+    "a sketch of the {}.",
+    "a embroidered {}.",
+    "a pixelated photo of a {}.",
+    "itap of the {}.",
+    "a jpeg corrupted photo of the {}.",
+    "a good photo of a {}.",
+    "a plushie {}.",
+    "a photo of the nice {}.",
+    "a photo of the small {}.",
+    "a photo of the weird {}.",
+    "the cartoon {}.",
+    "art of the {}.",
+    "a drawing of the {}.",
+    "a photo of the large {}.",
+    "a black and white photo of a {}.",
+    "the plushie {}.",
+    "a dark photo of a {}.",
+    "itap of a {}.",
+    "graffiti of the {}.",
+    "a toy {}.",
+    "itap of my {}.",
+    "a photo of a cool {}.",
+    "a photo of a small {}.",
+    "a tattoo of the {}.",
+]
+
+VILD_PROMPT = [
+    "a photo of a {}.",
+    "This is a photo of a {}",
+    "There is a {} in the scene",
+    "There is the {} in the scene",
+    "a photo of a {} in the scene",
+    "a photo of a small {}.",
+    "a photo of a medium {}.",
+    "a photo of a large {}.",
+    "This is a photo of a small {}.",
+    "This is a photo of a medium {}.",
+    "This is a photo of a large {}.",
+    "There is a small {} in the scene.",
+    "There is a medium {} in the scene.",
+    "There is a large {} in the scene.",
+]
+
+class PromptExtractor(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._buffer_init = False
+
+    def init_buffer(self, clip_model):
+        self._buffer_init = True
+
+    def forward(self, noun_list: List[str], clip_model: nn.Module):
+        raise NotImplementedError()
+
+
+class PredefinedPromptExtractor(PromptExtractor):
+    def __init__(self, templates: List[str]):
+        super().__init__()
+        self.templates = templates
+
+    def forward(self, noun_list: List[str], clip_model: nn.Module):
+        text_features_bucket = []
+        for template in self.templates:
+            noun_tokens = [clip.tokenize(template.format(noun)) for noun in noun_list]
+            text_inputs = torch.cat(noun_tokens).to(
+                clip_model.text_projection.data.device
+            )
+            text_features = clip_model.encode_text(text_inputs)
+            text_features /= text_features.norm(dim=-1, keepdim=True)
+            text_features_bucket.append(text_features)
+        del text_inputs
+        # ensemble by averaging
+        text_features = torch.stack(text_features_bucket).mean(dim=0)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+
+        return text_features
+
+
+class ImageNetPromptExtractor(PredefinedPromptExtractor):
+    def __init__(self):
+        super().__init__(IMAGENET_PROMPT)
+
+
+class VILDPromptExtractor(PredefinedPromptExtractor):
+    def __init__(self):
+        super().__init__(VILD_PROMPT)
diff --git a/open_vocab_seg/modeling/clip_adapter/utils.py b/open_vocab_seg/modeling/clip_adapter/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..276c1ed9feca77d9a37067d312aca97d132515d3
--- /dev/null
+++ b/open_vocab_seg/modeling/clip_adapter/utils.py
@@ -0,0 +1,81 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from typing import Tuple
+import numpy as np
+import torch
+import clip
+from detectron2.utils.comm import get_local_rank, synchronize
+
+
+def expand_box(
+    x1: float,
+    y1: float,
+    x2: float,
+    y2: float,
+    expand_ratio: float = 1.0,
+    max_h: int = None,
+    max_w: int = None,
+):
+    cx = 0.5 * (x1 + x2)
+    cy = 0.5 * (y1 + y2)
+    w = x2 - x1
+    h = y2 - y1
+    w = w * expand_ratio
+    h = h * expand_ratio
+    box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h]
+    if max_h is not None:
+        box[1] = max(0, box[1])
+        box[3] = min(max_h - 1, box[3])
+    if max_w is not None:
+        box[0] = max(0, box[0])
+        box[2] = min(max_w - 1, box[2])
+    return [int(b) for b in box]
+
+
+def mask2box(mask: torch.Tensor):
+    # use naive way
+    row = torch.nonzero(mask.sum(dim=0))[:, 0]
+    if len(row) == 0:
+        return None
+    x1 = row.min()
+    x2 = row.max()
+    col = np.nonzero(mask.sum(dim=1))[:, 0]
+    y1 = col.min()
+    y2 = col.max()
+    return x1, y1, x2 + 1, y2 + 1
+
+
+def crop_with_mask(
+    image: torch.Tensor,
+    mask: torch.Tensor,
+    bbox: torch.Tensor,
+    fill: Tuple[float, float, float] = (0, 0, 0),
+    expand_ratio: float = 1.0,
+):
+    l, t, r, b = expand_box(*bbox, expand_ratio)
+    _, h, w = image.shape
+    l = max(l, 0)
+    t = max(t, 0)
+    r = min(r, w)
+    b = min(b, h)
+    new_image = torch.cat(
+        [image.new_full((1, b - t, r - l), fill_value=val) for val in fill]
+    )
+    # return image[:, t:b, l:r], mask[None, t:b, l:r]
+    return image[:, t:b, l:r] * mask[None, t:b, l:r] + (1 - mask[None, t:b, l:r]) * new_image, mask[None, t:b, l:r]
+
+
+def build_clip_model(model: str, mask_prompt_depth: int = 0, frozen: bool = True):
+    rank = get_local_rank()
+    if rank == 0:
+        # download on rank 0 only
+        model, _ = clip.load(model, mask_prompt_depth=mask_prompt_depth, device="cpu")
+    synchronize()
+    if rank != 0:
+        model, _ = clip.load(model, mask_prompt_depth=mask_prompt_depth, device="cpu")
+    synchronize()
+    if frozen:
+        for param in model.parameters():
+            param.requires_grad = False
+    return model
diff --git a/open_vocab_seg/modeling/criterion.py b/open_vocab_seg/modeling/criterion.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4d5b71242f87c6f67463f9c31f873a742f3e5c7
--- /dev/null
+++ b/open_vocab_seg/modeling/criterion.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+"""
+MaskFormer criterion.
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from detectron2.utils.comm import get_world_size
+
+from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list
+
+
+def dice_loss(inputs, targets, num_masks):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_masks
+
+
+def sigmoid_focal_loss(
+    inputs, targets, num_masks, alpha: float = 0.25, gamma: float = 2
+):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_masks
+
+
+class SetCriterion(nn.Module):
+    """This class computes the loss for DETR.
+    The process happens in two steps:
+        1) we compute hungarian assignment between ground truth boxes and the outputs of the model
+        2) we supervise each pair of matched ground-truth / prediction (supervise class and box)
+    """
+
+    def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses):
+        """Create the criterion.
+        Parameters:
+            num_classes: number of object categories, omitting the special no-object category
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the no-object category
+            losses: list of all the losses to be applied. See get_loss for list of available losses.
+        """
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        self.losses = losses
+        if eos_coef > 0:
+
+            empty_weight = torch.ones(self.num_classes + 1)
+
+            empty_weight[-1] = self.eos_coef
+            self.register_buffer("empty_weight", empty_weight)
+            self.use_ignore_idx = False
+        else:
+            self.use_ignore_idx = True
+        self.cur_target = []
+
+    def loss_labels(self, outputs, targets, indices, num_masks):
+        """Classification loss (NLL)
+        targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes]
+        """
+        assert "pred_logits" in outputs
+        src_logits = outputs["pred_logits"]
+
+        idx = self._get_src_permutation_idx(indices)
+        target_classes_o = torch.cat(
+            [t["labels"][J] for t, (_, J) in zip(targets, indices)]
+        )
+        target_classes = torch.full(
+            src_logits.shape[:2],
+            self.num_classes,
+            dtype=torch.int64,
+            device=src_logits.device,
+        )
+        target_classes[idx] = target_classes_o
+        if self.use_ignore_idx:
+            loss_ce = F.cross_entropy(
+                src_logits.transpose(1, 2),
+                target_classes,
+                ignore_index=self.num_classes,
+            )
+        else:
+            if "empty_weight" in outputs:
+                empty_weight = torch.cat(
+                    [outputs["empty_weight"], self.empty_weight[-1:]]
+                ).detach()
+            else:
+                empty_weight = self.empty_weight
+            loss_ce = F.cross_entropy(
+                src_logits.transpose(1, 2), target_classes, empty_weight
+            )
+        losses = {"loss_ce": loss_ce}
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_masks):
+        """Compute the losses related to the masks: the focal loss and the dice loss.
+        targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w]
+        """
+        assert "pred_masks" in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs["pred_masks"]
+        src_masks = src_masks[src_idx]
+        masks = [t["masks"] for t in targets]
+        # TODO use valid to mask invalid areas due to padding in loss
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = F.interpolate(
+            src_masks[:, None],
+            size=target_masks.shape[-2:],
+            mode="bilinear",
+            align_corners=False,
+        )
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_masks),
+            "loss_dice": dice_loss(src_masks, target_masks, num_masks),
+        }
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)]
+        )
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)]
+        )
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_masks):
+        loss_map = {"labels": self.loss_labels, "masks": self.loss_masks}
+        assert loss in loss_map, f"do you really want to compute {loss} loss?"
+        return loss_map[loss](outputs, targets, indices, num_masks)
+
+    def forward(self, outputs, targets):
+        """This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_masks = sum(len(t["labels"]) for t in targets)
+        num_masks = torch.as_tensor(
+            [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device
+        )
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_masks))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "aux_outputs" in outputs:
+            for i, aux_outputs in enumerate(outputs["aux_outputs"]):
+                indices = self.matcher(aux_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(
+                        loss, aux_outputs, targets, indices, num_masks
+                    )
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        return losses
+
+    def clean_buffer(self):
+        self.cur_target = []
diff --git a/open_vocab_seg/modeling/heads/__init__.py b/open_vocab_seg/modeling/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..52db7cce67b1686f7cab3698f15b8f309c897918
--- /dev/null
+++ b/open_vocab_seg/modeling/heads/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
\ No newline at end of file
diff --git a/open_vocab_seg/modeling/heads/mask_former_head.py b/open_vocab_seg/modeling/heads/mask_former_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f592662f92d1b0862a3ef76304e7b28b46ecf80
--- /dev/null
+++ b/open_vocab_seg/modeling/heads/mask_former_head.py
@@ -0,0 +1,135 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import logging
+from copy import deepcopy
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+
+from ..transformer.transformer_predictor import TransformerPredictor
+from .pixel_decoder import build_pixel_decoder
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class MaskFormerHead(nn.Module):
+
+    _version = 2
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    # logger.debug(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+        # extra parameters
+        transformer_predictor: nn.Module,
+        transformer_in_feature: str,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+        self.transformer_in_feature = transformer_in_feature
+
+        self.num_classes = num_classes
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v
+                for k, v in input_shape.items()
+                if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "pixel_decoder": build_pixel_decoder(cfg, input_shape),
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+            "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
+            "transformer_predictor": TransformerPredictor(
+                cfg,
+                cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+                if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder"
+                else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels,
+                mask_classification=True,
+            ),
+        }
+
+    def forward(self, features):
+        return self.layers(features)
+
+    def layers(self, features):
+        (
+            mask_features,
+            transformer_encoder_features,
+        ) = self.pixel_decoder.forward_features(features)
+        if self.transformer_in_feature == "transformer_encoder":
+            assert (
+                transformer_encoder_features is not None
+            ), "Please use the TransformerEncoderPixelDecoder."
+            predictions = self.predictor(transformer_encoder_features, mask_features)
+        else:
+            predictions = self.predictor(
+                features[self.transformer_in_feature], mask_features
+            )
+        return predictions
diff --git a/open_vocab_seg/modeling/heads/open_vocab_mask_former_head.py b/open_vocab_seg/modeling/heads/open_vocab_mask_former_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed84f9a44d24415b3334fdf2ea8e1188de32de6
--- /dev/null
+++ b/open_vocab_seg/modeling/heads/open_vocab_mask_former_head.py
@@ -0,0 +1,145 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from
+# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/heads/zero_shot_mask_former_head.py
+
+import logging
+from copy import deepcopy
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+
+from ..transformer.open_vocab_transformer_predictor import OpenVocabTransformerPredictor
+from .pixel_decoder import build_pixel_decoder
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class OpenVocabMaskFormerHead(nn.Module):
+
+    _version = 2
+
+    def _load_from_state_dict(
+        self,
+        state_dict,
+        prefix,
+        local_metadata,
+        strict,
+        missing_keys,
+        unexpected_keys,
+        error_msgs,
+    ):
+        version = local_metadata.get("version", None)
+        if version is None or version < 2:
+            # Do not warn if train from scratch
+            scratch = True
+            logger = logging.getLogger(__name__)
+            for k in list(state_dict.keys()):
+                newk = k
+                if "sem_seg_head" in k and not k.startswith(prefix + "predictor"):
+                    newk = k.replace(prefix, prefix + "pixel_decoder.")
+                    # logger.debug(f"{k} ==> {newk}")
+                if newk != k:
+                    state_dict[newk] = state_dict[k]
+                    del state_dict[k]
+                    scratch = False
+
+            if not scratch:
+                logger.warning(
+                    f"Weight format of {self.__class__.__name__} have changed! "
+                    "Please upgrade your models. Applying automatic conversion now ..."
+                )
+
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        num_classes: int,
+        pixel_decoder: nn.Module,
+        loss_weight: float = 1.0,
+        ignore_value: int = -1,
+        # extra parameters
+        transformer_predictor: nn.Module,
+        transformer_in_feature: str,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            num_classes: number of classes to predict
+            pixel_decoder: the pixel decoder module
+            loss_weight: loss weight
+            ignore_value: category id to be ignored during training.
+            transformer_predictor: the transformer decoder that makes prediction
+            transformer_in_feature: input feature name to the transformer_predictor
+        """
+        super().__init__()
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+
+        self.ignore_value = ignore_value
+        self.common_stride = 4
+        self.loss_weight = loss_weight
+
+        self.pixel_decoder = pixel_decoder
+        self.predictor = transformer_predictor
+        self.transformer_in_feature = transformer_in_feature
+
+        self.num_classes = num_classes
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        return {
+            "input_shape": {
+                k: v
+                for k, v in input_shape.items()
+                if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+            },
+            "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+            "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES,
+            "pixel_decoder": build_pixel_decoder(cfg, input_shape),
+            "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT,
+            "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE,
+            "transformer_predictor": OpenVocabTransformerPredictor(
+                cfg,
+                cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+                if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder"
+                else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels,
+                mask_classification=True,
+            ),
+        }
+
+    def forward(self, features):
+        return self.layers(features)
+
+    def layers(self, features):
+        (
+            mask_features,
+            transformer_encoder_features,
+        ) = self.pixel_decoder.forward_features(features)
+        if self.transformer_in_feature == "transformer_encoder":
+            assert (
+                transformer_encoder_features is not None
+            ), "Please use the TransformerEncoderPixelDecoder."
+            predictions = self.predictor(transformer_encoder_features, mask_features)
+        else:
+            predictions = self.predictor(
+                features[self.transformer_in_feature], mask_features
+            )
+        return predictions
+
+    def freeze_pretrained(self):
+        for name, module in self.named_children():
+            if name not in ["predictor"]:
+                for param in module.parameters():
+                    param.requires_grad = False
+            else:
+                module.freeze_pretrained()
diff --git a/open_vocab_seg/modeling/heads/pixel_decoder.py b/open_vocab_seg/modeling/heads/pixel_decoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b10089331785e937b79cf82af6d8fba55519082
--- /dev/null
+++ b/open_vocab_seg/modeling/heads/pixel_decoder.py
@@ -0,0 +1,308 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import logging
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import fvcore.nn.weight_init as weight_init
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d, ShapeSpec, get_norm
+from detectron2.modeling import SEM_SEG_HEADS_REGISTRY
+
+from ..transformer.position_encoding import PositionEmbeddingSine
+from ..transformer.transformer import TransformerEncoder, TransformerEncoderLayer
+
+
+def build_pixel_decoder(cfg, input_shape):
+    """
+    Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`.
+    """
+    name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME
+    model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape)
+    forward_features = getattr(model, "forward_features", None)
+    if not callable(forward_features):
+        raise ValueError(
+            "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. "
+            f"Please implement forward_features for {name} to only return mask features."
+        )
+    return model
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class BasePixelDecoder(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__()
+
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        feature_channels = [v.channels for k, v in input_shape]
+
+        lateral_convs = []
+        output_convs = []
+
+        use_bias = norm == ""
+        for idx, in_channels in enumerate(feature_channels):
+            if idx == len(self.in_features) - 1:
+                output_norm = get_norm(norm, conv_dim)
+                output_conv = Conv2d(
+                    in_channels,
+                    conv_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=use_bias,
+                    norm=output_norm,
+                    activation=F.relu,
+                )
+                weight_init.c2_xavier_fill(output_conv)
+                self.add_module("layer_{}".format(idx + 1), output_conv)
+
+                lateral_convs.append(None)
+                output_convs.append(output_conv)
+            else:
+                lateral_norm = get_norm(norm, conv_dim)
+                output_norm = get_norm(norm, conv_dim)
+
+                lateral_conv = Conv2d(
+                    in_channels,
+                    conv_dim,
+                    kernel_size=1,
+                    bias=use_bias,
+                    norm=lateral_norm,
+                )
+                output_conv = Conv2d(
+                    conv_dim,
+                    conv_dim,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    bias=use_bias,
+                    norm=output_norm,
+                    activation=F.relu,
+                )
+                weight_init.c2_xavier_fill(lateral_conv)
+                weight_init.c2_xavier_fill(output_conv)
+                self.add_module("adapter_{}".format(idx + 1), lateral_conv)
+                self.add_module("layer_{}".format(idx + 1), output_conv)
+
+                lateral_convs.append(lateral_conv)
+                output_convs.append(output_conv)
+        # Place convs into top-down order (from low to high resolution)
+        # to make the top-down computation in forward clearer.
+        self.lateral_convs = lateral_convs[::-1]
+        self.output_convs = output_convs[::-1]
+
+        self.mask_dim = mask_dim
+        self.mask_features = Conv2d(
+            conv_dim,
+            mask_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+        weight_init.c2_xavier_fill(self.mask_features)
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = {}
+        ret["input_shape"] = {
+            k: v
+            for k, v in input_shape.items()
+            if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES
+        }
+        ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+        ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM
+        return ret
+
+    def forward_features(self, features):
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if lateral_conv is None:
+                y = output_conv(x)
+            else:
+                cur_fpn = lateral_conv(x)
+                # Following FPN implementation, we use nearest upsampling here
+                y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
+                y = output_conv(y)
+        return self.mask_features(y), None
+
+    def forward(self, features, targets=None):
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "Calling forward() may cause unpredicted behavior of PixelDecoder module."
+        )
+        return self.forward_features(features)
+
+
+class TransformerEncoderOnly(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm
+        )
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, src, mask, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        if mask is not None:
+            mask = mask.flatten(1)
+
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        return memory.permute(1, 2, 0).view(bs, c, h, w)
+
+
+@SEM_SEG_HEADS_REGISTRY.register()
+class TransformerEncoderPixelDecoder(BasePixelDecoder):
+    @configurable
+    def __init__(
+        self,
+        input_shape: Dict[str, ShapeSpec],
+        *,
+        transformer_dropout: float,
+        transformer_nheads: int,
+        transformer_dim_feedforward: int,
+        transformer_enc_layers: int,
+        transformer_pre_norm: bool,
+        conv_dim: int,
+        mask_dim: int,
+        norm: Optional[Union[str, Callable]] = None,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            input_shape: shapes (channels and stride) of the input features
+            transformer_dropout: dropout probability in transformer
+            transformer_nheads: number of heads in transformer
+            transformer_dim_feedforward: dimension of feedforward network
+            transformer_enc_layers: number of transformer encoder layers
+            transformer_pre_norm: whether to use pre-layernorm or not
+            conv_dims: number of output channels for the intermediate conv layers.
+            mask_dim: number of output channels for the final conv layer.
+            norm (str or callable): normalization for all conv layers
+        """
+        super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm)
+
+        input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride)
+        self.in_features = [k for k, v in input_shape]  # starting from "res2" to "res5"
+        feature_strides = [v.stride for k, v in input_shape]
+        feature_channels = [v.channels for k, v in input_shape]
+
+        in_channels = feature_channels[len(self.in_features) - 1]
+        self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1)
+        weight_init.c2_xavier_fill(self.input_proj)
+        self.transformer = TransformerEncoderOnly(
+            d_model=conv_dim,
+            dropout=transformer_dropout,
+            nhead=transformer_nheads,
+            dim_feedforward=transformer_dim_feedforward,
+            num_encoder_layers=transformer_enc_layers,
+            normalize_before=transformer_pre_norm,
+        )
+        N_steps = conv_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+
+        # update layer
+        use_bias = norm == ""
+        output_norm = get_norm(norm, conv_dim)
+        output_conv = Conv2d(
+            conv_dim,
+            conv_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias,
+            norm=output_norm,
+            activation=F.relu,
+        )
+        weight_init.c2_xavier_fill(output_conv)
+        delattr(self, "layer_{}".format(len(self.in_features)))
+        self.add_module("layer_{}".format(len(self.in_features)), output_conv)
+        self.output_convs[0] = output_conv
+
+    @classmethod
+    def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]):
+        ret = super().from_config(cfg, input_shape)
+        ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
+        ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        ret[
+            "transformer_enc_layers"
+        ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS  # a separate config
+        ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        return ret
+
+    def forward_features(self, features):
+        # Reverse feature maps into top-down order (from low to high resolution)
+        for idx, f in enumerate(self.in_features[::-1]):
+            x = features[f]
+            lateral_conv = self.lateral_convs[idx]
+            output_conv = self.output_convs[idx]
+            if lateral_conv is None:
+                transformer = self.input_proj(x)
+                pos = self.pe_layer(x)
+                transformer = self.transformer(transformer, None, pos)
+                y = output_conv(transformer)
+                # save intermediate feature as input to Transformer decoder
+                transformer_encoder_features = transformer
+            else:
+                cur_fpn = lateral_conv(x)
+                # Following FPN implementation, we use nearest upsampling here
+                y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest")
+                y = output_conv(y)
+        return self.mask_features(y), transformer_encoder_features
+
+    def forward(self, features, targets=None):
+        logger = logging.getLogger(__name__)
+        logger.warning(
+            "Calling forward() may cause unpredicted behavior of PixelDecoder module."
+        )
+        return self.forward_features(features)
diff --git a/open_vocab_seg/modeling/matcher.py b/open_vocab_seg/modeling/matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..a72ba671ad60db078e08046357a6aa0e5e9bd5dc
--- /dev/null
+++ b/open_vocab_seg/modeling/matcher.py
@@ -0,0 +1,187 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+"""
+Modules to compute the matching cost and solve the corresponding LSAP.
+"""
+import torch
+import torch.nn.functional as F
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+
+
+def batch_dice_loss(inputs, targets):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets)
+    denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :]
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss
+
+
+def batch_sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    hw = inputs.shape[1]
+
+    prob = inputs.sigmoid()
+    focal_pos = ((1 - prob) ** gamma) * F.binary_cross_entropy_with_logits(
+        inputs, torch.ones_like(inputs), reduction="none"
+    )
+    focal_neg = (prob ** gamma) * F.binary_cross_entropy_with_logits(
+        inputs, torch.zeros_like(inputs), reduction="none"
+    )
+    if alpha >= 0:
+        focal_pos = focal_pos * alpha
+        focal_neg = focal_neg * (1 - alpha)
+
+    loss = torch.einsum("nc,mc->nm", focal_pos, targets) + torch.einsum(
+        "nc,mc->nm", focal_neg, (1 - targets)
+    )
+
+    return loss / hw
+
+
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(
+        self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1
+    ):
+        """Creates the matcher
+
+        Params:
+            cost_class: This is the relative weight of the classification error in the matching cost
+            cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost
+            cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost
+        """
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_mask = cost_mask
+        self.cost_dice = cost_dice
+        assert (
+            cost_class != 0 or cost_mask != 0 or cost_dice != 0
+        ), "all costs cant be 0"
+
+    @torch.no_grad()
+    def memory_efficient_forward(self, outputs, targets):
+        """More memory-friendly matching"""
+        bs, num_queries = outputs["pred_logits"].shape[:2]
+
+        # Work out the mask padding size
+        masks = [v["masks"] for v in targets]
+        h_max = max([m.shape[1] for m in masks])
+        w_max = max([m.shape[2] for m in masks])
+
+        indices = []
+
+        # Iterate through batch size
+        for b in range(bs):
+
+            out_prob = outputs["pred_logits"][b].softmax(
+                -1
+            )  # [num_queries, num_classes]
+            out_mask = outputs["pred_masks"][b]  # [num_queries, H_pred, W_pred]
+
+            tgt_ids = targets[b]["labels"]
+            # gt masks are already padded when preparing target
+            tgt_mask = targets[b]["masks"].to(out_mask)
+
+            # Compute the classification cost. Contrary to the loss, we don't use the NLL,
+            # but approximate it in 1 - proba[target class].
+            # The 1 is a constant that doesn't change the matching, it can be ommitted.
+            cost_class = -out_prob[:, tgt_ids]
+
+            # Downsample gt masks to save memory
+            tgt_mask = F.interpolate(
+                tgt_mask[:, None], size=out_mask.shape[-2:], mode="nearest"
+            )
+
+            # Flatten spatial dimension
+            out_mask = out_mask.flatten(1)  # [batch_size * num_queries, H*W]
+            tgt_mask = tgt_mask[:, 0].flatten(1)  # [num_total_targets, H*W]
+
+            # Compute the focal loss between masks
+            cost_mask = batch_sigmoid_focal_loss(out_mask, tgt_mask)
+
+            # Compute the dice loss betwen masks
+            cost_dice = batch_dice_loss(out_mask, tgt_mask)
+
+            # Final cost matrix
+            C = (
+                self.cost_mask * cost_mask
+                + self.cost_class * cost_class
+                + self.cost_dice * cost_dice
+            )
+            C = C.reshape(num_queries, -1).cpu()
+
+            indices.append(linear_sum_assignment(C))
+        return [
+            (
+                torch.as_tensor(i, dtype=torch.int64),
+                torch.as_tensor(j, dtype=torch.int64),
+            )
+            for i, j in indices
+        ]
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """Performs the matching
+
+        Params:
+            outputs: This is a dict that contains at least these entries:
+                 "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                 "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks
+
+            targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
+                 "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
+                           objects in the target) containing the class labels
+                 "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        return self.memory_efficient_forward(outputs, targets)
+
+    def __repr__(self):
+        head = "Matcher " + self.__class__.__name__
+        body = [
+            "cost_class: {}".format(self.cost_class),
+            "cost_mask: {}".format(self.cost_mask),
+            "cost_dice: {}".format(self.cost_dice),
+        ]
+        _repr_indent = 4
+        lines = [head] + [" " * _repr_indent + line for line in body]
+        return "\n".join(lines)
diff --git a/open_vocab_seg/modeling/transformer/__init__.py b/open_vocab_seg/modeling/transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..49f9003b7a688f5396170dd89c26ef335a2c201f
--- /dev/null
+++ b/open_vocab_seg/modeling/transformer/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
diff --git a/open_vocab_seg/modeling/transformer/open_vocab_transformer_predictor.py b/open_vocab_seg/modeling/transformer/open_vocab_transformer_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0efee3e14c71400a1cc5a55ea6c21b6876189aaa
--- /dev/null
+++ b/open_vocab_seg/modeling/transformer/open_vocab_transformer_predictor.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from torch import nn
+from detectron2.config import configurable
+from .transformer_predictor import TransformerPredictor, MLP
+
+
+class OpenVocabTransformerPredictor(TransformerPredictor):
+    @configurable
+    def __init__(
+        self,
+        in_channels,
+        mask_classification=True,
+        *,
+        embedding_dim: int,
+        embed_hidden_dim: int,
+        embed_layers: int,
+        hidden_dim: int,
+        num_queries: int,
+        nheads: int,
+        dropout: float,
+        dim_feedforward: int,
+        enc_layers: int,
+        dec_layers: int,
+        pre_norm: bool,
+        deep_supervision: bool,
+        mask_dim: int,
+        enforce_input_project: bool,
+    ):
+        super().__init__(
+            in_channels,
+            False,
+            num_classes=embedding_dim,
+            hidden_dim=hidden_dim,
+            num_queries=num_queries,
+            nheads=nheads,
+            dropout=dropout,
+            dim_feedforward=dim_feedforward,
+            enc_layers=enc_layers,
+            dec_layers=dec_layers,
+            pre_norm=pre_norm,
+            deep_supervision=deep_supervision,
+            mask_dim=mask_dim,
+            enforce_input_project=enforce_input_project,
+        )
+        self.mask_classification = mask_classification
+        # output FFNs
+        if self.mask_classification:
+            self.class_embed = MLP(
+                hidden_dim, embed_hidden_dim, embedding_dim, embed_layers
+            )
+
+    def freeze_pretrained(self):
+        for name, module in self.named_children():
+            if name not in ["class_embed"]:
+                for param in module.parameters():
+                    param.requires_grad = False
+
+    @classmethod
+    def from_config(cls, cfg, in_channels, mask_classification):
+        ret = {}
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+
+        ret["embedding_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM
+        ret["embed_hidden_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM
+        ret["embed_layers"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS
+        ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
+        ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
+        ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
+        ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+        ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
+
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+
+        return ret
diff --git a/open_vocab_seg/modeling/transformer/position_encoding.py b/open_vocab_seg/modeling/transformer/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..db236c5b36cbc4f4435a83b542bdc242cbb441c3
--- /dev/null
+++ b/open_vocab_seg/modeling/transformer/position_encoding.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+"""
+Various positional encodings for the transformer.
+"""
+import math
+
+import torch
+from torch import nn
+
+
+class PositionEmbeddingSine(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one
+    used by the Attention is all you need paper, generalized to work on images.
+    """
+
+    def __init__(
+        self, num_pos_feats=64, temperature=10000, normalize=False, scale=None
+    ):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, x, mask=None):
+        if mask is None:
+            mask = torch.zeros(
+                (x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool
+            )
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
+        dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4
+        ).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
diff --git a/open_vocab_seg/modeling/transformer/transformer.py b/open_vocab_seg/modeling/transformer/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..76d1003b3852ce72c6ad5c3c23705f380197362f
--- /dev/null
+++ b/open_vocab_seg/modeling/transformer/transformer.py
@@ -0,0 +1,380 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+"""
+Transformer class.
+
+Copy-paste from torch.nn.Transformer with modifications:
+    * positional encodings are passed in MHattention
+    * extra LN at the end of encoder is removed
+    * decoder returns a stack of activations from all decoding layers
+"""
+import copy
+from typing import List, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+        return_intermediate_dec=False,
+    ):
+        super().__init__()
+
+        encoder_layer = TransformerEncoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer, num_encoder_layers, encoder_norm
+        )
+
+        decoder_layer = TransformerDecoderLayer(
+            d_model, nhead, dim_feedforward, dropout, activation, normalize_before
+        )
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+        )
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+    def forward(self, src, mask, query_embed, pos_embed):
+        # flatten NxCxHxW to HWxNxC
+        bs, c, h, w = src.shape
+        src = src.flatten(2).permute(2, 0, 1)
+        pos_embed = pos_embed.flatten(2).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1)
+        if mask is not None:
+            mask = mask.flatten(1)
+
+        tgt = torch.zeros_like(query_embed)
+        memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed)
+        hs = self.decoder(
+            tgt,
+            memory,
+            memory_key_padding_mask=mask,
+            pos=pos_embed,
+            query_pos=query_embed,
+        )
+        return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w)
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(
+        self,
+        src,
+        mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        output = src
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                src_mask=mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos=pos,
+            )
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        output = tgt
+
+        intermediate = []
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos,
+            )
+            if self.return_intermediate:
+                intermediate.append(self.norm(output))
+
+        if self.norm is not None:
+            output = self.norm(output)
+            if self.return_intermediate:
+                intermediate.pop()
+                intermediate.append(output)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate)
+
+        return output.unsqueeze(0)
+
+
+class TransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(
+            q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(
+            q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask
+        )[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(
+        self,
+        src,
+        src_mask: Optional[Tensor] = None,
+        src_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class TransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        normalize_before=False,
+    ):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(d_model, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, d_model)
+
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.normalize_before = normalize_before
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(
+            q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(
+            q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask
+        )[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask,
+        )[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        query_pos: Optional[Tensor] = None,
+    ):
+        if self.normalize_before:
+            return self.forward_pre(
+                tgt,
+                memory,
+                tgt_mask,
+                memory_mask,
+                tgt_key_padding_mask,
+                memory_key_padding_mask,
+                pos,
+                query_pos,
+            )
+        return self.forward_post(
+            tgt,
+            memory,
+            tgt_mask,
+            memory_mask,
+            tgt_key_padding_mask,
+            memory_key_padding_mask,
+            pos,
+            query_pos,
+        )
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string"""
+    if activation == "relu":
+        return F.relu
+    if activation == "gelu":
+        return F.gelu
+    if activation == "glu":
+        return F.glu
+    raise RuntimeError(f"activation should be relu/gelu, not {activation}.")
diff --git a/open_vocab_seg/modeling/transformer/transformer_predictor.py b/open_vocab_seg/modeling/transformer/transformer_predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..72378abe29c01809a00fa1b87d275258ee9c91fa
--- /dev/null
+++ b/open_vocab_seg/modeling/transformer/transformer_predictor.py
@@ -0,0 +1,179 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import fvcore.nn.weight_init as weight_init
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.layers import Conv2d
+
+from .position_encoding import PositionEmbeddingSine
+from .transformer import Transformer
+
+
+class TransformerPredictor(nn.Module):
+    @configurable
+    def __init__(
+        self,
+        in_channels,
+        mask_classification=True,
+        *,
+        num_classes: int,
+        hidden_dim: int,
+        num_queries: int,
+        nheads: int,
+        dropout: float,
+        dim_feedforward: int,
+        enc_layers: int,
+        dec_layers: int,
+        pre_norm: bool,
+        deep_supervision: bool,
+        mask_dim: int,
+        enforce_input_project: bool,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            in_channels: channels of the input features
+            mask_classification: whether to add mask classifier or not
+            num_classes: number of classes
+            hidden_dim: Transformer feature dimension
+            num_queries: number of queries
+            nheads: number of heads
+            dropout: dropout in Transformer
+            dim_feedforward: feature dimension in feedforward network
+            enc_layers: number of Transformer encoder layers
+            dec_layers: number of Transformer decoder layers
+            pre_norm: whether to use pre-LayerNorm or not
+            deep_supervision: whether to add supervision to every decoder layers
+            mask_dim: mask feature dimension
+            enforce_input_project: add input project 1x1 conv even if input
+                channels and hidden dim is identical
+        """
+        super().__init__()
+
+        self.mask_classification = mask_classification
+
+        # positional encoding
+        N_steps = hidden_dim // 2
+        self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True)
+
+        transformer = Transformer(
+            d_model=hidden_dim,
+            dropout=dropout,
+            nhead=nheads,
+            dim_feedforward=dim_feedforward,
+            num_encoder_layers=enc_layers,
+            num_decoder_layers=dec_layers,
+            normalize_before=pre_norm,
+            return_intermediate_dec=deep_supervision,
+        )
+
+        self.num_queries = num_queries
+        self.transformer = transformer
+        hidden_dim = transformer.d_model
+
+        self.query_embed = nn.Embedding(num_queries, hidden_dim)
+
+        if in_channels != hidden_dim or enforce_input_project:
+            self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1)
+            weight_init.c2_xavier_fill(self.input_proj)
+        else:
+            self.input_proj = nn.Sequential()
+        self.aux_loss = deep_supervision
+
+        # output FFNs
+        if self.mask_classification:
+            self.class_embed = nn.Linear(hidden_dim, num_classes + 1)
+        self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3)
+
+    @classmethod
+    def from_config(cls, cfg, in_channels, mask_classification):
+        ret = {}
+        ret["in_channels"] = in_channels
+        ret["mask_classification"] = mask_classification
+
+        ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES
+        ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM
+        ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES
+        # Transformer parameters:
+        ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS
+        ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT
+        ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD
+        ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS
+        ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS
+        ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM
+        ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION
+        ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ
+
+        ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM
+
+        return ret
+
+    def forward(self, x, mask_features):
+        pos = self.pe_layer(x)
+
+        src = x
+        mask = None
+        hs, memory = self.transformer(
+            self.input_proj(src), mask, self.query_embed.weight, pos
+        )
+
+        if self.mask_classification:
+            outputs_class = self.class_embed(hs)
+            out = {"pred_logits": outputs_class[-1]}
+        else:
+            out = {}
+
+        if self.aux_loss:
+            # [l, bs, queries, embed]
+            mask_embed = self.mask_embed(hs)
+            outputs_seg_masks = torch.einsum(
+                "lbqc,bchw->lbqhw", mask_embed, mask_features
+            )
+            out["pred_masks"] = outputs_seg_masks[-1]
+            out["aux_outputs"] = self._set_aux_loss(
+                outputs_class if self.mask_classification else None, outputs_seg_masks
+            )
+        else:
+            # FIXME h_boxes takes the last one computed, keep this in mind
+            # [bs, queries, embed]
+            mask_embed = self.mask_embed(hs[-1])
+            outputs_seg_masks = torch.einsum(
+                "bqc,bchw->bqhw", mask_embed, mask_features
+            )
+            out["pred_masks"] = outputs_seg_masks
+        return out
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_seg_masks):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        if self.mask_classification:
+            return [
+                {"pred_logits": a, "pred_masks": b}
+                for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1])
+            ]
+        else:
+            return [{"pred_masks": b} for b in outputs_seg_masks[:-1]]
+
+
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])
+        )
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
diff --git a/open_vocab_seg/ovseg_model.py b/open_vocab_seg/ovseg_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..82a14d487c032ec9c0c3bca73c25e94611c93245
--- /dev/null
+++ b/open_vocab_seg/ovseg_model.py
@@ -0,0 +1,461 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from
+# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/zero_shot_mask_former_model.py
+
+import logging
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from detectron2.config import configurable
+from detectron2.data import MetadataCatalog
+from detectron2.modeling import META_ARCH_REGISTRY
+from detectron2.modeling.backbone import Backbone
+from detectron2.modeling.postprocessing import sem_seg_postprocess
+from detectron2.structures import ImageList
+from detectron2.utils.logger import log_first_n
+from .modeling.clip_adapter import (
+    ClipAdapter,
+    MaskFormerClipAdapter,
+    build_text_prompt,
+)
+from .mask_former_model import MaskFormer
+from .utils.misc import get_gt_binary_masks
+
+@META_ARCH_REGISTRY.register()
+class OVSeg(MaskFormer):
+    """
+    Main class for zero shot mask classification semantic segmentation architectures.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        clip_adapter: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        panoptic_on: bool,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        clip_ensemble: bool,
+        clip_ensemble_weight: float,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            clip_adapter: adapter for clip-based mask classification
+            num_queries: int, number of queries
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__(
+            backbone=backbone,
+            sem_seg_head=sem_seg_head,
+            criterion=criterion,
+            num_queries=num_queries,
+            panoptic_on=panoptic_on,
+            object_mask_threshold=object_mask_threshold,
+            overlap_threshold=overlap_threshold,
+            metadata=metadata,
+            size_divisibility=size_divisibility,
+            sem_seg_postprocess_before_inference=sem_seg_postprocess_before_inference,
+            pixel_mean=pixel_mean,
+            pixel_std=pixel_std,
+        )
+        self.clip_adapter: ClipAdapter = clip_adapter
+
+        self.clip_ensemble: bool = clip_ensemble
+        self.clip_ensemble_weight: float = clip_ensemble_weight
+
+    @classmethod
+    def from_config(cls, cfg):
+        init_kwargs = MaskFormer.from_config(cfg)
+        text_templates = build_text_prompt(cfg.MODEL.CLIP_ADAPTER)
+
+        clip_adapter = MaskFormerClipAdapter(
+            cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME,
+            text_templates,
+            mask_fill=cfg.MODEL.CLIP_ADAPTER.MASK_FILL,
+            mask_expand_ratio=cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO,
+            mask_thr=cfg.MODEL.CLIP_ADAPTER.MASK_THR,
+            mask_matting=cfg.MODEL.CLIP_ADAPTER.MASK_MATTING,
+            region_resized=cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED,
+            mask_prompt_depth=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH,
+            mask_prompt_fwd=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD,
+        )
+        init_kwargs["clip_adapter"] = clip_adapter
+        init_kwargs["clip_ensemble"] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE
+        init_kwargs[
+            "clip_ensemble_weight"
+        ] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT
+
+        return init_kwargs
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        dataset_name = [x["meta"]["dataset_name"] for x in batched_inputs]
+        assert len(set(dataset_name)) == 1
+        dataset_name = dataset_name[0]
+
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features)
+        class_names = self.get_class_name_list(dataset_name)
+        text_features = self.clip_adapter.get_text_features(class_names)
+        outputs["pred_logits"] = self.clip_adapter.get_sim_logits(
+            text_features, self.clip_adapter.normalize_feature(outputs["pred_logits"])
+        )
+        if self.training:
+            if "aux_outputs" in outputs.keys():
+                for i in range(len(outputs["aux_outputs"])):
+                    outputs["aux_outputs"][i][
+                        "pred_logits"
+                    ] = self.clip_adapter.get_sim_logits(
+                        text_features,
+                        self.clip_adapter.normalize_feature(
+                            outputs["aux_outputs"][i]["pred_logits"]
+                        ),
+                    )
+            # mask classification target
+            if "instances" in batched_inputs[0]:
+                gt_instances = [x["instances"].to(self.device) for x in batched_inputs]
+                targets = self.prepare_targets(gt_instances, images)
+            else:
+                targets = None
+
+            # bipartite matching-based loss
+            losses = self.criterion(outputs, targets)
+
+            for k in list(losses.keys()):
+                if k in self.criterion.weight_dict:
+                    losses[k] *= self.criterion.weight_dict[k]
+                else:
+                    # remove this loss if not specified in `weight_dict`
+                    losses.pop(k)
+
+            return losses
+        else:
+            mask_cls_results = outputs["pred_logits"]
+            mask_pred_results = outputs["pred_masks"]
+            # upsample masks
+            mask_pred_results = F.interpolate(
+                mask_pred_results,
+                size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+            processed_results = []
+            for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
+                mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
+            ):
+                height = image_size[0]
+                width = image_size[1]
+                mask_pred_result = sem_seg_postprocess(
+                    mask_pred_result, image_size, height, width
+                )
+                image = input_per_image["image"].to(self.device)
+
+                r, regions = self.semantic_inference(
+                    mask_cls_result, mask_pred_result, image, class_names
+                )
+
+                height = input_per_image.get("height", image_size[0])
+                width = input_per_image.get("width", image_size[1])
+                r = sem_seg_postprocess(r, image_size, height, width)
+                processed_results.append({"sem_seg": r})
+
+                # panoptic segmentation inference
+                if self.panoptic_on:
+                    panoptic_r = self.panoptic_inference(
+                        mask_cls_result, mask_pred_result
+                    )
+                    processed_results[-1]["panoptic_seg"] = panoptic_r
+
+            return processed_results
+
+
+    def semantic_inference(self, mask_cls, mask_pred, image, class_names):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+
+        regions = None
+        if self.clip_ensemble:
+            clip_cls, regions, valid_flag = self.clip_adapter(
+                image, class_names, mask_pred, normalize=True
+            )
+            if clip_cls is None:
+                clip_cls = torch.empty(0, mask_cls.shape[-1] + 1, device=self.device)
+            # softmax before index or after?
+            clip_cls = F.softmax(clip_cls[:, :-1], dim=-1)
+            if self.clip_ensemble_weight > 0:
+                map_back_clip_cls = mask_cls.new_ones(mask_cls.shape)
+                map_back_clip_cls[valid_flag] = clip_cls
+                mask_cls = torch.pow(mask_cls, 1 - self.clip_ensemble_weight) * \
+                           torch.pow(map_back_clip_cls, self.clip_ensemble_weight)
+
+
+            else:
+                # only clip model predictions are used
+                mask_cls = clip_cls
+                mask_pred = mask_pred[valid_flag]
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg, regions
+
+    def get_class_name_list(self, dataset_name):
+        class_names = [
+            c.strip() for c in MetadataCatalog.get(dataset_name).stuff_classes
+        ]
+        return class_names
+
+
+@META_ARCH_REGISTRY.register()
+class OVSegDEMO(MaskFormer):
+    """
+    Main class for zero shot mask classification semantic segmentation architectures.
+    """
+
+    @configurable
+    def __init__(
+        self,
+        *,
+        backbone: Backbone,
+        sem_seg_head: nn.Module,
+        clip_adapter: nn.Module,
+        criterion: nn.Module,
+        num_queries: int,
+        panoptic_on: bool,
+        object_mask_threshold: float,
+        overlap_threshold: float,
+        metadata,
+        size_divisibility: int,
+        sem_seg_postprocess_before_inference: bool,
+        clip_ensemble: bool,
+        clip_ensemble_weight: float,
+        pixel_mean: Tuple[float],
+        pixel_std: Tuple[float],
+    ):
+        """
+        Args:
+            backbone: a backbone module, must follow detectron2's backbone interface
+            sem_seg_head: a module that predicts semantic segmentation from backbone features
+            criterion: a module that defines the loss
+            clip_adapter: adapter for clip-based mask classification
+            num_queries: int, number of queries
+            panoptic_on: bool, whether to output panoptic segmentation prediction
+            object_mask_threshold: float, threshold to filter query based on classification score
+                for panoptic segmentation inference
+            overlap_threshold: overlap threshold used in general inference for panoptic segmentation
+            metadata: dataset meta, get `thing` and `stuff` category names for panoptic
+                segmentation inference
+            size_divisibility: Some backbones require the input height and width to be divisible by a
+                specific integer. We can use this to override such requirement.
+            sem_seg_postprocess_before_inference: whether to resize the prediction back
+                to original input size before semantic segmentation inference or after.
+                For high-resolution dataset like Mapillary, resizing predictions before
+                inference will cause OOM error.
+            pixel_mean, pixel_std: list or tuple with #channels element, representing
+                the per-channel mean and std to be used to normalize the input image
+        """
+        super().__init__(
+            backbone=backbone,
+            sem_seg_head=sem_seg_head,
+            criterion=criterion,
+            num_queries=num_queries,
+            panoptic_on=panoptic_on,
+            object_mask_threshold=object_mask_threshold,
+            overlap_threshold=overlap_threshold,
+            metadata=metadata,
+            size_divisibility=size_divisibility,
+            sem_seg_postprocess_before_inference=sem_seg_postprocess_before_inference,
+            pixel_mean=pixel_mean,
+            pixel_std=pixel_std,
+        )
+        self.clip_adapter: ClipAdapter = clip_adapter
+
+        self.clip_ensemble: bool = clip_ensemble
+        self.clip_ensemble_weight: float = clip_ensemble_weight
+
+    @classmethod
+    def from_config(cls, cfg):
+        init_kwargs = MaskFormer.from_config(cfg)
+        text_templates = build_text_prompt(cfg.MODEL.CLIP_ADAPTER)
+
+        clip_adapter = MaskFormerClipAdapter(
+            cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME,
+            text_templates,
+            mask_fill=cfg.MODEL.CLIP_ADAPTER.MASK_FILL,
+            mask_expand_ratio=cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO,
+            mask_thr=cfg.MODEL.CLIP_ADAPTER.MASK_THR,
+            mask_matting=cfg.MODEL.CLIP_ADAPTER.MASK_MATTING,
+            region_resized=cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED,
+            mask_prompt_depth=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH,
+            mask_prompt_fwd=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD,
+        )
+        init_kwargs["clip_adapter"] = clip_adapter
+        init_kwargs["clip_ensemble"] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE
+        init_kwargs[
+            "clip_ensemble_weight"
+        ] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT
+
+        return init_kwargs
+
+    def forward(self, batched_inputs):
+        """
+        Args:
+            batched_inputs: a list, batched outputs of :class:`DatasetMapper`.
+                Each item in the list contains the inputs for one image.
+                For now, each item in the list is a dict that contains:
+                   * "image": Tensor, image in (C, H, W) format.
+                   * "instances": per-region ground truth
+                   * Other information that's included in the original dicts, such as:
+                     "height", "width" (int): the output resolution of the model (may be different
+                     from input resolution), used in inference.
+        Returns:
+            list[dict]:
+                each dict has the results for one image. The dict contains the following keys:
+
+                * "sem_seg":
+                    A Tensor that represents the
+                    per-pixel segmentation prediced by the head.
+                    The prediction has shape KxHxW that represents the logits of
+                    each class for each pixel.
+                * "panoptic_seg":
+                    A tuple that represent panoptic output
+                    panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment.
+                    segments_info (list[dict]): Describe each segment in `panoptic_seg`.
+                        Each dict contains keys "id", "category_id", "isthing".
+        """
+        images = [x["image"].to(self.device) for x in batched_inputs]
+        images = [(x - self.pixel_mean) / self.pixel_std for x in images]
+        images = ImageList.from_tensors(images, self.size_divisibility)
+
+        features = self.backbone(images.tensor)
+        outputs = self.sem_seg_head(features)
+        class_names = batched_inputs[0]["class_names"]
+        if len(class_names) == 1:
+            # Because classification is performed in a 'contrastive' manner, adding others to represent other concepts
+            class_names.append('others')
+        text_features = self.clip_adapter.get_text_features(class_names)
+        outputs["pred_logits"] = self.clip_adapter.get_sim_logits(
+            text_features, self.clip_adapter.normalize_feature(outputs["pred_logits"])
+        )
+        mask_cls_results = outputs["pred_logits"]
+        mask_pred_results = outputs["pred_masks"]
+        # upsample masks
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=(images.tensor.shape[-2], images.tensor.shape[-1]),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        processed_results = []
+        for mask_cls_result, mask_pred_result, input_per_image, image_size in zip(
+            mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes
+        ):
+            height = image_size[0]
+            width = image_size[1]
+            mask_pred_result = sem_seg_postprocess(
+                mask_pred_result, image_size, height, width
+            )
+            image = input_per_image["image"].to(self.device)
+
+            r, regions = self.demo_inference(mask_cls_result, mask_pred_result, image, class_names)
+
+            height = input_per_image.get("height", image_size[0])
+            width = input_per_image.get("width", image_size[1])
+            r = sem_seg_postprocess(r, image_size, height, width)
+            processed_results.append({"sem_seg": r})
+
+        return processed_results
+
+
+
+
+    def demo_inference(self, mask_cls, mask_pred, image, class_names):
+        mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+
+        regions = None
+        if self.clip_ensemble:
+            clip_cls, regions, valid_flag = self.clip_adapter(
+                image, class_names, mask_pred, normalize=True
+            )
+            if clip_cls is None:
+                clip_cls = torch.empty(0, mask_cls.shape[-1] + 1, device=self.device)
+            # softmax before index or after?
+            clip_cls = F.softmax(clip_cls[:, :-1], dim=-1)
+            if self.clip_ensemble_weight > 0:
+                map_back_clip_cls = mask_cls.new_ones(mask_cls.shape)
+                map_back_clip_cls[valid_flag] = clip_cls
+                mask_cls = torch.pow(mask_cls, 1 - self.clip_ensemble_weight) * \
+                           torch.pow(map_back_clip_cls, self.clip_ensemble_weight)
+
+            else:
+                # only clip model predictions are used
+                mask_cls = clip_cls
+                mask_pred = mask_pred[valid_flag]
+        # bin_mask = mask_pred > self.clip_adapter.mask_thr
+        # select_cls = torch.zeros(sum(valid_flag), mask_cls.shape[-1], device=self.device)
+        # select_mask = torch.argmax(mask_cls, dim=0)
+        # if len(class_names) == 2 and class_names[-1] == 'others':
+        #     select_mask = select_mask[:-1]
+        # for idx in select_mask:
+        #     select_cls[idx] = mask_cls[idx]
+        # semseg = torch.einsum("qc,qhw->chw", select_cls, bin_mask.float())
+        semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred)
+        return semseg, regions
diff --git a/open_vocab_seg/test_time_augmentation.py b/open_vocab_seg/test_time_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb7a51f28419c59775013c74fdee49e5166bde51
--- /dev/null
+++ b/open_vocab_seg/test_time_augmentation.py
@@ -0,0 +1,217 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import copy
+from itertools import count
+import math
+import numpy as np
+import torch
+from fvcore.transforms import HFlipTransform
+from torch import nn
+from torch.nn.parallel import DistributedDataParallel
+
+from detectron2.data.detection_utils import read_image
+from detectron2.modeling import DatasetMapperTTA
+from detectron2.modeling.postprocessing import sem_seg_postprocess
+import logging
+from detectron2.utils.logger import log_every_n, log_first_n
+
+__all__ = [
+    "SemanticSegmentorWithTTA",
+]
+
+
+class SemanticSegmentorWithTTA(nn.Module):
+    """
+    A SemanticSegmentor with test-time augmentation enabled.
+    Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`.
+    """
+
+    def __init__(self, cfg, model, tta_mapper=None, batch_size=1):
+        """
+        Args:
+            cfg (CfgNode):
+            model (SemanticSegmentor): a SemanticSegmentor to apply TTA on.
+            tta_mapper (callable): takes a dataset dict and returns a list of
+                augmented versions of the dataset dict. Defaults to
+                `DatasetMapperTTA(cfg)`.
+            batch_size (int): batch the augmented images into this batch size for inference.
+        """
+        super().__init__()
+        if isinstance(model, DistributedDataParallel):
+            model = model.module
+        self.cfg = cfg.clone()
+
+        self.model = model
+
+        if tta_mapper is None:
+            tta_mapper = DatasetMapperTTA(cfg)
+        self.tta_mapper = tta_mapper
+        self.batch_size = batch_size
+
+    def _inference_with_model(self, inputs):
+        if self.cfg.TEST.SLIDING_WINDOW:
+            log_first_n(logging.INFO, "Using sliding window to test")
+
+            outputs = []
+
+            for input in inputs:
+                image_size = input["image"].shape[1:]  # h,w
+                if self.cfg.TEST.SLIDING_TILE_SIZE > 0:
+                    tile_size = (
+                        self.cfg.TEST.SLIDING_TILE_SIZE,
+                        self.cfg.TEST.SLIDING_TILE_SIZE,
+                    )
+                else:
+                    selected_mapping = {256: 224, 512: 256, 768: 512, 896: 512}
+                    tile_size = min(image_size)
+                    tile_size = selected_mapping[tile_size]
+                    tile_size = (tile_size, tile_size)
+                extra_info = {
+                    k: v
+                    for k, v in input.items()
+                    if k not in ["image", "height", "width"]
+                }
+                log_every_n(
+                    logging.INFO, "split {} to {}".format(image_size, tile_size)
+                )
+                overlap = self.cfg.TEST.SLIDING_OVERLAP
+                stride = math.ceil(tile_size[0] * (1 - overlap))
+                tile_rows = int(
+                    math.ceil((image_size[0] - tile_size[0]) / stride) + 1
+                )  # strided convolution formula
+                tile_cols = int(math.ceil((image_size[1] - tile_size[1]) / stride) + 1)
+                full_probs = None
+                count_predictions = None
+                tile_counter = 0
+
+                for row in range(tile_rows):
+                    for col in range(tile_cols):
+                        x1 = int(col * stride)
+                        y1 = int(row * stride)
+                        x2 = min(x1 + tile_size[1], image_size[1])
+                        y2 = min(y1 + tile_size[0], image_size[0])
+                        x1 = max(
+                            int(x2 - tile_size[1]), 0
+                        )  # for portrait images the x1 underflows sometimes
+                        y1 = max(
+                            int(y2 - tile_size[0]), 0
+                        )  # for very few rows y1 underflows
+
+                        img = input["image"][:, y1:y2, x1:x2]
+                        padded_img = nn.functional.pad(
+                            img,
+                            (
+                                0,
+                                tile_size[1] - img.shape[-1],
+                                0,
+                                tile_size[0] - img.shape[-2],
+                            ),
+                        )
+                        tile_counter += 1
+                        padded_input = {"image": padded_img}
+                        padded_input.update(extra_info)
+                        padded_prediction = self.model([padded_input])[0]["sem_seg"]
+                        prediction = padded_prediction[
+                            :, 0 : img.shape[1], 0 : img.shape[2]
+                        ]
+                        if full_probs is None:
+                            full_probs = prediction.new_zeros(
+                                prediction.shape[0], image_size[0], image_size[1]
+                            )
+                        if count_predictions is None:
+                            count_predictions = prediction.new_zeros(
+                                prediction.shape[0], image_size[0], image_size[1]
+                            )
+                        count_predictions[:, y1:y2, x1:x2] += 1
+                        full_probs[
+                            :, y1:y2, x1:x2
+                        ] += prediction  # accumulate the predictions also in the overlapping regions
+
+                full_probs /= count_predictions
+                full_probs = sem_seg_postprocess(
+                    full_probs,
+                    image_size,
+                    input.get("height", image_size[0]),
+                    input.get("width", image_size[1]),
+                )
+                outputs.append({"sem_seg": full_probs})
+
+            return outputs
+        else:
+            log_first_n(logging.INFO, "Using whole image to test")
+            return self.model(inputs)
+
+    def _batch_inference(self, batched_inputs):
+        """
+        Execute inference on a list of inputs,
+        using batch size = self.batch_size, instead of the length of the list.
+        Inputs & outputs have the same format as :meth:`SemanticSegmentor.forward`
+        """
+        outputs = []
+        inputs = []
+        for idx, input in zip(count(), batched_inputs):
+            inputs.append(input)
+            if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1:
+                with torch.no_grad():
+                    outputs.extend(self._inference_with_model(inputs))
+                inputs = []
+        return outputs
+
+    def __call__(self, batched_inputs):
+        """
+        Same input/output format as :meth:`SemanticSegmentor.forward`
+        """
+
+        def _maybe_read_image(dataset_dict):
+            ret = copy.copy(dataset_dict)
+            if "image" not in ret:
+                image = read_image(ret.pop("file_name"), self.model.input_format)
+                image = torch.from_numpy(
+                    np.ascontiguousarray(image.transpose(2, 0, 1))
+                )  # CHW
+                ret["image"] = image
+            if "height" not in ret and "width" not in ret:
+                ret["height"] = image.shape[1]
+                ret["width"] = image.shape[2]
+            return ret
+
+        return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs]
+
+    def _inference_one_image(self, input):
+        """
+        Args:
+            input (dict): one dataset dict with "image" field being a CHW tensor
+        Returns:
+            dict: one output dict
+        """
+        augmented_inputs, tfms = self._get_augmented_inputs(input)
+        # 1: forward with all augmented images
+        outputs = self._batch_inference(augmented_inputs)
+        # Delete now useless variables to avoid being out of memory
+        del augmented_inputs
+        # 2: merge the results
+        # handle flip specially
+        # outputs = [output.detach() for output in outputs]
+        return self._merge_auged_output(outputs, tfms)
+
+    def _merge_auged_output(self, outputs, tfms):
+        new_outputs = []
+        for output, tfm in zip(outputs, tfms):
+            if any(isinstance(t, HFlipTransform) for t in tfm.transforms):
+                new_outputs.append(output["sem_seg"].flip(dims=[2]))
+            else:
+                new_outputs.append(output["sem_seg"])
+        del outputs
+        # to avoid OOM with torch.stack
+        final_predictions = new_outputs[0]
+        for i in range(1, len(new_outputs)):
+            final_predictions += new_outputs[i]
+        final_predictions = final_predictions / len(new_outputs)
+        del new_outputs
+        return {"sem_seg": final_predictions}
+
+    def _get_augmented_inputs(self, input):
+        augmented_inputs = self.tta_mapper(input)
+        tfms = [x.pop("transforms") for x in augmented_inputs]
+        return augmented_inputs, tfms
diff --git a/open_vocab_seg/utils/__init__.py b/open_vocab_seg/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7b02184067b3a370e2815d5dec39b9d1cdad42f
--- /dev/null
+++ b/open_vocab_seg/utils/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+from .events import setup_wandb, WandbWriter
+from .predictor import VisualizationDemo, VisualizationDemoIndoor
\ No newline at end of file
diff --git a/open_vocab_seg/utils/events.py b/open_vocab_seg/utils/events.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbe82ce80a7110a1018167763ba3adc90f58faa0
--- /dev/null
+++ b/open_vocab_seg/utils/events.py
@@ -0,0 +1,121 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import os
+import wandb
+from detectron2.utils import comm
+from detectron2.utils.events import EventWriter, get_event_storage
+
+
+def setup_wandb(cfg, args):
+    if comm.is_main_process():
+        init_args = {
+            k.lower(): v
+            for k, v in cfg.WANDB.items()
+            if isinstance(k, str) and k not in ["config", "name"]
+        }
+        # only include most related part to avoid too big table
+        # TODO: add configurable params to select which part of `cfg` should be saved in config
+        if "config_exclude_keys" in init_args:
+            init_args["config"] = cfg
+            init_args["config"]["cfg_file"] = args.config_file
+        else:
+            init_args["config"] = {
+                "model": cfg.MODEL,
+                "solver": cfg.SOLVER,
+                "cfg_file": args.config_file,
+            }
+        if ("name" not in init_args) or (init_args["name"] is None):
+            init_args["name"] = os.path.basename(args.config_file)
+        wandb.init(**init_args)
+
+
+class BaseRule(object):
+    def __call__(self, target):
+        return target
+
+
+class IsIn(BaseRule):
+    def __init__(self, keyword: str):
+        self.keyword = keyword
+
+    def __call__(self, target):
+        return self.keyword in target
+
+
+class Prefix(BaseRule):
+    def __init__(self, keyword: str):
+        self.keyword = keyword
+
+    def __call__(self, target):
+        return "/".join([self.keyword, target])
+
+
+class WandbWriter(EventWriter):
+    """
+    Write all scalars to a tensorboard file.
+    """
+
+    def __init__(self):
+        """
+        Args:
+            log_dir (str): the directory to save the output events
+            kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)`
+        """
+        self._last_write = -1
+        self._group_rules = [
+            (IsIn("/"), BaseRule()),
+            (IsIn("loss"), Prefix("train")),
+        ]
+
+    def write(self):
+
+        storage = get_event_storage()
+
+        def _group_name(scalar_name):
+            for (rule, op) in self._group_rules:
+                if rule(scalar_name):
+                    return op(scalar_name)
+            return scalar_name
+
+        stats = {
+            _group_name(name): scalars[0]
+            for name, scalars in storage.latest().items()
+            if scalars[1] > self._last_write
+        }
+        if len(stats) > 0:
+            self._last_write = max([v[1] for k, v in storage.latest().items()])
+
+        # storage.put_{image,histogram} is only meant to be used by
+        # tensorboard writer. So we access its internal fields directly from here.
+        if len(storage._vis_data) >= 1:
+            stats["image"] = [
+                wandb.Image(img, caption=img_name)
+                for img_name, img, step_num in storage._vis_data
+            ]
+            # Storage stores all image data and rely on this writer to clear them.
+            # As a result it assumes only one writer will use its image data.
+            # An alternative design is to let storage store limited recent
+            # data (e.g. only the most recent image) that all writers can access.
+            # In that case a writer may not see all image data if its period is long.
+            storage.clear_images()
+
+        if len(storage._histograms) >= 1:
+
+            def create_bar(tag, bucket_limits, bucket_counts, **kwargs):
+                data = [
+                    [label, val] for (label, val) in zip(bucket_limits, bucket_counts)
+                ]
+                table = wandb.Table(data=data, columns=["label", "value"])
+                return wandb.plot.bar(table, "label", "value", title=tag)
+
+            stats["hist"] = [create_bar(**params) for params in storage._histograms]
+
+            storage.clear_histograms()
+
+        if len(stats) == 0:
+            return
+        wandb.log(stats, step=storage.iter)
+
+    def close(self):
+        wandb.finish()
diff --git a/open_vocab_seg/utils/misc.py b/open_vocab_seg/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..a22d0a978c9cd89595c6e7c900885e1c148844b1
--- /dev/null
+++ b/open_vocab_seg/utils/misc.py
@@ -0,0 +1,126 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+"""
+Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+from typing import List, Optional
+
+import torch
+import torch.distributed as dist
+import torchvision
+from torch import Tensor
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+            m[: img.shape[1], : img.shape[2]] = False
+    else:
+        raise ValueError("not supported")
+    return NestedTensor(tensor, mask)
+
+
+# _onnx_nested_tensor_from_tensor_list() is an implementation of
+# nested_tensor_from_tensor_list() that is supported by ONNX tracing.
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32)
+        ).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+
+    # work around for
+    # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
+    # m[: img.shape[1], :img.shape[2]] = False
+    # which is not yet supported in onnx
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(
+            img, (0, padding[2], 0, padding[1], 0, padding[0])
+        )
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(
+            m, (0, padding[2], 0, padding[1]), "constant", 1
+        )
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+def get_gt_binary_masks(gt_semseg):
+    mask_ids = torch.unique(gt_semseg)
+    gt_masks = []
+    for id in mask_ids:
+        if id != 255:
+            gt_masks.append(gt_semseg == id)
+    gt_masks = torch.stack(gt_masks).float()
+    return gt_masks
diff --git a/open_vocab_seg/utils/pcd_rendering.py b/open_vocab_seg/utils/pcd_rendering.py
new file mode 100644
index 0000000000000000000000000000000000000000..74c9787d5c55834b417a25227a98b4fa0ea0993e
--- /dev/null
+++ b/open_vocab_seg/utils/pcd_rendering.py
@@ -0,0 +1,114 @@
+import torch
+import torch.nn as nn
+
+from pytorch3d.renderer import (
+    PerspectiveCameras,
+    PointsRasterizationSettings,
+    PointsRasterizer,
+    AlphaCompositor,
+)
+
+
+def homogenize_pt(coord):
+    return torch.cat([coord, torch.ones_like(coord[..., :1])], dim=-1)
+
+  
+def unproject_pts_pt(intrinsics, coords, depth):
+    if coords.shape[-1] == 2:
+        coords = homogenize_pt(coords)
+    intrinsics = intrinsics.squeeze()[:3, :3]
+    coords = torch.inverse(intrinsics).mm(coords.T) * depth.reshape(1, -1)
+    return coords.T   # [n, 3]
+
+  
+def get_coord_grids_pt(h, w, device, homogeneous=False):
+    """
+    create pxiel coordinate grid
+    :param h: height
+    :param w: weight
+    :param device: device
+    :param homogeneous: if homogeneous coordinate
+    :return: coordinates [h, w, 2]
+    """
+    y = torch.arange(0, h).to(device)
+    x = torch.arange(0, w).to(device)
+    grid_y, grid_x = torch.meshgrid(y, x)
+    if homogeneous:
+        return torch.stack([grid_x, grid_y, torch.ones_like(grid_x)], dim=-1)
+    return torch.stack([grid_x, grid_y], dim=-1)  # [h, w, 2]
+
+
+class PointsRenderer(nn.Module):
+    """
+    A class for rendering a batch of points. The class should
+    be initialized with a rasterizer and compositor class which each have a forward
+    function.
+    """
+
+    def __init__(self, rasterizer, compositor) -> None:
+        super().__init__()
+        self.rasterizer = rasterizer
+        self.compositor = compositor
+
+    def to(self, device):
+        self.rasterizer = self.rasterizer.to(device)
+        self.compositor = self.compositor.to(device)
+        return self
+
+    def forward(self, point_clouds, **kwargs) -> torch.Tensor:
+        fragments = self.rasterizer(point_clouds, **kwargs)
+
+        r = self.rasterizer.raster_settings.radius
+
+        if type(r) == torch.Tensor:
+            if r.shape[-1] > 1:
+                idx = fragments.idx.clone()
+                idx[idx == -1] = 0
+                r = r[:, idx.squeeze().long()]
+                r = r.permute(0, 3, 1, 2)
+
+        dists2 = fragments.dists.permute(0, 3, 1, 2)
+        weights = 1 - dists2 / (r * r)
+        images = self.compositor(
+            fragments.idx.long().permute(0, 3, 1, 2),
+            weights,
+            point_clouds.features_packed().permute(1, 0),
+            **kwargs,
+        )
+
+        # permute so image comes at the end
+        images = images.permute(0, 2, 3, 1)
+
+        return images
+
+
+def create_pcd_renderer(h, w, intrinsics, R=None, T=None, radius=None, device="cuda"):
+    fx = intrinsics[0, 0]
+    fy = intrinsics[1, 1]
+    if R is None:
+        R = torch.eye(3)[None]  # (1, 3, 3)
+    if T is None:
+        T = torch.zeros(1, 3)  # (1, 3)
+    cameras = PerspectiveCameras(R=R, T=T,
+                                 device=device,
+                                 focal_length=((-fx, -fy),),
+                                 principal_point=(tuple(intrinsics[:2, -1]),),
+                                 image_size=((h, w),),
+                                 in_ndc=False,
+                                 )
+
+    if radius is None:
+        radius = 1.5 / min(h, w) * 2.0
+
+    raster_settings = PointsRasterizationSettings(
+        image_size=(h, w),
+        radius=radius,
+        points_per_pixel=8,
+    )
+
+    rasterizer = PointsRasterizer(cameras=cameras, raster_settings=raster_settings)
+    renderer = PointsRenderer(
+        rasterizer=rasterizer,
+        compositor=AlphaCompositor(background_color=(1, 1, 1))
+    )
+    return renderer
diff --git a/open_vocab_seg/utils/post_process_utils.py b/open_vocab_seg/utils/post_process_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed214319d90ceba0b47ef835072102b9ffec5179
--- /dev/null
+++ b/open_vocab_seg/utils/post_process_utils.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import torch
+from torch.nn import functional as F
+import numpy as np
+
+try:
+    import pydensecrf.densecrf as dcrf
+    from pydensecrf.utils import (
+        unary_from_softmax,
+        unary_from_labels,
+        create_pairwise_bilateral,
+        create_pairwise_gaussian,
+    )
+except:
+    dcrf = None
+
+
+def dense_crf_post_process(
+    logits,
+    image,
+    n_labels=None,
+    max_iters=5,
+    pos_xy_std=(3, 3),
+    pos_w=3,
+    bi_xy_std=(80, 80),
+    bi_rgb_std=(13, 13, 13),
+    bi_w=10,
+):
+    """
+    logits : [C,H,W]
+    image : [3,H,W]
+    """
+    if dcrf is None:
+        raise FileNotFoundError(
+            "pydensecrf is required to perform dense crf inference."
+        )
+    if isinstance(logits, torch.Tensor):
+        logits = F.softmax(logits, dim=0).detach().cpu().numpy()
+        U = unary_from_softmax(logits)
+        n_labels = logits.shape[0]
+    elif logits.ndim == 3:
+        U = unary_from_softmax(logits)
+        n_labels = logits.shape[0]
+    else:
+        assert n_labels is not None
+        U = unary_from_labels(logits, n_labels, zero_unsure=False)
+
+    d = dcrf.DenseCRF2D(image.shape[1], image.shape[0], n_labels)
+
+    d.setUnaryEnergy(U)
+
+    # This adds the color-independent term, features are the locations only.
+    d.addPairwiseGaussian(
+        sxy=pos_xy_std,
+        compat=pos_w,
+        kernel=dcrf.DIAG_KERNEL,
+        normalization=dcrf.NORMALIZE_SYMMETRIC,
+    )
+
+    # This adds the color-dependent term, i.e. features are (x,y,r,g,b).
+    d.addPairwiseBilateral(
+        sxy=bi_xy_std,
+        srgb=bi_rgb_std,
+        rgbim=image,
+        compat=bi_w,
+        kernel=dcrf.DIAG_KERNEL,
+        normalization=dcrf.NORMALIZE_SYMMETRIC,
+    )
+    # Run five inference steps.
+    logits = d.inference(max_iters)
+    logits = np.asarray(logits).reshape((n_labels, image.shape[0], image.shape[1]))
+    return torch.from_numpy(logits)
diff --git a/open_vocab_seg/utils/predictor.py b/open_vocab_seg/utils/predictor.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f5744d31f7422389c6994aa6fb01f71b298d21
--- /dev/null
+++ b/open_vocab_seg/utils/predictor.py
@@ -0,0 +1,793 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import numpy as np
+import torch
+import torchvision
+import imageio
+from tqdm import tqdm
+import os
+import cv2
+
+from pytorch3d.structures import Pointclouds
+from pytorch3d.renderer import look_at_view_transform
+
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.visualizer import ColorMode, Visualizer
+from detectron2.data.detection_utils import read_image
+from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+from .pcd_rendering import unproject_pts_pt, get_coord_grids_pt, create_pcd_renderer
+
+
+class OVSegPredictor(DefaultPredictor):
+    def __init__(self, cfg):
+        super().__init__(cfg)
+
+    def __call__(self, original_image, class_names):
+        """
+        Args:
+            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+
+        Returns:
+            predictions (dict):
+                the output of the model for one image only.
+                See :doc:`/tutorials/models` for details about the format.
+        """
+        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
+            # Apply pre-processing to image.
+            if self.input_format == "RGB":
+                # whether the model expects BGR inputs or RGB
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.aug.get_transform(original_image).apply_image(original_image)
+            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+
+            inputs = {"image": image, "height": height, "width": width, "class_names": class_names}
+            predictions = self.model([inputs])[0]
+            return predictions
+
+class OVSegVisualizer(Visualizer):
+    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE, class_names=None):
+        super().__init__(img_rgb, metadata, scale, instance_mode)
+        self.class_names = class_names
+
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        class_names = self.class_names if self.class_names is not None else self.metadata.stuff_classes
+
+        for label in filter(lambda l: l < len(class_names), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+            mask_color = np.random.random((1, 3)).tolist()[0]
+
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = class_names[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=(1.0, 1.0, 240.0 / 255),
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+    def draw_sam_seg(self, masks, area_threshold=None, alpha=0.5):
+        """
+        Draw semantic segmentation predictions/labels.
+
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        plt.figure()
+        if len(masks) == 0:
+            return
+        sorted_anns = sorted(masks, key=(lambda x: x['area']), reverse=True)
+        img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 3))
+        class_names = self.class_names if self.class_names is not None else self.metadata.stuff_classes
+        for ann in sorted_anns:
+            m = ann['segmentation']
+            mask_color = np.random.random((1, 3)).tolist()[0]
+        
+            self.draw_binary_mask(
+                m,
+                color=mask_color,
+                edge_color=(1.0, 1.0, 240.0 / 255),
+                text=class_names[ann['class']],
+                alpha=alpha,
+                area_threshold=area_threshold,
+            )
+        return self.output
+
+
+
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+
+        self.parallel = parallel
+        if parallel:
+            raise NotImplementedError
+        else:
+            self.predictor = OVSegPredictor(cfg)
+
+    def run_on_image(self, image, class_names):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        predictions = self.predictor(image, class_names)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        # if "sem_seg" in predictions:
+        #     r = predictions["sem_seg"]
+        #     blank_area = (r[0] == 0)
+        #     pred_mask = r.argmax(dim=0).to('cpu')
+        #     pred_mask[blank_area] = 255
+        #     pred_mask = np.array(pred_mask, dtype=np.int)
+
+        #     vis_output = visualizer.draw_sem_seg(
+        #         pred_mask
+        #     )
+        # else:
+        #     raise NotImplementedError
+
+        if "sem_seg" in predictions:
+            r = predictions["sem_seg"]
+            pred_mask = r.argmax(dim=0).to('cpu')
+            pred_mask = np.array(pred_mask, dtype=int)
+
+            vis_output = visualizer.draw_sem_seg(
+                pred_mask
+            )
+        else:
+            raise NotImplementedError
+        
+        return predictions, vis_output
+    
+    def run_on_image_sam(self, path, class_names, depth_map_path, rage_matrices_path):
+        """
+        Args:
+            path (str): the path of the image
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        image = read_image(path, format="BGR")
+        predictions = self.predictor(image, class_names)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer_rgb = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_depth = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_rgb_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_depth_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        
+        sam_checkpoint = "sam_vit_h_4b8939.pth"
+        model_type = "vit_h"
+        device = "cuda"
+        sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+        sam.to(device=device)
+
+        mask_generator_2 = SamAutomaticMaskGenerator(
+            model=sam,
+            points_per_side=64,
+            pred_iou_thresh=0.8,
+            stability_score_thresh=0.8,
+            crop_n_layers=0,
+            crop_n_points_downscale_factor=0,
+            min_mask_region_area=100,  # Requires open-cv to run post-processing
+        )
+        print('Using SAM to generate segments for the RGB image')
+        masks_rgb = mask_generator_2.generate(image)
+        masks_rgb = sorted(masks_rgb, key=(lambda x: x['area']), reverse=True)
+
+        print('Using SAM to generate segments for the Depth map')
+        d, world_coord = self.project_2d_to_3d(depth_map_path, rage_matrices_path)
+        d = (d - np.min(d)) / (np.max(d) - np.min(d))
+        image_depth = mpl.colormaps['plasma'](d)*255
+        plt.figure()
+        plt.imshow(image_depth.astype(np.uint8))
+        plt.axis('off')
+        plt.savefig('outputs/Depth_rendered.png', bbox_inches='tight', pad_inches=0.0)
+        masks_depth = mask_generator_2.generate(image_depth.astype(np.uint8)[:,:,:-1])
+        masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=True)
+
+        if "sem_seg" in predictions:
+            r = predictions["sem_seg"]
+            pred_mask = r.argmax(dim=0).to('cpu')
+            pred_mask = np.array(pred_mask, dtype=int)
+            
+            pred_mask_sam_rgb = pred_mask.copy()
+            for mask in masks_rgb:
+                cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True)
+                pred_mask_sam_rgb[mask['segmentation']] = cls_tmp[np.argmax(cls_num)]
+                mask['class'] = cls_tmp[np.argmax(cls_num)]
+
+            vis_output_rgb = visualizer_rgb.draw_sem_seg(
+                pred_mask_sam_rgb
+            )
+            # vis_output_rgb = visualizer_rgb.draw_sem_seg(
+            #     pred_mask, alpha=1
+            # )
+
+            pred_mask_sam_depth = pred_mask.copy()
+            for mask in masks_depth:
+                cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True)
+                pred_mask_sam_depth[mask['segmentation']] = cls_tmp[np.argmax(cls_num)]
+                mask['class'] = cls_tmp[np.argmax(cls_num)]
+
+            vis_output_depth = visualizer_depth.draw_sem_seg(
+                pred_mask_sam_depth
+            )
+
+            vis_output_rgb_sam = visualizer_rgb_sam.draw_sam_seg(masks_rgb)
+            vis_output_depth_sam = visualizer_depth_sam.draw_sam_seg(masks_depth)
+
+        else:
+            raise NotImplementedError
+        
+        return predictions, vis_output_rgb, vis_output_depth, vis_output_rgb_sam, vis_output_depth_sam
+    
+    def project_2d_to_3d(self, depth_map_path, rage_matrices_path):
+
+        H = 800
+        W = 1280
+        IMAGE_SIZE = (H, W)
+
+        def pixels_to_ndcs(xx, yy, size=IMAGE_SIZE):
+            s_y, s_x = size
+            s_x -= 1  # so 1 is being mapped into (n-1)th pixel
+            s_y -= 1  # so 1 is being mapped into (n-1)th pixel
+            x = (2 / s_x) * xx - 1
+            y = (-2 / s_y) * yy + 1
+            return x, y
+
+        rage_matrices = np.load(rage_matrices_path)
+
+
+        # get the (ViewProj) matrix that transform points from the world coordinate to NDC
+        # (points in world coordinate) @ VP = (points in NDC) 
+        VP = rage_matrices['VP']
+        VP_inverse = rage_matrices['VP_inv'] # NDC to world coordinate
+
+        # get the (Proj) matrix that transform points from the camera coordinate to NDC
+        # (points in camera coordinate) @ P = (points in NDC) 
+        P = rage_matrices['P']
+        P_inverse = rage_matrices['P_inv'] # NDC to camera coordinate
+        # print(VP, VP_inverse, P, P_inverse)
+
+        d = np.load(depth_map_path)
+        d = d/6.0 - 4e-5 # convert to NDC coordinate
+
+        px = np.arange(0, W)
+        py = np.arange(0, H)
+        px, py = np.meshgrid(px, py, sparse=False)
+        px = px.reshape(-1)
+        py = py.reshape(-1)
+
+        ndcz = d[py, px] # get the depth in NDC
+        ndcx, ndcy = pixels_to_ndcs(px, py)
+        ndc_coord = np.stack([ndcx, ndcy, ndcz, np.ones_like(ndcz)], axis=1)
+
+        camera_coord = ndc_coord @ P_inverse
+        camera_coord = camera_coord/camera_coord[:,-1:]
+
+        world_coord = ndc_coord @ VP_inverse
+        world_coord = world_coord/world_coord[:,-1:]
+
+        return d, world_coord
+
+    def get_xyzrgb(self, rgb_path, depth_path, rage_matrices_path):
+
+        H = 800
+        W = 1280
+        IMAGE_SIZE = (H, W)
+
+        def pixels_to_ndcs(xx, yy, size=IMAGE_SIZE):
+            s_y, s_x = size
+            s_x -= 1  # so 1 is being mapped into (n-1)th pixel
+            s_y -= 1  # so 1 is being mapped into (n-1)th pixel
+            x = (2 / s_x) * xx - 1
+            y = (-2 / s_y) * yy + 1
+            return x, y
+        
+        rage_matrices = np.load(rage_matrices_path)
+
+
+        # get the (ViewProj) matrix that transform points from the world coordinate to NDC
+        # (points in world coordinate) @ VP = (points in NDC) 
+        VP = rage_matrices['VP']
+        VP_inverse = rage_matrices['VP_inv'] # NDC to world coordinate
+
+        # get the (Proj) matrix that transform points from the camera coordinate to NDC
+        # (points in camera coordinate) @ P = (points in NDC) 
+        P = rage_matrices['P']
+        P_inverse = rage_matrices['P_inv'] # NDC to camera coordinate
+        # print(VP, VP_inverse, P, P_inverse)
+
+        d = np.load(depth_path)
+        d = d/6.0 - 4e-5 # convert to NDC coordinate
+
+        px = np.arange(0, W)
+        py = np.arange(0, H)
+        px, py = np.meshgrid(px, py, sparse=False)
+        px = px.reshape(-1)
+        py = py.reshape(-1)
+
+        ndcz = d[py, px] # get the depth in NDC
+        ndcx, ndcy = pixels_to_ndcs(px, py)
+        ndc_coord = np.stack([ndcx, ndcy, ndcz, np.ones_like(ndcz)], axis=1)
+
+        camera_coord = ndc_coord @ P_inverse
+        camera_coord = camera_coord/camera_coord[:,-1:]
+
+        world_coord = ndc_coord @ VP_inverse
+        world_coord = world_coord/world_coord[:,-1:]
+
+        rgb = read_image(rgb_path, format="BGR")
+        rgb = rgb[:, :, ::-1]
+        rgb = rgb[py, px, :]
+
+        xyzrgb = np.concatenate((world_coord[:,:-1], rgb), axis=1)
+
+        return xyzrgb
+    
+    def render_3d_video(self, xyzrgb_path, depth_path):
+        
+        device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+        
+        xyzrgb = np.load(xyzrgb_path)
+        depth = np.load(depth_path)
+        depth = torch.tensor(depth).to(device)
+        depth = 1 / depth
+        
+        H = 800
+        W = 1280
+        radius = 1.5 / min(H, W) * 2.0
+        intrinsic = np.array([[max(H, W), 0, W // 2],
+                              [0, max(H, W), H // 2],
+                              [0, 0, 1]])
+
+        intrinsic = torch.from_numpy(intrinsic).float()[None].to(device)
+        coord = get_coord_grids_pt(H, W, device=device).float()[None]
+        pts = unproject_pts_pt(intrinsic, coord.reshape(-1, 2), depth)
+        pts[:, 0] = ((pts[:, 0] - pts[:, 0].min()) / (pts[:, 0].max() - pts[:, 0].min()) - 0.5) * 2
+        pts[:, 1] = ((pts[:, 1] - pts[:, 1].min()) / (pts[:, 1].max() - pts[:, 1].min()) - 0.7) * 2
+        pts[:, 2] = ((pts[:, 2] - pts[:, 2].min()) / (pts[:, 2].max() - pts[:, 2].min()) - 0.5) * 2
+        
+        num_frames = 45
+        degrees = np.linspace(120, 220, num_frames)
+        
+        total = ['rgb_3d_sam', 'depth_3d_sam', 'rgb_3d_sam_mask', 'depth_3d_sam_mask']
+        frames_all = {}
+        
+        for j, name in enumerate(total):
+            img = torch.from_numpy(xyzrgb[name][:, 3:] / 255.).to(device).float()
+            pcd = Pointclouds(points=[pts], features=[img.squeeze().reshape(-1, 3)])
+            frames = []
+            for i in tqdm(range(num_frames)):
+                R, t = look_at_view_transform(3., -10, degrees[i])
+                renderer = create_pcd_renderer(H, W, intrinsic.squeeze()[:3, :3],
+                                                           R=R, T=t,
+                                                           radius=radius, device=device)
+                result = renderer(pcd)
+                result = result.permute(0, 3, 1, 2)
+                frame = (255. * result.detach().cpu().squeeze().permute(1, 2, 0).numpy()).astype(np.uint8)
+                frames.append(frame)
+            
+            frames_all[name] = frames
+
+            # video_out_file = '{}.gif'.format(name)
+            # imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25)
+            
+            video_out_file = '{}.mp4'.format(name)
+            imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25, quality=8)
+            
+        video_out_file = '{}.mp4'.format('RGB_3D_All')
+        imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['rgb_3d_sam_mask']+frames_all['rgb_3d_sam'], fps=25, quality=8)
+        
+        video_out_file = '{}.mp4'.format('Depth_3D_All')
+        imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['depth_3d_sam_mask']+frames_all['depth_3d_sam'], fps=25, quality=8)
+            
+class VisualizationDemoIndoor(VisualizationDemo):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        super().__init__(cfg, instance_mode, parallel)
+
+    def build_pcd(self, depth_mask,  coords, colors, masks, sem_map):
+        group_ids = np.full(masks[0]["segmentation"].shape, -1, dtype=int)
+        num_masks = len(masks)
+        group_counter = 0
+        for i in reversed(range(num_masks)):
+            # print(masks[i]["predicted_iou"])
+            group_ids[masks[i]["segmentation"]] = group_counter
+            group_counter += 1
+        group_ids = np.unique(group_ids[depth_mask], return_inverse=True)[1]
+        return dict(coord=coords, color=colors, group=group_ids, sem_map=sem_map)
+
+
+    def run_on_pcd_ui(self, rgb_path, depth_path, class_names):
+        depth = depth_path
+        color = rgb_path
+        #semantic_map = join(rgb_path, scene_name, 'semantic_label', color_name[0:-4] + '.pth')
+
+        depth_img = cv2.imread(depth, -1) # read 16bit grayscale image
+        depth_mask = (depth_img != 0)
+        color_image = cv2.imread(color)
+        color_image = cv2.resize(color_image, (640, 480))
+        predictions = self.predictor(color_image, class_names)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = color_image[:, :, ::-1]
+        visualizer_rgb = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_depth = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_rgb_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_depth_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        
+        sam_checkpoint = "sam_vit_h_4b8939.pth"
+        model_type = "vit_h"
+        device = "cuda"
+        sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+        sam.to(device=device)
+
+        mask_generator_2 = SamAutomaticMaskGenerator(
+            model=sam,
+            points_per_side=64,
+            pred_iou_thresh=0.5,
+            stability_score_thresh=0.8,
+            crop_n_layers=0,
+            crop_n_points_downscale_factor=0,
+            min_mask_region_area=100,  # Requires open-cv to run post-processing
+        )
+        print('Using SAM to generate segments for the RGB image')
+        masks_rgb = mask_generator_2.generate(image)
+        masks_rgb = sorted(masks_rgb, key=(lambda x: x['area']), reverse=True)
+
+        print('Using SAM to generate segments for the Depth map')
+        d = np.full(depth_img.shape, 0, dtype=float)
+        d[depth_mask] = (1 / (depth_img+1e-6))[depth_mask]
+        colored_depth = (d - np.min(d)) / (np.max(d) - np.min(d))
+        colored_depth = mpl.colormaps['inferno'](colored_depth)*255
+        plt.figure()
+        plt.imshow(colored_depth.astype(np.uint8)[:,:,:-1])
+        plt.axis('off')
+        plt.savefig('outputs/Depth_rendered.png')
+        masks_depth = mask_generator_2.generate(colored_depth.astype(np.uint8)[:,:,:-1])
+        masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=True)
+
+        if "sem_seg" in predictions:
+            r = predictions["sem_seg"]
+            pred_mask = r.argmax(dim=0).to('cpu')
+            pred_mask = np.array(pred_mask, dtype=int)
+
+            output2D = {}
+            pred_mask_sam_depth = np.full(pred_mask.shape, -1)
+            masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=False)
+            for mask in masks_depth:
+                to_paint = pred_mask_sam_depth == -1
+                cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True)
+                #print(cls_tmp, cls_num)
+                pred_mask_sam_depth[mask['segmentation'] & to_paint] = cls_tmp[np.argmax(cls_num)]
+                #print(class_names[cls_tmp[np.argmax(cls_num)]])
+                mask['class'] = cls_tmp[np.argmax(cls_num)]
+
+            output2D['sem_seg_on_depth'] = visualizer_depth.draw_sem_seg(
+                pred_mask_sam_depth
+            )
+            
+            pred_mask_sam_rgb = pred_mask.copy()
+            for mask in masks_rgb:
+                cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True)
+                #print(mask['segmentation'].sum(), cls_tmp, cls_num)
+                pred_mask_sam_rgb[mask['segmentation']] = cls_tmp[np.argmax(cls_num)]
+                mask['class'] = cls_tmp[np.argmax(cls_num)]
+
+            output2D['sem_seg_on_rgb'] = visualizer_rgb.draw_sem_seg(
+                pred_mask_sam_rgb
+            )
+
+            output2D['sam_seg_on_rgb'] = visualizer_rgb_sam.draw_sam_seg(masks_rgb)
+            output2D['sam_seg_on_depth'] = visualizer_depth_sam.draw_sam_seg(masks_depth)
+
+        else:
+            raise NotImplementedError
+        
+        color_image = np.reshape(color_image[depth_mask], [-1,3])
+        #group_ids = group_ids[depth_mask]
+
+        sem_map_color = pred_mask_sam_rgb[depth_mask]
+        sem_map_depth = pred_mask_sam_depth[depth_mask]
+
+        colors = np.zeros_like(color_image)
+        colors[:,0] = color_image[:,2]
+        colors[:,1] = color_image[:,1]
+        colors[:,2] = color_image[:,0]
+        
+        depth_shift = 1000.0
+        x,y = np.meshgrid(np.linspace(0,depth_img.shape[1]-1,depth_img.shape[1]), np.linspace(0,depth_img.shape[0]-1,depth_img.shape[0]))
+        uv_depth = np.zeros((depth_img.shape[0], depth_img.shape[1], 3))
+        uv_depth[:,:,0] = x
+        uv_depth[:,:,1] = y
+        uv_depth[:,:,2] = depth_img/depth_shift
+
+        output3D = {}
+        output3D['rgb_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+        output3D['depth_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+        output3D['rgb_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+        output3D['depth_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+        
+        return predictions, output2D, output3D
+    
+    def run_on_pcd(self, rgb_path, scene_name, color_name, class_names):
+        intrinsic_path = os.path.join(rgb_path, scene_name, 'intrinsics', 'intrinsic_depth.txt')
+        depth_intrinsic = np.loadtxt(intrinsic_path)
+
+        pose = os.path.join(rgb_path, scene_name, 'pose', color_name[0:-4] + '.txt')
+        depth = os.path.join(rgb_path, scene_name, 'depth', color_name[0:-4] + '.png')
+        color = os.path.join(rgb_path, scene_name, 'color', color_name)
+        #semantic_map = join(rgb_path, scene_name, 'semantic_label', color_name[0:-4] + '.pth')
+
+        depth_img = cv2.imread(depth, -1) # read 16bit grayscale image
+        depth_mask = (depth_img != 0)
+        color_image = cv2.imread(color)
+        color_image = cv2.resize(color_image, (640, 480))
+        predictions = self.predictor(color_image, class_names)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = color_image[:, :, ::-1]
+        visualizer_rgb = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_depth = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_rgb_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        visualizer_depth_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names)
+        
+        sam_checkpoint = "sam_vit_h_4b8939.pth"
+        model_type = "vit_h"
+        device = "cuda"
+        sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
+        sam.to(device=device)
+
+        mask_generator_2 = SamAutomaticMaskGenerator(
+            model=sam,
+            points_per_side=64,
+            pred_iou_thresh=0.5,
+            stability_score_thresh=0.8,
+            crop_n_layers=0,
+            crop_n_points_downscale_factor=0,
+            min_mask_region_area=100,  # Requires open-cv to run post-processing
+        )
+        print('Using SAM to generate segments for the RGB image')
+        masks_rgb = mask_generator_2.generate(image)
+        masks_rgb = sorted(masks_rgb, key=(lambda x: x['area']), reverse=True)
+
+        print('Using SAM to generate segments for the Depth map')  
+        d = np.full(depth_img.shape, 0, dtype=float)
+        d[depth_mask] = (1 / (depth_img+1e-6))[depth_mask]
+        colored_depth = (d - np.min(d)) / (np.max(d) - np.min(d))
+        colored_depth = mpl.colormaps['inferno'](colored_depth)*255
+        plt.figure()
+        plt.imshow(colored_depth.astype(np.uint8)[:,:,:-1])
+        plt.axis('off')
+        plt.savefig('outputs/Depth_rendered.png')
+        masks_depth = mask_generator_2.generate(colored_depth.astype(np.uint8)[:,:,:-1])
+        masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=True)
+
+        if "sem_seg" in predictions:
+            r = predictions["sem_seg"]
+            pred_mask = r.argmax(dim=0).to('cpu')
+            pred_mask = np.array(pred_mask, dtype=int)
+
+            output2D = {}
+            pred_mask_sam_depth = np.full(pred_mask.shape, -1)
+            masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=False)
+            for mask in masks_depth:
+                to_paint = pred_mask_sam_depth == -1
+                cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True)
+                #print(cls_tmp, cls_num)
+                pred_mask_sam_depth[mask['segmentation'] & to_paint] = cls_tmp[np.argmax(cls_num)]
+                #print(class_names[cls_tmp[np.argmax(cls_num)]])
+                mask['class'] = cls_tmp[np.argmax(cls_num)]
+
+            output2D['sem_seg_on_depth'] = visualizer_depth.draw_sem_seg(
+                pred_mask_sam_depth
+            )
+            
+            pred_mask_sam_rgb = pred_mask.copy()
+            for mask in masks_rgb:
+                cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True)
+                #print(mask['segmentation'].sum(), cls_tmp, cls_num)
+                pred_mask_sam_rgb[mask['segmentation']] = cls_tmp[np.argmax(cls_num)]
+                mask['class'] = cls_tmp[np.argmax(cls_num)]
+
+            output2D['sem_seg_on_rgb'] = visualizer_rgb.draw_sem_seg(
+                pred_mask_sam_rgb
+            )
+
+            output2D['sam_seg_on_rgb'] = visualizer_rgb_sam.draw_sam_seg(masks_rgb)
+            output2D['sam_seg_on_depth'] = visualizer_depth_sam.draw_sam_seg(masks_depth)
+
+        else:
+            raise NotImplementedError
+        
+        color_image = np.reshape(color_image[depth_mask], [-1,3])
+        #group_ids = group_ids[depth_mask]
+
+        sem_map_color = pred_mask_sam_rgb[depth_mask]
+        sem_map_depth = pred_mask_sam_depth[depth_mask]
+
+        colors = np.zeros_like(color_image)
+        colors[:,0] = color_image[:,2]
+        colors[:,1] = color_image[:,1]
+        colors[:,2] = color_image[:,0]
+
+        pose = np.loadtxt(pose)
+        
+        depth_shift = 1000.0
+        x,y = np.meshgrid(np.linspace(0,depth_img.shape[1]-1,depth_img.shape[1]), np.linspace(0,depth_img.shape[0]-1,depth_img.shape[0]))
+        uv_depth = np.zeros((depth_img.shape[0], depth_img.shape[1], 3))
+        uv_depth[:,:,0] = x
+        uv_depth[:,:,1] = y
+        uv_depth[:,:,2] = depth_img/depth_shift
+
+        output3D = {}
+        output3D['rgb_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+        output3D['depth_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+        output3D['rgb_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+        output3D['depth_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6))
+
+        uv_depth = np.reshape(uv_depth, [-1,3])
+        uv_depth = uv_depth[np.where(uv_depth[:,2]!=0),:].squeeze()
+        
+        intrinsic_inv = np.linalg.inv(depth_intrinsic)
+        fx = depth_intrinsic[0,0]
+        fy = depth_intrinsic[1,1]
+        cx = depth_intrinsic[0,2]
+        cy = depth_intrinsic[1,2]
+        bx = depth_intrinsic[0,3]
+        by = depth_intrinsic[1,3]
+        n = uv_depth.shape[0]
+        points = np.ones((n,4))
+        X = (uv_depth[:,0]-cx)*uv_depth[:,2]/fx + bx
+        Y = (uv_depth[:,1]-cy)*uv_depth[:,2]/fy + by
+        points[:,0] = X
+        points[:,1] = Y
+        points[:,2] = uv_depth[:,2]
+        points_world = np.dot(points, np.transpose(pose))
+        
+        output3D['pcd_color'] = self.build_pcd(depth_mask, coords=points_world[:,:3], colors=colors, masks=masks_rgb, sem_map=sem_map_color)
+        output3D['pcd_depth'] = self.build_pcd(depth_mask, coords=points_world[:,:3], colors=colors, masks=masks_depth, sem_map=sem_map_depth)
+        
+        return predictions, output2D, output3D
+    
+    
+    def merge_pcd(self, pcd_list, data_path, save_path, scene_path, voxel_size, th):
+        while len(pcd_list) != 1:
+            print(len(pcd_list), flush=True)
+            new_pcd_list = []
+            for indice in pairwise_indices(len(pcd_list)):
+                # print(indice)
+                pcd_frame = cal_2_scenes(pcd_list, indice, voxel_size=voxel_size, voxelize=voxelize)
+                if pcd_frame is not None:
+                    new_pcd_list.append(pcd_frame)
+            pcd_list = new_pcd_list
+        seg_dict = pcd_list[0]
+        seg_dict["group"] = num_to_natural(remove_small_group(seg_dict["group"], th))
+
+        data_dict = torch.load(scene_path)
+        scene_coord = torch.tensor(data_dict["coord"]).cuda().contiguous()
+        new_offset = torch.tensor(scene_coord.shape[0]).cuda()
+        gen_coord = torch.tensor(seg_dict["coord"]).cuda().contiguous().float()
+        offset = torch.tensor(gen_coord.shape[0]).cuda()
+        gen_group = seg_dict["group"]
+        gen_sem = seg_dict['sem_map']
+        indices, dis = pointops.knn_query(1, gen_coord, offset, scene_coord, new_offset)
+        indices = indices.cpu().numpy()
+        sem_map = gen_sem[indices.reshape(-1)].astype(np.int16)
+        group = gen_group[indices.reshape(-1)].astype(np.int16)
+        mask_dis = dis.reshape(-1).cpu().numpy() > 0.6
+        group[mask_dis] = -1
+        sem_map[mask_dis] = -1
+        group = group.astype(np.int16)
+        sem_map = sem_map.astype(np.int16)
+        torch.save((sem_map, num_to_natural(group)), os.path.join(save_path, scene_name + ".pth"))
+    
+    def render_3d_video(self, xyzrgb_path):
+        xyzrgb = np.load(xyzrgb_path)
+        device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu')
+
+        depth = xyzrgb['rgb_3d_sam'][:, :, 2]
+        depth = torch.tensor(depth).to(device).float()
+
+        num_frames = [60, 60, 60, 90]
+
+        h = 480
+        w = 640
+
+        intrinsic = np.array([[max(h, w), 0, w // 2],
+                            [0, max(h, w), h // 2],
+                            [0, 0, 1]])
+        intrinsic = torch.from_numpy(intrinsic).float()[None].to(device)
+
+        coord = get_coord_grids_pt(h, w, device=device).float()[None]
+        pts = unproject_pts_pt(intrinsic, coord.reshape(-1, 2), depth)
+        pts[:, 0] = ((pts[:, 0] - pts[:, 0].min()) / (pts[:, 0].max() - pts[:, 0].min()) - 0.5) * 2
+        pts[:, 1] = ((pts[:, 1] - pts[:, 1].min()) / (pts[:, 1].max() - pts[:, 1].min()) - 0.5) * 2
+        # pts[:, 1] = ((pts[:, 1] - pts[:, 1].min()) / (pts[:, 1].max() - pts[:, 1].min()) - 0.7) * 2
+        pts[:, 2] = ((pts[:, 2] - pts[:, 2].min()) / (pts[:, 2].max() - pts[:, 2].min()) - 0.5) * 2
+
+        radius = 1.5 / min(h, w) * 2.0
+
+
+        total = ['rgb_3d_sam', 'depth_3d_sam', 'rgb_3d_sam_mask', 'depth_3d_sam_mask']
+        num_frames = 45
+        degrees = np.linspace(120, 220, num_frames)
+        frames_all = {}
+        for j, name in enumerate(total):
+            img = torch.from_numpy(xyzrgb[name][:, :, 3:] / 255.).to(device).float()
+            pcd = Pointclouds(points=[pts], features=[img.squeeze().reshape(-1, 3)])
+            time_steps = np.linspace(0, 1, num_frames)
+            frames = []
+            for i, t_step in tqdm(enumerate(time_steps), total=len(time_steps)):
+                R, t = look_at_view_transform(3., -10, degrees[i])
+                renderer = create_pcd_renderer(h, w, intrinsic.squeeze()[:3, :3],
+                                            R=R, T=t,
+                                            radius=radius, device=device)
+
+                result = renderer(pcd)
+                result = result.permute(0, 3, 1, 2)
+                frame = (255. * result.detach().cpu().squeeze().permute(1, 2, 0).numpy()).astype(np.uint8)
+                frames.append(frame)
+            
+            frames_all[name] = frames
+
+            # video_out_file = '{}.mp4'.format(name)
+            # imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25)
+            
+            video_out_file = '{}.mp4'.format(name)
+            imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25, quality=8)
+        
+        video_out_file = '{}.mp4'.format('RGB_3D_All')
+        imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['rgb_3d_sam_mask']+frames_all['rgb_3d_sam'], fps=25, quality=8)
+        
+        video_out_file = '{}.mp4'.format('Depth_3D_All')
+        imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['depth_3d_sam_mask']+frames_all['depth_3d_sam'], fps=25, quality=8)
diff --git a/outputs/holder.py b/outputs/holder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/ovseg_swinbase_vitL14_ft_mpt.pth b/ovseg_swinbase_vitL14_ft_mpt.pth
new file mode 100644
index 0000000000000000000000000000000000000000..0d2dcc4c4e721b187574f4c3829c58236713037a
--- /dev/null
+++ b/ovseg_swinbase_vitL14_ft_mpt.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dd3731dde48d96654aba63e5a93753dc837d6889162a18ddf0877f5463d94c90
+size 2129343629
diff --git a/read_video.py b/read_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..1da1027914daa99d1da9308f0d967ac7e012b49e
--- /dev/null
+++ b/read_video.py
@@ -0,0 +1,7 @@
+import cv2
+
+Depth_Semantic_SAM_Mask_gif = cv2.VideoCapture('outputs/depth_3d_sam_mask.mp4')
+
+while(Depth_Semantic_SAM_Mask_gif .isOpened()):  
+    ret, frame = Depth_Semantic_SAM_Mask_gif.read()
+    print(ret, frame.shape)
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e139481f15c6045034fe33f6d0a9e1aa9a87ecaa
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,14 @@
+cython
+scipy
+shapely
+timm
+h5py
+wandb
+fire
+opencv-python
+pandas
+imageio
+fvcore
+iopath
+imageio[ffmpeg]
+imageio[pyav]
\ No newline at end of file
diff --git a/sam_vit_h_4b8939.pth b/sam_vit_h_4b8939.pth
new file mode 100644
index 0000000000000000000000000000000000000000..8523acce9ddab1cf7e355628a08b1aab8ce08a72
--- /dev/null
+++ b/sam_vit_h_4b8939.pth
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e
+size 2564550879
diff --git a/third_party/CLIP/.gitignore b/third_party/CLIP/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fe3b563dec4e2d55b3824ef1bc7c31aed07848f0
--- /dev/null
+++ b/third_party/CLIP/.gitignore
@@ -0,0 +1,18 @@
+__pycache__/
+*.py[cod]
+*$py.class
+*.egg-info
+.pytest_cache
+.ipynb_checkpoints
+
+thumbs.db
+.DS_Store
+.idea
+data/
+*.pkl
+.theia
+tmp
+*/tmp
+wandb/
+*/wadb
+.history
\ No newline at end of file
diff --git a/third_party/CLIP/CLIP.png b/third_party/CLIP/CLIP.png
new file mode 100644
index 0000000000000000000000000000000000000000..a1b5ec9171fd7a51e36e845a02304eb837142ba1
Binary files /dev/null and b/third_party/CLIP/CLIP.png differ
diff --git a/third_party/CLIP/LICENSE b/third_party/CLIP/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..4e97f0b45803b7c04ae89548934af4f257a97501
--- /dev/null
+++ b/third_party/CLIP/LICENSE
@@ -0,0 +1,22 @@
+MIT License
+
+Copyright (c) 2021 OpenAI
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
diff --git a/third_party/CLIP/MANIFEST.in b/third_party/CLIP/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..effd8d995ff1842a48c69d2a0f7c8dce4423d7a2
--- /dev/null
+++ b/third_party/CLIP/MANIFEST.in
@@ -0,0 +1 @@
+include clip/bpe_simple_vocab_16e6.txt.gz
diff --git a/third_party/CLIP/README.md b/third_party/CLIP/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5d2d20cd9e1cafcdf8bd8dfd83a0a9c47a884a39
--- /dev/null
+++ b/third_party/CLIP/README.md
@@ -0,0 +1,193 @@
+# CLIP
+
+[[Blog]](https://openai.com/blog/clip/) [[Paper]](https://arxiv.org/abs/2103.00020) [[Model Card]](model-card.md) [[Colab]](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb)
+
+CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision.
+
+
+
+## Approach
+
+![CLIP](CLIP.png)
+
+
+
+## Usage
+
+First, [install PyTorch 1.7.1](https://pytorch.org/get-started/locally/) and torchvision, as well as small additional dependencies, and then install this repo as a Python package. On a CUDA GPU machine, the following will do the trick:
+
+```bash
+$ conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0
+$ pip install ftfy regex tqdm
+$ pip install git+https://github.com/openai/CLIP.git
+```
+
+Replace `cudatoolkit=11.0` above with the appropriate CUDA version on your machine or `cpuonly` when installing on a machine without a GPU.
+
+```python
+import torch
+import clip
+from PIL import Image
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load("ViT-B/32", device=device)
+
+image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device)
+text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
+
+with torch.no_grad():
+    image_features = model.encode_image(image)
+    text_features = model.encode_text(text)
+    
+    logits_per_image, logits_per_text = model(image, text)
+    probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]
+```
+
+
+## API
+
+The CLIP module `clip` provides the following methods:
+
+#### `clip.available_models()`
+
+Returns the names of the available CLIP models.
+
+#### `clip.load(name, device=..., jit=False)`
+
+Returns the model and the TorchVision transform needed by the model, specified by the model name returned by `clip.available_models()`. It will download the model as necessary. The `name` argument can also be a path to a local checkpoint.
+
+The device to run the model can be optionally specified, and the default is to use the first CUDA device if there is any, otherwise the CPU. When `jit` is `False`, a non-JIT version of the model will be loaded.
+
+#### `clip.tokenize(text: Union[str, List[str]], context_length=77)`
+
+Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model
+
+---
+
+The model returned by `clip.load()` supports the following methods:
+
+#### `model.encode_image(image: Tensor)`
+
+Given a batch of images, returns the image features encoded by the vision portion of the CLIP model.
+
+#### `model.encode_text(text: Tensor)`
+
+Given a batch of text tokens, returns the text features encoded by the language portion of the CLIP model.
+
+#### `model(image: Tensor, text: Tensor)`
+
+Given a batch of images and a batch of text tokens, returns two Tensors, containing the logit scores corresponding to each image and text input. The values are cosine similarities between the corresponding image and text features, times 100.
+
+
+
+## More Examples
+
+### Zero-Shot Prediction
+
+The code below performs zero-shot prediction using CLIP, as shown in Appendix B in the paper. This example takes an image from the [CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html), and predicts the most likely labels among the 100 textual labels from the dataset.
+
+```python
+import os
+import clip
+import torch
+from torchvision.datasets import CIFAR100
+
+# Load the model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load('ViT-B/32', device)
+
+# Download the dataset
+cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False)
+
+# Prepare the inputs
+image, class_id = cifar100[3637]
+image_input = preprocess(image).unsqueeze(0).to(device)
+text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device)
+
+# Calculate features
+with torch.no_grad():
+    image_features = model.encode_image(image_input)
+    text_features = model.encode_text(text_inputs)
+
+# Pick the top 5 most similar labels for the image
+image_features /= image_features.norm(dim=-1, keepdim=True)
+text_features /= text_features.norm(dim=-1, keepdim=True)
+similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
+values, indices = similarity[0].topk(5)
+
+# Print the result
+print("\nTop predictions:\n")
+for value, index in zip(values, indices):
+    print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%")
+```
+
+The output will look like the following (the exact numbers may be slightly different depending on the compute device):
+
+```
+Top predictions:
+
+           snake: 65.31%
+          turtle: 12.29%
+    sweet_pepper: 3.83%
+          lizard: 1.88%
+       crocodile: 1.75%
+```
+
+Note that this example uses the `encode_image()` and `encode_text()` methods that return the encoded features of given inputs.
+
+
+### Linear-probe evaluation
+
+The example below uses [scikit-learn](https://scikit-learn.org/) to perform logistic regression on image features.
+
+```python
+import os
+import clip
+import torch
+
+import numpy as np
+from sklearn.linear_model import LogisticRegression
+from torch.utils.data import DataLoader
+from torchvision.datasets import CIFAR100
+from tqdm import tqdm
+
+# Load the model
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model, preprocess = clip.load('ViT-B/32', device)
+
+# Load the dataset
+root = os.path.expanduser("~/.cache")
+train = CIFAR100(root, download=True, train=True, transform=preprocess)
+test = CIFAR100(root, download=True, train=False, transform=preprocess)
+
+
+def get_features(dataset):
+    all_features = []
+    all_labels = []
+    
+    with torch.no_grad():
+        for images, labels in tqdm(DataLoader(dataset, batch_size=100)):
+            features = model.encode_image(images.to(device))
+
+            all_features.append(features)
+            all_labels.append(labels)
+
+    return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy()
+
+# Calculate the image features
+train_features, train_labels = get_features(train)
+test_features, test_labels = get_features(test)
+
+# Perform logistic regression
+classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1)
+classifier.fit(train_features, train_labels)
+
+# Evaluate using the logistic regression classifier
+predictions = classifier.predict(test_features)
+accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100.
+print(f"Accuracy = {accuracy:.3f}")
+```
+
+Note that the `C` value should be determined via a hyperparameter sweep using a validation split.
diff --git a/third_party/CLIP/clip/__init__.py b/third_party/CLIP/clip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcc5619538c0f7c782508bdbd9587259d805e0d9
--- /dev/null
+++ b/third_party/CLIP/clip/__init__.py
@@ -0,0 +1 @@
+from .clip import *
diff --git a/third_party/CLIP/clip/bpe_simple_vocab_16e6.txt.gz b/third_party/CLIP/clip/bpe_simple_vocab_16e6.txt.gz
new file mode 100644
index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113
--- /dev/null
+++ b/third_party/CLIP/clip/bpe_simple_vocab_16e6.txt.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917
diff --git a/third_party/CLIP/clip/clip.py b/third_party/CLIP/clip/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d733edfac02d81ba3e402eb7e702764728bdaa2
--- /dev/null
+++ b/third_party/CLIP/clip/clip.py
@@ -0,0 +1,285 @@
+import hashlib
+import os
+import urllib
+import warnings
+from collections import OrderedDict
+from typing import Union, List
+
+import torch
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+
+try:
+    from torchvision.transforms import InterpolationMode
+
+    BICUBIC = InterpolationMode.BICUBIC
+except ImportError:
+    BICUBIC = Image.BICUBIC
+
+
+if torch.__version__.split(".") < ["1", "7", "1"]:
+    warnings.warn("PyTorch version 1.7.1 or higher is recommended")
+
+
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+    "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt",
+    "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt",
+    "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt",
+}
+
+
+def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+
+    if os.path.isfile(download_target):
+        if (
+            hashlib.sha256(open(download_target, "rb").read()).hexdigest()
+            == expected_sha256
+        ):
+            return download_target
+        else:
+            warnings.warn(
+                f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file"
+            )
+
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(
+            total=int(source.info().get("Content-Length")),
+            ncols=80,
+            unit="iB",
+            unit_scale=True,
+        ) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    if (
+        hashlib.sha256(open(download_target, "rb").read()).hexdigest()
+        != expected_sha256
+    ):
+        raise RuntimeError(
+            f"Model has been downloaded but the SHA256 checksum does not not match"
+        )
+
+    return download_target
+
+
+def _transform(n_px):
+    return Compose(
+        [
+            Resize(n_px, interpolation=BICUBIC),
+            CenterCrop(n_px),
+            lambda image: image.convert("RGB"),
+            ToTensor(),
+            Normalize(
+                (0.48145466, 0.4578275, 0.40821073),
+                (0.26862954, 0.26130258, 0.27577711),
+            ),
+        ]
+    )
+
+
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+
+
+def load(
+    name: str,
+    mask_prompt_depth: int = 0,
+    device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu",
+    jit=False,
+):
+    """Load a CLIP model
+
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+
+    device : Union[str, torch.device]
+        The device to put the loaded model
+
+    jit : bool
+        Whether to load the optimized JIT model or more hackable non-JIT model (default).
+
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name])
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(
+            f"Model {name} not found; available models = {available_models()}"
+        )
+
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(
+                f"File {model_path} is not a JIT archive. Loading as a state dict instead"
+            )
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+        if 'state_dict' in state_dict:
+            new_state_dict = OrderedDict()
+            for k, v in state_dict['state_dict'].items():
+                if k.startswith('module.'):
+                    name = k[7:]  # remove `module.`
+                    new_state_dict[name] = v
+            state_dict = new_state_dict
+
+    if not jit:
+        model = build_model(state_dict or model.state_dict(), mask_prompt_depth).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+
+    # patch the device names
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]
+    )
+    device_node = [
+        n
+        for n in device_holder.graph.findAllNodes("prim::Constant")
+        if "Device" in repr(n)
+    ][-1]
+
+    def patch_device(module):
+        try:
+            graphs = [module.graph] if hasattr(module, "graph") else []
+        except RuntimeError:
+            graphs = []
+
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith(
+                    "cuda"
+                ):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[]
+        )
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            try:
+                graphs = [module.graph] if hasattr(module, "graph") else []
+            except RuntimeError:
+                graphs = []
+
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [
+                        1,
+                        2,
+                    ]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, _transform(model.input_resolution.item())
+
+
+def tokenize(
+    texts: Union[str, List[str]],
+    context_length: int = 77,
+    truncate: bool = False,
+    return_length: bool = False,
+) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    truncate: bool
+        Whether to truncate the text in case its encoding is longer than the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    length = []
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            if truncate:
+                tokens = tokens[:context_length]
+                tokens[-1] = eot_token
+                length.append(context_length)
+            else:
+                raise RuntimeError(
+                    f"Input {texts[i]} is too long for context length {context_length}"
+                )
+        else:
+            length.append(len(tokens))
+        result[i, : len(tokens)] = torch.tensor(tokens)
+    if return_length:
+        return result, length
+    return result
diff --git a/third_party/CLIP/clip/model.py b/third_party/CLIP/clip/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ea730a2cc8a992f9180428bd1fec7fc96aa89dd
--- /dev/null
+++ b/third_party/CLIP/clip/model.py
@@ -0,0 +1,613 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from https://github.com/openai/CLIP/blob/main/clip/model.py
+
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict(
+                    [
+                        ("-1", nn.AvgPool2d(stride)),
+                        (
+                            "0",
+                            nn.Conv2d(
+                                inplanes,
+                                planes * self.expansion,
+                                1,
+                                stride=1,
+                                bias=False,
+                            ),
+                        ),
+                        ("1", nn.BatchNorm2d(planes * self.expansion)),
+                    ]
+                )
+            )
+
+    def forward(self, x: torch.Tensor):
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+    def __init__(
+        self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None
+    ):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5
+        )
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+        self.grid_size = spacial_dim
+
+    def forward(self, x, mask=None, return_cls=True):
+        b, c, gh, gw = x.shape
+        # remove irrelated feature
+        if mask is not None:
+            mask = F.interpolate(mask[:, None, ...], size=(gh, gw)).squeeze(
+                1
+            )  # [N,H,W] -> [N,grid,grid]
+            mask = (mask > 0.5).reshape(mask.shape[0], -1)
+            mask = torch.cat([mask, mask.new_ones(mask.shape[0], 1)], dim=1)
+            if x.size()[0] == 1:
+                x = x.expand(mask.shape[0], c, gh, gw)
+
+        x = x.reshape(x.shape[0], c, gh * gw).permute(2, 0, 1)  # NCHW -> (HW)NC
+
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        positional_embedding = self.positional_embedding
+        if not (self.positional_embedding.shape[0] == x.shape[0]):
+            cls_pos = positional_embedding[0:1, :]
+            per_pos_embedding = (
+                F.interpolate(
+                    positional_embedding[1:, :]
+                    .permute(1, 0)
+                    .view(1, -1, self.grid_size, self.grid_size),
+                    size=(gh, gw),
+                    mode="bicubic",
+                )
+                .reshape(-1, gh * gw)
+                .permute(1, 0)
+            )
+            positional_embedding = torch.cat([cls_pos, per_pos_embedding])
+
+        x = x + positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x,
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]
+            ),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False,
+            key_padding_mask=mask,
+        )
+
+        if return_cls:
+            return x[0]
+        else:
+            return x
+
+
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False
+        )
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False
+        )
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(
+            input_resolution // 32, embed_dim, heads, output_dim
+        )
+
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x, mask: torch.Tensor = None, return_cls=True):
+        def stem(x):
+            for conv, bn in [
+                (self.conv1, self.bn1),
+                (self.conv2, self.bn2),
+                (self.conv3, self.bn3),
+            ]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)  # 1/4,1/4
+        x = self.layer1(x)
+        x = self.layer2(x)  # 1/8,1/8
+        x = self.layer3(x)  # 1/16,1/16
+        x = self.layer4(x)  # 1/32,1/32
+        b, c, gh, gw = x.shape
+        x = self.attnpool(x, mask, return_cls)
+        if not return_cls:
+            return x[1:].permute(1, 0, 2).reshape(b, gh, gw, x.shape[-1])  # N,L,C
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict(
+                [
+                    ("c_fc", nn.Linear(d_model, d_model * 4)),
+                    ("gelu", QuickGELU()),
+                    ("c_proj", nn.Linear(d_model * 4, d_model)),
+                ]
+            )
+        )
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+
+    def attention(self, x: torch.Tensor, **kwargs):
+        self.attn_mask = (
+            self.attn_mask.to(dtype=x.dtype, device=x.device)
+            if self.attn_mask is not None
+            else None
+        )
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask, **kwargs
+        )[0]
+
+    def forward(self, x: torch.Tensor, **kwargs):
+        x = x + self.attention(self.ln_1(x), **kwargs)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+
+class Transformer(nn.Module):
+    def __init__(
+        self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None
+    ):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(
+            *[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]
+        )
+
+    def forward(self, x: torch.Tensor, **kwargs):
+        for block in self.resblocks:
+            x = block(x, **kwargs)
+        return x
+
+
+class VisionTransformer(nn.Module):
+    def __init__(
+        self,
+        input_resolution: int,
+        patch_size: int,
+        mask_prompt_depth: int,
+        width: int,
+        layers: int,
+        heads: int,
+        output_dim: int,
+    ):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False,
+        )
+
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(
+            scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)
+        )
+        self.grid_size = input_resolution // patch_size
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(width, layers, heads)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+        self.mask_pool = nn.AvgPool2d(patch_size, stride=patch_size)
+        self.mask_prompt_depth = mask_prompt_depth
+        self.mask_embedding = nn.Parameter(torch.zeros(self.mask_prompt_depth, self.grid_size * self.grid_size, width))
+
+    def forward(self, x: torch.Tensor, m: torch.Tensor = None):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        if m is not None:
+            m = self.mask_pool(m.to(torch.float).squeeze()).reshape(m.shape[0], -1).unsqueeze(-1)
+            m = torch.ceil(m)
+            if self.mask_embedding.shape[1] == 1:
+                mask_embedding = self.mask_embedding.to(x.dtype).repeat(1, x.shape[1], 1)
+            else:
+                mask_embedding = self.mask_embedding.to(x.dtype)
+            x = x * m + mask_embedding[0].unsqueeze(0) * (1 - m)
+
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        if m is not None:
+            for i, blk in enumerate(self.transformer.resblocks):
+                d = i + 1
+                x = blk(x)
+                if d < self.mask_prompt_depth:
+                    masked_x = x[1:, :, :] * m.permute(1, 0, 2) + \
+                               mask_embedding[d].unsqueeze(0).permute(1, 0, 2) * (1 - m.permute(1, 0, 2))
+                    x = torch.cat([x[:1, :, :], masked_x], dim=0)
+        else:
+            x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+
+
+class CLIP(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        # vision
+        image_resolution: int,
+        vision_layers: Union[Tuple[int, int, int, int], int],
+        vision_width: int,
+        vision_patch_size: int,
+        mask_prompt_depth: int,
+        # text
+        context_length: int,
+        vocab_size: int,
+        transformer_width: int,
+        transformer_heads: int,
+        transformer_layers: int,
+    ):
+        super().__init__()
+
+        self.context_length = context_length
+
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width,
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisionTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                mask_prompt_depth=mask_prompt_depth,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+            )
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask(),
+        )
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width)
+        )
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+
+        self.initialize_parameters()
+
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+
+            for resnet_block in [
+                self.visual.layer1,
+                self.visual.layer2,
+                self.visual.layer3,
+                self.visual.layer4,
+            ]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+
+        proj_std = (self.transformer.width ** -0.5) * (
+            (2 * self.transformer.layers) ** -0.5
+        )
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image, **kwargs):
+        return self.visual(image.type(self.dtype), **kwargs)
+
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+
+        return x
+
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [
+                *[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]],
+                "in_proj_bias",
+                "bias_k",
+                "bias_v",
+            ]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def build_model(state_dict: dict, mask_prompt_depth: int = 0):
+    vit = "visual.proj" in state_dict
+
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len(
+            [
+                k
+                for k in state_dict.keys()
+                if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")
+            ]
+        )
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round(
+            (state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5
+        )
+        image_resolution = vision_patch_size * grid_size
+    else:
+        assert mask_prompt_depth == 0, 'ResNets do not support mask prompt tuning'
+        counts: list = [
+            len(
+                set(
+                    k.split(".")[2]
+                    for k in state_dict
+                    if k.startswith(f"visual.layer{b}")
+                )
+            )
+            for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round(
+            (state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5
+        )
+        vision_patch_size = None
+        assert (
+            output_width ** 2 + 1
+            == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        )
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(
+        set(
+            k.split(".")[2]
+            for k in state_dict
+            if k.startswith(f"transformer.resblocks")
+        )
+    )
+
+    model = CLIP(
+        embed_dim,
+        image_resolution,
+        vision_layers,
+        vision_width,
+        vision_patch_size,
+        mask_prompt_depth,
+        context_length,
+        vocab_size,
+        transformer_width,
+        transformer_heads,
+        transformer_layers,
+    )
+
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+
+    convert_weights(model)
+    model.load_state_dict(state_dict, strict=False)
+    return model.eval()
diff --git a/third_party/CLIP/clip/simple_tokenizer.py b/third_party/CLIP/clip/simple_tokenizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..56d17512b06afb700e7834e4f3f6515c315ebb74
--- /dev/null
+++ b/third_party/CLIP/clip/simple_tokenizer.py
@@ -0,0 +1,150 @@
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz"
+    )
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1))
+        + list(range(ord("¡"), ord("¬") + 1))
+        + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2 ** 8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2 ** 8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r"\s+", " ", text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split("\n")
+        merges = merges[1 : 49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + "</w>" for v in vocab]
+        for merge in merges:
+            vocab.append("".join(merge))
+        vocab.extend(["<|startoftext|>", "<|endoftext|>"])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            "<|startoftext|>": "<|startoftext|>",
+            "<|endoftext|>": "<|endoftext|>",
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE,
+        )
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + "</w>",)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + "</w>"
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = "".join(self.byte_encoder[b] for b in token.encode("utf-8"))
+            bpe_tokens.extend(
+                self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ")
+            )
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = "".join([self.decoder[token] for token in tokens])
+        text = (
+            bytearray([self.byte_decoder[c] for c in text])
+            .decode("utf-8", errors="replace")
+            .replace("</w>", " ")
+        )
+        return text
diff --git a/third_party/CLIP/model-card.md b/third_party/CLIP/model-card.md
new file mode 100644
index 0000000000000000000000000000000000000000..2d22e25bea89fdbccdaa2809fbeb83e0a7cfaa07
--- /dev/null
+++ b/third_party/CLIP/model-card.md
@@ -0,0 +1,120 @@
+# Model Card: CLIP
+
+Inspired by [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993) and [Lessons from Archives (Jo & Gebru)](https://arxiv.org/pdf/1912.10389.pdf), we’re providing some accompanying information about the multimodal model.
+
+## Model Details
+
+The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within.
+
+### Model Date
+
+January 2021
+
+### Model Type
+
+The base model uses a ResNet50 with several modifications as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer.
+
+### Model Versions
+
+Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50.
+
+As part of the staged release process, we have also released the RN101 model, as well as RN50x4, a RN50 scaled up 4x according to the [EfficientNet](https://arxiv.org/abs/1905.11946) scaling rule. In July 2021, we additionally released the RN50x16 and ViT-B/16 models.
+
+Please see the paper linked below for further details about their specification.
+
+### Documents
+
+- [Blog Post](https://openai.com/blog/clip/)
+- [CLIP Paper](https://arxiv.org/abs/2103.00020)
+
+
+
+## Model Use
+
+### Intended Use
+
+The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis.
+
+#### Primary intended uses
+
+The primary intended users of these models are AI researchers.
+
+We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models.
+
+### Out-of-Scope Use Cases
+
+**Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. 
+
+Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use.
+
+Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases.
+
+
+
+## Data
+
+The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users.
+
+### Data Mission Statement
+
+Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset.
+
+
+
+## Performance and Limitations
+
+### Performance
+
+We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets:
+
+- Food101
+- CIFAR10   
+- CIFAR100   
+- Birdsnap
+- SUN397
+- Stanford Cars
+- FGVC Aircraft
+- VOC2007
+- DTD
+- Oxford-IIIT Pet dataset
+- Caltech101
+- Flowers102
+- MNIST   
+- SVHN 
+- IIIT5K   
+- Hateful Memes   
+- SST-2
+- UCF101
+- Kinetics700
+- Country211
+- CLEVR Counting
+- KITTI Distance
+- STL-10
+- RareAct
+- Flickr30
+- MSCOCO
+- ImageNet
+- ImageNet-A
+- ImageNet-R
+- ImageNet Sketch
+- ObjectNet (ImageNet Overlap)
+- Youtube-BB
+- ImageNet-Vid
+
+## Limitations
+
+CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance.
+
+### Bias and Fairness
+
+We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper).
+
+We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks.
+
+
+
+## Feedback
+
+### Where to send questions or comments about the model
+
+Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9)
diff --git a/third_party/CLIP/requirements.txt b/third_party/CLIP/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6b98c33f3a0e09ddf982606430472de3061c6e9f
--- /dev/null
+++ b/third_party/CLIP/requirements.txt
@@ -0,0 +1,5 @@
+ftfy
+regex
+tqdm
+torch
+torchvision
diff --git a/third_party/CLIP/setup.py b/third_party/CLIP/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..1026ae8a1c4d99f7107cd2eaffb0b391e87a121f
--- /dev/null
+++ b/third_party/CLIP/setup.py
@@ -0,0 +1,21 @@
+import os
+
+import pkg_resources
+from setuptools import setup, find_packages
+
+setup(
+    name="clip",
+    py_modules=["clip"],
+    version="1.0",
+    description="",
+    author="OpenAI",
+    packages=find_packages(exclude=["tests*"]),
+    install_requires=[
+        str(r)
+        for r in pkg_resources.parse_requirements(
+            open(os.path.join(os.path.dirname(__file__), "requirements.txt"))
+        )
+    ],
+    include_package_data=True,
+    extras_require={"dev": ["pytest"]},
+)
diff --git a/third_party/CLIP/tests/test_consistency.py b/third_party/CLIP/tests/test_consistency.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d49eaae8721b7ad82d4949f2ab2606c8875d9f
--- /dev/null
+++ b/third_party/CLIP/tests/test_consistency.py
@@ -0,0 +1,25 @@
+import numpy as np
+import pytest
+import torch
+from PIL import Image
+
+import clip
+
+
+@pytest.mark.parametrize("model_name", clip.available_models())
+def test_consistency(model_name):
+    device = "cpu"
+    jit_model, transform = clip.load(model_name, device=device, jit=True)
+    py_model, _ = clip.load(model_name, device=device, jit=False)
+
+    image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device)
+    text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)
+
+    with torch.no_grad():
+        logits_per_image, _ = jit_model(image, text)
+        jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+        logits_per_image, _ = py_model(image, text)
+        py_probs = logits_per_image.softmax(dim=-1).cpu().numpy()
+
+    assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1)
diff --git a/tools/__init__.py b/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tools/convert-pretrained-clip-model-to-d2.py b/tools/convert-pretrained-clip-model-to-d2.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0fc1a37805727d625ba40cbbf07e9426e87ad7
--- /dev/null
+++ b/tools/convert-pretrained-clip-model-to-d2.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import pickle as pkl
+import sys
+
+import torch
+
+"""
+Usage:
+  # download pretrained swin model:
+  wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
+  # run the conversion
+  ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
+  # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
+MODEL:
+  WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
+INPUT:
+  FORMAT: "RGB"
+"""
+
+
+def transform(path):
+    model = torch.load(path, map_location="cpu")
+    print(f"loading {path}......")
+    state_dict = model["model"]
+    state_dict = {
+        k.replace("visual_model.", ""): v
+        for k, v in state_dict.items()
+        if k.startswith("visual_model")
+    }
+    source_keys = [k for k in state_dict.keys() if "relative_coords" in k]
+    for k in source_keys:
+        state_dict[
+            k.replace("relative_coords", "relative_position_index")
+        ] = state_dict[k]
+        del state_dict[k]
+
+    source_keys = [k for k in state_dict.keys() if "atten_mask_matrix" in k]
+    for k in source_keys:
+        state_dict[k.replace("atten_mask_matrix", "attn_mask")] = state_dict[k]
+        del state_dict[k]
+
+    source_keys = [k for k in state_dict.keys() if "rel_pos_embed_table" in k]
+    for k in source_keys:
+        state_dict[
+            k.replace("rel_pos_embed_table", "relative_position_bias_table")
+        ] = state_dict[k]
+        del state_dict[k]
+
+    source_keys = [k for k in state_dict.keys() if "channel_reduction" in k]
+    for k in source_keys:
+        state_dict[k.replace("channel_reduction", "reduction")] = state_dict[k]
+        del state_dict[k]
+    return {
+        k if k.startswith("backbone.") else "backbone." + k: v
+        for k, v in state_dict.items()
+    }
+
+
+if __name__ == "__main__":
+    input = sys.argv[1]
+    res = {
+        "model": transform(input),
+        "__author__": "third_party",
+        "matching_heuristics": True,
+    }
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)
diff --git a/tools/convert-pretrained-swin-model-to-d2.py b/tools/convert-pretrained-swin-model-to-d2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cc9939c781a4d04dc6070a7fcac8d6c09afc8a1
--- /dev/null
+++ b/tools/convert-pretrained-swin-model-to-d2.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import pickle as pkl
+import sys
+
+import torch
+
+"""
+Usage:
+  # download pretrained swin model:
+  wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth
+  # run the conversion
+  ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl
+  # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config:
+MODEL:
+  WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl"
+INPUT:
+  FORMAT: "RGB"
+"""
+
+if __name__ == "__main__":
+    input = sys.argv[1]
+
+    obj = torch.load(input, map_location="cpu")["model"]
+
+    res = {"model": obj, "__author__": "third_party", "matching_heuristics": True}
+
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)
diff --git a/tools/convert-torchvision-to-d2.py b/tools/convert-torchvision-to-d2.py
new file mode 100644
index 0000000000000000000000000000000000000000..60b9fb88693350c75f0b69350807503c87192724
--- /dev/null
+++ b/tools/convert-torchvision-to-d2.py
@@ -0,0 +1,54 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import pickle as pkl
+import sys
+
+import torch
+
+"""
+Usage:
+  # download one of the ResNet{18,34,50,101,152} models from torchvision:
+  wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth
+  # run the conversion
+  ./convert-torchvision-to-d2.py r50.pth r50.pkl
+  # Then, use r50.pkl with the following changes in config:
+MODEL:
+  WEIGHTS: "/path/to/r50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STRIDE_IN_1X1: False
+INPUT:
+  FORMAT: "RGB"
+  These models typically produce slightly worse results than the
+  pre-trained ResNets we use in official configs, which are the
+  original ResNet models released by MSRA.
+"""
+
+if __name__ == "__main__":
+    input = sys.argv[1]
+
+    obj = torch.load(input, map_location="cpu")
+
+    newmodel = {}
+    for k in list(obj.keys()):
+        old_k = k
+        if "layer" not in k:
+            k = "stem." + k
+        for t in [1, 2, 3, 4]:
+            k = k.replace("layer{}".format(t), "res{}".format(t + 1))
+        for t in [1, 2, 3]:
+            k = k.replace("bn{}".format(t), "conv{}.norm".format(t))
+        k = k.replace("downsample.0", "shortcut")
+        k = k.replace("downsample.1", "shortcut.norm")
+        print(old_k, "->", k)
+        newmodel[k] = obj.pop(old_k).detach().numpy()
+
+    res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True}
+
+    with open(sys.argv[2], "wb") as f:
+        pkl.dump(res, f)
+    if obj:
+        print("Unconverted keys:", obj.keys())
diff --git a/tools/ovseg_replace_clip.py b/tools/ovseg_replace_clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..0781c5910d8bd7dec25aeb468514849dfe68e9e4
--- /dev/null
+++ b/tools/ovseg_replace_clip.py
@@ -0,0 +1,30 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import torch
+from collections import OrderedDict
+
+
+# PATH to new clip model
+clip_ckpt = torch.load('xx/open_clip/src/logs/2022_xx/checkpoints/epoch_x.pt')
+
+new_model = OrderedDict()
+state_dict = clip_ckpt['state_dict']
+
+for k, v in state_dict.items():
+    new_key = k.replace('module.','')
+    new_model[new_key] = v
+
+# PATH to trained ovseg model
+ovseg_model = torch.load('xx/ovseg/output/model_final.pth', 'cpu')
+
+for k, v in new_model.items():
+    new_k = 'clip_adapter.clip_model.' + k
+    if new_k in ovseg_model['model'].keys():
+        ovseg_model['model'][new_k] = v
+    else:
+        print(f'{new_k} does not exist in ckpt')
+
+# ovseg_model['model']['clip_adapter.clip_model.visual.mask_embedding'] = new_model['visual.mask_embedding']
+
+torch.save(ovseg_model, 'xx/ovseg/output/ovseg_ft_mpt.pth')
diff --git a/tools/search_thr_ensemble_w.sh b/tools/search_thr_ensemble_w.sh
new file mode 100644
index 0000000000000000000000000000000000000000..efdbd72dd1a6a9da96868688b0fd5530e956498a
--- /dev/null
+++ b/tools/search_thr_ensemble_w.sh
@@ -0,0 +1,11 @@
+or MASK_THR in 0.35 0.4 0.45
+o
+   for ENSEMBLE_WEIGHT in 0.6 0.65 0.7 0.75 0.8
+   do
+       python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml \
+       MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\"\) \
+       MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT $ENSEMBLE_WEIGHT MODEL.CLIP_ADAPTER.MASK_THR $MASK_THR
+   done
+one
+
+
diff --git a/tools/util.py b/tools/util.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9d6caf994cf4bf79156e0d95ae6df7fd0142bf2
--- /dev/null
+++ b/tools/util.py
@@ -0,0 +1,296 @@
+import numpy as np
+import torch
+import os
+import copy
+from PIL import Image
+import json
+import imageio
+# import clip
+
+
+SCANNET_COLOR_MAP_20 = {-1: (0., 0., 0.), 0: (174., 199., 232.), 1: (152., 223., 138.), 2: (31., 119., 180.), 3: (255., 187., 120.), 4: (188., 189., 34.), 5: (140., 86., 75.),
+                        6: (255., 152., 150.), 7: (214., 39., 40.), 8: (197., 176., 213.), 9: (148., 103., 189.), 10: (196., 156., 148.), 11: (23., 190., 207.), 12: (247., 182., 210.), 
+                        13: (219., 219., 141.), 14: (255., 127., 14.), 15: (158., 218., 229.), 16: (44., 160., 44.), 17: (112., 128., 144.), 18: (227., 119., 194.), 19: (82., 84., 163.)}
+
+class Voxelize(object):
+    def __init__(self,
+                 voxel_size=0.05,
+                 hash_type="fnv",
+                 mode='train',
+                 keys=("coord", "normal", "color", "label"),
+                 return_discrete_coord=False,
+                 return_min_coord=False):
+        self.voxel_size = voxel_size
+        self.hash = self.fnv_hash_vec if hash_type == "fnv" else self.ravel_hash_vec
+        assert mode in ["train", "test"]
+        self.mode = mode
+        self.keys = keys
+        self.return_discrete_coord = return_discrete_coord
+        self.return_min_coord = return_min_coord
+
+    def __call__(self, data_dict):
+        assert "coord" in data_dict.keys()
+        discrete_coord = np.floor(data_dict["coord"] / np.array(self.voxel_size)).astype(np.int)
+        min_coord = discrete_coord.min(0) * np.array(self.voxel_size)
+        discrete_coord -= discrete_coord.min(0)
+        key = self.hash(discrete_coord)
+        idx_sort = np.argsort(key)
+        key_sort = key[idx_sort]
+        _, inverse, count = np.unique(key_sort, return_inverse=True, return_counts=True)
+        if self.mode == 'train':  # train mode
+            # idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + np.random.randint(0, count.max(), count.size) % count
+            idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1])
+            idx_unique = idx_sort[idx_select]
+            if self.return_discrete_coord:
+                data_dict["discrete_coord"] = discrete_coord[idx_unique]
+            if self.return_min_coord:
+                data_dict["min_coord"] = min_coord.reshape([1, 3])
+            for key in self.keys:
+                data_dict[key] = data_dict[key][idx_unique]
+            return data_dict
+
+        elif self.mode == 'test':  # test mode
+            data_part_list = []
+            for i in range(count.max()):
+                idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + i % count
+                idx_part = idx_sort[idx_select]
+                data_part = dict(index=idx_part)
+                for key in data_dict.keys():
+                    if key in self.keys:
+                        data_part[key] = data_dict[key][idx_part]
+                    else:
+                        data_part[key] = data_dict[key]
+                if self.return_discrete_coord:
+                    data_part["discrete_coord"] = discrete_coord[idx_part]
+                if self.return_min_coord:
+                    data_part["min_coord"] = min_coord.reshape([1, 3])
+                data_part_list.append(data_part)
+            return data_part_list
+        else:
+            raise NotImplementedError
+
+    @staticmethod
+    def ravel_hash_vec(arr):
+        """
+        Ravel the coordinates after subtracting the min coordinates.
+        """
+        assert arr.ndim == 2
+        arr = arr.copy()
+        arr -= arr.min(0)
+        arr = arr.astype(np.uint64, copy=False)
+        arr_max = arr.max(0).astype(np.uint64) + 1
+
+        keys = np.zeros(arr.shape[0], dtype=np.uint64)
+        # Fortran style indexing
+        for j in range(arr.shape[1] - 1):
+            keys += arr[:, j]
+            keys *= arr_max[j + 1]
+        keys += arr[:, -1]
+        return keys
+
+    @staticmethod
+    def fnv_hash_vec(arr):
+        """
+        FNV64-1A
+        """
+        assert arr.ndim == 2
+        # Floor first for negative coordinates
+        arr = arr.copy()
+        arr = arr.astype(np.uint64, copy=False)
+        hashed_arr = np.uint64(14695981039346656037) * np.ones(arr.shape[0], dtype=np.uint64)
+        for j in range(arr.shape[1]):
+            hashed_arr *= np.uint64(1099511628211)
+            hashed_arr = np.bitwise_xor(hashed_arr, arr[:, j])
+        return hashed_arr
+
+
+def overlap_percentage(mask1, mask2):
+    intersection = np.logical_and(mask1, mask2)
+    area_intersection = np.sum(intersection)
+
+    area_mask1 = np.sum(mask1)
+    area_mask2 = np.sum(mask2)
+
+    smaller_area = min(area_mask1, area_mask2)
+
+    return area_intersection / smaller_area
+
+
+def remove_samll_masks(masks, ratio=0.8):
+    filtered_masks = []
+    skip_masks = set()
+
+    for i, mask1_dict in enumerate(masks):
+        if i in skip_masks:
+            continue
+
+        should_keep = True
+        for j, mask2_dict in enumerate(masks):
+            if i == j or j in skip_masks:
+                continue
+            mask1 = mask1_dict["segmentation"]
+            mask2 = mask2_dict["segmentation"]
+            overlap = overlap_percentage(mask1, mask2)
+            if overlap > ratio:
+                if np.sum(mask1) < np.sum(mask2):
+                    should_keep = False
+                    break
+                else:
+                    skip_masks.add(j)
+
+        if should_keep:
+            filtered_masks.append(mask1)
+
+    return filtered_masks
+
+
+def to_numpy(x):
+    if isinstance(x, torch.Tensor):
+        x = x.clone().detach().cpu().numpy()
+    assert isinstance(x, np.ndarray)
+    return x
+
+
+def save_point_cloud(coord, color=None, file_path="pc.ply", logger=None):
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+    coord = to_numpy(coord)
+    if color is not None:
+        color = to_numpy(color)
+    pcd = o3d.geometry.PointCloud()
+    pcd.points = o3d.utility.Vector3dVector(coord)
+    pcd.colors = o3d.utility.Vector3dVector(np.ones_like(coord) if color is None else color)
+    o3d.io.write_point_cloud(file_path, pcd)
+    if logger is not None:
+        logger.info(f"Save Point Cloud to: {file_path}")
+
+
+def remove_small_group(group_ids, th):
+    unique_elements, counts = np.unique(group_ids, return_counts=True)
+    result = group_ids.copy()
+    for i, count in enumerate(counts):
+        if count < th:
+            result[group_ids == unique_elements[i]] = -1
+    
+    return result
+
+
+def pairwise_indices(length):
+    return [[i, i + 1] if i + 1 < length else [i] for i in range(0, length, 2)]
+
+
+def num_to_natural(group_ids):
+    '''
+    Change the group number to natural number arrangement
+    '''
+    if np.all(group_ids == -1):
+        return group_ids
+    array = copy.deepcopy(group_ids)
+    unique_values = np.unique(array[array != -1])
+    mapping = np.full(np.max(unique_values) + 2, -1)
+    mapping[unique_values + 1] = np.arange(len(unique_values))
+    array = mapping[array + 1]
+    return array
+
+
+def get_matching_indices(source, pcd_tree, search_voxel_size, K=None):
+    match_inds = []
+    for i, point in enumerate(source.points):
+        [_, idx, _] = pcd_tree.search_radius_vector_3d(point, search_voxel_size)
+        if K is not None:
+            idx = idx[:K]
+        for j in idx:
+            # match_inds[i, j] = 1
+            match_inds.append((i, j))
+    return match_inds
+
+
+def visualize_3d(data_dict, text_feat_path, save_path):
+    text_feat = torch.load(text_feat_path)
+    group_logits = np.einsum('nc,mc->nm', data_dict["group_feat"], text_feat)
+    group_labels = np.argmax(group_logits, axis=-1)
+    labels = group_labels[data_dict["group"]]
+    labels[data_dict["group"] == -1] = -1
+    visualize_pcd(data_dict["coord"], data_dict["color"], labels, save_path)
+
+
+def visualize_pcd(coord, pcd_color, labels, save_path):
+    # alpha = 0.5
+    label_color = np.array([SCANNET_COLOR_MAP_20[label] for label in labels])
+    # overlay = (pcd_color * (1-alpha) + label_color * alpha).astype(np.uint8) / 255
+    label_color = label_color / 255
+    save_point_cloud(coord, label_color, save_path)
+
+
+def visualize_2d(img_color, labels, img_size, save_path):
+    import matplotlib.pyplot as plt
+    # from skimage.segmentation import mark_boundaries
+    # from skimage.color import label2rgb
+    label_names = ["wall", "floor", "cabinet", "bed", "chair",
+           "sofa", "table", "door", "window", "bookshelf",
+           "picture", "counter", "desk", "curtain", "refridgerator",
+           "shower curtain", "toilet", "sink", "bathtub", "other"]
+    colors = np.array(list(SCANNET_COLOR_MAP_20.values()))[1:]
+    segmentation_color = np.zeros((img_size[0], img_size[1], 3))
+    for i, color in enumerate(colors):
+        segmentation_color[labels == i] = color
+    alpha = 1
+    overlay = (img_color * (1-alpha) + segmentation_color * alpha).astype(np.uint8)
+    fig, ax = plt.subplots()
+    ax.imshow(overlay)
+    patches = [plt.plot([], [], 's', color=np.array(color)/255, label=label)[0] for label, color in zip(label_names, colors)]
+    plt.legend(handles=patches, bbox_to_anchor=(0.5, -0.1), loc='upper center', ncol=4, fontsize='small')
+    plt.savefig(save_path, bbox_inches='tight')
+    plt.show()
+
+
+def visualize_partition(coord, group_id, save_path):
+    group_id = group_id.reshape(-1)
+    num_groups = group_id.max() + 1
+    group_colors = np.random.rand(num_groups, 3)
+    group_colors = np.vstack((group_colors, np.array([0,0,0])))
+    color = group_colors[group_id]
+    save_point_cloud(coord, color, save_path)
+
+
+def delete_invalid_group(group, group_feat):
+    indices = np.unique(group[group != -1])
+    group = num_to_natural(group)
+    group_feat = group_feat[indices]
+    return group, group_feat
+
+def group_sem_voting(semantic_label, seg_result, instance_num=0):
+    if instance_num == 0:
+        instance_num = seg_result.max() + 1
+    seg_labels = []
+    sem_map = -1 * torch.ones_like(semantic_label)
+    for n in range(instance_num):
+        mask = (seg_result == n)
+        if mask.sum() == 0: 
+            sem_map[mask] = -1
+            seg_labels.append(-1)
+            continue
+        seg_label_n_cover, seg_label_n_nums  = torch.unique(semantic_label[mask], return_counts=True)
+        seg_label_n = seg_label_n_cover[seg_label_n_nums.max(-1)[1]]
+        seg_labels.append(seg_label_n)
+        sem_map[mask] = seg_label_n
+    
+    return sem_map
+
+def two_image_to_gif(image_1, image_2, name):
+    num_begin = 30
+    num_frames = 30
+    num_end = 30
+    frames = []
+    for i in range(num_begin):
+        frames.append(image_1)
+    for i in range(num_frames):
+        image_tmp = image_1 + (image_2 - image_1) * (i / (num_frames - 1))
+        frames.append(image_tmp.astype(np.uint8))
+    for i in range(num_end):
+        frames.append(image_2)
+        
+    # video_out_file = '{}.gif'.format(name)
+    # imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25)
+    
+    video_out_file = '{}.mp4'.format(name)
+    imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25, quality=8)
\ No newline at end of file
diff --git a/tools/web_demo.py b/tools/web_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..027b8ca4d656e3d94379c014ae505d2dc57c9225
--- /dev/null
+++ b/tools/web_demo.py
@@ -0,0 +1,76 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import multiprocessing as mp
+
+import numpy as np
+from PIL import Image
+
+from detectron2.config import get_cfg
+
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.data.detection_utils import read_image
+from open_vocab_seg import add_ovseg_config
+from open_vocab_seg.utils import VisualizationDemo
+
+import gradio as gr
+
+def setup_cfg(config_file):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_ovseg_config(cfg)
+    cfg.merge_from_file(config_file)
+    cfg.freeze()
+    return cfg
+
+
+def inference(class_names, input_img):
+    mp.set_start_method("spawn", force=True)
+    config_file = './configs/ovseg_swinB_vitL_demo.yaml'
+    cfg = setup_cfg(config_file)
+
+    demo = VisualizationDemo(cfg)
+
+    class_names = class_names.split(',')
+    img = read_image(input_img, format="BGR")
+    _, visualized_output = demo.run_on_image(img, class_names)
+
+    return Image.fromarray(np.uint8(visualized_output.get_image())).convert('RGB')
+
+# demo = gr.Interface(fn=greet, inputs="text", outputs="text")
+# demo.launch()
+
+
+examples = [['Oculus, Ukulele', './resources/demo_samples/sample_03.jpeg'],]
+output_labels = ['segmentation map']
+
+title = 'OVSeg'
+
+description = """
+Gradio Demo for Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP \n
+You may click on of the examples or upload your own image. \n
+OVSeg could perform open vocabulary segmentation, you may input more classes (seperate by comma).
+"""
+
+article = """
+<p style='text-align: center'>
+<a href='https://arxiv.org/abs/2210.04150' target='_blank'>
+Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP
+</a>
+|
+<a href='https://github.com' target='_blank'>Github Repo</a></p>
+"""
+
+gr.Interface(
+    inference,
+    inputs=[
+        gr.inputs.Textbox(
+            lines=1, placeholder=None, default='', label='class names'),
+        gr.inputs.Image(type='filepath')
+    ],
+    outputs=gr.outputs.Image(label='segmentation map'),
+    title=title,
+    description=description,
+    article=article,
+    examples=examples).launch(enable_queue=True)
diff --git a/train_net.py b/train_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f544a17aa30b99ef64f783d5e55e6b786fe18c7
--- /dev/null
+++ b/train_net.py
@@ -0,0 +1,309 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/train_net.py
+
+"""
+OVSeg Training Script.
+
+This script is a simplified version of the training script in detectron2/tools.
+"""
+import copy
+import itertools
+import logging
+import os
+from collections import OrderedDict
+from typing import Any, Dict, List, Set
+
+import detectron2.utils.comm as comm
+import torch
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.config import get_cfg
+from detectron2.data import MetadataCatalog
+from detectron2.engine import (
+    DefaultTrainer,
+    default_argument_parser,
+    default_setup,
+    launch,
+)
+from detectron2.evaluation import (
+    DatasetEvaluator,
+    CityscapesSemSegEvaluator,
+    COCOEvaluator,
+    DatasetEvaluators,
+    verify_results,
+)
+from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler
+from detectron2.solver.build import maybe_add_gradient_clipping
+from detectron2.utils.logger import setup_logger
+from detectron2.utils.events import CommonMetricPrinter, JSONWriter
+
+# OVSeg
+from open_vocab_seg import SemanticSegmentorWithTTA, add_ovseg_config
+from open_vocab_seg.data import (
+    MaskFormerSemanticDatasetMapper,
+)
+
+from open_vocab_seg.data import (
+    build_detection_test_loader,
+    build_detection_train_loader,
+)
+from open_vocab_seg.evaluation import (
+    GeneralizedSemSegEvaluator,
+)
+from open_vocab_seg.utils.events import WandbWriter, setup_wandb
+from open_vocab_seg.utils.post_process_utils import dense_crf_post_process
+
+
+class Trainer(DefaultTrainer):
+    """
+    Extension of the Trainer class adapted to DETR.
+    """
+
+    @classmethod
+    def build_evaluator(cls, cfg, dataset_name, output_folder=None):
+        """
+        Create evaluator(s) for a given dataset.
+        This uses the special metadata "evaluator_type" associated with each
+        builtin dataset. For your own dataset, you can simply create an
+        evaluator manually in your script and do not have to worry about the
+        hacky if-else logic here.
+        """
+        if output_folder is None:
+            output_folder = os.path.join(cfg.OUTPUT_DIR, "inference")
+        evaluator_list = []
+        evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type
+        if evaluator_type in ["sem_seg"]:
+            evaluator = GeneralizedSemSegEvaluator
+            evaluator_list.append(
+                evaluator(
+                    dataset_name,
+                    distributed=True,
+                    output_dir=output_folder,
+                    post_process_func=dense_crf_post_process
+                    if cfg.TEST.DENSE_CRF
+                    else None,
+                )
+            )
+
+        if len(evaluator_list) == 0:
+            raise NotImplementedError(
+                "no Evaluator for the dataset {} with the type {}".format(
+                    dataset_name, evaluator_type
+                )
+            )
+        elif len(evaluator_list) == 1:
+            return evaluator_list[0]
+        return DatasetEvaluators(evaluator_list)
+
+    @classmethod
+    def build_train_loader(cls, cfg):
+        dataset = None
+        # Semantic segmentation dataset mapper
+        if cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_semantic":
+            mapper = MaskFormerSemanticDatasetMapper(cfg, True)
+        else:
+            raise NotImplementedError
+        return build_detection_train_loader(cfg, mapper=mapper, dataset=dataset)
+
+    @classmethod
+    def build_test_loader(cls, cfg, dataset_name):
+        """
+        Returns:
+            iterable
+        It now calls :func:`detectron2.data.build_detection_test_loader`.
+        Overwrite it if you'd like a different data loader.
+        """
+        return build_detection_test_loader(cfg, dataset_name, mapper=None)
+
+    def build_writers(self):
+        """
+        Build a list of writers to be used. By default it contains
+        writers that write metrics to the screen,
+        a json file, and a tensorboard event file respectively.
+        If you'd like a different list of writers, you can overwrite it in
+        your trainer.
+
+        Returns:
+            list[EventWriter]: a list of :class:`EventWriter` objects.
+
+        It is now implemented by:
+        ::
+            return [
+                CommonMetricPrinter(self.max_iter),
+                JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+                TensorboardXWriter(self.cfg.OUTPUT_DIR),
+            ]
+
+        """
+        # Here the default print/log frequency of each writer is used.
+        return [
+            # It may not always print what you want to see, since it prints "common" metrics only.
+            CommonMetricPrinter(self.max_iter),
+            JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")),
+            WandbWriter(),
+        ]
+
+    @classmethod
+    def build_lr_scheduler(cls, cfg, optimizer):
+        """
+        It now calls :func:`detectron2.solver.build_lr_scheduler`.
+        Overwrite it if you'd like a different scheduler.
+        """
+        return build_lr_scheduler(cfg, optimizer)
+
+    @classmethod
+    def build_optimizer(cls, cfg, model):
+        weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM
+        weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED
+
+        defaults = {}
+        defaults["lr"] = cfg.SOLVER.BASE_LR
+        defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY
+
+        norm_module_types = (
+            torch.nn.BatchNorm1d,
+            torch.nn.BatchNorm2d,
+            torch.nn.BatchNorm3d,
+            torch.nn.SyncBatchNorm,
+            # NaiveSyncBatchNorm inherits from BatchNorm2d
+            torch.nn.GroupNorm,
+            torch.nn.InstanceNorm1d,
+            torch.nn.InstanceNorm2d,
+            torch.nn.InstanceNorm3d,
+            torch.nn.LayerNorm,
+            torch.nn.LocalResponseNorm,
+        )
+
+        params: List[Dict[str, Any]] = []
+        memo: Set[torch.nn.parameter.Parameter] = set()
+        for module_name, module in model.named_modules():
+            for module_param_name, value in module.named_parameters(recurse=False):
+                if not value.requires_grad:
+                    continue
+                # Avoid duplicating parameters
+                if value in memo:
+                    continue
+                memo.add(value)
+
+                hyperparams = copy.copy(defaults)
+                if "backbone" in module_name:
+                    hyperparams["lr"] = (
+                        hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER
+                    )
+                if (
+                    "relative_position_bias_table" in module_param_name
+                    or "absolute_pos_embed" in module_param_name
+                ):
+                    print(module_param_name)
+                    hyperparams["weight_decay"] = 0.0
+                if isinstance(module, norm_module_types):
+                    hyperparams["weight_decay"] = weight_decay_norm
+                if isinstance(module, torch.nn.Embedding):
+                    hyperparams["weight_decay"] = weight_decay_embed
+                params.append({"params": [value], **hyperparams})
+
+        def maybe_add_full_model_gradient_clipping(optim):
+            # detectron2 doesn't have full model gradient clipping now
+            clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE
+            enable = (
+                cfg.SOLVER.CLIP_GRADIENTS.ENABLED
+                and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model"
+                and clip_norm_val > 0.0
+            )
+
+            class FullModelGradientClippingOptimizer(optim):
+                def step(self, closure=None):
+                    all_params = itertools.chain(
+                        *[x["params"] for x in self.param_groups]
+                    )
+                    torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val)
+                    super().step(closure=closure)
+
+            return FullModelGradientClippingOptimizer if enable else optim
+
+        optimizer_type = cfg.SOLVER.OPTIMIZER
+        if optimizer_type == "SGD":
+            optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)(
+                params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM
+            )
+        elif optimizer_type == "ADAMW":
+            optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)(
+                params, cfg.SOLVER.BASE_LR
+            )
+        else:
+            raise NotImplementedError(f"no optimizer type {optimizer_type}")
+        if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model":
+            optimizer = maybe_add_gradient_clipping(cfg, optimizer)
+        return optimizer
+
+    @classmethod
+    def test_with_TTA(cls, cfg, model):
+        logger = logging.getLogger("detectron2.trainer")
+        # In the end of training, run an evaluation with TTA.
+        logger.info("Running inference with test-time augmentation ...")
+        model = SemanticSegmentorWithTTA(cfg, model)
+        evaluators = [
+            cls.build_evaluator(
+                cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA")
+            )
+            for name in cfg.DATASETS.TEST
+        ]
+        res = cls.test(cfg, model, evaluators)
+        res = OrderedDict({k + "_TTA": v for k, v in res.items()})
+        return res
+
+
+def setup(args):
+    """
+    Create configs and perform basic setups.
+    """
+    cfg = get_cfg()
+    # for poly lr schedule
+    add_deeplab_config(cfg)
+    add_ovseg_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    default_setup(cfg, args)
+    # Setup logger for "ovseg" module
+    if not args.eval_only:
+        setup_wandb(cfg, args)
+    setup_logger(
+        output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="ovseg"
+    )
+    return cfg
+
+
+def main(args):
+    cfg = setup(args)
+
+    if args.eval_only:
+        model = Trainer.build_model(cfg)
+        DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load(
+            cfg.MODEL.WEIGHTS, resume=args.resume
+        )
+
+        if cfg.TEST.AUG.ENABLED:
+            res = Trainer.test_with_TTA(cfg, model)
+        else:
+            res = Trainer.test(cfg, model)
+        if comm.is_main_process():
+            verify_results(cfg, res)
+        return res
+
+    trainer = Trainer(cfg)
+    trainer.resume_or_load(resume=args.resume)
+    return trainer.train()
+
+
+if __name__ == "__main__":
+    args = default_argument_parser().parse_args()
+    print("Command Line Args:", args)
+    launch(
+        main,
+        args.num_gpus,
+        num_machines=args.num_machines,
+        machine_rank=args.machine_rank,
+        dist_url=args.dist_url,
+        args=(args,),
+    )