diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..fc4c07276075879524bda37a4ca3e76c5aea9529 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +*.bmp filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..b10d85b2e054b8ffb4d9dbe32cbd4ac6e71c7b35 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.zip +*.pyc \ No newline at end of file diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..83f431e8feeb7e80d571f39c9f6c1b96857b5f85 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,80 @@ +# Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to make participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or +advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic +address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a +professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies within all project spaces, and it also applies when +an individual is representing the project or its community in public spaces. +Examples of representing a project or community include using an official +project e-mail address, posting via an official social media account, or acting +as an appointed representative at an online or offline event. Representation of +a project may be further defined and clarified by project maintainers. + +This Code of Conduct also applies outside the project spaces when there is a +reasonable belief that an individual's behavior may have a negative impact on +the project or its community. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..7141f8d55f5d491525cf73b4958ff560f65e7a1a --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,32 @@ +# Contributing to OVSeg +We want to make contributing to this project as easy and transparent as +possible. + +## Pull Requests +We actively welcome your pull requests. + +1. Fork the repo and create your branch from `main`. +2. If you've added code that should be tested, add tests. +3. If you've changed APIs, update the documentation. +4. Ensure the test suite passes. +5. Make sure your code lints. +6. If you haven't already, complete the Contributor License Agreement ("CLA"). + +## Contributor License Agreement ("CLA") +In order to accept your pull request, we need you to submit a CLA. You only need +to do this once to work on any of Meta's open source projects. + +Complete your CLA here: + +## Issues +We use GitHub issues to track public bugs. Please ensure your description is +clear and has sufficient instructions to be able to reproduce the issue. + +Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe +disclosure of security bugs. In those cases, please go through the process +outlined on that page and do not file a public issue. + + +## License +By contributing to OVSeg, you agree that your contributions will be licensed +under the LICENSE file in the root directory of this source tree. diff --git a/GETTING_STARTED.md b/GETTING_STARTED.md new file mode 100644 index 0000000000000000000000000000000000000000..847ddda04c47cf234c3593da2504184011e165fc --- /dev/null +++ b/GETTING_STARTED.md @@ -0,0 +1,99 @@ +## Getting started with OVSeg + + +### Try demo + +We release our largest model (Swin-Base + CLIP-ViT-L/14) [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) (md5: 526080). + +- Test on sample image + ```bash + python demo.py --config-file configs/ovseg_swinB_vitL_demo.yaml --class-names 'Oculus' 'Ukulele' --input ./resources/demo_samples/sample_03.jpeg --output ./pred --opts MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth + ``` + +### Evaluation with pre-trained weights + +We release our largest model (Swin-Base + CLIP-ViT-L/14) [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) (md5: 526080). + +- Test on ADE20K-150 and ADE-847 + ```bash + python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) + ``` + +- Test on PascalContext-59 and PascalContext-459 + ```bash + python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.6 DATASETS.TEST \(\"pascal_context_59_sem_seg_val\",\"pascal_context_459_sem_seg_val\",\) + ``` + +- Test on PascalVOC-20 + ```bash + python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT 0.45 DATASETS.TEST \(\"pascalvoc20_sem_seg_val\",\) + ``` + +#### Performance benchmark + +| method | backbone | training dataset | A-847 | PC-459 | A-150 | PC-59 | PAS-20 | +|------------------------------------|----------|------------------|:-----:|:------:|:-----:|:-----:|:------:| +| Open-vocabulary generalist models. | | | | | | | | +| SPNet | R-101 | PASCAL-15 | - | - | - | 24.3 | 18.3 | +| ZS3Net | R-101 | PASCAL-15 | - | - | - | 19.4 | 38.3 | +| LSeg | R-101 | PASCAL-15 | - | - | - | - | 47.4 | +| LSeg+ | R-101 | COCO Panoptic | 2.5 | 5.2 | 13.0 | 36.0 | 59.0 | +| SimBaseline | R-101c | COCO-Stuff-156 | - | - | 15.3 | - | 74.5 | +| ZegFormer | R-50 | COCO-Stuff-156 | - | - | 16.4 | - | 80.7 | +| OpenSeg | R-101 | COCO Panoptic | 4.0 | 6.5 | 15.3 | 36.9 | 60.0 | +| OVSeg (Ours) | R-101c | COCO-Stuff-171 | 7.1 | 11.0 | 24.8 | 53.3 | 92.6 | +| LSeg+ | Eff-B7 | COCO Panoptic | 3.8 | 7.8 | 18.0 | 46.5 | - | +| OpenSeg | Eff-B7 | COCO Panoptic | 6.3 | 9.0 | 21.1 | 42.1 | - | +| OVSeg (Ours) | Swin-B | COCO-Stuff-171 | 9.0 | 12.4 | 29.6 | 55.7 | 94.5 | +| Supervised specialist models. | | | | | | | | +| FCN | FCN-8s | Same as test | - | - | 29.4 | 37.8 | - | +| Deeplab | R-101 | Same as test | - | - | - | 45.7 | 77.7 | +| SelfTrain | Eff-L2 | Same as test | - | - | - | - | 90.0 | + +#### Ablation study + +- Mask prompt tuning can bring significant improvement without changing CLIP weights (Table 3 in [paper](https://arxiv.org/pdf/2210.04150.pdf)) + +Download the checkpoint with mpt only [ovseg_swinbase_vitL14_mpt_only.pt](https://drive.google.com/file/d/1LJGWFjHw76OGDNy9r9KQIaACfIm9KMhQ/view?usp=sharing) (md5: 2dd495). + + ```bash + python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_mpt_only.pt DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) + ``` + +- Mask prompt tuning can improve over fully finetuned model (Table 3 in [paper](https://arxiv.org/pdf/2210.04150.pdf)) + +With the same [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) checkpoint, set `MASK_PROMPT_FWD` as `False` + + ```bash + python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) + ``` + +- The effects of class prediction ensemble (Table 6 in [paper](https://arxiv.org/pdf/2210.04150.pdf)) + +With the same [ovseg_swinbase_vitL14_ft_mpt.pth](https://drive.google.com/file/d/1cn-ohxgXDrDfkzC1QdO-fi8IjbjXmgKy/view?usp=sharing) checkpoint, set `CLIP_ENSEMBLE` as `False`. + + ```bash + python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE False MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\",\"ade20k_full_sem_seg_val\"\) + ``` + +### Training Segmentation model + + Our model is trained on COCO-Stuff + +- Training baseline w/ original CLIP + ``` + python train_net.py --num-gpu 8 --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD False + ``` + +To reproduce our final results, you may want to use the our mask-adapted CLIP + +- Training ovseg w/ mask-adapted CLIP + ``` + python train_net.py --num-gpu 8 --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME #PATH_TO_MASKADAPTED_CLIP + ``` + +CAUTION: The final results is sensitive to the ensemble (appendix A.5 in [paper](https://arxiv.org/pdf/2210.04150.pdf)). Thus, you may want to use the ```tools/search_thr_ensemble_w.sh``` to find the best ensemble hyper-parameters. + +### Fine-tuning CLIP with collected mask-category pairs + +We are still working on this part, stay tuned! \ No newline at end of file diff --git a/INSTALL.md b/INSTALL.md new file mode 100644 index 0000000000000000000000000000000000000000..59ee72f5a078de9cf7a4d66aea7e6099b7345f02 --- /dev/null +++ b/INSTALL.md @@ -0,0 +1,50 @@ +## Installation + +### Requirements +- Linux with Python ≥ 3.8 +- PyTorch ≥ 1.8 and [torchvision](https://github.com/pytorch/vision/) that matches the PyTorch installation. + Install them together at [pytorch.org](https://pytorch.org) to make sure of this. Note, please check + PyTorch version matches that is required by Detectron2. +- PyTorch3d: follow [Pytorch3d installation instructions](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md). +- Detectron2: follow [Detectron2 installation instructions](https://detectron2.readthedocs.io/tutorials/install.html). +- Segment Anything Model: follow [SAM](https://github.com/facebookresearch/segment-anything). + +### Usage + +Install required packages. + +```bash +conda create --name ovseg python=3.8 +conda activate ovseg +conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge +conda install -c fvcore -c iopath -c conda-forge fvcore iopath +conda install pytorch3d -c pytorch3d +pip install -r requirements.txt +``` + +You need to download `detectron2==0.6` following [instructions](https://detectron2.readthedocs.io/en/latest/tutorials/install.html) + +```bash +python -m pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html +``` + +If you cannot succefully install `pycocotools`, try this from [here](https://github.com/cocodataset/cocoapi/issues/351): +```bash +conda install -c conda-forge pycocotools +``` + +Install the SAM with: +```bash +pip install git+https://github.com/facebookresearch/segment-anything.git +``` +To fully support the SAM, install these packages: +```bash +pip install opencv-python pycocotools matplotlib onnxruntime onnx +``` + +FurtherMore, install the modified clip package. + +```bash +cd third_party/CLIP +python -m pip install -Ue . +``` \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..50f2e656c8e006d68fce3c9ddd02d9069072214a --- /dev/null +++ b/LICENSE @@ -0,0 +1,399 @@ +Attribution-NonCommercial 4.0 International + +======================================================================= + +Creative Commons Corporation ("Creative Commons") is not a law firm and +does not provide legal services or legal advice. Distribution of +Creative Commons public licenses does not create a lawyer-client or +other relationship. Creative Commons makes its licenses and related +information available on an "as-is" basis. Creative Commons gives no +warranties regarding its licenses, any material licensed under their +terms and conditions, or any related information. Creative Commons +disclaims all liability for damages resulting from their use to the +fullest extent possible. + +Using Creative Commons Public Licenses + +Creative Commons public licenses provide a standard set of terms and +conditions that creators and other rights holders may use to share +original works of authorship and other material subject to copyright +and certain other rights specified in the public license below. The +following considerations are for informational purposes only, are not +exhaustive, and do not form part of our licenses. + + Considerations for licensors: Our public licenses are + intended for use by those authorized to give the public + permission to use material in ways otherwise restricted by + copyright and certain other rights. Our licenses are + irrevocable. Licensors should read and understand the terms + and conditions of the license they choose before applying it. + Licensors should also secure all rights necessary before + applying our licenses so that the public can reuse the + material as expected. Licensors should clearly mark any + material not subject to the license. This includes other CC- + licensed material, or material used under an exception or + limitation to copyright. More considerations for licensors: + wiki.creativecommons.org/Considerations_for_licensors + + Considerations for the public: By using one of our public + licenses, a licensor grants the public permission to use the + licensed material under specified terms and conditions. If + the licensor's permission is not necessary for any reason--for + example, because of any applicable exception or limitation to + copyright--then that use is not regulated by the license. Our + licenses grant only permissions under copyright and certain + other rights that a licensor has authority to grant. Use of + the licensed material may still be restricted for other + reasons, including because others have copyright or other + rights in the material. A licensor may make special requests, + such as asking that all changes be marked or described. + Although not required by our licenses, you are encouraged to + respect those requests where reasonable. More_considerations + for the public: + wiki.creativecommons.org/Considerations_for_licensees + +======================================================================= + +Creative Commons Attribution-NonCommercial 4.0 International Public +License + +By exercising the Licensed Rights (defined below), You accept and agree +to be bound by the terms and conditions of this Creative Commons +Attribution-NonCommercial 4.0 International Public License ("Public +License"). To the extent this Public License may be interpreted as a +contract, You are granted the Licensed Rights in consideration of Your +acceptance of these terms and conditions, and the Licensor grants You +such rights in consideration of benefits the Licensor receives from +making the Licensed Material available under these terms and +conditions. + +Section 1 -- Definitions. + + a. Adapted Material means material subject to Copyright and Similar + Rights that is derived from or based upon the Licensed Material + and in which the Licensed Material is translated, altered, + arranged, transformed, or otherwise modified in a manner requiring + permission under the Copyright and Similar Rights held by the + Licensor. For purposes of this Public License, where the Licensed + Material is a musical work, performance, or sound recording, + Adapted Material is always produced where the Licensed Material is + synched in timed relation with a moving image. + + b. Adapter's License means the license You apply to Your Copyright + and Similar Rights in Your contributions to Adapted Material in + accordance with the terms and conditions of this Public License. + + c. Copyright and Similar Rights means copyright and/or similar rights + closely related to copyright including, without limitation, + performance, broadcast, sound recording, and Sui Generis Database + Rights, without regard to how the rights are labeled or + categorized. For purposes of this Public License, the rights + specified in Section 2(b)(1)-(2) are not Copyright and Similar + Rights. + d. Effective Technological Measures means those measures that, in the + absence of proper authority, may not be circumvented under laws + fulfilling obligations under Article 11 of the WIPO Copyright + Treaty adopted on December 20, 1996, and/or similar international + agreements. + + e. Exceptions and Limitations means fair use, fair dealing, and/or + any other exception or limitation to Copyright and Similar Rights + that applies to Your use of the Licensed Material. + + f. Licensed Material means the artistic or literary work, database, + or other material to which the Licensor applied this Public + License. + + g. Licensed Rights means the rights granted to You subject to the + terms and conditions of this Public License, which are limited to + all Copyright and Similar Rights that apply to Your use of the + Licensed Material and that the Licensor has authority to license. + + h. Licensor means the individual(s) or entity(ies) granting rights + under this Public License. + + i. NonCommercial means not primarily intended for or directed towards + commercial advantage or monetary compensation. For purposes of + this Public License, the exchange of the Licensed Material for + other material subject to Copyright and Similar Rights by digital + file-sharing or similar means is NonCommercial provided there is + no payment of monetary compensation in connection with the + exchange. + + j. Share means to provide material to the public by any means or + process that requires permission under the Licensed Rights, such + as reproduction, public display, public performance, distribution, + dissemination, communication, or importation, and to make material + available to the public including in ways that members of the + public may access the material from a place and at a time + individually chosen by them. + + k. Sui Generis Database Rights means rights other than copyright + resulting from Directive 96/9/EC of the European Parliament and of + the Council of 11 March 1996 on the legal protection of databases, + as amended and/or succeeded, as well as other essentially + equivalent rights anywhere in the world. + + l. You means the individual or entity exercising the Licensed Rights + under this Public License. Your has a corresponding meaning. + +Section 2 -- Scope. + + a. License grant. + + 1. Subject to the terms and conditions of this Public License, + the Licensor hereby grants You a worldwide, royalty-free, + non-sublicensable, non-exclusive, irrevocable license to + exercise the Licensed Rights in the Licensed Material to: + + a. reproduce and Share the Licensed Material, in whole or + in part, for NonCommercial purposes only; and + + b. produce, reproduce, and Share Adapted Material for + NonCommercial purposes only. + + 2. Exceptions and Limitations. For the avoidance of doubt, where + Exceptions and Limitations apply to Your use, this Public + License does not apply, and You do not need to comply with + its terms and conditions. + + 3. Term. The term of this Public License is specified in Section + 6(a). + + 4. Media and formats; technical modifications allowed. The + Licensor authorizes You to exercise the Licensed Rights in + all media and formats whether now known or hereafter created, + and to make technical modifications necessary to do so. The + Licensor waives and/or agrees not to assert any right or + authority to forbid You from making technical modifications + necessary to exercise the Licensed Rights, including + technical modifications necessary to circumvent Effective + Technological Measures. For purposes of this Public License, + simply making modifications authorized by this Section 2(a) + (4) never produces Adapted Material. + + 5. Downstream recipients. + + a. Offer from the Licensor -- Licensed Material. Every + recipient of the Licensed Material automatically + receives an offer from the Licensor to exercise the + Licensed Rights under the terms and conditions of this + Public License. + + b. No downstream restrictions. You may not offer or impose + any additional or different terms or conditions on, or + apply any Effective Technological Measures to, the + Licensed Material if doing so restricts exercise of the + Licensed Rights by any recipient of the Licensed + Material. + + 6. No endorsement. Nothing in this Public License constitutes or + may be construed as permission to assert or imply that You + are, or that Your use of the Licensed Material is, connected + with, or sponsored, endorsed, or granted official status by, + the Licensor or others designated to receive attribution as + provided in Section 3(a)(1)(A)(i). + + b. Other rights. + + 1. Moral rights, such as the right of integrity, are not + licensed under this Public License, nor are publicity, + privacy, and/or other similar personality rights; however, to + the extent possible, the Licensor waives and/or agrees not to + assert any such rights held by the Licensor to the limited + extent necessary to allow You to exercise the Licensed + Rights, but not otherwise. + + 2. Patent and trademark rights are not licensed under this + Public License. + + 3. To the extent possible, the Licensor waives any right to + collect royalties from You for the exercise of the Licensed + Rights, whether directly or through a collecting society + under any voluntary or waivable statutory or compulsory + licensing scheme. In all other cases the Licensor expressly + reserves any right to collect such royalties, including when + the Licensed Material is used other than for NonCommercial + purposes. + +Section 3 -- License Conditions. + +Your exercise of the Licensed Rights is expressly made subject to the +following conditions. + + a. Attribution. + + 1. If You Share the Licensed Material (including in modified + form), You must: + + a. retain the following if it is supplied by the Licensor + with the Licensed Material: + + i. identification of the creator(s) of the Licensed + Material and any others designated to receive + attribution, in any reasonable manner requested by + the Licensor (including by pseudonym if + designated); + + ii. a copyright notice; + + iii. a notice that refers to this Public License; + + iv. a notice that refers to the disclaimer of + warranties; + + v. a URI or hyperlink to the Licensed Material to the + extent reasonably practicable; + + b. indicate if You modified the Licensed Material and + retain an indication of any previous modifications; and + + c. indicate the Licensed Material is licensed under this + Public License, and include the text of, or the URI or + hyperlink to, this Public License. + + 2. You may satisfy the conditions in Section 3(a)(1) in any + reasonable manner based on the medium, means, and context in + which You Share the Licensed Material. For example, it may be + reasonable to satisfy the conditions by providing a URI or + hyperlink to a resource that includes the required + information. + + 3. If requested by the Licensor, You must remove any of the + information required by Section 3(a)(1)(A) to the extent + reasonably practicable. + + 4. If You Share Adapted Material You produce, the Adapter's + License You apply must not prevent recipients of the Adapted + Material from complying with this Public License. + +Section 4 -- Sui Generis Database Rights. + +Where the Licensed Rights include Sui Generis Database Rights that +apply to Your use of the Licensed Material: + + a. for the avoidance of doubt, Section 2(a)(1) grants You the right + to extract, reuse, reproduce, and Share all or a substantial + portion of the contents of the database for NonCommercial purposes + only; + + b. if You include all or a substantial portion of the database + contents in a database in which You have Sui Generis Database + Rights, then the database in which You have Sui Generis Database + Rights (but not its individual contents) is Adapted Material; and + + c. You must comply with the conditions in Section 3(a) if You Share + all or a substantial portion of the contents of the database. + +For the avoidance of doubt, this Section 4 supplements and does not +replace Your obligations under this Public License where the Licensed +Rights include other Copyright and Similar Rights. + +Section 5 -- Disclaimer of Warranties and Limitation of Liability. + + a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE + EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS + AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF + ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, + IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, + WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR + PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, + ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT + KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT + ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. + + b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE + TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, + NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, + INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, + COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR + USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN + ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR + DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR + IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. + + c. The disclaimer of warranties and limitation of liability provided + above shall be interpreted in a manner that, to the extent + possible, most closely approximates an absolute disclaimer and + waiver of all liability. + +Section 6 -- Term and Termination. + + a. This Public License applies for the term of the Copyright and + Similar Rights licensed here. However, if You fail to comply with + this Public License, then Your rights under this Public License + terminate automatically. + + b. Where Your right to use the Licensed Material has terminated under + Section 6(a), it reinstates: + + 1. automatically as of the date the violation is cured, provided + it is cured within 30 days of Your discovery of the + violation; or + + 2. upon express reinstatement by the Licensor. + + For the avoidance of doubt, this Section 6(b) does not affect any + right the Licensor may have to seek remedies for Your violations + of this Public License. + + c. For the avoidance of doubt, the Licensor may also offer the + Licensed Material under separate terms or conditions or stop + distributing the Licensed Material at any time; however, doing so + will not terminate this Public License. + + d. Sections 1, 5, 6, 7, and 8 survive termination of this Public + License. + +Section 7 -- Other Terms and Conditions. + + a. The Licensor shall not be bound by any additional or different + terms or conditions communicated by You unless expressly agreed. + + b. Any arrangements, understandings, or agreements regarding the + Licensed Material not stated herein are separate from and + independent of the terms and conditions of this Public License. + +Section 8 -- Interpretation. + + a. For the avoidance of doubt, this Public License does not, and + shall not be interpreted to, reduce, limit, restrict, or impose + conditions on any use of the Licensed Material that could lawfully + be made without permission under this Public License. + + b. To the extent possible, if any provision of this Public License is + deemed unenforceable, it shall be automatically reformed to the + minimum extent necessary to make it enforceable. If the provision + cannot be reformed, it shall be severed from this Public License + without affecting the enforceability of the remaining terms and + conditions. + + c. No term or condition of this Public License will be waived and no + failure to comply consented to unless expressly agreed to by the + Licensor. + + d. Nothing in this Public License constitutes or may be interpreted + as a limitation upon, or waiver of, any privileges and immunities + that apply to the Licensor or You, including from the legal + processes of any jurisdiction or authority. + +======================================================================= + +Creative Commons is not a party to its public +licenses. Notwithstanding, Creative Commons may elect to apply one of +its public licenses to material it publishes and in those instances +will be considered the “Licensor.” The text of the Creative Commons +public licenses is dedicated to the public domain under the CC0 Public +Domain Dedication. Except for the limited purpose of indicating that +material is shared under a Creative Commons public license or as +otherwise permitted by the Creative Commons policies published at +creativecommons.org/policies, Creative Commons does not authorize the +use of the trademark "Creative Commons" or any other trademark or logo +of Creative Commons without its prior written consent including, +without limitation, in connection with any unauthorized modifications +to any of its public licenses or any other arrangements, +understandings, or agreements concerning use of licensed material. For +the avoidance of doubt, this paragraph does not form part of the +public licenses. + +Creative Commons may be contacted at creativecommons.org. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8f2b6472fa2e15646df3bc9821f8bde16f788fe6 --- /dev/null +++ b/README.md @@ -0,0 +1,13 @@ +--- +title: Semantic Segment AnyRGBD +emoji: ⚡ +colorFrom: yellow +colorTo: green +sdk: gradio +sdk_version: 3.27.0 +app_file: app.py +pinned: false +license: mit +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/UI/sailvos3d/ex1/inputs/depth_000160.npy b/UI/sailvos3d/ex1/inputs/depth_000160.npy new file mode 100644 index 0000000000000000000000000000000000000000..8ab775b91010849a4a98e2d0e4a595cf1cec76df --- /dev/null +++ b/UI/sailvos3d/ex1/inputs/depth_000160.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96d4969b8b33250785d1996b1536bb9026536f420391c68255e326990138598e +size 4096128 diff --git a/UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz b/UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz new file mode 100644 index 0000000000000000000000000000000000000000..08b4ae3bb99971542db3661cf049c0d993d48710 --- /dev/null +++ b/UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5afc2fdf1faa9b7b5d7808bb703c82aa5ccbb3154e2f62b3cc4989a2dcc92fe5 +size 1234 diff --git a/UI/sailvos3d/ex1/inputs/rgb_000160.bmp b/UI/sailvos3d/ex1/inputs/rgb_000160.bmp new file mode 100644 index 0000000000000000000000000000000000000000..03a72ef7da9ae9186efa66119cb287bfef4185e5 --- /dev/null +++ b/UI/sailvos3d/ex1/inputs/rgb_000160.bmp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c461e0c0cf6049bd9984ccaedb8b8fb07a1df06462931d38fdcd952bb38805c +size 3072054 diff --git a/UI/sailvos3d/ex2/inputs/depth_000540.npy b/UI/sailvos3d/ex2/inputs/depth_000540.npy new file mode 100644 index 0000000000000000000000000000000000000000..f952ea79b409c521ead8a8fea61f64a271079c9d --- /dev/null +++ b/UI/sailvos3d/ex2/inputs/depth_000540.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24fcabda3f7fd17856c4105279f2842b631cc18579d273b87dd8f2cb39e7df6 +size 4096128 diff --git a/UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz b/UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz new file mode 100644 index 0000000000000000000000000000000000000000..ed8f4184e0f67faaa1f448e8046d2f7f280b4244 --- /dev/null +++ b/UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb66f05ce4cdb6d6410bd3e34b70eeb07724810e70786249c30de0f50404fd64 +size 1234 diff --git a/UI/sailvos3d/ex2/inputs/rgb_000540.bmp b/UI/sailvos3d/ex2/inputs/rgb_000540.bmp new file mode 100644 index 0000000000000000000000000000000000000000..115b5ca1897d7d3d0b80fc411b0834fe106ee4bb --- /dev/null +++ b/UI/sailvos3d/ex2/inputs/rgb_000540.bmp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aa08869030d51751983bdab733f4f26342dc239abedb3195d3f4771d93701cf +size 3072054 diff --git a/UI/scannetv2/examples/scene0000_00/color/1660.jpg b/UI/scannetv2/examples/scene0000_00/color/1660.jpg new file mode 100644 index 0000000000000000000000000000000000000000..dbbd168d04d7a13533a81b2d051ff8888cb8400a Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/color/1660.jpg differ diff --git a/UI/scannetv2/examples/scene0000_00/color/5560.jpg b/UI/scannetv2/examples/scene0000_00/color/5560.jpg new file mode 100644 index 0000000000000000000000000000000000000000..cf72818630d852e9dba470b2fe2f8a517530a549 Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/color/5560.jpg differ diff --git a/UI/scannetv2/examples/scene0000_00/depth/1660.png b/UI/scannetv2/examples/scene0000_00/depth/1660.png new file mode 100644 index 0000000000000000000000000000000000000000..701312992acf73df575741d6ac237ab6d0d531db Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/depth/1660.png differ diff --git a/UI/scannetv2/examples/scene0000_00/depth/5560.png b/UI/scannetv2/examples/scene0000_00/depth/5560.png new file mode 100644 index 0000000000000000000000000000000000000000..cfc72a3d8d0f707be0f5b57a52044e767f4ba4a9 Binary files /dev/null and b/UI/scannetv2/examples/scene0000_00/depth/5560.png differ diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_color.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_color.txt new file mode 100644 index 0000000000000000000000000000000000000000..50a318656a6eb9597e65cd0811bbd671e26b9c24 --- /dev/null +++ b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_color.txt @@ -0,0 +1,4 @@ +1.000000 0.000000 0.000000 0.000000 +0.000000 1.000000 0.000000 0.000000 +0.000000 0.000000 1.000000 0.000000 +0.000000 0.000000 0.000000 1.000000 diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_depth.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_depth.txt new file mode 100644 index 0000000000000000000000000000000000000000..50a318656a6eb9597e65cd0811bbd671e26b9c24 --- /dev/null +++ b/UI/scannetv2/examples/scene0000_00/intrinsics/extrinsic_depth.txt @@ -0,0 +1,4 @@ +1.000000 0.000000 0.000000 0.000000 +0.000000 1.000000 0.000000 0.000000 +0.000000 0.000000 1.000000 0.000000 +0.000000 0.000000 0.000000 1.000000 diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_color.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_color.txt new file mode 100644 index 0000000000000000000000000000000000000000..b59f1c74414f34ea206c923e54863debd7b135c1 --- /dev/null +++ b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_color.txt @@ -0,0 +1,4 @@ +1169.621094 0.000000 646.295044 0.000000 +0.000000 1167.105103 489.927032 0.000000 +0.000000 0.000000 1.000000 0.000000 +0.000000 0.000000 0.000000 1.000000 diff --git a/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_depth.txt b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_depth.txt new file mode 100644 index 0000000000000000000000000000000000000000..757719fe85cf22cb752a9f10f49ac100428e400c --- /dev/null +++ b/UI/scannetv2/examples/scene0000_00/intrinsics/intrinsic_depth.txt @@ -0,0 +1,4 @@ +577.590698 0.000000 318.905426 0.000000 +0.000000 578.729797 242.683609 0.000000 +0.000000 0.000000 1.000000 0.000000 +0.000000 0.000000 0.000000 1.000000 diff --git a/UI/scannetv2/examples/scene0000_00/pose/1660.txt b/UI/scannetv2/examples/scene0000_00/pose/1660.txt new file mode 100644 index 0000000000000000000000000000000000000000..df62592207edaee68d2a93452d80de21e535f347 --- /dev/null +++ b/UI/scannetv2/examples/scene0000_00/pose/1660.txt @@ -0,0 +1,4 @@ +0.470083 -0.286393 0.834866 4.877258 +-0.882320 -0.127731 0.452986 4.841086 +-0.023094 -0.949560 -0.312735 1.390592 +0.000000 0.000000 0.000000 1.000000 diff --git a/UI/scannetv2/examples/scene0000_00/pose/5560.txt b/UI/scannetv2/examples/scene0000_00/pose/5560.txt new file mode 100644 index 0000000000000000000000000000000000000000..02ba667129dff485eebbc333b3ea5aae7030a410 --- /dev/null +++ b/UI/scannetv2/examples/scene0000_00/pose/5560.txt @@ -0,0 +1,4 @@ +-0.994579 -0.050921 0.090665 2.842624 +-0.101826 0.300126 -0.948449 3.131151 +0.021085 -0.952539 -0.303684 1.467106 +0.000000 0.000000 0.000000 1.000000 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..84235225a705e98da68a0a276f54a91ae182feb5 --- /dev/null +++ b/app.py @@ -0,0 +1,304 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import os +os.system('pip install torch==1.10.1+cu113 torchvision==0.11.2+cu113 torchaudio==0.10.1+cu113 -f https://download.pytorch.org/whl/torch_stable.html') + +try: + import detectron2 +except: + import os + # os.system('cd /home/user/app/third_party/CLIP && pip install -Ue .') + os.system('pip install git+https://github.com/Jun-CEN/CLIP.git') + os.system('pip install git+https://github.com/facebookresearch/detectron2.git') + os.system('pip install git+https://github.com/facebookresearch/pytorch3d.git') + os.system('pip install git+https://github.com/facebookresearch/segment-anything.git') + +import argparse +import glob +import multiprocessing as mp +import os +import time +import cv2 +import tqdm +import numpy as np +import gradio as gr +from tools.util import * + +from detectron2.config import get_cfg + +from detectron2.projects.deeplab import add_deeplab_config +from detectron2.data.detection_utils import read_image +from detectron2.utils.logger import setup_logger +from open_vocab_seg import add_ovseg_config + +from open_vocab_seg.utils import VisualizationDemo, VisualizationDemoIndoor + +# constants +WINDOW_NAME = "Open vocabulary segmentation" + + +def setup_cfg(args): + # load config from file and command-line arguments + cfg = get_cfg() + # for poly lr schedule + add_deeplab_config(cfg) + add_ovseg_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + return cfg + + +def get_parser(): + parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation") + parser.add_argument( + "--config-file", + default="configs/ovseg_swinB_vitL_demo.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument( + "--input", + default=["/mnt/lustre/jkyang/PSG4D/sailvos3d/downloads/sailvos3d/trevor_1_int/images/000160.bmp"], + nargs="+", + help="A list of space separated input images; " + "or a single glob pattern such as 'directory/*.jpg'", + ) + parser.add_argument( + "--class-names", + default=["person", "car", "motorcycle", "truck", "bird", "dog", "handbag", "suitcase", "bottle", "cup", "bowl", "chair", "potted plant", "bed", "dining table", "tv", "laptop", "cell phone", "bag", "bin", "box", "door", "road barrier", "stick", "lamp", "floor", "wall"], + nargs="+", + help="A list of user-defined class_names" + ) + parser.add_argument( + "--output", + default = "./pred", + help="A file or directory to save output visualizations. " + "If not given, will show output in an OpenCV window.", + ) + parser.add_argument( + "--opts", + help="Modify config options using the command-line 'KEY VALUE' pairs", + default=["MODEL.WEIGHTS", "ovseg_swinbase_vitL14_ft_mpt.pth"], + nargs=argparse.REMAINDER, + ) + return parser + +args = get_parser().parse_args() + +def greet_sailvos3d(rgb_input, depth_map_input, rage_matrices_input, class_candidates): + print(args.class_names) + print(class_candidates[0], class_candidates[1], class_candidates[2], class_candidates[3],) + print(class_candidates.split(', ')) + args.input = [rgb_input] + args.class_names = class_candidates.split(', ') + depth_map_path = depth_map_input.name + rage_matrices_path = rage_matrices_input.name + print(args.input, args.class_names, depth_map_path, rage_matrices_path) + mp.set_start_method("spawn", force=True) + setup_logger(name="fvcore") + logger = setup_logger() + logger.info("Arguments: " + str(args)) + + cfg = setup_cfg(args) + + demo = VisualizationDemo(cfg) + class_names = args.class_names + print(args.input) + if args.input: + if len(args.input) == 1: + args.input = glob.glob(os.path.expanduser(args.input[0])) + assert args.input, "The input path(s) was not found" + for path in tqdm.tqdm(args.input, disable=not args.output): + # use PIL, to be consistent with evaluation + start_time = time.time() + predictions, visualized_output_rgb, visualized_output_depth, visualized_output_rgb_sam, visualized_output_depth_sam = demo.run_on_image_sam(path, class_names, depth_map_path, rage_matrices_path) + logger.info( + "{}: {} in {:.2f}s".format( + path, + "detected {} instances".format(len(predictions["instances"])) + if "instances" in predictions + else "finished", + time.time() - start_time, + ) + ) + + if args.output: + if os.path.isdir(args.output): + assert os.path.isdir(args.output), args.output + out_filename = os.path.join(args.output, os.path.basename(path)) + else: + assert len(args.input) == 1, "Please specify a directory with args.output" + out_filename = args.output + visualized_output_rgb.save('outputs/RGB_Semantic_SAM.png') + visualized_output_depth.save('outputs/Depth_Semantic_SAM.png') + visualized_output_rgb_sam.save('outputs/RGB_Semantic_SAM_Mask.png') + visualized_output_depth_sam.save('outputs/Depth_Semantic_SAM_Mask.png') + rgb_3d_sam = demo.get_xyzrgb('outputs/RGB_Semantic_SAM.png', depth_map_path, rage_matrices_path) + depth_3d_sam = demo.get_xyzrgb('outputs/Depth_Semantic_SAM.png', depth_map_path, rage_matrices_path) + rgb_3d_sam_mask = demo.get_xyzrgb('outputs/RGB_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path) + depth_3d_sam_mask = demo.get_xyzrgb('outputs/Depth_Semantic_SAM_Mask.png', depth_map_path, rage_matrices_path) + np.savez('outputs/xyzrgb.npz', rgb_3d_sam = rgb_3d_sam, depth_3d_sam = depth_3d_sam, rgb_3d_sam_mask = rgb_3d_sam_mask, depth_3d_sam_mask = depth_3d_sam_mask) + demo.render_3d_video('outputs/xyzrgb.npz', depth_map_path) + else: + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, visualized_output_rgb.get_image()[:, :, ::-1]) + if cv2.waitKey(0) == 27: + break # esc to quit + else: + raise NotImplementedError + + Depth_Semantic_SAM_Mask = read_image('outputs/Depth_Semantic_SAM_Mask.png') + RGB_Semantic_SAM_Mask = read_image('outputs/RGB_Semantic_SAM_Mask.png') + Depth_Semantic_SAM = read_image('outputs/Depth_Semantic_SAM.png') + RGB_Semantic_SAM = read_image('outputs/RGB_Semantic_SAM.png') + two_image_to_gif(Depth_Semantic_SAM_Mask, Depth_Semantic_SAM, 'Depth_Semantic_SAM_2D') + two_image_to_gif(RGB_Semantic_SAM_Mask, RGB_Semantic_SAM, 'RGB_Semantic_SAM_2D') + Depth_Semantic_SAM_2D = 'outputs/Depth_Semantic_SAM_2D.mp4' + RGB_Semantic_SAM_2D = 'outputs/RGB_Semantic_SAM_2D.mp4' + Depth_map = read_image('outputs/Depth_rendered.png') + Depth_Semantic_SAM_Mask_gif = 'outputs/Depth_3D_All.mp4' + RGB_Semantic_SAM_Mask_gif = 'outputs/RGB_3D_All.mp4' + return RGB_Semantic_SAM_2D, RGB_Semantic_SAM_Mask_gif, Depth_map, Depth_Semantic_SAM_2D, Depth_Semantic_SAM_Mask_gif + +def greet_scannet(rgb_input, depth_map_input, class_candidates): + rgb_input = rgb_input + depth_map_input = depth_map_input.name + class_candidates = class_candidates.split(', ') + print(rgb_input, depth_map_input, class_candidates) + mp.set_start_method("spawn", force=True) + args = get_parser().parse_args() + setup_logger(name="fvcore") + logger = setup_logger() + logger.info("Arguments: " + str(args)) + + cfg = setup_cfg(args) + + demo = VisualizationDemoIndoor(cfg) + """ args.input = glob.glob(os.path.expanduser(args.input[0])) + assert args.input, "The input path(s) was not found" """ + start_time = time.time() + predictions, output2D, output3D = demo.run_on_pcd_ui(rgb_input, depth_map_input, class_candidates) + + output2D['sem_seg_on_rgb'].save('outputs/RGB_Semantic_SAM.png') + output2D['sem_seg_on_depth'].save('outputs/Depth_Semantic_SAM.png') + output2D['sam_seg_on_rgb'].save('outputs/RGB_Semantic_SAM_Mask.png') + output2D['sam_seg_on_depth'].save('outputs/Depth_Semantic_SAM_Mask.png') + """ rgb_3d_sam = demo.get_xyzrgb('outputs/RGB_Semantic_SAM.png', path) + depth_3d_sam = demo.get_xyzrgb('outputs/Depth_Semantic_SAM.png', path) + rgb_3d_sam_mask = demo.get_xyzrgb('outputs/RGB_Semantic_SAM_Mask.png', path) + depth_3d_sam_mask = demo.get_xyzrgb(outputs/'Depth_Semantic_SAM_Mask.png', path) """ + rgb_3d_sem = output3D['rgb_3d_sem'] + depth_3d_sem = output3D['depth_3d_sem'] + rgb_3d_sam = output3D['rgb_3d_sam'] + depth_3d_sam = output3D['depth_3d_sam'] + + np.savez('outputs/xyzrgb.npz', rgb_3d_sam = rgb_3d_sem, depth_3d_sam = depth_3d_sem, rgb_3d_sam_mask = rgb_3d_sam, depth_3d_sam_mask = depth_3d_sam) + demo.render_3d_video('outputs/xyzrgb.npz') + + Depth_Semantic_SAM_Mask = read_image('outputs/Depth_Semantic_SAM_Mask.png') + RGB_Semantic_SAM_Mask = read_image('outputs/RGB_Semantic_SAM_Mask.png') + Depth_Semantic_SAM = read_image('outputs/Depth_Semantic_SAM.png') + RGB_Semantic_SAM = read_image('outputs/RGB_Semantic_SAM.png') + two_image_to_gif(Depth_Semantic_SAM_Mask, Depth_Semantic_SAM, 'Depth_Semantic_SAM_2D') + two_image_to_gif(RGB_Semantic_SAM_Mask, RGB_Semantic_SAM, 'RGB_Semantic_SAM_2D') + Depth_Semantic_SAM_2D = 'outputs/Depth_Semantic_SAM_2D.mp4' + RGB_Semantic_SAM_2D = 'outputs/RGB_Semantic_SAM_2D.mp4' + Depth_map = read_image('outputs/Depth_rendered.png') + Depth_Semantic_SAM_Mask_gif = 'outputs/Depth_3D_All.mp4' + RGB_Semantic_SAM_Mask_gif = 'outputs/RGB_3D_All.mp4' + return RGB_Semantic_SAM_2D, RGB_Semantic_SAM_Mask_gif, Depth_map, Depth_Semantic_SAM_2D, Depth_Semantic_SAM_Mask_gif + + +with gr.Blocks(analytics_enabled=False) as segrgbd_iface: + gr.Markdown("

Segment Any RGBD

\ + Github
") + + gr.Markdown(" Note that you need a GPU for this project. You may duplicate the space and upgrade to GPU in settings for better performance and faster inference without waiting in the queue. Duplicate Space ") + #######t2v####### + with gr.Tab(label="Dataset: Sailvos3D"): + with gr.Column(): + with gr.Row(): + # with gr.Tab(label='input'): + with gr.Column(): + with gr.Row(): + Input_RGB_Component = gr.Image(label = 'RGB_Input', type = 'filepath').style(width=320, height=200) + Depth_Map_Output_Component = gr.Image(label = "Vis_Depth_Map").style(width=320, height=200) + with gr.Row(): + Depth_Map_Input_Component = gr.File(label = 'input_Depth_map') + Component_2D_to_3D_Projection_Parameters = gr.File(label = '2D_to_3D_Projection_Parameters') + with gr.Row(): + Class_Candidates_Component = gr.Text(label = 'Class_Candidates') + vc_end_btn = gr.Button("Send") + with gr.Tab(label='Result'): + with gr.Row(): + RGB_Semantic_SAM_Mask_Component = gr.Video(label = "RGB_Semantic_SAM_Mask").style(width=320, height=200) + RGB_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_RGB_Semantic_SAM_Mask").style(width=320, height=200) + with gr.Row(): + Depth_Semantic_SAM_Mask_Component = gr.Video(label = "Depth_Semantic_SAM_Mask").style(width=320, height=200) + Depth_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_Depth_Semantic_SAM_Mask").style(width=320, height=200) + with gr.Row(): + gr.Markdown(" It takes around 2 to 5 minutes to get the final results. The framework initialization, SAM segmentation, zero-shot semantic segmentation and 3D results rendering take long time.") + gr.Examples(examples=[ + [ + 'UI/sailvos3d/ex1/inputs/rgb_000160.bmp', + 'UI/sailvos3d/ex1/inputs/depth_000160.npy', + 'UI/sailvos3d/ex1/inputs/rage_matrices_000160.npz', + 'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall', + ], + [ + 'UI/sailvos3d/ex2/inputs/rgb_000540.bmp', + 'UI/sailvos3d/ex2/inputs/depth_000540.npy', + 'UI/sailvos3d/ex2/inputs/rage_matrices_000540.npz', + 'person, car, motorcycle, truck, bird, dog, handbag, suitcase, bottle, cup, bowl, chair, potted plant, bed, dining table, tv, laptop, cell phone, bag, bin, box, door, road barrier, stick, lamp, floor, wall', + ]], + inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component], + outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component], + fn=greet_sailvos3d) + vc_end_btn.click(inputs=[Input_RGB_Component, Depth_Map_Input_Component, Component_2D_to_3D_Projection_Parameters, Class_Candidates_Component], + outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component], + fn=greet_sailvos3d) + + with gr.Tab(label="Dataset: Scannet"): + with gr.Column(): + with gr.Row(): + # with gr.Tab(label='input'): + with gr.Column(): + with gr.Row(): + Input_RGB_Component = gr.Image(label = 'RGB_Input', type = 'filepath').style(width=320, height=200) + Depth_Map_Output_Component = gr.Image(label = "Vis_Depth_Map").style(width=320, height=200) + with gr.Row(): + Depth_Map_Input_Component = gr.File(label = "Input_Depth_Map") + Class_Candidates_Component = gr.Text(label = 'Class_Candidates') + vc_end_btn = gr.Button("Send") + with gr.Tab(label='Result'): + with gr.Row(): + RGB_Semantic_SAM_Mask_Component = gr.Video(label = "RGB_Semantic_SAM_Mask").style(width=320, height=200) + RGB_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_RGB_Semantic_SAM_Mask").style(width=320, height=200) + with gr.Row(): + Depth_Semantic_SAM_Mask_Component = gr.Video(label = "Depth_Semantic_SAM_Mask").style(width=320, height=200) + Depth_Semantic_SAM_Mask_3D_Component = gr.Video(label = "Video_3D_Depth_Semantic_SAM_Mask").style(width=320, height=200) + with gr.Row(): + gr.Markdown(" It takes around 2 to 5 minutes to get the final results. The framework initialization, SAM segmentation, zero-shot semantic segmentation and 3D results rendering take long time.") + gr.Examples(examples=[ + [ + 'UI/scannetv2/examples/scene0000_00/color/1660.jpg', + 'UI/scannetv2/examples/scene0000_00/depth/1660.png', + 'wall, floor, cabinet, bed, chair, sofa, table, door, window, bookshelf, picture, counter, desk, curtain, refrigerator, shower curtain, toilet, sink, bathtub, other furniture', + ], + [ + 'UI/scannetv2/examples/scene0000_00/color/5560.jpg', + 'UI/scannetv2/examples/scene0000_00/depth/5560.png', + 'wall, floor, cabinet, bed, chair, sofa, table, door, window, bookshelf, picture, counter, desk, curtain, refrigerator, shower curtain, toilet, sink, bathtub, other furniture', + ]], + inputs=[Input_RGB_Component, Depth_Map_Input_Component, Class_Candidates_Component], + outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component], + fn=greet_scannet) + vc_end_btn.click(inputs=[Input_RGB_Component, Depth_Map_Input_Component, Class_Candidates_Component], + outputs=[RGB_Semantic_SAM_Mask_Component, RGB_Semantic_SAM_Mask_3D_Component, Depth_Map_Output_Component, Depth_Semantic_SAM_Mask_Component, Depth_Semantic_SAM_Mask_3D_Component], + fn=greet_scannet) + +demo = segrgbd_iface +demo.launch() + diff --git a/configs/ovseg_swinB_vitL_bs32_120k.yaml b/configs/ovseg_swinB_vitL_bs32_120k.yaml new file mode 100644 index 0000000000000000000000000000000000000000..068de76324481cdb6fc66a21cc481fad5eb42070 --- /dev/null +++ b/configs/ovseg_swinB_vitL_bs32_120k.yaml @@ -0,0 +1,100 @@ +MODEL: + META_ARCHITECTURE: "OVSeg" + BACKBONE: + FREEZE_AT: 0 + NAME: "D2SwinTransformer" + SWIN: + EMBED_DIM: 128 + DEPTHS: [2, 2, 18, 2] + NUM_HEADS: [4, 8, 16, 32] + WINDOW_SIZE: 12 + APE: False + DROP_PATH_RATE: 0.3 + PATCH_NORM: True + PRETRAIN_IMG_SIZE: 384 + WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + SEM_SEG_HEAD: + NAME: "OpenVocabMaskFormerHead" + IN_FEATURES: ["res2", "res3", "res4", "res5"] + IGNORE_VALUE: 255 + NUM_CLASSES: 171 # number of categories in training set + EMBEDDING_DIM: 768 + EMBED_LAYERS: 2 + COMMON_STRIDE: 4 # not used, hard-coded + LOSS_WEIGHT: 1.0 + CONVS_DIM: 256 + MASK_DIM: 256 + NORM: "GN" + MASK_FORMER: + TRANSFORMER_IN_FEATURE: "res5" + DEEP_SUPERVISION: True + NO_OBJECT_WEIGHT: 0.1 + DICE_WEIGHT: 1.0 + MASK_WEIGHT: 20.0 + HIDDEN_DIM: 256 + NUM_OBJECT_QUERIES: 100 + NHEADS: 8 + DROPOUT: 0.1 + DIM_FEEDFORWARD: 2048 + ENC_LAYERS: 0 + DEC_LAYERS: 6 + PRE_NORM: False + CLIP_ADAPTER: + TEXT_TEMPLATES: "vild" + CLIP_MODEL_NAME: "ViT-L/14" + MASK_FILL: "mean" + MASK_EXPAND_RATIO: 1.0 + MASK_THR: 0.4 # choose the foreground objects + MASK_MATTING: False # use soft background, default not used + MASK_PROMPT_DEPTH: 3 + MASK_PROMPT_FWD: True # use mask prompt during forward + REGION_RESIZED: True # resize to the input of clip, e.g., 224 + CLIP_ENSEMBLE: True # use ensemble of two classification branches + CLIP_ENSEMBLE_WEIGHT: 0.7 +DATASETS: + TRAIN: ("coco_2017_train_stuff_sem_seg",) + TEST: ("ade20k_sem_seg_val",) +SOLVER: + IMS_PER_BATCH: 32 + BASE_LR: 0.00006 + MAX_ITER: 120000 + WARMUP_FACTOR: 1e-6 + WARMUP_ITERS: 1500 + LR_SCHEDULER_NAME: "WarmupPolyLR" + WEIGHT_DECAY: 0.01 + WEIGHT_DECAY_NORM: 0.0 + WEIGHT_DECAY_EMBED: 0.0 + BACKBONE_MULTIPLIER: 1.0 + TEST_IMS_PER_BATCH: 1 + CLIP_GRADIENTS: + ENABLED: True + CLIP_TYPE: "full_model" + CLIP_VALUE: 0.01 + NORM_TYPE: 2.0 +INPUT: + MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 640 + MAX_SIZE_TRAIN: 2560 + MAX_SIZE_TEST: 2560 + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: (640, 640) + SINGLE_CATEGORY_MAX_AREA: 1.0 + COLOR_AUG_SSD: True + SIZE_DIVISIBILITY: 640 # used in dataset mapper + FORMAT: "RGB" +TEST: + EVAL_PERIOD: 5000 + AUG: + ENABLED: False + MIN_SIZES: [256, 384, 512, 640, 768, 896] + MAX_SIZE: 3584 + FLIP: True +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: True + NUM_WORKERS: 4 +VERSION: 2 \ No newline at end of file diff --git a/configs/ovseg_swinB_vitL_demo.yaml b/configs/ovseg_swinB_vitL_demo.yaml new file mode 100644 index 0000000000000000000000000000000000000000..20a3be9304b898ebf7cc2535d43833900b6233f2 --- /dev/null +++ b/configs/ovseg_swinB_vitL_demo.yaml @@ -0,0 +1,99 @@ +MODEL: + META_ARCHITECTURE: "OVSegDEMO" + BACKBONE: + FREEZE_AT: 0 + NAME: "D2SwinTransformer" + SWIN: + EMBED_DIM: 128 + DEPTHS: [2, 2, 18, 2] + NUM_HEADS: [4, 8, 16, 32] + WINDOW_SIZE: 12 + APE: False + DROP_PATH_RATE: 0.3 + PATCH_NORM: True + PRETRAIN_IMG_SIZE: 384 + WEIGHTS: "swin_base_patch4_window12_384_22k.pkl" + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + SEM_SEG_HEAD: + NAME: "OpenVocabMaskFormerHead" + IN_FEATURES: ["res2", "res3", "res4", "res5"] + IGNORE_VALUE: 255 + NUM_CLASSES: 171 # number of categories in training set + EMBEDDING_DIM: 768 + EMBED_LAYERS: 2 + COMMON_STRIDE: 4 # not used, hard-coded + LOSS_WEIGHT: 1.0 + CONVS_DIM: 256 + MASK_DIM: 256 + NORM: "GN" + MASK_FORMER: + TRANSFORMER_IN_FEATURE: "res5" + DEEP_SUPERVISION: True + NO_OBJECT_WEIGHT: 0.1 + DICE_WEIGHT: 1.0 + MASK_WEIGHT: 20.0 + HIDDEN_DIM: 256 + NUM_OBJECT_QUERIES: 100 + NHEADS: 8 + DROPOUT: 0.1 + DIM_FEEDFORWARD: 2048 + ENC_LAYERS: 0 + DEC_LAYERS: 6 + PRE_NORM: False + CLIP_ADAPTER: + TEXT_TEMPLATES: "vild" + CLIP_MODEL_NAME: "ViT-L/14" + MASK_FILL: "mean" + MASK_EXPAND_RATIO: 1.0 + MASK_THR: 0.1 # choose the foreground objects + MASK_MATTING: False # use soft background, default not used + MASK_PROMPT_DEPTH: 3 + MASK_PROMPT_FWD: True # use mask prompt during forward + REGION_RESIZED: True # resize to the input of clip, e.g., 224 + CLIP_ENSEMBLE: True # use ensemble of two classification branches + CLIP_ENSEMBLE_WEIGHT: 0.0 +DATASETS: + TRAIN: ("coco_2017_train_stuff_sem_seg",) + TEST: ("ade20k_sem_seg_val",) +SOLVER: + IMS_PER_BATCH: 32 + BASE_LR: 0.00006 + MAX_ITER: 120000 + WARMUP_FACTOR: 1e-6 + WARMUP_ITERS: 1500 + WEIGHT_DECAY: 0.01 + WEIGHT_DECAY_NORM: 0.0 + WEIGHT_DECAY_EMBED: 0.0 + BACKBONE_MULTIPLIER: 1.0 + TEST_IMS_PER_BATCH: 1 + CLIP_GRADIENTS: + ENABLED: True + CLIP_TYPE: "full_model" + CLIP_VALUE: 0.01 + NORM_TYPE: 2.0 +INPUT: + MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"] + MIN_SIZE_TRAIN_SAMPLING: "choice" + MIN_SIZE_TEST: 640 + MAX_SIZE_TRAIN: 2560 + MAX_SIZE_TEST: 2560 + CROP: + ENABLED: True + TYPE: "absolute" + SIZE: (640, 640) + SINGLE_CATEGORY_MAX_AREA: 1.0 + COLOR_AUG_SSD: True + SIZE_DIVISIBILITY: 640 # used in dataset mapper + FORMAT: "RGB" +TEST: + EVAL_PERIOD: 5000 + AUG: + ENABLED: False + MIN_SIZES: [256, 384, 512, 640, 768, 896] + MAX_SIZE: 3584 + FLIP: True +DATALOADER: + FILTER_EMPTY_ANNOTATIONS: True + NUM_WORKERS: 4 +VERSION: 2 \ No newline at end of file diff --git a/datasets/DATASETS.md b/datasets/DATASETS.md new file mode 100644 index 0000000000000000000000000000000000000000..30d30ba314c9842098c5c38d0a47ce780283d9d9 --- /dev/null +++ b/datasets/DATASETS.md @@ -0,0 +1,122 @@ +## Prepare Datasets for OVSeg + +This doc is a modification/extension of [MaskFormer](https://github.com/facebookresearch/MaskFormer/blob/main/datasets/README.md) following [Detectron2 fromat](https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html). + +A dataset can be used by accessing [DatasetCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.DatasetCatalog) +for its data, or [MetadataCatalog](https://detectron2.readthedocs.io/modules/data.html#detectron2.data.MetadataCatalog) for its metadata (class names, etc). +This document explains how to setup the builtin datasets so they can be used by the above APIs. +[Use Custom Datasets](https://detectron2.readthedocs.io/tutorials/datasets.html) gives a deeper dive on how to use `DatasetCatalog` and `MetadataCatalog`, +and how to add new datasets to them. + +OVSeg has builtin support for a few datasets. +The datasets are assumed to exist in a directory specified by the environment variable +`DETECTRON2_DATASETS`. +Under this directory, detectron2 will look for datasets in the structure described below, if needed. +``` +$DETECTRON2_DATASETS/ + coco/ # COCOStuff-171 + ADEChallengeData2016/ # ADE20K-150 + ADE20K_2021_17_01/ # ADE20K-847 + VOCdevkit/ + VOC2012/ # PASCALVOC-20 + VOC2010/ # PASCALContext-59, PASCALContext-459 +``` + +You can set the location for builtin datasets by `export DETECTRON2_DATASETS=/path/to/datasets`. +If left unset, the default is `./datasets` relative to your current working directory. + +Without specific notifications, our model is trained on COCOStuff-171 and evlauted on ADE20K-150, ADE20K-847, PASCALVOC-20, PASCALContext-59 and PASCALContext-459. + +| dataset | split | # images | # categories | +|:--------------:|:---------:|:--------:|:------------:| +| COCO Stuff | train2017 | 118K | 171 | +| ADE20K | val | 2K | 150/847 | +| Pascal VOC | val | 1.5K | 20 | +| Pascal Context | val | 5K | 59/459 | + + +### Expected dataset structure for [COCO Stuff](https://github.com/nightrome/cocostuff): +``` +coco/ + train2017/ # http://images.cocodataset.org/zips/train2017.zip + annotations/ # http://images.cocodataset.org/annotations/annotations_trainval2017.zip + stuffthingmaps/ + stuffthingmaps_trainval2017.zip # http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip + train2017/ + # below are generated + stuffthingmaps_detectron2/ + train2017/ +``` + +The directory `stuffthingmaps_detectron2` is generated by running `python datasets/prepare_coco_stuff_sem_seg.py`. + + + +### Expected dataset structure for [ADE20k Scene Parsing (ADE20K-150)](http://sceneparsing.csail.mit.edu/): +``` +ADEChallengeData2016/ + annotations/ + images/ + objectInfo150.txt + # below are generated + annotations_detectron2/ +``` +The directory `annotations_detectron2` is generated by running `python datasets/prepare_ade20k_sem_seg.py`. + + +### Expected dataset structure for [ADE20k-Full (ADE20K-847)](https://github.com/CSAILVision/ADE20K#download): +``` +ADE20K_2021_17_01/ + images/ + index_ade20k.pkl + objects.txt + # below are generated + images_detectron2/ + annotations_detectron2/ +``` +The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_ade20k_full_sem_seg.py`. + +### Expected dataset structure for [Pascal VOC 2012 (PASCALVOC-20)](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/#devkit): +``` +VOCdevkit/VOC2012/ + Annotations/ + ImageSets/ + JPEGImages/ + SegmentationClass/ + SegmentationObject/ + SegmentationClassAug/ # https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/voc12/README.md + # below are generated + images_detectron2/ + annotations_detectron2/ +``` + +It starts with a tar file `VOCtrainval_11-May-2012.tar`. + +We use SBD augmentated training data as `SegmentationClassAug` following [Deeplab](https://github.com/kazuto1011/deeplab-pytorch/blob/master/data/datasets/voc12/README.md) + +The directories `images_detectron2` and `annotations_detectron2` are generated by running `python datasets/prepare_voc_sem_seg.py`. + + +### Expected dataset structure for [Pascal Context](https://www.cs.stanford.edu/~roozbeh/pascal-context/): + +``` +VOCdevkit/VOC2010/ + Annotations/ + ImageSets/ + JPEGImages/ + SegmentationClass/ + SegmentationObject/ + # below are from https://www.cs.stanford.edu/~roozbeh/pascal-context/trainval.tar.gz + trainval/ + labels.txt + 59_labels.txt # https://www.cs.stanford.edu/~roozbeh/pascal-context/59_labels.txt + pascalcontext_val.txt # https://drive.google.com/file/d/1BCbiOKtLvozjVnlTJX51koIveUZHCcUh/view?usp=sharing + # below are generated + annotations_detectron2/ + pc459_val + pc59_val +``` +It starts with a tar file `VOCtrainval_03-May-2010.tar`. You may want to download the 5K validation set [here](https://drive.google.com/file/d/1BCbiOKtLvozjVnlTJX51koIveUZHCcUh/view?usp=sharing). + +The directory `annotations_detectron2` is generated by running `python datasets/prepare_pascal_context.py`. + diff --git a/datasets/prepare_ade20k_full_sem_seg.py b/datasets/prepare_ade20k_full_sem_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..4a55e039549ff0aaf928a4dddee7a94ea8d0f6bf --- /dev/null +++ b/datasets/prepare_ade20k_full_sem_seg.py @@ -0,0 +1,1011 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import os +import pickle as pkl +from pathlib import Path + +import cv2 +import numpy as np +import tqdm +from PIL import Image + +ADE20K_SEM_SEG_FULL_CATEGORIES = [ + {"name": "wall", "id": 2978, "trainId": 0}, + {"name": "building, edifice", "id": 312, "trainId": 1}, + {"name": "sky", "id": 2420, "trainId": 2}, + {"name": "tree", "id": 2855, "trainId": 3}, + {"name": "road, route", "id": 2131, "trainId": 4}, + {"name": "floor, flooring", "id": 976, "trainId": 5}, + {"name": "ceiling", "id": 447, "trainId": 6}, + {"name": "bed", "id": 165, "trainId": 7}, + {"name": "sidewalk, pavement", "id": 2377, "trainId": 8}, + {"name": "earth, ground", "id": 838, "trainId": 9}, + {"name": "cabinet", "id": 350, "trainId": 10}, + {"name": "person, individual, someone, somebody, mortal, soul", "id": 1831, "trainId": 11}, + {"name": "grass", "id": 1125, "trainId": 12}, + {"name": "windowpane, window", "id": 3055, "trainId": 13}, + {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14}, + {"name": "mountain, mount", "id": 1610, "trainId": 15}, + {"name": "plant, flora, plant life", "id": 1910, "trainId": 16}, + {"name": "table", "id": 2684, "trainId": 17}, + {"name": "chair", "id": 471, "trainId": 18}, + {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19}, + {"name": "door", "id": 774, "trainId": 20}, + {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21}, + {"name": "sea", "id": 2264, "trainId": 22}, + {"name": "painting, picture", "id": 1735, "trainId": 23}, + {"name": "water", "id": 2994, "trainId": 24}, + {"name": "mirror", "id": 1564, "trainId": 25}, + {"name": "house", "id": 1276, "trainId": 26}, + {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27}, + {"name": "shelf", "id": 2329, "trainId": 28}, + {"name": "armchair", "id": 57, "trainId": 29}, + {"name": "fence, fencing", "id": 907, "trainId": 30}, + {"name": "field", "id": 913, "trainId": 31}, + {"name": "lamp", "id": 1395, "trainId": 32}, + {"name": "rock, stone", "id": 2138, "trainId": 33}, + {"name": "seat", "id": 2272, "trainId": 34}, + {"name": "river", "id": 2128, "trainId": 35}, + {"name": "desk", "id": 724, "trainId": 36}, + {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37}, + {"name": "railing, rail", "id": 2053, "trainId": 38}, + {"name": "signboard, sign", "id": 2380, "trainId": 39}, + {"name": "cushion", "id": 689, "trainId": 40}, + {"name": "path", "id": 1788, "trainId": 41}, + {"name": "work surface", "id": 3087, "trainId": 42}, + {"name": "stairs, steps", "id": 2530, "trainId": 43}, + {"name": "column, pillar", "id": 581, "trainId": 44}, + {"name": "sink", "id": 2388, "trainId": 45}, + {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46}, + {"name": "snow", "id": 2454, "trainId": 47}, + {"name": "refrigerator, icebox", "id": 2096, "trainId": 48}, + {"name": "base, pedestal, stand", "id": 137, "trainId": 49}, + {"name": "bridge, span", "id": 294, "trainId": 50}, + {"name": "blind, screen", "id": 212, "trainId": 51}, + {"name": "runway", "id": 2185, "trainId": 52}, + {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53}, + {"name": "sand", "id": 2212, "trainId": 54}, + {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55}, + {"name": "pillow", "id": 1869, "trainId": 56}, + {"name": "screen door, screen", "id": 2251, "trainId": 57}, + {"name": "toilet, can, commode, crapper, pot, potty, stool, throne", "id": 2793, "trainId": 58}, + {"name": "skyscraper", "id": 2423, "trainId": 59}, + {"name": "grandstand, covered stand", "id": 1121, "trainId": 60}, + {"name": "box", "id": 266, "trainId": 61}, + {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62}, + {"name": "palm, palm tree", "id": 1744, "trainId": 63}, + {"name": "double door", "id": 783, "trainId": 64}, + {"name": "coffee table, cocktail table", "id": 571, "trainId": 65}, + {"name": "counter", "id": 627, "trainId": 66}, + {"name": "countertop", "id": 629, "trainId": 67}, + {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68}, + {"name": "kitchen island", "id": 1374, "trainId": 69}, + {"name": "boat", "id": 223, "trainId": 70}, + {"name": "waterfall, falls", "id": 3016, "trainId": 71}, + { + "name": "stove, kitchen stove, range, kitchen range, cooking stove", + "id": 2598, + "trainId": 72, + }, + {"name": "flower", "id": 978, "trainId": 73}, + {"name": "bookcase", "id": 239, "trainId": 74}, + {"name": "controls", "id": 608, "trainId": 75}, + {"name": "book", "id": 236, "trainId": 76}, + {"name": "stairway, staircase", "id": 2531, "trainId": 77}, + {"name": "streetlight, street lamp", "id": 2616, "trainId": 78}, + { + "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system", + "id": 591, + "trainId": 79, + }, + { + "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle", + "id": 327, + "trainId": 80, + }, + {"name": "swivel chair", "id": 2679, "trainId": 81}, + {"name": "light, light source", "id": 1451, "trainId": 82}, + {"name": "bench", "id": 181, "trainId": 83}, + {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84}, + {"name": "towel", "id": 2821, "trainId": 85}, + {"name": "fountain", "id": 1023, "trainId": 86}, + {"name": "embankment", "id": 855, "trainId": 87}, + { + "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box", + "id": 2733, + "trainId": 88, + }, + {"name": "van", "id": 2928, "trainId": 89}, + {"name": "hill", "id": 1240, "trainId": 90}, + {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91}, + {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92}, + {"name": "truck, motortruck", "id": 2880, "trainId": 93}, + {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94}, + {"name": "pole", "id": 1936, "trainId": 95}, + {"name": "tower", "id": 2828, "trainId": 96}, + {"name": "court", "id": 631, "trainId": 97}, + {"name": "ball", "id": 103, "trainId": 98}, + { + "name": "aircraft carrier, carrier, flattop, attack aircraft carrier", + "id": 3144, + "trainId": 99, + }, + {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100}, + {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101}, + {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102}, + {"name": "minibike, motorbike", "id": 1563, "trainId": 103}, + {"name": "animal, animate being, beast, brute, creature, fauna", "id": 29, "trainId": 104}, + {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105}, + {"name": "step, stair", "id": 2569, "trainId": 106}, + {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107}, + {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108}, + {"name": "doorframe, doorcase", "id": 778, "trainId": 109}, + {"name": "sconce", "id": 2243, "trainId": 110}, + {"name": "pond", "id": 1941, "trainId": 111}, + {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112}, + {"name": "bannister, banister, balustrade, balusters, handrail", "id": 120, "trainId": 113}, + {"name": "bag", "id": 95, "trainId": 114}, + {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115}, + {"name": "gazebo", "id": 1087, "trainId": 116}, + {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117}, + {"name": "land, ground, soil", "id": 1401, "trainId": 118}, + {"name": "board, plank", "id": 220, "trainId": 119}, + {"name": "arcade machine", "id": 47, "trainId": 120}, + {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121}, + {"name": "bar", "id": 123, "trainId": 122}, + {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123}, + {"name": "playground", "id": 1927, "trainId": 124}, + {"name": "ship", "id": 2337, "trainId": 125}, + {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126}, + { + "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin", + "id": 64, + "trainId": 127, + }, + {"name": "bottle", "id": 249, "trainId": 128}, + {"name": "cradle", "id": 642, "trainId": 129}, + {"name": "pot, flowerpot", "id": 1981, "trainId": 130}, + { + "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", + "id": 609, + "trainId": 131, + }, + {"name": "train, railroad train", "id": 2840, "trainId": 132}, + {"name": "stool", "id": 2586, "trainId": 133}, + {"name": "lake", "id": 1393, "trainId": 134}, + {"name": "tank, storage tank", "id": 2704, "trainId": 135}, + {"name": "ice, water ice", "id": 1304, "trainId": 136}, + {"name": "basket, handbasket", "id": 146, "trainId": 137}, + {"name": "manhole", "id": 1494, "trainId": 138}, + {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139}, + {"name": "canopy", "id": 389, "trainId": 140}, + {"name": "microwave, microwave oven", "id": 1551, "trainId": 141}, + {"name": "barrel, cask", "id": 131, "trainId": 142}, + {"name": "dirt track", "id": 738, "trainId": 143}, + {"name": "beam", "id": 161, "trainId": 144}, + {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145}, + {"name": "plate", "id": 1919, "trainId": 146}, + {"name": "screen, crt screen", "id": 3109, "trainId": 147}, + {"name": "ruins", "id": 2179, "trainId": 148}, + {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149}, + {"name": "blanket, cover", "id": 206, "trainId": 150}, + {"name": "plaything, toy", "id": 1930, "trainId": 151}, + {"name": "food, solid food", "id": 1002, "trainId": 152}, + {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153}, + {"name": "oven", "id": 1708, "trainId": 154}, + {"name": "stage", "id": 2526, "trainId": 155}, + {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156}, + {"name": "umbrella", "id": 2901, "trainId": 157}, + {"name": "sculpture", "id": 2262, "trainId": 158}, + {"name": "aqueduct", "id": 44, "trainId": 159}, + {"name": "container", "id": 597, "trainId": 160}, + {"name": "scaffolding, staging", "id": 2235, "trainId": 161}, + {"name": "hood, exhaust hood", "id": 1260, "trainId": 162}, + {"name": "curb, curbing, kerb", "id": 682, "trainId": 163}, + {"name": "roller coaster", "id": 2151, "trainId": 164}, + {"name": "horse, equus caballus", "id": 3107, "trainId": 165}, + {"name": "catwalk", "id": 432, "trainId": 166}, + {"name": "glass, drinking glass", "id": 1098, "trainId": 167}, + {"name": "vase", "id": 2932, "trainId": 168}, + {"name": "central reservation", "id": 461, "trainId": 169}, + {"name": "carousel", "id": 410, "trainId": 170}, + {"name": "radiator", "id": 2046, "trainId": 171}, + {"name": "closet", "id": 533, "trainId": 172}, + {"name": "machine", "id": 1481, "trainId": 173}, + {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174}, + {"name": "fan", "id": 894, "trainId": 175}, + {"name": "inflatable bounce game", "id": 1322, "trainId": 176}, + {"name": "pitch", "id": 1891, "trainId": 177}, + {"name": "paper", "id": 1756, "trainId": 178}, + {"name": "arcade, colonnade", "id": 49, "trainId": 179}, + {"name": "hot tub", "id": 1272, "trainId": 180}, + {"name": "helicopter", "id": 1229, "trainId": 181}, + {"name": "tray", "id": 2850, "trainId": 182}, + {"name": "partition, divider", "id": 1784, "trainId": 183}, + {"name": "vineyard", "id": 2962, "trainId": 184}, + {"name": "bowl", "id": 259, "trainId": 185}, + {"name": "bullring", "id": 319, "trainId": 186}, + {"name": "flag", "id": 954, "trainId": 187}, + {"name": "pot", "id": 1974, "trainId": 188}, + {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189}, + {"name": "shower", "id": 2356, "trainId": 190}, + {"name": "bag, traveling bag, travelling bag, grip, suitcase", "id": 97, "trainId": 191}, + {"name": "bulletin board, notice board", "id": 318, "trainId": 192}, + {"name": "confessional booth", "id": 592, "trainId": 193}, + {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194}, + {"name": "forest", "id": 1017, "trainId": 195}, + {"name": "elevator door", "id": 851, "trainId": 196}, + {"name": "laptop, laptop computer", "id": 1407, "trainId": 197}, + {"name": "instrument panel", "id": 1332, "trainId": 198}, + {"name": "bucket, pail", "id": 303, "trainId": 199}, + {"name": "tapestry, tapis", "id": 2714, "trainId": 200}, + {"name": "platform", "id": 1924, "trainId": 201}, + {"name": "jacket", "id": 1346, "trainId": 202}, + {"name": "gate", "id": 1081, "trainId": 203}, + {"name": "monitor, monitoring device", "id": 1583, "trainId": 204}, + { + "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk", + "id": 2727, + "trainId": 205, + }, + {"name": "spotlight, spot", "id": 2509, "trainId": 206}, + {"name": "ring", "id": 2123, "trainId": 207}, + {"name": "control panel", "id": 602, "trainId": 208}, + {"name": "blackboard, chalkboard", "id": 202, "trainId": 209}, + {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210}, + {"name": "chest", "id": 490, "trainId": 211}, + {"name": "clock", "id": 530, "trainId": 212}, + {"name": "sand dune", "id": 2213, "trainId": 213}, + {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214}, + {"name": "vault", "id": 2934, "trainId": 215}, + {"name": "table football", "id": 2687, "trainId": 216}, + {"name": "cannon", "id": 387, "trainId": 217}, + {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218}, + {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219}, + {"name": "statue", "id": 2547, "trainId": 220}, + { + "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system", + "id": 1474, + "trainId": 221, + }, + {"name": "exhibitor", "id": 877, "trainId": 222}, + {"name": "ladder", "id": 1391, "trainId": 223}, + {"name": "carport", "id": 414, "trainId": 224}, + {"name": "dam", "id": 698, "trainId": 225}, + {"name": "pulpit", "id": 2019, "trainId": 226}, + {"name": "skylight, fanlight", "id": 2422, "trainId": 227}, + {"name": "water tower", "id": 3010, "trainId": 228}, + {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229}, + {"name": "display board", "id": 753, "trainId": 230}, + {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231}, + {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232}, + {"name": "ice rink", "id": 1301, "trainId": 233}, + {"name": "fruit", "id": 1033, "trainId": 234}, + {"name": "patio", "id": 1789, "trainId": 235}, + {"name": "vending machine", "id": 2939, "trainId": 236}, + {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237}, + {"name": "net", "id": 1652, "trainId": 238}, + { + "name": "backpack, back pack, knapsack, packsack, rucksack, haversack", + "id": 90, + "trainId": 239, + }, + {"name": "jar", "id": 1349, "trainId": 240}, + {"name": "track", "id": 2830, "trainId": 241}, + {"name": "magazine", "id": 1485, "trainId": 242}, + {"name": "shutter", "id": 2370, "trainId": 243}, + {"name": "roof", "id": 2155, "trainId": 244}, + {"name": "banner, streamer", "id": 118, "trainId": 245}, + {"name": "landfill", "id": 1402, "trainId": 246}, + {"name": "post", "id": 1957, "trainId": 247}, + {"name": "altarpiece, reredos", "id": 3130, "trainId": 248}, + {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249}, + {"name": "arch, archway", "id": 52, "trainId": 250}, + {"name": "table game", "id": 2688, "trainId": 251}, + {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252}, + {"name": "document, written document, papers", "id": 762, "trainId": 253}, + {"name": "dome", "id": 772, "trainId": 254}, + {"name": "pier", "id": 1857, "trainId": 255}, + {"name": "shanties", "id": 2315, "trainId": 256}, + {"name": "forecourt", "id": 1016, "trainId": 257}, + {"name": "crane", "id": 643, "trainId": 258}, + {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259}, + {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260}, + {"name": "drawing", "id": 791, "trainId": 261}, + {"name": "cabin", "id": 349, "trainId": 262}, + { + "name": "ad, advertisement, advertizement, advertising, advertizing, advert", + "id": 6, + "trainId": 263, + }, + {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264}, + {"name": "monument", "id": 1587, "trainId": 265}, + {"name": "henhouse", "id": 1233, "trainId": 266}, + {"name": "cockpit", "id": 559, "trainId": 267}, + {"name": "heater, warmer", "id": 1223, "trainId": 268}, + {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269}, + {"name": "pool", "id": 1943, "trainId": 270}, + {"name": "elevator, lift", "id": 853, "trainId": 271}, + {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272}, + {"name": "labyrinth", "id": 1390, "trainId": 273}, + {"name": "text, textual matter", "id": 2748, "trainId": 274}, + {"name": "printer", "id": 2007, "trainId": 275}, + {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276}, + {"name": "mattress", "id": 1513, "trainId": 277}, + {"name": "straw", "id": 2600, "trainId": 278}, + {"name": "stalls", "id": 2538, "trainId": 279}, + {"name": "patio, terrace", "id": 1790, "trainId": 280}, + {"name": "billboard, hoarding", "id": 194, "trainId": 281}, + {"name": "bus stop", "id": 326, "trainId": 282}, + {"name": "trouser, pant", "id": 2877, "trainId": 283}, + {"name": "console table, console", "id": 594, "trainId": 284}, + {"name": "rack", "id": 2036, "trainId": 285}, + {"name": "notebook", "id": 1662, "trainId": 286}, + {"name": "shrine", "id": 2366, "trainId": 287}, + {"name": "pantry", "id": 1754, "trainId": 288}, + {"name": "cart", "id": 418, "trainId": 289}, + {"name": "steam shovel", "id": 2553, "trainId": 290}, + {"name": "porch", "id": 1951, "trainId": 291}, + {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292}, + {"name": "figurine, statuette", "id": 918, "trainId": 293}, + {"name": "recycling bin", "id": 2086, "trainId": 294}, + {"name": "folding screen", "id": 997, "trainId": 295}, + {"name": "telescope", "id": 2731, "trainId": 296}, + {"name": "deck chair, beach chair", "id": 704, "trainId": 297}, + {"name": "kennel", "id": 1365, "trainId": 298}, + {"name": "coffee maker", "id": 569, "trainId": 299}, + {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300}, + {"name": "fish", "id": 948, "trainId": 301}, + {"name": "easel", "id": 839, "trainId": 302}, + {"name": "artificial golf green", "id": 63, "trainId": 303}, + {"name": "iceberg", "id": 1305, "trainId": 304}, + {"name": "candlestick, candle holder", "id": 378, "trainId": 305}, + {"name": "shower stall, shower bath", "id": 2362, "trainId": 306}, + {"name": "television stand", "id": 2734, "trainId": 307}, + { + "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle", + "id": 2982, + "trainId": 308, + }, + {"name": "skeleton", "id": 2398, "trainId": 309}, + {"name": "grand piano, grand", "id": 1119, "trainId": 310}, + {"name": "candy, confect", "id": 382, "trainId": 311}, + {"name": "grille door", "id": 1141, "trainId": 312}, + {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313}, + {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314}, + {"name": "shoe", "id": 2341, "trainId": 315}, + {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316}, + {"name": "shanty", "id": 2316, "trainId": 317}, + {"name": "structure", "id": 2626, "trainId": 318}, + {"name": "rocking chair, rocker", "id": 3104, "trainId": 319}, + {"name": "bird", "id": 198, "trainId": 320}, + {"name": "place mat", "id": 1896, "trainId": 321}, + {"name": "tomb", "id": 2800, "trainId": 322}, + {"name": "big top", "id": 190, "trainId": 323}, + {"name": "gas pump, gasoline pump, petrol pump, island dispenser", "id": 3131, "trainId": 324}, + {"name": "lockers", "id": 1463, "trainId": 325}, + {"name": "cage", "id": 357, "trainId": 326}, + {"name": "finger", "id": 929, "trainId": 327}, + {"name": "bleachers", "id": 209, "trainId": 328}, + {"name": "ferris wheel", "id": 912, "trainId": 329}, + {"name": "hairdresser chair", "id": 1164, "trainId": 330}, + {"name": "mat", "id": 1509, "trainId": 331}, + {"name": "stands", "id": 2539, "trainId": 332}, + {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333}, + {"name": "streetcar, tram, tramcar, trolley, trolley car", "id": 2615, "trainId": 334}, + {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335}, + {"name": "dummy", "id": 818, "trainId": 336}, + {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337}, + {"name": "sand trap", "id": 2217, "trainId": 338}, + {"name": "shop, store", "id": 2347, "trainId": 339}, + {"name": "table cloth", "id": 2686, "trainId": 340}, + {"name": "service station", "id": 2300, "trainId": 341}, + {"name": "coffin", "id": 572, "trainId": 342}, + {"name": "drawer", "id": 789, "trainId": 343}, + {"name": "cages", "id": 358, "trainId": 344}, + {"name": "slot machine, coin machine", "id": 2443, "trainId": 345}, + {"name": "balcony", "id": 101, "trainId": 346}, + {"name": "volleyball court", "id": 2969, "trainId": 347}, + {"name": "table tennis", "id": 2692, "trainId": 348}, + {"name": "control table", "id": 606, "trainId": 349}, + {"name": "shirt", "id": 2339, "trainId": 350}, + {"name": "merchandise, ware, product", "id": 1533, "trainId": 351}, + {"name": "railway", "id": 2060, "trainId": 352}, + {"name": "parterre", "id": 1782, "trainId": 353}, + {"name": "chimney", "id": 495, "trainId": 354}, + {"name": "can, tin, tin can", "id": 371, "trainId": 355}, + {"name": "tanks", "id": 2707, "trainId": 356}, + {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357}, + {"name": "alga, algae", "id": 3156, "trainId": 358}, + {"name": "system", "id": 2683, "trainId": 359}, + {"name": "map", "id": 1499, "trainId": 360}, + {"name": "greenhouse", "id": 1135, "trainId": 361}, + {"name": "mug", "id": 1619, "trainId": 362}, + {"name": "barbecue", "id": 125, "trainId": 363}, + {"name": "trailer", "id": 2838, "trainId": 364}, + {"name": "toilet tissue, toilet paper, bathroom tissue", "id": 2792, "trainId": 365}, + {"name": "organ", "id": 1695, "trainId": 366}, + {"name": "dishrag, dishcloth", "id": 746, "trainId": 367}, + {"name": "island", "id": 1343, "trainId": 368}, + {"name": "keyboard", "id": 1370, "trainId": 369}, + {"name": "trench", "id": 2858, "trainId": 370}, + {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371}, + {"name": "steering wheel, wheel", "id": 2565, "trainId": 372}, + {"name": "pitcher, ewer", "id": 1892, "trainId": 373}, + {"name": "goal", "id": 1103, "trainId": 374}, + {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375}, + {"name": "beds", "id": 170, "trainId": 376}, + {"name": "wood", "id": 3073, "trainId": 377}, + {"name": "file cabinet", "id": 922, "trainId": 378}, + {"name": "newspaper, paper", "id": 1655, "trainId": 379}, + {"name": "motorboat", "id": 1602, "trainId": 380}, + {"name": "rope", "id": 2160, "trainId": 381}, + {"name": "guitar", "id": 1151, "trainId": 382}, + {"name": "rubble", "id": 2176, "trainId": 383}, + {"name": "scarf", "id": 2239, "trainId": 384}, + {"name": "barrels", "id": 132, "trainId": 385}, + {"name": "cap", "id": 394, "trainId": 386}, + {"name": "leaves", "id": 1424, "trainId": 387}, + {"name": "control tower", "id": 607, "trainId": 388}, + {"name": "dashboard", "id": 700, "trainId": 389}, + {"name": "bandstand", "id": 116, "trainId": 390}, + {"name": "lectern", "id": 1425, "trainId": 391}, + {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392}, + {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393}, + {"name": "shower room", "id": 2360, "trainId": 394}, + {"name": "smoke", "id": 2449, "trainId": 395}, + {"name": "faucet, spigot", "id": 897, "trainId": 396}, + {"name": "bulldozer", "id": 317, "trainId": 397}, + {"name": "saucepan", "id": 2228, "trainId": 398}, + {"name": "shops", "id": 2351, "trainId": 399}, + {"name": "meter", "id": 1543, "trainId": 400}, + {"name": "crevasse", "id": 656, "trainId": 401}, + {"name": "gear", "id": 1088, "trainId": 402}, + {"name": "candelabrum, candelabra", "id": 373, "trainId": 403}, + {"name": "sofa bed", "id": 2472, "trainId": 404}, + {"name": "tunnel", "id": 2892, "trainId": 405}, + {"name": "pallet", "id": 1740, "trainId": 406}, + {"name": "wire, conducting wire", "id": 3067, "trainId": 407}, + {"name": "kettle, boiler", "id": 1367, "trainId": 408}, + {"name": "bidet", "id": 188, "trainId": 409}, + { + "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher", + "id": 79, + "trainId": 410, + }, + {"name": "music stand", "id": 1633, "trainId": 411}, + {"name": "pipe, tube", "id": 1885, "trainId": 412}, + {"name": "cup", "id": 677, "trainId": 413}, + {"name": "parking meter", "id": 1779, "trainId": 414}, + {"name": "ice hockey rink", "id": 1297, "trainId": 415}, + {"name": "shelter", "id": 2334, "trainId": 416}, + {"name": "weeds", "id": 3027, "trainId": 417}, + {"name": "temple", "id": 2735, "trainId": 418}, + {"name": "patty, cake", "id": 1791, "trainId": 419}, + {"name": "ski slope", "id": 2405, "trainId": 420}, + {"name": "panel", "id": 1748, "trainId": 421}, + {"name": "wallet", "id": 2983, "trainId": 422}, + {"name": "wheel", "id": 3035, "trainId": 423}, + {"name": "towel rack, towel horse", "id": 2824, "trainId": 424}, + {"name": "roundabout", "id": 2168, "trainId": 425}, + {"name": "canister, cannister, tin", "id": 385, "trainId": 426}, + {"name": "rod", "id": 2148, "trainId": 427}, + {"name": "soap dispenser", "id": 2465, "trainId": 428}, + {"name": "bell", "id": 175, "trainId": 429}, + {"name": "canvas", "id": 390, "trainId": 430}, + {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431}, + {"name": "teacup", "id": 2722, "trainId": 432}, + {"name": "trellis", "id": 2857, "trainId": 433}, + {"name": "workbench", "id": 3088, "trainId": 434}, + {"name": "valley, vale", "id": 2926, "trainId": 435}, + {"name": "toaster", "id": 2782, "trainId": 436}, + {"name": "knife", "id": 1378, "trainId": 437}, + {"name": "podium", "id": 1934, "trainId": 438}, + {"name": "ramp", "id": 2072, "trainId": 439}, + {"name": "tumble dryer", "id": 2889, "trainId": 440}, + {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441}, + {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442}, + {"name": "lab bench", "id": 1383, "trainId": 443}, + {"name": "equipment", "id": 867, "trainId": 444}, + {"name": "rocky formation", "id": 2145, "trainId": 445}, + {"name": "plastic", "id": 1915, "trainId": 446}, + {"name": "calendar", "id": 361, "trainId": 447}, + {"name": "caravan", "id": 402, "trainId": 448}, + {"name": "check-in-desk", "id": 482, "trainId": 449}, + {"name": "ticket counter", "id": 2761, "trainId": 450}, + {"name": "brush", "id": 300, "trainId": 451}, + {"name": "mill", "id": 1554, "trainId": 452}, + {"name": "covered bridge", "id": 636, "trainId": 453}, + {"name": "bowling alley", "id": 260, "trainId": 454}, + {"name": "hanger", "id": 1186, "trainId": 455}, + {"name": "excavator", "id": 871, "trainId": 456}, + {"name": "trestle", "id": 2859, "trainId": 457}, + {"name": "revolving door", "id": 2103, "trainId": 458}, + {"name": "blast furnace", "id": 208, "trainId": 459}, + {"name": "scale, weighing machine", "id": 2236, "trainId": 460}, + {"name": "projector", "id": 2012, "trainId": 461}, + {"name": "soap", "id": 2462, "trainId": 462}, + {"name": "locker", "id": 1462, "trainId": 463}, + {"name": "tractor", "id": 2832, "trainId": 464}, + {"name": "stretcher", "id": 2617, "trainId": 465}, + {"name": "frame", "id": 1024, "trainId": 466}, + {"name": "grating", "id": 1129, "trainId": 467}, + {"name": "alembic", "id": 18, "trainId": 468}, + {"name": "candle, taper, wax light", "id": 376, "trainId": 469}, + {"name": "barrier", "id": 134, "trainId": 470}, + {"name": "cardboard", "id": 407, "trainId": 471}, + {"name": "cave", "id": 434, "trainId": 472}, + {"name": "puddle", "id": 2017, "trainId": 473}, + {"name": "tarp", "id": 2717, "trainId": 474}, + {"name": "price tag", "id": 2005, "trainId": 475}, + {"name": "watchtower", "id": 2993, "trainId": 476}, + {"name": "meters", "id": 1545, "trainId": 477}, + { + "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb", + "id": 1445, + "trainId": 478, + }, + {"name": "tracks", "id": 2831, "trainId": 479}, + {"name": "hair dryer", "id": 1161, "trainId": 480}, + {"name": "skirt", "id": 2411, "trainId": 481}, + {"name": "viaduct", "id": 2949, "trainId": 482}, + {"name": "paper towel", "id": 1769, "trainId": 483}, + {"name": "coat", "id": 552, "trainId": 484}, + {"name": "sheet", "id": 2327, "trainId": 485}, + {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486}, + {"name": "water wheel", "id": 3013, "trainId": 487}, + {"name": "pottery, clayware", "id": 1986, "trainId": 488}, + {"name": "magazine rack", "id": 1486, "trainId": 489}, + {"name": "teapot", "id": 2723, "trainId": 490}, + {"name": "microphone, mike", "id": 1549, "trainId": 491}, + {"name": "support", "id": 2649, "trainId": 492}, + {"name": "forklift", "id": 1020, "trainId": 493}, + {"name": "canyon", "id": 392, "trainId": 494}, + {"name": "cash register, register", "id": 422, "trainId": 495}, + {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496}, + {"name": "remote control, remote", "id": 2099, "trainId": 497}, + {"name": "soap dish", "id": 2464, "trainId": 498}, + {"name": "windshield, windscreen", "id": 3058, "trainId": 499}, + {"name": "cat", "id": 430, "trainId": 500}, + {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501}, + {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502}, + {"name": "videos", "id": 2955, "trainId": 503}, + {"name": "shovel", "id": 2355, "trainId": 504}, + {"name": "eaves", "id": 840, "trainId": 505}, + {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506}, + {"name": "shipyard", "id": 2338, "trainId": 507}, + {"name": "hen, biddy", "id": 1232, "trainId": 508}, + {"name": "traffic cone", "id": 2834, "trainId": 509}, + {"name": "washing machines", "id": 2991, "trainId": 510}, + {"name": "truck crane", "id": 2879, "trainId": 511}, + {"name": "cds", "id": 444, "trainId": 512}, + {"name": "niche", "id": 1657, "trainId": 513}, + {"name": "scoreboard", "id": 2246, "trainId": 514}, + {"name": "briefcase", "id": 296, "trainId": 515}, + {"name": "boot", "id": 245, "trainId": 516}, + {"name": "sweater, jumper", "id": 2661, "trainId": 517}, + {"name": "hay", "id": 1202, "trainId": 518}, + {"name": "pack", "id": 1714, "trainId": 519}, + {"name": "bottle rack", "id": 251, "trainId": 520}, + {"name": "glacier", "id": 1095, "trainId": 521}, + {"name": "pergola", "id": 1828, "trainId": 522}, + {"name": "building materials", "id": 311, "trainId": 523}, + {"name": "television camera", "id": 2732, "trainId": 524}, + {"name": "first floor", "id": 947, "trainId": 525}, + {"name": "rifle", "id": 2115, "trainId": 526}, + {"name": "tennis table", "id": 2738, "trainId": 527}, + {"name": "stadium", "id": 2525, "trainId": 528}, + {"name": "safety belt", "id": 2194, "trainId": 529}, + {"name": "cover", "id": 634, "trainId": 530}, + {"name": "dish rack", "id": 740, "trainId": 531}, + {"name": "synthesizer", "id": 2682, "trainId": 532}, + {"name": "pumpkin", "id": 2020, "trainId": 533}, + {"name": "gutter", "id": 1156, "trainId": 534}, + {"name": "fruit stand", "id": 1036, "trainId": 535}, + {"name": "ice floe, floe", "id": 1295, "trainId": 536}, + {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537}, + {"name": "wheelchair", "id": 3037, "trainId": 538}, + {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539}, + {"name": "diploma", "id": 736, "trainId": 540}, + {"name": "fairground ride", "id": 893, "trainId": 541}, + {"name": "radio", "id": 2047, "trainId": 542}, + {"name": "hotplate", "id": 1274, "trainId": 543}, + {"name": "junk", "id": 1361, "trainId": 544}, + {"name": "wheelbarrow", "id": 3036, "trainId": 545}, + {"name": "stream", "id": 2606, "trainId": 546}, + {"name": "toll plaza", "id": 2797, "trainId": 547}, + {"name": "punching bag", "id": 2022, "trainId": 548}, + {"name": "trough", "id": 2876, "trainId": 549}, + {"name": "throne", "id": 2758, "trainId": 550}, + {"name": "chair desk", "id": 472, "trainId": 551}, + {"name": "weighbridge", "id": 3028, "trainId": 552}, + {"name": "extractor fan", "id": 882, "trainId": 553}, + {"name": "hanging clothes", "id": 1189, "trainId": 554}, + {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555}, + {"name": "alarm clock, alarm", "id": 3122, "trainId": 556}, + {"name": "ski lift", "id": 2401, "trainId": 557}, + {"name": "chain", "id": 468, "trainId": 558}, + {"name": "garage", "id": 1061, "trainId": 559}, + {"name": "mechanical shovel", "id": 1523, "trainId": 560}, + {"name": "wine rack", "id": 3059, "trainId": 561}, + {"name": "tramway", "id": 2843, "trainId": 562}, + {"name": "treadmill", "id": 2853, "trainId": 563}, + {"name": "menu", "id": 1529, "trainId": 564}, + {"name": "block", "id": 214, "trainId": 565}, + {"name": "well", "id": 3032, "trainId": 566}, + {"name": "witness stand", "id": 3071, "trainId": 567}, + {"name": "branch", "id": 277, "trainId": 568}, + {"name": "duck", "id": 813, "trainId": 569}, + {"name": "casserole", "id": 426, "trainId": 570}, + {"name": "frying pan", "id": 1039, "trainId": 571}, + {"name": "desk organizer", "id": 727, "trainId": 572}, + {"name": "mast", "id": 1508, "trainId": 573}, + {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574}, + {"name": "service elevator", "id": 2299, "trainId": 575}, + {"name": "dollhouse", "id": 768, "trainId": 576}, + {"name": "hammock", "id": 1172, "trainId": 577}, + {"name": "clothes hanging", "id": 537, "trainId": 578}, + {"name": "photocopier", "id": 1847, "trainId": 579}, + {"name": "notepad", "id": 1664, "trainId": 580}, + {"name": "golf cart", "id": 1110, "trainId": 581}, + {"name": "footpath", "id": 1014, "trainId": 582}, + {"name": "cross", "id": 662, "trainId": 583}, + {"name": "baptismal font", "id": 121, "trainId": 584}, + {"name": "boiler", "id": 227, "trainId": 585}, + {"name": "skip", "id": 2410, "trainId": 586}, + {"name": "rotisserie", "id": 2165, "trainId": 587}, + {"name": "tables", "id": 2696, "trainId": 588}, + {"name": "water mill", "id": 3005, "trainId": 589}, + {"name": "helmet", "id": 1231, "trainId": 590}, + {"name": "cover curtain", "id": 635, "trainId": 591}, + {"name": "brick", "id": 292, "trainId": 592}, + {"name": "table runner", "id": 2690, "trainId": 593}, + {"name": "ashtray", "id": 65, "trainId": 594}, + {"name": "street box", "id": 2607, "trainId": 595}, + {"name": "stick", "id": 2574, "trainId": 596}, + {"name": "hangers", "id": 1188, "trainId": 597}, + {"name": "cells", "id": 456, "trainId": 598}, + {"name": "urinal", "id": 2913, "trainId": 599}, + {"name": "centerpiece", "id": 459, "trainId": 600}, + {"name": "portable fridge", "id": 1955, "trainId": 601}, + {"name": "dvds", "id": 827, "trainId": 602}, + {"name": "golf club", "id": 1111, "trainId": 603}, + {"name": "skirting board", "id": 2412, "trainId": 604}, + {"name": "water cooler", "id": 2997, "trainId": 605}, + {"name": "clipboard", "id": 528, "trainId": 606}, + {"name": "camera, photographic camera", "id": 366, "trainId": 607}, + {"name": "pigeonhole", "id": 1863, "trainId": 608}, + {"name": "chips", "id": 500, "trainId": 609}, + {"name": "food processor", "id": 1001, "trainId": 610}, + {"name": "post box", "id": 1958, "trainId": 611}, + {"name": "lid", "id": 1441, "trainId": 612}, + {"name": "drum", "id": 809, "trainId": 613}, + {"name": "blender", "id": 210, "trainId": 614}, + {"name": "cave entrance", "id": 435, "trainId": 615}, + {"name": "dental chair", "id": 718, "trainId": 616}, + {"name": "obelisk", "id": 1674, "trainId": 617}, + {"name": "canoe", "id": 388, "trainId": 618}, + {"name": "mobile", "id": 1572, "trainId": 619}, + {"name": "monitors", "id": 1584, "trainId": 620}, + {"name": "pool ball", "id": 1944, "trainId": 621}, + {"name": "cue rack", "id": 674, "trainId": 622}, + {"name": "baggage carts", "id": 99, "trainId": 623}, + {"name": "shore", "id": 2352, "trainId": 624}, + {"name": "fork", "id": 1019, "trainId": 625}, + {"name": "paper filer", "id": 1763, "trainId": 626}, + {"name": "bicycle rack", "id": 185, "trainId": 627}, + {"name": "coat rack", "id": 554, "trainId": 628}, + {"name": "garland", "id": 1066, "trainId": 629}, + {"name": "sports bag", "id": 2508, "trainId": 630}, + {"name": "fish tank", "id": 951, "trainId": 631}, + {"name": "towel dispenser", "id": 2822, "trainId": 632}, + {"name": "carriage", "id": 415, "trainId": 633}, + {"name": "brochure", "id": 297, "trainId": 634}, + {"name": "plaque", "id": 1914, "trainId": 635}, + {"name": "stringer", "id": 2619, "trainId": 636}, + {"name": "iron", "id": 1338, "trainId": 637}, + {"name": "spoon", "id": 2505, "trainId": 638}, + {"name": "flag pole", "id": 955, "trainId": 639}, + {"name": "toilet brush", "id": 2786, "trainId": 640}, + {"name": "book stand", "id": 238, "trainId": 641}, + {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642}, + {"name": "ticket office", "id": 2763, "trainId": 643}, + {"name": "broom", "id": 299, "trainId": 644}, + {"name": "dvd", "id": 822, "trainId": 645}, + {"name": "ice bucket", "id": 1288, "trainId": 646}, + {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647}, + {"name": "tureen", "id": 2894, "trainId": 648}, + {"name": "folders", "id": 992, "trainId": 649}, + {"name": "chess", "id": 489, "trainId": 650}, + {"name": "root", "id": 2157, "trainId": 651}, + {"name": "sewing machine", "id": 2309, "trainId": 652}, + {"name": "model", "id": 1576, "trainId": 653}, + {"name": "pen", "id": 1810, "trainId": 654}, + {"name": "violin", "id": 2964, "trainId": 655}, + {"name": "sweatshirt", "id": 2662, "trainId": 656}, + {"name": "recycling materials", "id": 2087, "trainId": 657}, + {"name": "mitten", "id": 1569, "trainId": 658}, + {"name": "chopping board, cutting board", "id": 503, "trainId": 659}, + {"name": "mask", "id": 1505, "trainId": 660}, + {"name": "log", "id": 1468, "trainId": 661}, + {"name": "mouse, computer mouse", "id": 1613, "trainId": 662}, + {"name": "grill", "id": 1138, "trainId": 663}, + {"name": "hole", "id": 1256, "trainId": 664}, + {"name": "target", "id": 2715, "trainId": 665}, + {"name": "trash bag", "id": 2846, "trainId": 666}, + {"name": "chalk", "id": 477, "trainId": 667}, + {"name": "sticks", "id": 2576, "trainId": 668}, + {"name": "balloon", "id": 108, "trainId": 669}, + {"name": "score", "id": 2245, "trainId": 670}, + {"name": "hair spray", "id": 1162, "trainId": 671}, + {"name": "roll", "id": 2149, "trainId": 672}, + {"name": "runner", "id": 2183, "trainId": 673}, + {"name": "engine", "id": 858, "trainId": 674}, + {"name": "inflatable glove", "id": 1324, "trainId": 675}, + {"name": "games", "id": 1055, "trainId": 676}, + {"name": "pallets", "id": 1741, "trainId": 677}, + {"name": "baskets", "id": 149, "trainId": 678}, + {"name": "coop", "id": 615, "trainId": 679}, + {"name": "dvd player", "id": 825, "trainId": 680}, + {"name": "rocking horse", "id": 2143, "trainId": 681}, + {"name": "buckets", "id": 304, "trainId": 682}, + {"name": "bread rolls", "id": 283, "trainId": 683}, + {"name": "shawl", "id": 2322, "trainId": 684}, + {"name": "watering can", "id": 3017, "trainId": 685}, + {"name": "spotlights", "id": 2510, "trainId": 686}, + {"name": "post-it", "id": 1960, "trainId": 687}, + {"name": "bowls", "id": 265, "trainId": 688}, + {"name": "security camera", "id": 2282, "trainId": 689}, + {"name": "runner cloth", "id": 2184, "trainId": 690}, + {"name": "lock", "id": 1461, "trainId": 691}, + {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692}, + {"name": "side", "id": 2372, "trainId": 693}, + {"name": "roulette", "id": 2166, "trainId": 694}, + {"name": "bone", "id": 232, "trainId": 695}, + {"name": "cutlery", "id": 693, "trainId": 696}, + {"name": "pool balls", "id": 1945, "trainId": 697}, + {"name": "wheels", "id": 3039, "trainId": 698}, + {"name": "spice rack", "id": 2494, "trainId": 699}, + {"name": "plant pots", "id": 1908, "trainId": 700}, + {"name": "towel ring", "id": 2827, "trainId": 701}, + {"name": "bread box", "id": 280, "trainId": 702}, + {"name": "video", "id": 2950, "trainId": 703}, + {"name": "funfair", "id": 1044, "trainId": 704}, + {"name": "breads", "id": 288, "trainId": 705}, + {"name": "tripod", "id": 2863, "trainId": 706}, + {"name": "ironing board", "id": 1342, "trainId": 707}, + {"name": "skimmer", "id": 2409, "trainId": 708}, + {"name": "hollow", "id": 1258, "trainId": 709}, + {"name": "scratching post", "id": 2249, "trainId": 710}, + {"name": "tricycle", "id": 2862, "trainId": 711}, + {"name": "file box", "id": 920, "trainId": 712}, + {"name": "mountain pass", "id": 1607, "trainId": 713}, + {"name": "tombstones", "id": 2802, "trainId": 714}, + {"name": "cooker", "id": 610, "trainId": 715}, + {"name": "card game, cards", "id": 3129, "trainId": 716}, + {"name": "golf bag", "id": 1108, "trainId": 717}, + {"name": "towel paper", "id": 2823, "trainId": 718}, + {"name": "chaise lounge", "id": 476, "trainId": 719}, + {"name": "sun", "id": 2641, "trainId": 720}, + {"name": "toilet paper holder", "id": 2788, "trainId": 721}, + {"name": "rake", "id": 2070, "trainId": 722}, + {"name": "key", "id": 1368, "trainId": 723}, + {"name": "umbrella stand", "id": 2903, "trainId": 724}, + {"name": "dartboard", "id": 699, "trainId": 725}, + {"name": "transformer", "id": 2844, "trainId": 726}, + {"name": "fireplace utensils", "id": 942, "trainId": 727}, + {"name": "sweatshirts", "id": 2663, "trainId": 728}, + { + "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone", + "id": 457, + "trainId": 729, + }, + {"name": "tallboy", "id": 2701, "trainId": 730}, + {"name": "stapler", "id": 2540, "trainId": 731}, + {"name": "sauna", "id": 2231, "trainId": 732}, + {"name": "test tube", "id": 2746, "trainId": 733}, + {"name": "palette", "id": 1738, "trainId": 734}, + {"name": "shopping carts", "id": 2350, "trainId": 735}, + {"name": "tools", "id": 2808, "trainId": 736}, + {"name": "push button, push, button", "id": 2025, "trainId": 737}, + {"name": "star", "id": 2541, "trainId": 738}, + {"name": "roof rack", "id": 2156, "trainId": 739}, + {"name": "barbed wire", "id": 126, "trainId": 740}, + {"name": "spray", "id": 2512, "trainId": 741}, + {"name": "ear", "id": 831, "trainId": 742}, + {"name": "sponge", "id": 2503, "trainId": 743}, + {"name": "racket", "id": 2039, "trainId": 744}, + {"name": "tins", "id": 2774, "trainId": 745}, + {"name": "eyeglasses", "id": 886, "trainId": 746}, + {"name": "file", "id": 919, "trainId": 747}, + {"name": "scarfs", "id": 2240, "trainId": 748}, + {"name": "sugar bowl", "id": 2636, "trainId": 749}, + {"name": "flip flop", "id": 963, "trainId": 750}, + {"name": "headstones", "id": 1218, "trainId": 751}, + {"name": "laptop bag", "id": 1406, "trainId": 752}, + {"name": "leash", "id": 1420, "trainId": 753}, + {"name": "climbing frame", "id": 526, "trainId": 754}, + {"name": "suit hanger", "id": 2639, "trainId": 755}, + {"name": "floor spotlight", "id": 975, "trainId": 756}, + {"name": "plate rack", "id": 1921, "trainId": 757}, + {"name": "sewer", "id": 2305, "trainId": 758}, + {"name": "hard drive", "id": 1193, "trainId": 759}, + {"name": "sprinkler", "id": 2517, "trainId": 760}, + {"name": "tools box", "id": 2809, "trainId": 761}, + {"name": "necklace", "id": 1647, "trainId": 762}, + {"name": "bulbs", "id": 314, "trainId": 763}, + {"name": "steel industry", "id": 2560, "trainId": 764}, + {"name": "club", "id": 545, "trainId": 765}, + {"name": "jack", "id": 1345, "trainId": 766}, + {"name": "door bars", "id": 775, "trainId": 767}, + { + "name": "control panel, instrument panel, control board, board, panel", + "id": 603, + "trainId": 768, + }, + {"name": "hairbrush", "id": 1163, "trainId": 769}, + {"name": "napkin holder", "id": 1641, "trainId": 770}, + {"name": "office", "id": 1678, "trainId": 771}, + {"name": "smoke detector", "id": 2450, "trainId": 772}, + {"name": "utensils", "id": 2915, "trainId": 773}, + {"name": "apron", "id": 42, "trainId": 774}, + {"name": "scissors", "id": 2242, "trainId": 775}, + {"name": "terminal", "id": 2741, "trainId": 776}, + {"name": "grinder", "id": 1143, "trainId": 777}, + {"name": "entry phone", "id": 862, "trainId": 778}, + {"name": "newspaper stand", "id": 1654, "trainId": 779}, + {"name": "pepper shaker", "id": 1826, "trainId": 780}, + {"name": "onions", "id": 1689, "trainId": 781}, + { + "name": "central processing unit, cpu, c p u , central processor, processor, mainframe", + "id": 3124, + "trainId": 782, + }, + {"name": "tape", "id": 2710, "trainId": 783}, + {"name": "bat", "id": 152, "trainId": 784}, + {"name": "coaster", "id": 549, "trainId": 785}, + {"name": "calculator", "id": 360, "trainId": 786}, + {"name": "potatoes", "id": 1982, "trainId": 787}, + {"name": "luggage rack", "id": 1478, "trainId": 788}, + {"name": "salt", "id": 2203, "trainId": 789}, + {"name": "street number", "id": 2612, "trainId": 790}, + {"name": "viewpoint", "id": 2956, "trainId": 791}, + {"name": "sword", "id": 2681, "trainId": 792}, + {"name": "cd", "id": 437, "trainId": 793}, + {"name": "rowing machine", "id": 2171, "trainId": 794}, + {"name": "plug", "id": 1933, "trainId": 795}, + {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796}, + {"name": "pepper", "id": 1824, "trainId": 797}, + {"name": "tongs", "id": 2803, "trainId": 798}, + {"name": "bonfire", "id": 234, "trainId": 799}, + {"name": "dog dish", "id": 764, "trainId": 800}, + {"name": "belt", "id": 177, "trainId": 801}, + {"name": "dumbbells", "id": 817, "trainId": 802}, + {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803}, + {"name": "hook", "id": 1262, "trainId": 804}, + {"name": "envelopes", "id": 864, "trainId": 805}, + {"name": "shower faucet", "id": 2359, "trainId": 806}, + {"name": "watch", "id": 2992, "trainId": 807}, + {"name": "padlock", "id": 1725, "trainId": 808}, + {"name": "swimming pool ladder", "id": 2667, "trainId": 809}, + {"name": "spanners", "id": 2484, "trainId": 810}, + {"name": "gravy boat", "id": 1133, "trainId": 811}, + {"name": "notice board", "id": 1667, "trainId": 812}, + {"name": "trash bags", "id": 2847, "trainId": 813}, + {"name": "fire alarm", "id": 932, "trainId": 814}, + {"name": "ladle", "id": 1392, "trainId": 815}, + {"name": "stethoscope", "id": 2573, "trainId": 816}, + {"name": "rocket", "id": 2140, "trainId": 817}, + {"name": "funnel", "id": 1046, "trainId": 818}, + {"name": "bowling pins", "id": 264, "trainId": 819}, + {"name": "valve", "id": 2927, "trainId": 820}, + {"name": "thermometer", "id": 2752, "trainId": 821}, + {"name": "cups", "id": 679, "trainId": 822}, + {"name": "spice jar", "id": 2493, "trainId": 823}, + {"name": "night light", "id": 1658, "trainId": 824}, + {"name": "soaps", "id": 2466, "trainId": 825}, + {"name": "games table", "id": 1057, "trainId": 826}, + {"name": "slotted spoon", "id": 2444, "trainId": 827}, + {"name": "reel", "id": 2093, "trainId": 828}, + {"name": "scourer", "id": 2248, "trainId": 829}, + {"name": "sleeping robe", "id": 2432, "trainId": 830}, + {"name": "desk mat", "id": 726, "trainId": 831}, + {"name": "dumbbell", "id": 816, "trainId": 832}, + {"name": "hammer", "id": 1171, "trainId": 833}, + {"name": "tie", "id": 2766, "trainId": 834}, + {"name": "typewriter", "id": 2900, "trainId": 835}, + {"name": "shaker", "id": 2313, "trainId": 836}, + {"name": "cheese dish", "id": 488, "trainId": 837}, + {"name": "sea star", "id": 2265, "trainId": 838}, + {"name": "racquet", "id": 2043, "trainId": 839}, + {"name": "butane gas cylinder", "id": 332, "trainId": 840}, + {"name": "paper weight", "id": 1771, "trainId": 841}, + {"name": "shaving brush", "id": 2320, "trainId": 842}, + {"name": "sunglasses", "id": 2646, "trainId": 843}, + {"name": "gear shift", "id": 1089, "trainId": 844}, + {"name": "towel rail", "id": 2826, "trainId": 845}, + {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846}, +] + + +def loadAde20K(file): + fileseg = file.replace(".jpg", "_seg.png") + with Image.open(fileseg) as io: + seg = np.array(io) + + R = seg[:, :, 0] + G = seg[:, :, 1] + ObjectClassMasks = (R / 10).astype(np.int32) * 256 + (G.astype(np.int32)) + + return {"img_name": file, "segm_name": fileseg, "class_mask": ObjectClassMasks} + + +if __name__ == "__main__": + dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) + index_file = dataset_dir / "ADE20K_2021_17_01" / "index_ade20k.pkl" + print('Caution: we only generate the validation set!') + with open(index_file, "rb") as f: + index_ade20k = pkl.load(f) + + id_map = {} + for cat in ADE20K_SEM_SEG_FULL_CATEGORIES: + id_map[cat["id"]] = cat["trainId"] + + # make output dir + for name in ["training", "validation"]: + image_dir = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / name + image_dir.mkdir(parents=True, exist_ok=True) + annotation_dir = dataset_dir / "ADE20K_2021_17_01" / "annotations_detectron2" / name + annotation_dir.mkdir(parents=True, exist_ok=True) + + # process image and gt + for i, (folder_name, file_name) in tqdm.tqdm( + enumerate(zip(index_ade20k["folder"], index_ade20k["filename"])), + total=len(index_ade20k["filename"]), + ): + split = "validation" if file_name.split("_")[1] == "val" else "training" + if split == 'training': + # FIXME: If you want to generate training set, delete this condition + continue + info = loadAde20K(str(dataset_dir / folder_name / file_name)) + + # resize image and label + img = np.asarray(Image.open(info["img_name"])) + lab = np.asarray(info["class_mask"]) + + h, w = img.shape[0], img.shape[1] + max_size = 512 + resize = True + if w >= h > max_size: + h_new, w_new = max_size, round(w / float(h) * max_size) + elif h >= w > max_size: + h_new, w_new = round(h / float(w) * max_size), max_size + else: + resize = False + + if resize: + img = cv2.resize(img, (w_new, h_new), interpolation=cv2.INTER_LINEAR) + lab = cv2.resize(lab, (w_new, h_new), interpolation=cv2.INTER_NEAREST) + + assert img.dtype == np.uint8 + assert lab.dtype == np.int32 + + # apply label conversion and save into uint16 images + output = np.zeros_like(lab, dtype=np.uint16) + 65535 + for obj_id in np.unique(lab): + if obj_id in id_map: + output[lab == obj_id] = id_map[obj_id] + + output_img = dataset_dir / "ADE20K_2021_17_01" / "images_detectron2" / split / file_name + output_lab = ( + dataset_dir + / "ADE20K_2021_17_01" + / "annotations_detectron2" + / split + / file_name.replace(".jpg", ".tif") + ) + Image.fromarray(img).save(output_img) + + assert output.dtype == np.uint16 + Image.fromarray(output).save(output_lab) \ No newline at end of file diff --git a/datasets/prepare_ade20k_sem_seg.py b/datasets/prepare_ade20k_sem_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..9598151dc446807ccaee2056f7dae3260023a9b0 --- /dev/null +++ b/datasets/prepare_ade20k_sem_seg.py @@ -0,0 +1,35 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import os +from pathlib import Path + +import numpy as np +import tqdm +from PIL import Image + + +def convert(input, output, index=None): + img = np.asarray(Image.open(input)) + assert img.dtype == np.uint8 + img = img - 1 # 0 (ignore) becomes 255. others are shifted by 1 + if index is not None: + mapping = {i: k for k, i in enumerate(index)} + img = np.vectorize(lambda x: mapping[x] if x in mapping else 255)( + img.astype(np.float) + ).astype(np.uint8) + Image.fromarray(img).save(output) + + +if __name__ == "__main__": + dataset_dir = ( + Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016" + ) + print('Caution: we only generate the validation set!') + for name in ["validation"]: + annotation_dir = dataset_dir / "annotations" / name + output_dir = dataset_dir / "annotations_detectron2" / name + output_dir.mkdir(parents=True, exist_ok=True) + for file in tqdm.tqdm(list(annotation_dir.iterdir())): + output_file = output_dir / file.name + convert(file, output_file) diff --git a/datasets/prepare_coco_stuff_sem_seg.py b/datasets/prepare_coco_stuff_sem_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..1c2281f3590a2ec68d5aceb904d7a8ba10bd993a --- /dev/null +++ b/datasets/prepare_coco_stuff_sem_seg.py @@ -0,0 +1,219 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from +# https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_coco_stuff_164k_sem_seg.py + +import os +import os.path as osp +from pathlib import Path +import tqdm +from glob import glob + +import numpy as np +from PIL import Image + + +full_clsID_to_trID = { + 0: 0, + 1: 1, + 2: 2, + 3: 3, + 4: 4, + 5: 5, + 6: 6, + 7: 7, + 8: 8, + 9: 9, + 10: 10, + 12: 11, + 13: 12, + 14: 13, + 15: 14, + 16: 15, + 17: 16, + 18: 17, + 19: 18, + 20: 19, + 21: 20, + 22: 21, + 23: 22, + 24: 23, + 26: 24, + 27: 25, + 30: 26, + 31: 27, + 32: 28, + 33: 29, + 34: 30, + 35: 31, + 36: 32, + 37: 33, + 38: 34, + 39: 35, + 40: 36, + 41: 37, + 42: 38, + 43: 39, + 45: 40, + 46: 41, + 47: 42, + 48: 43, + 49: 44, + 50: 45, + 51: 46, + 52: 47, + 53: 48, + 54: 49, + 55: 50, + 56: 51, + 57: 52, + 58: 53, + 59: 54, + 60: 55, + 61: 56, + 62: 57, + 63: 58, + 64: 59, + 66: 60, + 69: 61, + 71: 62, + 72: 63, + 73: 64, + 74: 65, + 75: 66, + 76: 67, + 77: 68, + 78: 69, + 79: 70, + 80: 71, + 81: 72, + 83: 73, + 84: 74, + 85: 75, + 86: 76, + 87: 77, + 88: 78, + 89: 79, + 91: 80, + 92: 81, + 93: 82, + 94: 83, + 95: 84, + 96: 85, + 97: 86, + 98: 87, + 99: 88, + 100: 89, + 101: 90, + 102: 91, + 103: 92, + 104: 93, + 105: 94, + 106: 95, + 107: 96, + 108: 97, + 109: 98, + 110: 99, + 111: 100, + 112: 101, + 113: 102, + 114: 103, + 115: 104, + 116: 105, + 117: 106, + 118: 107, + 119: 108, + 120: 109, + 121: 110, + 122: 111, + 123: 112, + 124: 113, + 125: 114, + 126: 115, + 127: 116, + 128: 117, + 129: 118, + 130: 119, + 131: 120, + 132: 121, + 133: 122, + 134: 123, + 135: 124, + 136: 125, + 137: 126, + 138: 127, + 139: 128, + 140: 129, + 141: 130, + 142: 131, + 143: 132, + 144: 133, + 145: 134, + 146: 135, + 147: 136, + 148: 137, + 149: 138, + 150: 139, + 151: 140, + 152: 141, + 153: 142, + 154: 143, + 155: 144, + 156: 145, + 157: 146, + 158: 147, + 159: 148, + 160: 149, + 161: 150, + 162: 151, + 163: 152, + 164: 153, + 165: 154, + 166: 155, + 167: 156, + 168: 157, + 169: 158, + 170: 159, + 171: 160, + 172: 161, + 173: 162, + 174: 163, + 175: 164, + 176: 165, + 177: 166, + 178: 167, + 179: 168, + 180: 169, + 181: 170, + 255: 255, +} + +def convert_to_trainID( + maskpath, out_mask_dir, is_train, clsID_to_trID=full_clsID_to_trID, suffix="" +): + mask = np.array(Image.open(maskpath)) + mask_copy = np.ones_like(mask, dtype=np.uint8) * 255 + for clsID, trID in clsID_to_trID.items(): + mask_copy[mask == clsID] = trID + seg_filename = ( + osp.join(out_mask_dir, "train2017" + suffix, osp.basename(maskpath)) + if is_train + else osp.join(out_mask_dir, "val2017" + suffix, osp.basename(maskpath)) + ) + if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255: + return + Image.fromarray(mask_copy).save(seg_filename, "PNG") + + + +if __name__ == "__main__": + dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) + print('Caution: we only generate the training set!') + coco_path = dataset_dir / "coco" + mask_dir = coco_path / "stuffthingmaps" + out_mask_dir = coco_path / "stuffthingmaps_detectron2" + for name in ["train2017"]: + os.makedirs((out_mask_dir / name), exist_ok=True) + train_list = glob(osp.join(mask_dir, "train2017", "*.png")) + for file in tqdm.tqdm(train_list): + convert_to_trainID(file, out_mask_dir, is_train=True) diff --git a/datasets/prepare_pascal_context.py b/datasets/prepare_pascal_context.py new file mode 100644 index 0000000000000000000000000000000000000000..25d38469242affc188617cbd23eaaf33219bd317 --- /dev/null +++ b/datasets/prepare_pascal_context.py @@ -0,0 +1,69 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import tqdm +import os +import os.path as osp +from pathlib import Path + +import numpy as np +from PIL import Image +import scipy.io + +def convert_pc59(mask_path, new_mask_path, pc59_dict): + mat = scipy.io.loadmat(mask_path) + mask = mat['LabelMap'] + + mask_copy = np.ones_like(mask, dtype=np.uint8) * 255 + for trID, clsID in pc59_dict.items(): + mask_copy[mask == clsID] = trID + + min_value = np.amin(mask_copy) + assert min_value >= 0, print(min_value) + Image.fromarray(mask_copy).save(new_mask_path, "PNG") + +def convert_pc459(mask_path, new_mask_path): + mat = scipy.io.loadmat(mask_path) + mask = mat['LabelMap'] + mask = mask - 1 + min_value = np.amin(mask) + assert min_value >= 0, print(min_value) + Image.fromarray(mask).save(new_mask_path, "TIFF") + + +if __name__ == "__main__": + dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) + print('Caution: we only generate the validation set!') + pc_path = dataset_dir / "VOCdevkit/VOC2010" + + val_list = open(pc_path / "pascalcontext_val.txt", "r") + pc459_labels = open(pc_path / "labels.txt", "r") + pc59_labels = open(pc_path / "59_labels.txt", "r") + + pc459_dict = {} + for line in pc459_labels.readlines(): + if ':' in line: + idx, name = line.split(':') + idx = int(idx.strip()) + name = name.strip() + pc459_dict[name] = idx + + pc59_dict = {} + for i, line in enumerate(pc59_labels.readlines()): + name = line.split(':')[-1].strip() + if name is not '': + pc59_dict[i] = pc459_dict[name] + + pc459_dir = pc_path / "annotations_detectron2" / "pc459_val" + pc459_dir.mkdir(parents=True, exist_ok=True) + pc59_dir = pc_path / "annotations_detectron2" / "pc59_val" + pc59_dir.mkdir(parents=True, exist_ok=True) + + for line in tqdm.tqdm(val_list.readlines()): + fileid = line.strip() + ori_mask = f'{pc_path}/trainval/{fileid}.mat' + pc459_dst = f'{pc459_dir}/{fileid}.tif' + pc59_dst = f'{pc59_dir}/{fileid}.png' + if osp.exists(ori_mask): + convert_pc459(ori_mask, pc459_dst) + convert_pc59(ori_mask, pc59_dst, pc59_dict) diff --git a/datasets/prepare_voc_sem_seg.py b/datasets/prepare_voc_sem_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..1dbe80a5b8ae53627998214ec6a1f9a7fc30fad9 --- /dev/null +++ b/datasets/prepare_voc_sem_seg.py @@ -0,0 +1,71 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_voc_sem_seg.py + +import os +import os.path as osp +from pathlib import Path +import tqdm + +import numpy as np +from PIL import Image + + +clsID_to_trID = { + 0: 255, + 1: 0, + 2: 1, + 3: 2, + 4: 3, + 5: 4, + 6: 5, + 7: 6, + 8: 7, + 9: 8, + 10: 9, + 11: 10, + 12: 11, + 13: 12, + 14: 13, + 15: 14, + 16: 15, + 17: 16, + 18: 17, + 19: 18, + 20: 19, + 255: 255, +} + +def convert_to_trainID( + maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix="" +): + mask = np.array(Image.open(maskpath)) + mask_copy = np.ones_like(mask, dtype=np.uint8) * 255 + for clsID, trID in clsID_to_trID.items(): + mask_copy[mask == clsID] = trID + seg_filename = ( + osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath)) + if is_train + else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath)) + ) + if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255: + return + Image.fromarray(mask_copy).save(seg_filename, "PNG") + + + +if __name__ == "__main__": + dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) + print('Caution: we only generate the validation set!') + voc_path = dataset_dir / "VOCdevkit" / "VOC2012" + out_mask_dir = voc_path / "annotations_detectron2" + out_image_dir = voc_path / "images_detectron2" + for name in ["val"]: + os.makedirs((out_mask_dir / name), exist_ok=True) + os.makedirs((out_image_dir / name), exist_ok=True) + val_list = [ + osp.join(voc_path, "SegmentationClassAug", f + ".png") + for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist() + ] + for file in tqdm.tqdm(val_list): + convert_to_trainID(file, out_mask_dir, is_train=False) diff --git a/datasets/scannet_preprocess/meta_data/classes_ObjClassification-ShapeNetCore55.txt b/datasets/scannet_preprocess/meta_data/classes_ObjClassification-ShapeNetCore55.txt new file mode 100644 index 0000000000000000000000000000000000000000..e53f5bcb2c1480f42ee9327940246258aa434f88 --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/classes_ObjClassification-ShapeNetCore55.txt @@ -0,0 +1,17 @@ +1 trash +3 basket +4 bathtub +5 bed +9 shelf +13 cabinet +18 chair +20 keyboard +22 tv +30 lamp +31 laptop +35 microwave +39 pillow +42 printer +47 sofa +48 stove +49 table diff --git a/datasets/scannet_preprocess/meta_data/classes_SemVoxLabel-nyu40id.txt b/datasets/scannet_preprocess/meta_data/classes_SemVoxLabel-nyu40id.txt new file mode 100644 index 0000000000000000000000000000000000000000..48e228766391e0f0234c2eed086e31f738068a4b --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/classes_SemVoxLabel-nyu40id.txt @@ -0,0 +1,20 @@ +1 wall +2 floor +3 cabinet +4 bed +5 chair +6 sofa +7 table +8 door +9 window +10 bookshelf +11 picture +12 counter +14 desk +16 curtain +24 refridgerator +28 shower curtain +33 toilet +34 sink +36 bathtub +39 otherfurniture \ No newline at end of file diff --git a/datasets/scannet_preprocess/meta_data/scannet200_constants.py b/datasets/scannet_preprocess/meta_data/scannet200_constants.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8e9a64d0b9ff821cfd46ea017305aff65c5d60 --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannet200_constants.py @@ -0,0 +1,295 @@ +# ScanNet Benchmark constants +VALID_CLASS_IDS_20 = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 16, 24, 28, 33, 34, 36, 39) + +CLASS_LABELS_20 = ('wall', 'floor', 'cabinet', 'bed', 'chair', 'sofa', 'table', 'door', 'window', + 'bookshelf', 'picture', 'counter', 'desk', 'curtain', 'refrigerator', + 'shower curtain', 'toilet', 'sink', 'bathtub', 'otherfurniture') + +SCANNET_COLOR_MAP_20 = { + 0: (0., 0., 0.), + 1: (174., 199., 232.), + 2: (152., 223., 138.), + 3: (31., 119., 180.), + 4: (255., 187., 120.), + 5: (188., 189., 34.), + 6: (140., 86., 75.), + 7: (255., 152., 150.), + 8: (214., 39., 40.), + 9: (197., 176., 213.), + 10: (148., 103., 189.), + 11: (196., 156., 148.), + 12: (23., 190., 207.), + 14: (247., 182., 210.), + 15: (66., 188., 102.), + 16: (219., 219., 141.), + 17: (140., 57., 197.), + 18: (202., 185., 52.), + 19: (51., 176., 203.), + 20: (200., 54., 131.), + 21: (92., 193., 61.), + 22: (78., 71., 183.), + 23: (172., 114., 82.), + 24: (255., 127., 14.), + 25: (91., 163., 138.), + 26: (153., 98., 156.), + 27: (140., 153., 101.), + 28: (158., 218., 229.), + 29: (100., 125., 154.), + 30: (178., 127., 135.), + 32: (146., 111., 194.), + 33: (44., 160., 44.), + 34: (112., 128., 144.), + 35: (96., 207., 209.), + 36: (227., 119., 194.), + 37: (213., 92., 176.), + 38: (94., 106., 211.), + 39: (82., 84., 163.), + 40: (100., 85., 144.), +} + +# ScanNet200 Benchmark constants +VALID_CLASS_IDS_200 = ( + 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, + 36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68, 69, + 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, + 104, 105, 106, 107, 110, 112, 115, 116, 118, 120, 121, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, + 145, 148, 154, 155, 156, 157, 159, 161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195, 202, 208, + 213, 214, 221, 229, 230, 232, 233, 242, 250, 261, 264, 276, 283, 286, 300, 304, 312, 323, 325, 331, 342, 356, 370, + 392, 395, 399, 408, 417, 488, 540, 562, 570, 572, 581, 609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, + 1169, 1170, 1171, 1172, 1173, 1174, 1175, 1176, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, + 1189, 1190, 1191) + +CLASS_LABELS_200 = ( + 'wall', 'chair', 'floor', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk', 'office chair', 'bed', 'pillow', + 'sink', 'picture', 'window', 'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair', 'coffee table', 'box', + 'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool', + 'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard', 'bag', 'backpack', + 'toilet paper', 'printer', 'tv stand', 'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet', 'stairs', + 'microwave', 'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board', 'washing machine', + 'mirror', 'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person', + 'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail', 'radiator', + 'recycling bin', 'container', 'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light', + 'laundry basket', 'pipe', 'clothes dryer', 'guitar', 'toilet paper holder', 'seat', 'speaker', 'column', 'bicycle', + 'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin', 'coffee maker', 'dishwasher', + 'paper towel roll', 'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board', + 'fireplace', 'soap dish', 'kitchen counter', 'doorframe', 'toilet paper dispenser', 'mini fridge', + 'fire extinguisher', 'ball', 'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray', 'shower door', + 'pillar', 'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser', 'furniture', 'cart', 'storage container', + 'scale', 'tissue box', 'light switch', 'crate', 'power outlet', 'decoration', 'sign', 'projector', 'closet door', + 'vacuum cleaner', 'candle', 'plunger', 'stuffed animal', 'headphones', 'dish rack', 'broom', 'guitar case', + 'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar', 'purse', 'vent', 'shower floor', + 'water pitcher', 'mailbox', 'bowl', 'paper bag', 'alarm clock', 'music stand', 'projector screen', 'divider', + 'laundry detergent', 'bathroom counter', 'object', 'bathroom vanity', 'closet wall', 'laundry hamper', + 'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell', 'stair rail', 'tube', 'bathroom cabinet', + 'cd case', 'closet rod', 'coffee kettle', 'structure', 'shower head', 'keyboard piano', 'case of water bottles', + 'coat rack', 'storage organizer', 'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster', 'potted plant', + 'luggage', 'mattress') + +SCANNET_COLOR_MAP_200 = { + 0: (0., 0., 0.), + 1: (174., 199., 232.), + 2: (188., 189., 34.), + 3: (152., 223., 138.), + 4: (255., 152., 150.), + 5: (214., 39., 40.), + 6: (91., 135., 229.), + 7: (31., 119., 180.), + 8: (229., 91., 104.), + 9: (247., 182., 210.), + 10: (91., 229., 110.), + 11: (255., 187., 120.), + 13: (141., 91., 229.), + 14: (112., 128., 144.), + 15: (196., 156., 148.), + 16: (197., 176., 213.), + 17: (44., 160., 44.), + 18: (148., 103., 189.), + 19: (229., 91., 223.), + 21: (219., 219., 141.), + 22: (192., 229., 91.), + 23: (88., 218., 137.), + 24: (58., 98., 137.), + 26: (177., 82., 239.), + 27: (255., 127., 14.), + 28: (237., 204., 37.), + 29: (41., 206., 32.), + 31: (62., 143., 148.), + 32: (34., 14., 130.), + 33: (143., 45., 115.), + 34: (137., 63., 14.), + 35: (23., 190., 207.), + 36: (16., 212., 139.), + 38: (90., 119., 201.), + 39: (125., 30., 141.), + 40: (150., 53., 56.), + 41: (186., 197., 62.), + 42: (227., 119., 194.), + 44: (38., 100., 128.), + 45: (120., 31., 243.), + 46: (154., 59., 103.), + 47: (169., 137., 78.), + 48: (143., 245., 111.), + 49: (37., 230., 205.), + 50: (14., 16., 155.), + 51: (196., 51., 182.), + 52: (237., 80., 38.), + 54: (138., 175., 62.), + 55: (158., 218., 229.), + 56: (38., 96., 167.), + 57: (190., 77., 246.), + 58: (208., 49., 84.), + 59: (208., 193., 72.), + 62: (55., 220., 57.), + 63: (10., 125., 140.), + 64: (76., 38., 202.), + 65: (191., 28., 135.), + 66: (211., 120., 42.), + 67: (118., 174., 76.), + 68: (17., 242., 171.), + 69: (20., 65., 247.), + 70: (208., 61., 222.), + 71: (162., 62., 60.), + 72: (210., 235., 62.), + 73: (45., 152., 72.), + 74: (35., 107., 149.), + 75: (160., 89., 237.), + 76: (227., 56., 125.), + 77: (169., 143., 81.), + 78: (42., 143., 20.), + 79: (25., 160., 151.), + 80: (82., 75., 227.), + 82: (253., 59., 222.), + 84: (240., 130., 89.), + 86: (123., 172., 47.), + 87: (71., 194., 133.), + 88: (24., 94., 205.), + 89: (134., 16., 179.), + 90: (159., 32., 52.), + 93: (213., 208., 88.), + 95: (64., 158., 70.), + 96: (18., 163., 194.), + 97: (65., 29., 153.), + 98: (177., 10., 109.), + 99: (152., 83., 7.), + 100: (83., 175., 30.), + 101: (18., 199., 153.), + 102: (61., 81., 208.), + 103: (213., 85., 216.), + 104: (170., 53., 42.), + 105: (161., 192., 38.), + 106: (23., 241., 91.), + 107: (12., 103., 170.), + 110: (151., 41., 245.), + 112: (133., 51., 80.), + 115: (184., 162., 91.), + 116: (50., 138., 38.), + 118: (31., 237., 236.), + 120: (39., 19., 208.), + 121: (223., 27., 180.), + 122: (254., 141., 85.), + 125: (97., 144., 39.), + 128: (106., 231., 176.), + 130: (12., 61., 162.), + 131: (124., 66., 140.), + 132: (137., 66., 73.), + 134: (250., 253., 26.), + 136: (55., 191., 73.), + 138: (60., 126., 146.), + 139: (153., 108., 234.), + 140: (184., 58., 125.), + 141: (135., 84., 14.), + 145: (139., 248., 91.), + 148: (53., 200., 172.), + 154: (63., 69., 134.), + 155: (190., 75., 186.), + 156: (127., 63., 52.), + 157: (141., 182., 25.), + 159: (56., 144., 89.), + 161: (64., 160., 250.), + 163: (182., 86., 245.), + 165: (139., 18., 53.), + 166: (134., 120., 54.), + 168: (49., 165., 42.), + 169: (51., 128., 133.), + 170: (44., 21., 163.), + 177: (232., 93., 193.), + 180: (176., 102., 54.), + 185: (116., 217., 17.), + 188: (54., 209., 150.), + 191: (60., 99., 204.), + 193: (129., 43., 144.), + 195: (252., 100., 106.), + 202: (187., 196., 73.), + 208: (13., 158., 40.), + 213: (52., 122., 152.), + 214: (128., 76., 202.), + 221: (187., 50., 115.), + 229: (180., 141., 71.), + 230: (77., 208., 35.), + 232: (72., 183., 168.), + 233: (97., 99., 203.), + 242: (172., 22., 158.), + 250: (155., 64., 40.), + 261: (118., 159., 30.), + 264: (69., 252., 148.), + 276: (45., 103., 173.), + 283: (111., 38., 149.), + 286: (184., 9., 49.), + 300: (188., 174., 67.), + 304: (53., 206., 53.), + 312: (97., 235., 252.), + 323: (66., 32., 182.), + 325: (236., 114., 195.), + 331: (241., 154., 83.), + 342: (133., 240., 52.), + 356: (16., 205., 144.), + 370: (75., 101., 198.), + 392: (237., 95., 251.), + 395: (191., 52., 49.), + 399: (227., 254., 54.), + 408: (49., 206., 87.), + 417: (48., 113., 150.), + 488: (125., 73., 182.), + 540: (229., 32., 114.), + 562: (158., 119., 28.), + 570: (60., 205., 27.), + 572: (18., 215., 201.), + 581: (79., 76., 153.), + 609: (134., 13., 116.), + 748: (192., 97., 63.), + 776: (108., 163., 18.), + 1156: (95., 220., 156.), + 1163: (98., 141., 208.), + 1164: (144., 19., 193.), + 1165: (166., 36., 57.), + 1166: (212., 202., 34.), + 1167: (23., 206., 34.), + 1168: (91., 211., 236.), + 1169: (79., 55., 137.), + 1170: (182., 19., 117.), + 1171: (134., 76., 14.), + 1172: (87., 185., 28.), + 1173: (82., 224., 187.), + 1174: (92., 110., 214.), + 1175: (168., 80., 171.), + 1176: (197., 63., 51.), + 1178: (175., 199., 77.), + 1179: (62., 180., 98.), + 1180: (8., 91., 150.), + 1181: (77., 15., 130.), + 1182: (154., 65., 96.), + 1183: (197., 152., 11.), + 1184: (59., 155., 45.), + 1185: (12., 147., 145.), + 1186: (54., 35., 219.), + 1187: (210., 73., 181.), + 1188: (221., 124., 77.), + 1189: (149., 214., 66.), + 1190: (72., 185., 134.), + 1191: (42., 94., 198.), +} + +# For instance segmentation the non-object categories +VALID_PANOPTIC_IDS = (1, 3) + +CLASS_LABELS_PANOPTIC = ('wall', 'floor') diff --git a/datasets/scannet_preprocess/meta_data/scannet200_splits.py b/datasets/scannet_preprocess/meta_data/scannet200_splits.py new file mode 100644 index 0000000000000000000000000000000000000000..9e66fc81f2c48e7df5ce328dc89f11ad3f4eb98a --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannet200_splits.py @@ -0,0 +1,18 @@ +# This file contains the HEAD - COMMON - TAIL split category ids for ScanNet 200 + +HEAD_CATS_SCANNET_200 = ['tv stand', 'curtain', 'blinds', 'shower curtain', 'bookshelf', 'tv', 'kitchen cabinet', 'pillow', 'lamp', 'dresser', 'monitor', 'object', 'ceiling', 'board', 'stove', 'closet wall', 'couch', 'office chair', 'kitchen counter', 'shower', 'closet', 'doorframe', 'sofa chair', 'mailbox', 'nightstand', 'washing machine', 'picture', 'book', 'sink', 'recycling bin', 'table', 'backpack', 'shower wall', 'toilet', 'copier', 'counter', 'stool', 'refrigerator', 'window', 'file cabinet', 'chair', 'wall', 'plant', 'coffee table', 'stairs', 'armchair', 'cabinet', 'bathroom vanity', 'bathroom stall', 'mirror', 'blackboard', 'trash can', 'stair rail', 'box', 'towel', 'door', 'clothes', 'whiteboard', 'bed', 'floor', 'bathtub', 'desk', 'wardrobe', 'clothes dryer', 'radiator', 'shelf'] +COMMON_CATS_SCANNET_200 = ["cushion", "end table", "dining table", "keyboard", "bag", "toilet paper", "printer", "blanket", "microwave", "shoe", "computer tower", "bottle", "bin", "ottoman", "bench", "basket", "fan", "laptop", "person", "paper towel dispenser", "oven", "rack", "piano", "suitcase", "rail", "container", "telephone", "stand", "light", "laundry basket", "pipe", "seat", "column", "bicycle", "ladder", "jacket", "storage bin", "coffee maker", "dishwasher", "machine", "mat", "windowsill", "bulletin board", "fireplace", "mini fridge", "water cooler", "shower door", "pillar", "ledge", "furniture", "cart", "decoration", "closet door", "vacuum cleaner", "dish rack", "range hood", "projector screen", "divider", "bathroom counter", "laundry hamper", "bathroom stall door", "ceiling light", "trash bin", "bathroom cabinet", "structure", "storage organizer", "potted plant", "mattress"] +TAIL_CATS_SCANNET_200 = ["paper", "plate", "soap dispenser", "bucket", "clock", "guitar", "toilet paper holder", "speaker", "cup", "paper towel roll", "bar", "toaster", "ironing board", "soap dish", "toilet paper dispenser", "fire extinguisher", "ball", "hat", "shower curtain rod", "paper cutter", "tray", "toaster oven", "mouse", "toilet seat cover dispenser", "storage container", "scale", "tissue box", "light switch", "crate", "power outlet", "sign", "projector", "candle", "plunger", "stuffed animal", "headphones", "broom", "guitar case", "dustpan", "hair dryer", "water bottle", "handicap bar", "purse", "vent", "shower floor", "water pitcher", "bowl", "paper bag", "alarm clock", "music stand", "laundry detergent", "dumbbell", "tube", "cd case", "closet rod", "coffee kettle", "shower head", "keyboard piano", "case of water bottles", "coat rack", "folded chair", "fire alarm", "power strip", "calendar", "poster", "luggage"] + + +# Given the different size of the official train and val sets, not all ScanNet200 categories are present in the validation set. +# Here we list of categories with labels and IDs present in both train and validation set, and the remaining categories those are present in train, but not in val +# We dont evaluate on unseen validation categories in this benchmark + +VALID_CLASS_IDS_200_VALIDATION = ('wall', 'chair', 'floor', 'table', 'door', 'couch', 'cabinet', 'shelf', 'desk', 'office chair', 'bed', 'pillow', 'sink', 'picture', 'window', 'toilet', 'bookshelf', 'monitor', 'curtain', 'book', 'armchair', 'coffee table', 'box', 'refrigerator', 'lamp', 'kitchen cabinet', 'towel', 'clothes', 'tv', 'nightstand', 'counter', 'dresser', 'stool', 'cushion', 'plant', 'ceiling', 'bathtub', 'end table', 'dining table', 'keyboard', 'bag', 'backpack', 'toilet paper', 'printer', 'tv stand', 'whiteboard', 'blanket', 'shower curtain', 'trash can', 'closet', 'stairs', 'microwave', 'stove', 'shoe', 'computer tower', 'bottle', 'bin', 'ottoman', 'bench', 'board', 'washing machine', 'mirror', 'copier', 'basket', 'sofa chair', 'file cabinet', 'fan', 'laptop', 'shower', 'paper', 'person', 'paper towel dispenser', 'oven', 'blinds', 'rack', 'plate', 'blackboard', 'piano', 'suitcase', 'rail', 'radiator', 'recycling bin', 'container', 'wardrobe', 'soap dispenser', 'telephone', 'bucket', 'clock', 'stand', 'light', 'laundry basket', 'pipe', 'clothes dryer', 'guitar', 'toilet paper holder', 'seat', 'speaker', 'column', 'ladder', 'bathroom stall', 'shower wall', 'cup', 'jacket', 'storage bin', 'coffee maker', 'dishwasher', 'paper towel roll', 'machine', 'mat', 'windowsill', 'bar', 'toaster', 'bulletin board', 'ironing board', 'fireplace', 'soap dish', 'kitchen counter', 'doorframe', 'toilet paper dispenser', 'mini fridge', 'fire extinguisher', 'ball', 'hat', 'shower curtain rod', 'water cooler', 'paper cutter', 'tray', 'shower door', 'pillar', 'ledge', 'toaster oven', 'mouse', 'toilet seat cover dispenser', 'furniture', 'cart', 'scale', 'tissue box', 'light switch', 'crate', 'power outlet', 'decoration', 'sign', 'projector', 'closet door', 'vacuum cleaner', 'plunger', 'stuffed animal', 'headphones', 'dish rack', 'broom', 'range hood', 'dustpan', 'hair dryer', 'water bottle', 'handicap bar', 'vent', 'shower floor', 'water pitcher', 'mailbox', 'bowl', 'paper bag', 'projector screen', 'divider', 'laundry detergent', 'bathroom counter', 'object', 'bathroom vanity', 'closet wall', 'laundry hamper', 'bathroom stall door', 'ceiling light', 'trash bin', 'dumbbell', 'stair rail', 'tube', 'bathroom cabinet', 'closet rod', 'coffee kettle', 'shower head', 'keyboard piano', 'case of water bottles', 'coat rack', 'folded chair', 'fire alarm', 'power strip', 'calendar', 'poster', 'potted plant', 'mattress') + +CLASS_LABELS_200_VALIDATION = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 82, 84, 86, 87, 88, 89, 90, 93, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 110, 112, 115, 116, 118, 120, 122, 125, 128, 130, 131, 132, 134, 136, 138, 139, 140, 141, 145, 148, 154, 155, 156, 157, 159, 161, 163, 165, 166, 168, 169, 170, 177, 180, 185, 188, 191, 193, 195, 202, 208, 213, 214, 229, 230, 232, 233, 242, 250, 261, 264, 276, 283, 300, 304, 312, 323, 325, 342, 356, 370, 392, 395, 408, 417, 488, 540, 562, 570, 609, 748, 776, 1156, 1163, 1164, 1165, 1166, 1167, 1168, 1169, 1170, 1171, 1172, 1173, 1175, 1176, 1179, 1180, 1181, 1182, 1184, 1185, 1186, 1187, 1188, 1189, 1191) + +VALID_CLASS_IDS_200_TRAIN_ONLY = ('bicycle', 'storage container', 'candle', 'guitar case', 'purse', 'alarm clock', 'music stand', 'cd case', 'structure', 'storage organizer', 'luggage') + +CLASS_LABELS_200_TRAIN_ONLY = (121, 221, 286, 331, 399, 572, 581, 1174, 1178, 1183, 1190) \ No newline at end of file diff --git a/datasets/scannet_preprocess/meta_data/scannet_means.npz b/datasets/scannet_preprocess/meta_data/scannet_means.npz new file mode 100644 index 0000000000000000000000000000000000000000..d9bbb4f7c3b72dbe81fbeb86f594066b883fafaf --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannet_means.npz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df5c2bd40e8518e982c7d7b4b39020b07ac774695038bf49cb28b44e5760457e +size 676 diff --git a/datasets/scannet_preprocess/meta_data/scannetv1_test.txt b/datasets/scannet_preprocess/meta_data/scannetv1_test.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9e7d9205321e8ca047a527466f4b7100c9c9d2c --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv1_test.txt @@ -0,0 +1,312 @@ +scene0568_00 +scene0568_01 +scene0568_02 +scene0304_00 +scene0488_00 +scene0488_01 +scene0412_00 +scene0412_01 +scene0217_00 +scene0019_00 +scene0019_01 +scene0414_00 +scene0575_00 +scene0575_01 +scene0575_02 +scene0426_00 +scene0426_01 +scene0426_02 +scene0426_03 +scene0549_00 +scene0549_01 +scene0578_00 +scene0578_01 +scene0578_02 +scene0665_00 +scene0665_01 +scene0050_00 +scene0050_01 +scene0050_02 +scene0257_00 +scene0025_00 +scene0025_01 +scene0025_02 +scene0583_00 +scene0583_01 +scene0583_02 +scene0701_00 +scene0701_01 +scene0701_02 +scene0580_00 +scene0580_01 +scene0565_00 +scene0169_00 +scene0169_01 +scene0655_00 +scene0655_01 +scene0655_02 +scene0063_00 +scene0221_00 +scene0221_01 +scene0591_00 +scene0591_01 +scene0591_02 +scene0678_00 +scene0678_01 +scene0678_02 +scene0462_00 +scene0427_00 +scene0595_00 +scene0193_00 +scene0193_01 +scene0164_00 +scene0164_01 +scene0164_02 +scene0164_03 +scene0598_00 +scene0598_01 +scene0598_02 +scene0599_00 +scene0599_01 +scene0599_02 +scene0328_00 +scene0300_00 +scene0300_01 +scene0354_00 +scene0458_00 +scene0458_01 +scene0423_00 +scene0423_01 +scene0423_02 +scene0307_00 +scene0307_01 +scene0307_02 +scene0606_00 +scene0606_01 +scene0606_02 +scene0432_00 +scene0432_01 +scene0608_00 +scene0608_01 +scene0608_02 +scene0651_00 +scene0651_01 +scene0651_02 +scene0430_00 +scene0430_01 +scene0689_00 +scene0357_00 +scene0357_01 +scene0574_00 +scene0574_01 +scene0574_02 +scene0329_00 +scene0329_01 +scene0329_02 +scene0153_00 +scene0153_01 +scene0616_00 +scene0616_01 +scene0671_00 +scene0671_01 +scene0618_00 +scene0382_00 +scene0382_01 +scene0490_00 +scene0621_00 +scene0607_00 +scene0607_01 +scene0149_00 +scene0695_00 +scene0695_01 +scene0695_02 +scene0695_03 +scene0389_00 +scene0377_00 +scene0377_01 +scene0377_02 +scene0342_00 +scene0139_00 +scene0629_00 +scene0629_01 +scene0629_02 +scene0496_00 +scene0633_00 +scene0633_01 +scene0518_00 +scene0652_00 +scene0406_00 +scene0406_01 +scene0406_02 +scene0144_00 +scene0144_01 +scene0494_00 +scene0278_00 +scene0278_01 +scene0316_00 +scene0609_00 +scene0609_01 +scene0609_02 +scene0609_03 +scene0084_00 +scene0084_01 +scene0084_02 +scene0696_00 +scene0696_01 +scene0696_02 +scene0351_00 +scene0351_01 +scene0643_00 +scene0644_00 +scene0645_00 +scene0645_01 +scene0645_02 +scene0081_00 +scene0081_01 +scene0081_02 +scene0647_00 +scene0647_01 +scene0535_00 +scene0353_00 +scene0353_01 +scene0353_02 +scene0559_00 +scene0559_01 +scene0559_02 +scene0593_00 +scene0593_01 +scene0246_00 +scene0653_00 +scene0653_01 +scene0064_00 +scene0064_01 +scene0356_00 +scene0356_01 +scene0356_02 +scene0030_00 +scene0030_01 +scene0030_02 +scene0222_00 +scene0222_01 +scene0338_00 +scene0338_01 +scene0338_02 +scene0378_00 +scene0378_01 +scene0378_02 +scene0660_00 +scene0553_00 +scene0553_01 +scene0553_02 +scene0527_00 +scene0663_00 +scene0663_01 +scene0663_02 +scene0664_00 +scene0664_01 +scene0664_02 +scene0334_00 +scene0334_01 +scene0334_02 +scene0046_00 +scene0046_01 +scene0046_02 +scene0203_00 +scene0203_01 +scene0203_02 +scene0088_00 +scene0088_01 +scene0088_02 +scene0088_03 +scene0086_00 +scene0086_01 +scene0086_02 +scene0670_00 +scene0670_01 +scene0256_00 +scene0256_01 +scene0256_02 +scene0249_00 +scene0441_00 +scene0658_00 +scene0704_00 +scene0704_01 +scene0187_00 +scene0187_01 +scene0131_00 +scene0131_01 +scene0131_02 +scene0207_00 +scene0207_01 +scene0207_02 +scene0461_00 +scene0011_00 +scene0011_01 +scene0343_00 +scene0251_00 +scene0077_00 +scene0077_01 +scene0684_00 +scene0684_01 +scene0550_00 +scene0686_00 +scene0686_01 +scene0686_02 +scene0208_00 +scene0500_00 +scene0500_01 +scene0552_00 +scene0552_01 +scene0648_00 +scene0648_01 +scene0435_00 +scene0435_01 +scene0435_02 +scene0435_03 +scene0690_00 +scene0690_01 +scene0693_00 +scene0693_01 +scene0693_02 +scene0700_00 +scene0700_01 +scene0700_02 +scene0699_00 +scene0231_00 +scene0231_01 +scene0231_02 +scene0697_00 +scene0697_01 +scene0697_02 +scene0697_03 +scene0474_00 +scene0474_01 +scene0474_02 +scene0474_03 +scene0474_04 +scene0474_05 +scene0355_00 +scene0355_01 +scene0146_00 +scene0146_01 +scene0146_02 +scene0196_00 +scene0702_00 +scene0702_01 +scene0702_02 +scene0314_00 +scene0277_00 +scene0277_01 +scene0277_02 +scene0095_00 +scene0095_01 +scene0015_00 +scene0100_00 +scene0100_01 +scene0100_02 +scene0558_00 +scene0558_01 +scene0558_02 +scene0685_00 +scene0685_01 +scene0685_02 diff --git a/datasets/scannet_preprocess/meta_data/scannetv1_train.txt b/datasets/scannet_preprocess/meta_data/scannetv1_train.txt new file mode 100644 index 0000000000000000000000000000000000000000..7520948c8170df9ae1a9e8a40bc444fcc7cc0772 --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv1_train.txt @@ -0,0 +1,1045 @@ +scene0191_00 +scene0191_01 +scene0191_02 +scene0119_00 +scene0230_00 +scene0528_00 +scene0528_01 +scene0705_00 +scene0705_01 +scene0705_02 +scene0415_00 +scene0415_01 +scene0415_02 +scene0007_00 +scene0141_00 +scene0141_01 +scene0141_02 +scene0515_00 +scene0515_01 +scene0515_02 +scene0447_00 +scene0447_01 +scene0447_02 +scene0531_00 +scene0503_00 +scene0285_00 +scene0069_00 +scene0584_00 +scene0584_01 +scene0584_02 +scene0581_00 +scene0581_01 +scene0581_02 +scene0620_00 +scene0620_01 +scene0263_00 +scene0263_01 +scene0481_00 +scene0481_01 +scene0020_00 +scene0020_01 +scene0291_00 +scene0291_01 +scene0291_02 +scene0469_00 +scene0469_01 +scene0469_02 +scene0659_00 +scene0659_01 +scene0024_00 +scene0024_01 +scene0024_02 +scene0564_00 +scene0117_00 +scene0027_00 +scene0027_01 +scene0027_02 +scene0028_00 +scene0330_00 +scene0418_00 +scene0418_01 +scene0418_02 +scene0233_00 +scene0233_01 +scene0673_00 +scene0673_01 +scene0673_02 +scene0673_03 +scene0673_04 +scene0673_05 +scene0585_00 +scene0585_01 +scene0362_00 +scene0362_01 +scene0362_02 +scene0362_03 +scene0035_00 +scene0035_01 +scene0358_00 +scene0358_01 +scene0358_02 +scene0037_00 +scene0194_00 +scene0321_00 +scene0293_00 +scene0293_01 +scene0623_00 +scene0623_01 +scene0592_00 +scene0592_01 +scene0569_00 +scene0569_01 +scene0413_00 +scene0313_00 +scene0313_01 +scene0313_02 +scene0480_00 +scene0480_01 +scene0401_00 +scene0517_00 +scene0517_01 +scene0517_02 +scene0032_00 +scene0032_01 +scene0613_00 +scene0613_01 +scene0613_02 +scene0306_00 +scene0306_01 +scene0052_00 +scene0052_01 +scene0052_02 +scene0053_00 +scene0444_00 +scene0444_01 +scene0055_00 +scene0055_01 +scene0055_02 +scene0560_00 +scene0589_00 +scene0589_01 +scene0589_02 +scene0610_00 +scene0610_01 +scene0610_02 +scene0364_00 +scene0364_01 +scene0383_00 +scene0383_01 +scene0383_02 +scene0006_00 +scene0006_01 +scene0006_02 +scene0275_00 +scene0451_00 +scene0451_01 +scene0451_02 +scene0451_03 +scene0451_04 +scene0451_05 +scene0135_00 +scene0065_00 +scene0065_01 +scene0065_02 +scene0104_00 +scene0674_00 +scene0674_01 +scene0448_00 +scene0448_01 +scene0448_02 +scene0502_00 +scene0502_01 +scene0502_02 +scene0440_00 +scene0440_01 +scene0440_02 +scene0071_00 +scene0072_00 +scene0072_01 +scene0072_02 +scene0509_00 +scene0509_01 +scene0509_02 +scene0649_00 +scene0649_01 +scene0602_00 +scene0694_00 +scene0694_01 +scene0101_00 +scene0101_01 +scene0101_02 +scene0101_03 +scene0101_04 +scene0101_05 +scene0218_00 +scene0218_01 +scene0579_00 +scene0579_01 +scene0579_02 +scene0039_00 +scene0039_01 +scene0493_00 +scene0493_01 +scene0242_00 +scene0242_01 +scene0242_02 +scene0083_00 +scene0083_01 +scene0127_00 +scene0127_01 +scene0662_00 +scene0662_01 +scene0662_02 +scene0018_00 +scene0087_00 +scene0087_01 +scene0087_02 +scene0332_00 +scene0332_01 +scene0332_02 +scene0628_00 +scene0628_01 +scene0628_02 +scene0134_00 +scene0134_01 +scene0134_02 +scene0238_00 +scene0238_01 +scene0092_00 +scene0092_01 +scene0092_02 +scene0092_03 +scene0092_04 +scene0022_00 +scene0022_01 +scene0467_00 +scene0392_00 +scene0392_01 +scene0392_02 +scene0424_00 +scene0424_01 +scene0424_02 +scene0646_00 +scene0646_01 +scene0646_02 +scene0098_00 +scene0098_01 +scene0044_00 +scene0044_01 +scene0044_02 +scene0510_00 +scene0510_01 +scene0510_02 +scene0571_00 +scene0571_01 +scene0166_00 +scene0166_01 +scene0166_02 +scene0563_00 +scene0172_00 +scene0172_01 +scene0388_00 +scene0388_01 +scene0215_00 +scene0215_01 +scene0252_00 +scene0287_00 +scene0668_00 +scene0572_00 +scene0572_01 +scene0572_02 +scene0026_00 +scene0224_00 +scene0113_00 +scene0113_01 +scene0551_00 +scene0381_00 +scene0381_01 +scene0381_02 +scene0371_00 +scene0371_01 +scene0460_00 +scene0118_00 +scene0118_01 +scene0118_02 +scene0417_00 +scene0008_00 +scene0634_00 +scene0521_00 +scene0123_00 +scene0123_01 +scene0123_02 +scene0045_00 +scene0045_01 +scene0511_00 +scene0511_01 +scene0114_00 +scene0114_01 +scene0114_02 +scene0070_00 +scene0029_00 +scene0029_01 +scene0029_02 +scene0129_00 +scene0103_00 +scene0103_01 +scene0002_00 +scene0002_01 +scene0132_00 +scene0132_01 +scene0132_02 +scene0124_00 +scene0124_01 +scene0143_00 +scene0143_01 +scene0143_02 +scene0604_00 +scene0604_01 +scene0604_02 +scene0507_00 +scene0105_00 +scene0105_01 +scene0105_02 +scene0428_00 +scene0428_01 +scene0311_00 +scene0140_00 +scene0140_01 +scene0182_00 +scene0182_01 +scene0182_02 +scene0142_00 +scene0142_01 +scene0399_00 +scene0399_01 +scene0012_00 +scene0012_01 +scene0012_02 +scene0060_00 +scene0060_01 +scene0370_00 +scene0370_01 +scene0370_02 +scene0310_00 +scene0310_01 +scene0310_02 +scene0661_00 +scene0650_00 +scene0152_00 +scene0152_01 +scene0152_02 +scene0158_00 +scene0158_01 +scene0158_02 +scene0482_00 +scene0482_01 +scene0600_00 +scene0600_01 +scene0600_02 +scene0393_00 +scene0393_01 +scene0393_02 +scene0562_00 +scene0174_00 +scene0174_01 +scene0157_00 +scene0157_01 +scene0161_00 +scene0161_01 +scene0161_02 +scene0159_00 +scene0254_00 +scene0254_01 +scene0115_00 +scene0115_01 +scene0115_02 +scene0162_00 +scene0163_00 +scene0163_01 +scene0523_00 +scene0523_01 +scene0523_02 +scene0459_00 +scene0459_01 +scene0175_00 +scene0085_00 +scene0085_01 +scene0279_00 +scene0279_01 +scene0279_02 +scene0201_00 +scene0201_01 +scene0201_02 +scene0283_00 +scene0456_00 +scene0456_01 +scene0429_00 +scene0043_00 +scene0043_01 +scene0419_00 +scene0419_01 +scene0419_02 +scene0368_00 +scene0368_01 +scene0348_00 +scene0348_01 +scene0348_02 +scene0442_00 +scene0178_00 +scene0380_00 +scene0380_01 +scene0380_02 +scene0165_00 +scene0165_01 +scene0165_02 +scene0181_00 +scene0181_01 +scene0181_02 +scene0181_03 +scene0333_00 +scene0614_00 +scene0614_01 +scene0614_02 +scene0404_00 +scene0404_01 +scene0404_02 +scene0185_00 +scene0126_00 +scene0126_01 +scene0126_02 +scene0519_00 +scene0236_00 +scene0236_01 +scene0189_00 +scene0075_00 +scene0267_00 +scene0192_00 +scene0192_01 +scene0192_02 +scene0281_00 +scene0420_00 +scene0420_01 +scene0420_02 +scene0195_00 +scene0195_01 +scene0195_02 +scene0597_00 +scene0597_01 +scene0597_02 +scene0041_00 +scene0041_01 +scene0111_00 +scene0111_01 +scene0111_02 +scene0666_00 +scene0666_01 +scene0666_02 +scene0200_00 +scene0200_01 +scene0200_02 +scene0536_00 +scene0536_01 +scene0536_02 +scene0390_00 +scene0280_00 +scene0280_01 +scene0280_02 +scene0344_00 +scene0344_01 +scene0205_00 +scene0205_01 +scene0205_02 +scene0484_00 +scene0484_01 +scene0009_00 +scene0009_01 +scene0009_02 +scene0302_00 +scene0302_01 +scene0209_00 +scene0209_01 +scene0209_02 +scene0210_00 +scene0210_01 +scene0395_00 +scene0395_01 +scene0395_02 +scene0683_00 +scene0601_00 +scene0601_01 +scene0214_00 +scene0214_01 +scene0214_02 +scene0477_00 +scene0477_01 +scene0439_00 +scene0439_01 +scene0468_00 +scene0468_01 +scene0468_02 +scene0546_00 +scene0466_00 +scene0466_01 +scene0220_00 +scene0220_01 +scene0220_02 +scene0122_00 +scene0122_01 +scene0130_00 +scene0110_00 +scene0110_01 +scene0110_02 +scene0327_00 +scene0156_00 +scene0266_00 +scene0266_01 +scene0001_00 +scene0001_01 +scene0228_00 +scene0199_00 +scene0219_00 +scene0464_00 +scene0232_00 +scene0232_01 +scene0232_02 +scene0299_00 +scene0299_01 +scene0530_00 +scene0363_00 +scene0453_00 +scene0453_01 +scene0570_00 +scene0570_01 +scene0570_02 +scene0183_00 +scene0239_00 +scene0239_01 +scene0239_02 +scene0373_00 +scene0373_01 +scene0241_00 +scene0241_01 +scene0241_02 +scene0188_00 +scene0622_00 +scene0622_01 +scene0244_00 +scene0244_01 +scene0691_00 +scene0691_01 +scene0206_00 +scene0206_01 +scene0206_02 +scene0247_00 +scene0247_01 +scene0061_00 +scene0061_01 +scene0082_00 +scene0250_00 +scene0250_01 +scene0250_02 +scene0501_00 +scene0501_01 +scene0501_02 +scene0320_00 +scene0320_01 +scene0320_02 +scene0320_03 +scene0631_00 +scene0631_01 +scene0631_02 +scene0255_00 +scene0255_01 +scene0255_02 +scene0047_00 +scene0265_00 +scene0265_01 +scene0265_02 +scene0004_00 +scene0336_00 +scene0336_01 +scene0058_00 +scene0058_01 +scene0260_00 +scene0260_01 +scene0260_02 +scene0243_00 +scene0603_00 +scene0603_01 +scene0093_00 +scene0093_01 +scene0093_02 +scene0109_00 +scene0109_01 +scene0434_00 +scene0434_01 +scene0434_02 +scene0290_00 +scene0627_00 +scene0627_01 +scene0470_00 +scene0470_01 +scene0137_00 +scene0137_01 +scene0137_02 +scene0270_00 +scene0270_01 +scene0270_02 +scene0271_00 +scene0271_01 +scene0504_00 +scene0274_00 +scene0274_01 +scene0274_02 +scene0036_00 +scene0036_01 +scene0276_00 +scene0276_01 +scene0272_00 +scene0272_01 +scene0499_00 +scene0698_00 +scene0698_01 +scene0051_00 +scene0051_01 +scene0051_02 +scene0051_03 +scene0108_00 +scene0245_00 +scene0369_00 +scene0369_01 +scene0369_02 +scene0284_00 +scene0289_00 +scene0289_01 +scene0286_00 +scene0286_01 +scene0286_02 +scene0286_03 +scene0031_00 +scene0031_01 +scene0031_02 +scene0545_00 +scene0545_01 +scene0545_02 +scene0557_00 +scene0557_01 +scene0557_02 +scene0533_00 +scene0533_01 +scene0116_00 +scene0116_01 +scene0116_02 +scene0611_00 +scene0611_01 +scene0688_00 +scene0294_00 +scene0294_01 +scene0294_02 +scene0295_00 +scene0295_01 +scene0296_00 +scene0296_01 +scene0596_00 +scene0596_01 +scene0596_02 +scene0532_00 +scene0532_01 +scene0637_00 +scene0638_00 +scene0121_00 +scene0121_01 +scene0121_02 +scene0040_00 +scene0040_01 +scene0197_00 +scene0197_01 +scene0197_02 +scene0410_00 +scene0410_01 +scene0305_00 +scene0305_01 +scene0615_00 +scene0615_01 +scene0703_00 +scene0703_01 +scene0555_00 +scene0297_00 +scene0297_01 +scene0297_02 +scene0582_00 +scene0582_01 +scene0582_02 +scene0023_00 +scene0094_00 +scene0013_00 +scene0013_01 +scene0013_02 +scene0136_00 +scene0136_01 +scene0136_02 +scene0407_00 +scene0407_01 +scene0062_00 +scene0062_01 +scene0062_02 +scene0386_00 +scene0318_00 +scene0554_00 +scene0554_01 +scene0497_00 +scene0213_00 +scene0258_00 +scene0323_00 +scene0323_01 +scene0324_00 +scene0324_01 +scene0016_00 +scene0016_01 +scene0016_02 +scene0681_00 +scene0398_00 +scene0398_01 +scene0227_00 +scene0090_00 +scene0066_00 +scene0262_00 +scene0262_01 +scene0155_00 +scene0155_01 +scene0155_02 +scene0352_00 +scene0352_01 +scene0352_02 +scene0038_00 +scene0038_01 +scene0038_02 +scene0335_00 +scene0335_01 +scene0335_02 +scene0261_00 +scene0261_01 +scene0261_02 +scene0261_03 +scene0640_00 +scene0640_01 +scene0640_02 +scene0080_00 +scene0080_01 +scene0080_02 +scene0403_00 +scene0403_01 +scene0282_00 +scene0282_01 +scene0282_02 +scene0682_00 +scene0173_00 +scene0173_01 +scene0173_02 +scene0522_00 +scene0687_00 +scene0345_00 +scene0345_01 +scene0612_00 +scene0612_01 +scene0411_00 +scene0411_01 +scene0411_02 +scene0625_00 +scene0625_01 +scene0211_00 +scene0211_01 +scene0211_02 +scene0211_03 +scene0676_00 +scene0676_01 +scene0179_00 +scene0498_00 +scene0498_01 +scene0498_02 +scene0547_00 +scene0547_01 +scene0547_02 +scene0269_00 +scene0269_01 +scene0269_02 +scene0366_00 +scene0680_00 +scene0680_01 +scene0588_00 +scene0588_01 +scene0588_02 +scene0588_03 +scene0346_00 +scene0346_01 +scene0359_00 +scene0359_01 +scene0014_00 +scene0120_00 +scene0120_01 +scene0212_00 +scene0212_01 +scene0212_02 +scene0176_00 +scene0049_00 +scene0259_00 +scene0259_01 +scene0586_00 +scene0586_01 +scene0586_02 +scene0309_00 +scene0309_01 +scene0125_00 +scene0455_00 +scene0177_00 +scene0177_01 +scene0177_02 +scene0326_00 +scene0372_00 +scene0171_00 +scene0171_01 +scene0374_00 +scene0654_00 +scene0654_01 +scene0445_00 +scene0445_01 +scene0475_00 +scene0475_01 +scene0475_02 +scene0349_00 +scene0349_01 +scene0234_00 +scene0669_00 +scene0669_01 +scene0375_00 +scene0375_01 +scene0375_02 +scene0387_00 +scene0387_01 +scene0387_02 +scene0312_00 +scene0312_01 +scene0312_02 +scene0384_00 +scene0385_00 +scene0385_01 +scene0385_02 +scene0000_00 +scene0000_01 +scene0000_02 +scene0376_00 +scene0376_01 +scene0376_02 +scene0301_00 +scene0301_01 +scene0301_02 +scene0322_00 +scene0542_00 +scene0079_00 +scene0079_01 +scene0099_00 +scene0099_01 +scene0476_00 +scene0476_01 +scene0476_02 +scene0394_00 +scene0394_01 +scene0147_00 +scene0147_01 +scene0067_00 +scene0067_01 +scene0067_02 +scene0397_00 +scene0397_01 +scene0337_00 +scene0337_01 +scene0337_02 +scene0431_00 +scene0223_00 +scene0223_01 +scene0223_02 +scene0010_00 +scene0010_01 +scene0402_00 +scene0268_00 +scene0268_01 +scene0268_02 +scene0679_00 +scene0679_01 +scene0405_00 +scene0128_00 +scene0408_00 +scene0408_01 +scene0190_00 +scene0107_00 +scene0076_00 +scene0167_00 +scene0361_00 +scene0361_01 +scene0361_02 +scene0216_00 +scene0202_00 +scene0303_00 +scene0303_01 +scene0303_02 +scene0446_00 +scene0446_01 +scene0089_00 +scene0089_01 +scene0089_02 +scene0360_00 +scene0150_00 +scene0150_01 +scene0150_02 +scene0421_00 +scene0421_01 +scene0421_02 +scene0454_00 +scene0626_00 +scene0626_01 +scene0626_02 +scene0186_00 +scene0186_01 +scene0538_00 +scene0479_00 +scene0479_01 +scene0479_02 +scene0656_00 +scene0656_01 +scene0656_02 +scene0656_03 +scene0525_00 +scene0525_01 +scene0525_02 +scene0308_00 +scene0396_00 +scene0396_01 +scene0396_02 +scene0624_00 +scene0292_00 +scene0292_01 +scene0632_00 +scene0253_00 +scene0021_00 +scene0325_00 +scene0325_01 +scene0437_00 +scene0437_01 +scene0438_00 +scene0590_00 +scene0590_01 +scene0400_00 +scene0400_01 +scene0541_00 +scene0541_01 +scene0541_02 +scene0677_00 +scene0677_01 +scene0677_02 +scene0443_00 +scene0315_00 +scene0288_00 +scene0288_01 +scene0288_02 +scene0422_00 +scene0672_00 +scene0672_01 +scene0184_00 +scene0449_00 +scene0449_01 +scene0449_02 +scene0048_00 +scene0048_01 +scene0138_00 +scene0452_00 +scene0452_01 +scene0452_02 +scene0667_00 +scene0667_01 +scene0667_02 +scene0463_00 +scene0463_01 +scene0078_00 +scene0078_01 +scene0078_02 +scene0636_00 +scene0457_00 +scene0457_01 +scene0457_02 +scene0465_00 +scene0465_01 +scene0577_00 +scene0151_00 +scene0151_01 +scene0339_00 +scene0573_00 +scene0573_01 +scene0154_00 +scene0096_00 +scene0096_01 +scene0096_02 +scene0235_00 +scene0168_00 +scene0168_01 +scene0168_02 +scene0594_00 +scene0587_00 +scene0587_01 +scene0587_02 +scene0587_03 +scene0229_00 +scene0229_01 +scene0229_02 +scene0512_00 +scene0106_00 +scene0106_01 +scene0106_02 +scene0472_00 +scene0472_01 +scene0472_02 +scene0489_00 +scene0489_01 +scene0489_02 +scene0425_00 +scene0425_01 +scene0641_00 +scene0526_00 +scene0526_01 +scene0317_00 +scene0317_01 +scene0544_00 +scene0017_00 +scene0017_01 +scene0017_02 +scene0042_00 +scene0042_01 +scene0042_02 +scene0576_00 +scene0576_01 +scene0576_02 +scene0347_00 +scene0347_01 +scene0347_02 +scene0436_00 +scene0226_00 +scene0226_01 +scene0485_00 +scene0486_00 +scene0487_00 +scene0487_01 +scene0619_00 +scene0097_00 +scene0367_00 +scene0367_01 +scene0491_00 +scene0492_00 +scene0492_01 +scene0005_00 +scene0005_01 +scene0543_00 +scene0543_01 +scene0543_02 +scene0657_00 +scene0341_00 +scene0341_01 diff --git a/datasets/scannet_preprocess/meta_data/scannetv1_val.txt b/datasets/scannet_preprocess/meta_data/scannetv1_val.txt new file mode 100644 index 0000000000000000000000000000000000000000..965ff258035f857446c30b10e9a6be49f71d3dc7 --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv1_val.txt @@ -0,0 +1,156 @@ +scene0534_00 +scene0534_01 +scene0319_00 +scene0273_00 +scene0273_01 +scene0225_00 +scene0198_00 +scene0003_00 +scene0003_01 +scene0003_02 +scene0409_00 +scene0409_01 +scene0331_00 +scene0331_01 +scene0505_00 +scene0505_01 +scene0505_02 +scene0505_03 +scene0505_04 +scene0506_00 +scene0057_00 +scene0057_01 +scene0074_00 +scene0074_01 +scene0074_02 +scene0091_00 +scene0112_00 +scene0112_01 +scene0112_02 +scene0240_00 +scene0102_00 +scene0102_01 +scene0513_00 +scene0514_00 +scene0514_01 +scene0537_00 +scene0516_00 +scene0516_01 +scene0495_00 +scene0617_00 +scene0133_00 +scene0520_00 +scene0520_01 +scene0635_00 +scene0635_01 +scene0054_00 +scene0473_00 +scene0473_01 +scene0524_00 +scene0524_01 +scene0379_00 +scene0471_00 +scene0471_01 +scene0471_02 +scene0566_00 +scene0248_00 +scene0248_01 +scene0248_02 +scene0529_00 +scene0529_01 +scene0529_02 +scene0391_00 +scene0264_00 +scene0264_01 +scene0264_02 +scene0675_00 +scene0675_01 +scene0350_00 +scene0350_01 +scene0350_02 +scene0450_00 +scene0068_00 +scene0068_01 +scene0237_00 +scene0237_01 +scene0365_00 +scene0365_01 +scene0365_02 +scene0605_00 +scene0605_01 +scene0539_00 +scene0539_01 +scene0539_02 +scene0540_00 +scene0540_01 +scene0540_02 +scene0170_00 +scene0170_01 +scene0170_02 +scene0433_00 +scene0340_00 +scene0340_01 +scene0340_02 +scene0160_00 +scene0160_01 +scene0160_02 +scene0160_03 +scene0160_04 +scene0059_00 +scene0059_01 +scene0059_02 +scene0056_00 +scene0056_01 +scene0478_00 +scene0478_01 +scene0548_00 +scene0548_01 +scene0548_02 +scene0204_00 +scene0204_01 +scene0204_02 +scene0033_00 +scene0145_00 +scene0483_00 +scene0508_00 +scene0508_01 +scene0508_02 +scene0180_00 +scene0148_00 +scene0556_00 +scene0556_01 +scene0416_00 +scene0416_01 +scene0416_02 +scene0416_03 +scene0416_04 +scene0073_00 +scene0073_01 +scene0073_02 +scene0073_03 +scene0034_00 +scene0034_01 +scene0034_02 +scene0639_00 +scene0561_00 +scene0561_01 +scene0298_00 +scene0692_00 +scene0692_01 +scene0692_02 +scene0692_03 +scene0692_04 +scene0642_00 +scene0642_01 +scene0642_02 +scene0642_03 +scene0630_00 +scene0630_01 +scene0630_02 +scene0630_03 +scene0630_04 +scene0630_05 +scene0630_06 +scene0706_00 +scene0567_00 +scene0567_01 diff --git a/datasets/scannet_preprocess/meta_data/scannetv2-labels-old.combined.tsv b/datasets/scannet_preprocess/meta_data/scannetv2-labels-old.combined.tsv new file mode 100644 index 0000000000000000000000000000000000000000..05c006e98066aa78d126bebcfb3654200d351b93 --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv2-labels-old.combined.tsv @@ -0,0 +1,608 @@ +id raw_category category count nyu40id eigen13id nyuClass nyu40class eigen13class ModelNet40 ModelNet10 ShapeNetCore55 synsetoffset wnsynsetid wnsynsetkey mpcat40 mpcat40index +1 wall wall 8277 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +2 chair chair 4646 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +22 books book 1678 23 2 book books Books n02870526 book.n.11 objects 39 +3 floor floor 1553 2 5 floor floor Floor n03365592 floor.n.01 floor 2 +5 door door 1483 8 12 door door Wall door n03221720 door.n.01 door 4 +1163 object object 1313 40 7 otherprop Objects objects 39 +16 window window 1209 9 13 window window Window n04587648 window.n.01 window 9 +4 table table 1170 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +56 trash can trash can 1090 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +13 pillow pillow 937 18 7 pillow pillow Objects pillow 3938244 n03938244 pillow.n.01 cushion 8 +15 picture picture 862 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +41 ceiling ceiling 806 22 3 ceiling ceiling Ceiling n02990373 ceiling.n.01 ceiling 17 +26 box box 775 29 7 box box Objects n02883344 box.n.01 objects 39 +161 doorframe doorframe 768 8 12 door door Wall door doorframe.n.01 door 4 +19 monitor monitor 765 40 7 monitor otherprop Objects monitor monitor tv or monitor 3211117 n03782190 monitor.n.04 objects 39 +7 cabinet cabinet 731 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +9 desk desk 680 14 10 desk desk Table desk desk table 4379243 n03179701 desk.n.01 table 5 +8 shelf shelf 641 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +10 office chair office chair 595 5 4 chair chair Chair chair chair chair 3001627 n04373704 swivel_chair.n.01 chair 3 +31 towel towel 570 27 7 towel towel Objects n04459362 towel.n.01 towel 20 +6 couch couch 502 6 9 sofa sofa Sofa sofa sofa sofa 4256520 n04256520 sofa.n.01 sofa 10 +14 sink sink 488 34 7 sink sink Objects sink n04223580 sink.n.01 sink 15 +48 backpack backpack 479 40 7 backpack otherprop Objects n02769748 backpack.n.01 objects 39 +28 lamp lamp 419 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +11 bed bed 370 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +18 bookshelf bookshelf 360 10 6 bookshelf bookshelf Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +71 mirror mirror 349 19 7 mirror mirror Objects n03773035 mirror.n.01 mirror 21 +21 curtain curtain 347 16 13 curtain curtain Window curtain n03151077 curtain.n.01 curtain 12 +40 plant plant 331 40 7 plant otherprop Objects plant n00017222 plant.n.02 plant 14 +52 whiteboard whiteboard 327 30 7 whiteboard whiteboard Objects n03211616 display_panel.n.01 board_panel 35 +96 radiator radiator 322 39 6 radiator otherfurniture Furniture n04041069 radiator.n.02 misc 40 +22 book book 318 23 2 book books Books n02870526 book.n.11 objects 39 +29 kitchen cabinet kitchen cabinet 310 3 6 cabinet cabinet Furniture n02933112 cabinet.n.01 cabinet 7 +49 toilet paper toilet paper 291 40 7 toilet paper otherprop Objects n15075141 toilet_tissue.n.01 objects 39 +29 kitchen cabinets kitchen cabinet 289 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +23 armchair armchair 281 5 4 chair chair Chair chair chair chair 3001627 n02738535 armchair.n.01 chair 3 +63 shoes shoe 272 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +24 coffee table coffee table 258 7 10 coffee table table Table table table table 4379243 n03063968 coffee_table.n.01 table 5 +17 toilet toilet 256 33 7 toilet toilet Objects toilet toilet n04446276 toilet.n.01 toilet 18 +47 bag bag 252 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +32 clothes clothes 248 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +46 keyboard keyboard 246 40 7 keyboard otherprop Objects keyboard computer keyboard 3085013 n03085013 computer_keyboard.n.01 objects 39 +65 bottle bottle 226 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +97 recycling bin recycling bin 225 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +34 nightstand nightstand 224 32 6 night stand night stand Furniture night_stand night_stand n03015254 chest_of_drawers.n.01 chest_of_drawers 13 +38 stool stool 221 40 7 stool otherprop Objects stool n04326896 stool.n.01 stool 19 +33 tv tv 219 25 11 television television TV tv or monitor 3211117 n03211117 display.n.06 tv_monitor 22 +75 file cabinet file cabinet 217 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +36 dresser dresser 213 17 6 dresser dresser Furniture dresser dresser n03015254 chest_of_drawers.n.01 chest_of_drawers 13 +64 computer tower computer tower 203 40 7 computer otherprop Objects n03082979 computer.n.01 objects 39 +32 clothing clothes 165 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +101 telephone telephone 164 40 7 telephone otherprop Objects telephone 4401088 n04401088 telephone.n.01 objects 39 +130 cup cup 157 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +27 refrigerator refrigerator 154 24 6 refridgerator refridgerator Furniture n04070727 refrigerator.n.01 appliances 37 +44 end table end table 147 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +131 jacket jacket 146 40 7 jacket otherprop Objects n03589791 jacket.n.01 clothes 38 +55 shower curtain shower curtain 144 28 7 shower curtain shower curtain Objects curtain n04209239 shower_curtain.n.01 curtain 12 +42 bathtub bathtub 144 36 7 bathtub bathtub Objects bathtub bathtub tub 2808440 n02808440 bathtub.n.01 bathtub 25 +59 microwave microwave 141 40 7 microwave otherprop Objects microwave 3761084 n03761084 microwave.n.02 appliances 37 +159 kitchen counter kitchen counter 140 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26 +74 sofa chair sofa chair 129 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +82 paper towel dispenser paper towel dispenser 129 40 7 paper towel dispenser otherprop Objects objects 39 +1164 bathroom vanity bathroom vanity 126 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 table 5 +93 suitcase suitcase 118 40 7 luggage otherprop Objects n02773838 bag.n.06 objects 39 +77 laptop laptop 111 40 7 laptop otherprop Objects laptop laptop 3642806 n03642806 laptop.n.01 objects 39 +67 ottoman ottoman 111 39 6 ottoman otherfurniture Furniture stool n03380724 footstool.n.01 stool 19 +128 shower walls shower wall 109 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +50 printer printer 106 40 7 printer otherprop Objects printer 4004475 n04004475 printer.n.03 appliances 37 +35 counter counter 104 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26 +69 board board 100 38 7 board otherstructure Objects board_panel 35 +100 soap dispenser soap dispenser 99 40 7 otherprop Objects n04254120 soap_dispenser.n.01 objects 39 +62 stove stove 95 38 7 stove otherstructure Objects stove 4330267 n04330267 stove.n.02 appliances 37 +105 light light 93 38 7 light otherstructure Objects n03665366 light.n.02 lighting 28 +1165 closet wall closet wall 90 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +165 mini fridge mini fridge 87 24 6 refridgerator refridgerator Furniture n03273913 electric_refrigerator.n.01 appliances 37 +7 cabinets cabinet 79 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +5 doors door 76 8 12 door door Wall door n03221720 door.n.01 door 4 +76 fan fan 75 40 7 fan otherprop Objects n03320046 fan.n.01 misc 40 +230 tissue box tissue box 73 40 7 tissue box otherprop Objects n02883344 box.n.01 objects 39 +54 blanket blanket 72 40 7 blanket otherprop Objects n02849154 blanket.n.01 objects 39 +125 bathroom stall bathroom stall 71 38 7 otherstructure Objects n02873839 booth.n.02 misc 40 +72 copier copier 70 40 7 otherprop Objects n03257586 duplicator.n.01 appliances 37 +68 bench bench 66 39 6 bench otherfurniture Furniture bench bench 2828884 n02828884 bench.n.01 seating 34 +145 bar bar 66 38 7 bar otherstructure Objects n02788689 bar.n.03 misc 40 +157 soap dish soap dish 65 40 7 soap dish otherprop Objects n04254009 soap_dish.n.01 objects 39 +1166 laundry hamper laundry hamper 65 40 7 laundry basket otherprop Objects objects 39 +132 storage bin storage bin 63 40 7 storage bin otherprop Objects objects 39 +1167 bathroom stall door bathroom stall door 62 8 12 door door Wall door n03221720 door.n.01 door 4 +232 light switch light switch 61 38 7 light switch otherstructure Objects n04372370 switch.n.01 misc 40 +134 coffee maker coffee maker 61 40 7 otherprop Objects n03063338 coffee_maker.n.01 appliances 37 +51 tv stand tv stand 61 39 6 tv stand otherfurniture Furniture tv_stand n03290653 entertainment_center.n.01 furniture 36 +250 decoration decoration 60 40 7 otherprop Objects n03169390 decoration.n.01 misc 40 +1168 ceiling light ceiling light 59 38 7 light otherstructure Objects n03665366 light.n.02 lighting 28 +342 range hood range hood 59 38 7 range hood otherstructure Objects range_hood n04053677 range_hood.n.01 misc 40 +89 blackboard blackboard 58 38 7 blackboard otherstructure Objects n02846511 blackboard.n.01 board_panel 35 +103 clock clock 58 40 7 clock otherprop Objects clock 3046257 n03046257 clock.n.01 objects 39 +99 wardrobe closet wardrobe 54 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +95 rail rail 53 38 7 railing otherstructure Objects n04047401 railing.n.01 railing 30 +154 bulletin board bulletin board 53 38 7 board otherstructure Objects n03211616 display_panel.n.01 board_panel 35 +140 mat mat 52 20 5 floor mat floor mat Floor n03727837 mat.n.01 floor 2 +1169 trash bin trash bin 52 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +193 ledge ledge 51 38 7 otherstructure Objects n09337253 ledge.n.01 misc 40 +116 seat seat 49 39 6 furniture otherfurniture Furniture n04161981 seat.n.03 furniture 36 +202 mouse mouse 49 40 7 mouse otherprop Objects n03793489 mouse.n.04 objects 39 +73 basket basket 48 40 7 basket otherprop Objects basket 2801938 n02801938 basket.n.01 objects 39 +78 shower shower 48 38 7 otherstructure Objects n04208936 shower.n.01 shower 23 +1170 dumbbell dumbbell 48 40 7 otherprop Objects n03255030 dumbbell.n.01 objects 39 +79 paper paper 46 26 7 paper paper Objects n14974264 paper.n.01 objects 39 +80 person person 46 31 7 person person Objects person n05217688 person.n.02 misc 40 +141 windowsill windowsill 45 38 7 otherstructure Objects n04590263 windowsill.n.01 window 9 +57 closet closet 45 39 6 wardrobe otherfurniture Furniture wardrobe misc 40 +102 bucket bucket 45 40 7 bucket otherprop Objects n02909870 bucket.n.01 misc 40 +261 sign sign 44 40 7 sign otherprop Objects n04217882 signboard.n.01 objects 39 +118 speaker speaker 43 40 7 speaker otherprop Objects speaker 3691459 n03691459 loudspeaker.n.01 objects 39 +136 dishwasher dishwasher 43 38 7 dishwasher otherstructure Objects dishwasher 3207941 n03207941 dishwasher.n.01 appliances 37 +98 container container 43 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1171 stair rail stair rail 42 38 7 banister otherstructure Objects n02788148 bannister.n.02 railing 30 +170 shower curtain rod shower curtain rod 42 40 7 otherprop Objects curtain 12 +1172 tube tube 41 40 7 otherprop Objects misc 40 +1173 bathroom cabinet bathroom cabinet 39 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +79 papers paper 39 26 7 paper paper Objects n14974264 paper.n.01 objects 39 +221 storage container storage container 39 40 7 container otherprop Objects objects 39 +570 paper bag paper bag 39 37 7 bag bag Objects n04122825 sack.n.01 objects 39 +138 paper towel roll paper towel roll 39 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20 +168 ball ball 39 40 7 ball otherprop Objects objects 39 +276 closet doors closet door 38 8 12 door door Wall door n03221720 door.n.01 door 4 +106 laundry basket laundry basket 37 40 7 laundry basket otherprop Objects basket 2801938 n03050864 clothes_hamper.n.01 objects 39 +214 cart cart 37 40 7 cart otherprop Objects n03484083 handcart.n.01 shelving 31 +276 closet door closet door 35 8 12 door door Wall door n03221720 door.n.01 door 4 +323 dish rack dish rack 35 40 7 dish rack otherprop Objects n03207630 dish_rack.n.01 objects 39 +58 stairs stairs 35 38 7 stairs otherstructure Objects n04298308 stairway.n.01 stairs 16 +86 blinds blinds 35 13 13 blinds blinds Window n02851099 blind.n.03 blinds 32 +2 stack of chairs chair 35 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +399 purse purse 34 40 7 purse otherprop Objects n02774152 bag.n.04 objects 39 +121 bicycle bicycle 33 40 7 bicycle otherprop Objects bicycle 2834778 n02834778 bicycle.n.01 objects 39 +185 tray tray 32 40 7 tray otherprop Objects n04476259 tray.n.01 objects 39 +300 plunger plunger 30 40 7 otherprop Objects n03970156 plunger.n.03 objects 39 +180 paper cutter paper cutter 30 40 7 paper cutter otherprop Objects n03886940 paper_cutter.n.01 objects 39 +163 toilet paper dispenser toilet paper dispenser 29 40 7 otherprop Objects objects 39 +26 boxes box 29 29 7 box box Objects n02883344 box.n.01 objects 39 +66 bin bin 28 40 7 bin otherprop Objects n02839910 bin.n.01 objects 39 +208 toilet seat cover dispenser toilet seat cover dispenser 28 40 7 otherprop Objects objects 39 +112 guitar guitar 28 40 7 guitar otherprop Objects guitar guitar 3467517 n03467517 guitar.n.01 objects 39 +540 mailboxes mailbox 28 29 7 box box Objects mailbox 3710193 n03710193 mailbox.n.01 misc 40 +395 handicap bar handicap bar 27 38 7 bar otherstructure Objects misc 40 +166 fire extinguisher fire extinguisher 27 40 7 fire extinguisher otherprop Objects n03345837 fire_extinguisher.n.01 misc 40 +122 ladder ladder 27 39 6 ladder otherfurniture Furniture stairs n03632277 ladder.n.01 stairs 16 +120 column column 26 38 7 column otherstructure Objects n03074380 column.n.06 column 24 +107 pipe pipe 25 40 7 pipe otherprop Objects n03944672 pipe.n.02 misc 40 +283 vacuum cleaner vacuum cleaner 25 40 7 otherprop Objects n04517823 vacuum.n.04 objects 39 +88 plate plate 24 40 7 plate otherprop Objects n03959485 plate.n.04 objects 39 +90 piano piano 24 39 6 piano otherfurniture Furniture piano piano 3928116 n03928116 piano.n.01 furniture 36 +177 water cooler water cooler 24 39 6 water cooler otherfurniture Furniture n04559166 water_cooler.n.01 misc 40 +1174 cd case cd case 24 40 7 otherprop Objects objects 39 +562 bowl bowl 24 40 7 bowl otherprop Objects bowl bowl 2880940 n02880940 bowl.n.03 objects 39 +1175 closet rod closet rod 24 40 7 otherprop Objects n04100174 rod.n.01 misc 40 +1156 bathroom counter bathroom counter 24 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26 +84 oven oven 23 38 7 oven otherstructure Objects n03862676 oven.n.01 appliances 37 +104 stand stand 23 39 6 stand otherfurniture Furniture table table table 4379243 n04301000 stand.n.04 table 5 +229 scale scale 23 40 7 scale otherprop Objects n04141975 scale.n.07 objects 39 +70 washing machine washing machine 23 39 6 washing machine otherfurniture Furniture washing_machine 4554684 n04554684 washer.n.03 appliances 37 +325 broom broom 22 40 7 broom otherprop Objects n02906734 broom.n.01 objects 39 +169 hat hat 22 40 7 hat otherprop Objects n03497657 hat.n.01 clothes 38 +128 shower wall shower wall 22 1 12 wall wall Wall n04208936 shower.n.01 wall 1 +331 guitar case guitar case 21 40 7 guitar case otherprop Objects objects 39 +87 rack rack 21 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +488 water pitcher water pitcher 21 40 7 pitcher otherprop Objects n03950228 pitcher.n.02 objects 39 +776 laundry detergent laundry detergent 21 40 7 otherprop Objects objects 39 +370 hair dryer hair dryer 21 40 7 hair dryer otherprop Objects n03483316 hand_blower.n.01 objects 39 +191 pillar pillar 21 38 7 column otherstructure Objects n03073977 column.n.07 column 24 +748 divider divider 20 40 7 otherprop Objects wall 1 +242 power outlet power outlet 19 40 7 otherprop Objects misc 40 +45 dining table dining table 19 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +417 shower floor shower floor 19 2 5 floor floor Floor n04208936 shower.n.01 floor 2 +70 washing machines washing machine 19 39 6 washing machine otherfurniture Furniture washing_machine 4554684 n04554684 washer.n.03 appliances 37 +188 shower door shower door 19 8 12 door door Wall door n04208936 shower.n.01 door 4 +1176 coffee kettle coffee kettle 18 40 7 pot otherprop Objects n03612814 kettle.n.01 objects 39 +1177 wardrobe cabinet wardrobe 18 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +1178 structure structure 18 38 7 otherstructure Objects misc 40 +18 bookshelves bookshelf 17 10 6 bookshelf bookshelf Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +110 clothes dryer clothes dryer 17 39 6 otherfurniture Furniture n03251766 dryer.n.01 appliances 37 +148 toaster toaster 17 40 7 toaster otherprop Objects n04442312 toaster.n.02 appliances 37 +63 shoe shoe 17 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +155 ironing board ironing board 16 39 6 ironing board otherfurniture Furniture n03586090 ironing_board.n.01 objects 39 +572 alarm clock alarm clock 16 40 7 alarm clock otherprop Objects clock 3046257 n02694662 alarm_clock.n.01 objects 39 +1179 shower head shower head 15 38 7 otherstructure Objects shower 23 +28 lamp base lamp 15 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +392 water bottle water bottle 15 40 7 bottle otherprop Objects bottle bottle 2876657 n04557648 water_bottle.n.01 objects 39 +1180 keyboard piano keyboard piano 15 39 6 piano otherfurniture Furniture piano piano 3928116 n03928116 piano.n.01 furniture 36 +609 projector screen projector screen 15 38 7 projector screen otherstructure Objects misc 40 +1181 case of water bottles case of water bottles 15 40 7 otherprop Objects objects 39 +195 toaster oven toaster oven 14 40 7 toaster oven otherprop Objects n04442441 toaster_oven.n.01 appliances 37 +581 music stand music stand 14 39 6 music stand otherfurniture Furniture n03801760 music_stand.n.01 furniture 36 +58 staircase stairs 14 38 7 stairs otherstructure Objects n04298308 stairway.n.01 stairs 16 +1182 coat rack coat rack 14 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 3 +1183 storage organizer storage organizer 14 40 7 otherprop Objects shelving 3 +139 machine machine 14 40 7 machine otherprop Objects n03699975 machine.n.01 appliances 37 +1184 folded chair folded chair 14 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1185 fire alarm fire alarm 14 40 7 otherprop Objects n03343737 fire_alarm.n.02 misc 40 +156 fireplace fireplace 13 38 7 fireplace otherstructure Objects n03346455 fireplace.n.01 fireplace 27 +408 vent vent 13 40 7 otherprop Objects n04526241 vent.n.01 misc 40 +213 furniture furniture 13 39 6 furniture otherfurniture Furniture n03405725 furniture.n.01 furniture 36 +1186 power strip power strip 13 40 7 otherprop Objects objects 39 +1187 calendar calendar 13 40 7 otherprop Objects objects 39 +1188 poster poster 13 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +115 toilet paper holder toilet paper holder 13 40 7 toilet paper holder otherprop Objects objects 39 +1189 potted plant potted plant 12 40 7 plant otherprop Objects plant n00017222 plant.n.02 plant 14 +304 stuffed animal stuffed animal 12 40 7 stuffed animal otherprop Objects n04399382 teddy.n.01 objects 39 +1190 luggage luggage 12 40 7 luggage otherprop Objects n02774630 baggage.n.01 objects 39 +21 curtains curtain 12 16 13 curtain curtain Window curtain n03151077 curtain.n.01 curtain 12 +312 headphones headphones 12 40 7 otherprop Objects n03261776 earphone.n.01 objects 39 +233 crate crate 12 39 6 crate otherfurniture Furniture n03127925 crate.n.01 objects 39 +286 candle candle 12 40 7 candle otherprop Objects lamp n02948072 candle.n.01 objects 39 +264 projector projector 12 40 7 projector otherprop Objects n04009552 projector.n.02 objects 39 +110 clothes dryers clothes dryer 12 39 6 otherfurniture Furniture n03251766 dryer.n.01 appliances 37 +1191 mattress mattress 12 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +356 dustpan dustpan 12 40 7 otherprop Objects n03259009 dustpan.n.02 objects 39 +25 drawer drawer 11 39 6 drawer otherfurniture Furniture n03233905 drawer.n.01 furniture 36 +750 rod rod 11 40 7 otherprop Objects pistol 3948459 n03427202 gat.n.01 misc 40 +269 globe globe 11 40 7 globe otherprop Objects objects 39 +307 footrest footrest 11 39 6 foot rest otherfurniture Furniture stool n03380724 footstool.n.01 stool 19 +410 piano bench piano bench 11 39 6 piano bench otherfurniture Furniture bench bench 2828884 n02828884 bench.n.01 seating 34 +730 breakfast bar breakfast bar 11 38 7 bar otherstructure Objects counter 26 +216 step stool step stool 11 40 7 step stool otherprop Objects stool n04315713 step_stool.n.01 stool 19 +1192 hand rail hand rail 11 38 7 railing otherstructure Objects railing 30 +119 vending machine vending machine 11 40 7 machine otherprop Objects n04525305 vending_machine.n.01 appliances 37 +682 ceiling fan ceiling fan 11 40 7 fan otherprop Objects n03320046 fan.n.01 misc 40 +434 swiffer swiffer 11 40 7 otherprop Objects objects 39 +126 foosball table foosball table 11 39 6 foosball table otherfurniture Furniture table table table 4379243 n04379243 table.n.02 table 5 +919 jar jar 11 40 7 jar otherprop Objects jar 3593526 n03593526 jar.n.01 objects 39 +85 footstool footstool 11 39 6 ottoman otherfurniture Furniture stool n03380724 footstool.n.01 stool 19 +1193 folded table folded table 10 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +108 round table round table 10 7 10 table table Table table table table 4379243 n04114554 round_table.n.02 table 5 +135 hamper hamper 10 40 7 basket otherprop Objects basket 2801938 n03482405 hamper.n.02 objects 39 +1194 poster tube poster tube 10 40 7 otherprop Objects objects 39 +432 case case 10 40 7 case otherprop Objects objects 39 +53 carpet carpet 10 40 7 rug otherprop Objects n04118021 rug.n.01 floor 2 +1195 thermostat thermostat 10 40 7 otherprop Objects n04422875 thermostat.n.01 misc 40 +111 coat coat 10 40 7 jacket otherprop Objects n03057021 coat.n.01 clothes 38 +305 water fountain water fountain 10 38 7 water fountain otherstructure Objects n03241335 drinking_fountain.n.01 misc 40 +1125 smoke detector smoke detector 10 40 7 otherprop Objects misc 40 +13 pillows pillow 9 18 7 pillow pillow Objects pillow 3938244 n03938244 pillow.n.01 cushion 8 +1196 flip flops flip flops 9 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +1197 cloth cloth 9 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +1198 banner banner 9 40 7 otherprop Objects n02788021 banner.n.01 misc 40 +1199 clothes hanger clothes hanger 9 40 7 otherprop Objects n03057920 coat_hanger.n.01 objects 39 +1200 whiteboard eraser whiteboard eraser 9 40 7 otherprop Objects objects 39 +378 iron iron 9 40 7 otherprop Objects n03584829 iron.n.04 objects 39 +591 instrument case instrument case 9 40 7 case otherprop Objects objects 39 +49 toilet paper rolls toilet paper 9 40 7 toilet paper otherprop Objects n15075141 toilet_tissue.n.01 objects 39 +92 soap soap 9 40 7 soap otherprop Objects n04253437 soap.n.01 objects 39 +1098 block block 9 40 7 otherprop Objects misc 40 +291 wall hanging wall hanging 8 40 7 otherprop Objects n03491178 hanging.n.01 picture 6 +1063 kitchen island kitchen island 8 38 7 kitchen island otherstructure Objects n03620600 kitchen_island.n.01 counter 26 +107 pipes pipe 8 38 7 otherstructure Objects misc 40 +1135 toothbrush toothbrush 8 40 7 toothbrush otherprop Objects n04453156 toothbrush.n.01 objects 39 +189 shirt shirt 8 40 7 otherprop Objects n04197391 shirt.n.01 clothes 38 +245 cutting board cutting board 8 40 7 cutting board otherprop Objects n03025513 chopping_board.n.01 objects 39 +194 vase vase 8 40 7 vase otherprop Objects vase jar 3593526 n04522168 vase.n.01 objects 39 +1201 shower control valve shower control valve 8 38 7 otherstructure Objects n04208936 shower.n.01 shower 23 +386 exercise machine exercise machine 8 40 7 machine otherprop Objects gym_equipment 33 +1202 compost bin compost bin 8 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +857 shorts shorts 8 40 7 shorts otherprop Objects clothes 38 +452 tire tire 8 40 7 otherprop Objects n04440749 tire.n.01 objects 39 +1203 teddy bear teddy bear 7 40 7 stuffed animal otherprop Objects n04399382 teddy.n.01 objects 39 +346 bathrobe bathrobe 7 40 7 otherprop Objects n02807616 bathrobe.n.01 clothes 38 +152 handrail handrail 7 38 7 railing otherstructure Objects n02788148 bannister.n.02 railing 30 +83 faucet faucet 7 40 7 faucet otherprop Objects faucet 3325088 n03325088 faucet.n.01 misc 40 +1204 pantry wall pantry wall 7 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +726 thermos thermos 7 40 7 flask otherprop Objects bottle bottle 2876657 n04422727 thermos.n.01 objects 39 +61 rug rug 7 40 7 rug otherprop Objects n04118021 rug.n.01 floor 2 +39 couch cushions cushion 7 18 7 pillow pillow Objects n03151500 cushion.n.03 cushion 8 +1117 tripod tripod 7 39 6 stand otherfurniture Furniture n04485082 tripod.n.01 objects 39 +540 mailbox mailbox 7 29 7 box box Objects mailbox 3710193 n03710193 mailbox.n.01 misc 40 +1205 tupperware tupperware 7 40 7 otherprop Objects objects 39 +415 shoe rack shoe rack 7 40 7 shoe rack otherprop Objects shelving 31 +31 towels towel 6 27 7 towel towel Objects n04459362 towel.n.01 towel 20 +1206 beer bottles beer bottle 6 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +153 treadmill treadmill 6 39 6 treadmill otherfurniture Furniture n04477387 treadmill.n.01 gym_equipment 33 +1207 salt salt 6 40 7 otherprop Objects objects 39 +129 chest chest 6 39 6 chest otherfurniture Furniture dresser dresser chest_of_drawers 13 +220 dispenser dispenser 6 40 7 otherprop Objects n03210683 dispenser.n.01 objects 39 +1208 mirror doors mirror door 6 8 12 door door Wall door n03221720 door.n.01 door 4 +231 remote remote 6 40 7 otherprop Objects remote_control 4074963 n04074963 remote_control.n.01 objects 39 +1209 folded ladder folded ladder 6 39 6 ladder otherfurniture Furniture stairs n03632277 ladder.n.01 misc 40 +39 cushion cushion 6 18 7 pillow pillow Objects n03151500 cushion.n.03 cushion 8 +1210 carton carton 6 40 7 otherprop Objects objects 39 +117 step step 6 38 7 otherstructure Objects n04314914 step.n.04 misc 40 +822 drying rack drying rack 6 39 6 drying rack otherfurniture Furniture shelving 31 +238 slippers slipper 6 40 7 shoe otherprop Objects n04241394 slipper.n.01 clothes 38 +143 pool table pool table 6 39 6 pool table otherfurniture Furniture table table table 4379243 n03982430 pool_table.n.01 table 5 +1211 soda stream soda stream 6 40 7 otherprop Objects objects 39 +228 toilet brush toilet brush 6 40 7 toilet brush otherprop Objects objects 39 +494 loft bed loft bed 6 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +226 cooking pot cooking pot 6 40 7 pot otherprop Objects objects 39 +91 heater heater 6 39 6 heater otherfurniture Furniture n03508101 heater.n.01 misc 40 +1072 messenger bag messenger bag 6 37 7 bag bag Objects objects 39 +435 stapler stapler 6 40 7 stapler otherprop Objects n04303497 stapler.n.01 objects 39 +1165 closet walls closet wall 5 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +345 scanner scanner 5 40 7 otherprop Objects appliances 37 +893 elliptical machine elliptical machine 5 40 7 machine otherprop Objects gym_equipment 33 +621 kettle kettle 5 40 7 pot otherprop Objects n03612814 kettle.n.01 objects 39 +1212 metronome metronome 5 40 7 otherprop Objects n03757604 metronome.n.01 objects 39 +297 dumbell dumbell 5 40 7 otherprop Objects objects 39 +1213 music book music book 5 23 2 book books Books n02870526 book.n.11 objects 39 +1214 rice cooker rice cooker 5 40 7 otherprop Objects objects 39 +1215 dart board dart board 5 38 7 board otherstructure Objects n03162940 dartboard.n.01 objects 39 +529 sewing machine sewing machine 5 40 7 sewing machine otherprop Objects n04179913 sewing_machine.n.01 objects 39 +1216 grab bar grab bar 5 38 7 railing otherstructure Objects railing 30 +1217 flowerpot flowerpot 5 40 7 vase otherprop Objects vase jar 3593526 n04522168 vase.n.01 objects 39 +1218 painting painting 5 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +1219 railing railing 5 38 7 railing otherstructure Objects n04047401 railing.n.01 railing 30 +1220 stair stair 5 38 7 stairs otherstructure Objects stairs n04314914 step.n.04 stairs 16 +525 toolbox toolbox 5 39 6 chest otherfurniture Furniture n04452615 toolbox.n.01 objects 39 +204 nerf gun nerf gun 5 40 7 otherprop Objects objects 39 +693 binders binder 5 40 7 binder otherprop Objects objects 39 +179 desk lamp desk lamp 5 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +1221 quadcopter quadcopter 5 40 7 otherprop Objects objects 39 +1222 pitcher pitcher 5 40 7 pitcher otherprop Objects n03950228 pitcher.n.02 objects 39 +1223 hanging hanging 5 40 7 otherprop Objects misc 40 +1224 mail mail 5 40 7 otherprop Objects misc 40 +1225 closet ceiling closet ceiling 5 22 3 ceiling ceiling Ceiling n02990373 ceiling.n.01 ceiling 17 +1226 hoverboard hoverboard 5 40 7 otherprop Objects objects 39 +1227 beanbag chair beanbag chair 5 39 6 bean bag otherfurniture Furniture n02816656 beanbag.n.01 chair 3 +571 water heater water heater 5 40 7 water heater otherprop Objects n04560113 water_heater.n.01 misc 40 +1228 spray bottle spray bottle 5 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +556 rope rope 5 40 7 rope otherprop Objects n04108268 rope.n.01 objects 39 +280 plastic container plastic container 5 40 7 container otherprop Objects objects 39 +1229 soap bottle soap bottle 5 40 7 soap otherprop Objects objects 39 +1230 ikea bag ikea bag 4 37 7 bag bag Objects 2773838 n02773838 bag.n.06 objects 39 +1231 sleeping bag sleeping bag 4 40 7 otherprop Objects n04235860 sleeping_bag.n.01 objects 39 +1232 duffel bag duffel bag 4 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +746 frying pan frying pan 4 40 7 frying pan otherprop Objects n03400231 frying_pan.n.01 objects 39 +1233 oven mitt oven mitt 4 40 7 otherprop Objects objects 39 +1234 pot pot 4 40 7 pot otherprop Objects n04235860 sleeping_bag.n.01 objects 39 +144 hand dryer hand dryer 4 40 7 otherprop Objects objects 39 +282 dollhouse dollhouse 4 39 6 doll house otherfurniture Furniture n03219483 dollhouse.n.01 objects 39 +167 shampoo bottle shampoo bottle 4 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1235 hair brush hair brush 4 40 7 otherprop Objects n02908217 brush.n.02 objects 39 +1236 tennis racket tennis racket 4 40 7 otherprop Objects n04409806 tennis_racket.n.01 objects 39 +1237 display case display case 4 40 7 case otherprop Objects objects 39 +234 ping pong table ping pong table 4 39 6 ping pong table otherfurniture Furniture table table table 4379243 n04379243 table.n.02 table 5 +563 boiler boiler 4 40 7 otherprop Objects misc 40 +1238 bag of coffee beans bag of coffee beans 4 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +1239 bananas banana 4 40 7 otherprop Objects n00021265 food.n.01 objects 39 +1240 carseat carseat 4 40 7 otherprop Objects misc 40 +366 helmet helmet 4 40 7 otherprop Objects helmet 3513137 n03513137 helmet.n.02 clothes 38 +816 umbrella umbrella 4 40 7 umbrella otherprop Objects n04507155 umbrella.n.01 objects 39 +1241 coffee box coffee box 4 40 7 otherprop Objects objects 39 +719 envelope envelope 4 40 7 envelope otherprop Objects n03291819 envelope.n.01 objects 39 +284 wet floor sign wet floor sign 4 40 7 sign otherprop Objects misc 40 +1242 clothing rack clothing rack 4 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +247 controller controller 4 40 7 otherprop Objects n03096960 control.n.09 objects 39 +1243 bath walls bathroom wall 4 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +1244 podium podium 4 39 6 otherfurniture Furniture n03159640 dais.n.01 furniture 36 +1245 storage box storage box 4 29 7 box box Objects n02883344 box.n.01 objects 39 +1246 dolly dolly 4 40 7 otherprop Objects misc 40 +1247 shampoo shampoo 3 40 7 otherprop Objects n04183516 shampoo.n.01 objects 39 +592 paper tray paper tray 3 40 7 paper tray otherprop Objects objects 39 +385 cabinet door cabinet door 3 8 12 door door Wall door door 4 +1248 changing station changing station 3 40 7 otherprop Objects misc 40 +1249 poster printer poster printer 3 40 7 printer otherprop Objects printer 4004475 n04004475 printer.n.03 appliances 37 +133 screen screen 3 40 7 otherprop Objects n03151077 curtain.n.01 curtain 12 +301 soap bar soap bar 3 38 7 bar otherstructure Objects objects 39 +1250 crutches crutches 3 40 7 otherprop Objects n03141823 crutch.n.01 objects 39 +379 studio light studio light 3 38 7 light otherstructure Objects lighting 28 +130 stack of cups cup 3 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +1251 toilet flush button toilet flush button 3 40 7 otherprop Objects objects 39 +450 trunk trunk 3 40 7 otherprop Objects misc 40 +1252 grocery bag grocery bag 3 37 7 bag bag Objects suitcase 2773838 n03461288 grocery_bag.n.01 objects 39 +316 plastic bin plastic bin 3 40 7 bin otherprop Objects objects 39 +1253 pizza box pizza box 3 29 7 box box Objects objects 39 +385 cabinet doors cabinet door 3 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 door 4 +1254 legs legs 3 31 7 person person Objects person n05217688 person.n.02 misc 40 +461 car car 3 40 7 car otherprop Objects car car 2958343 n02958343 car.n.01 misc 40 +1255 shaving cream shaving cream 3 40 7 otherprop Objects n04186051 shaving_cream.n.01 objects 39 +1256 luggage stand luggage stand 3 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +599 shredder shredder 3 40 7 otherprop Objects n04210120 shredder.n.01 objects 39 +281 statue statue 3 40 7 sculpture otherprop Objects n04306847 statue.n.01 misc 40 +1257 urinal urinal 3 33 7 toilet toilet Objects toilet toilet n04515991 urinal.n.01 toilet 18 +1258 hose hose 3 40 7 otherprop Objects n03539875 hose.n.03 misc 40 +1259 bike pump bike pump 3 40 7 otherprop Objects objects 39 +319 coatrack coatrack 3 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31 +1260 bear bear 3 40 7 otherprop Objects objects 39 +28 wall lamp lamp 3 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +1261 humidifier humidifier 3 40 7 otherprop Objects objects 39 +546 toothpaste toothpaste 3 40 7 toothpaste otherprop Objects objects 39 +1262 mouthwash bottle mouthwash bottle 3 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1263 poster cutter poster cutter 3 40 7 otherprop Objects objects 39 +1264 golf bag golf bag 3 37 7 bag bag Objects suitcase 2773838 n03445617 golf_bag.n.01 objects 39 +1265 food container food container 3 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1266 camera camera 3 40 7 otherprop Objects objects 39 +28 table lamp lamp 3 35 7 lamp lamp Objects lamp lamp 3636649 n04380533 table_lamp.n.01 lighting 28 +1267 yoga mat yoga mat 3 20 5 floor mat floor mat Floor n03727837 mat.n.01 floor 2 +1268 card card 3 40 7 otherprop Objects objects 39 +1269 mug mug 3 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +188 shower doors shower door 3 38 7 otherstructure Objects n04208936 shower.n.01 door 4 +689 cardboard cardboard 3 40 7 otherprop Objects objects 39 +1270 rack stand rack stand 3 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +1271 boxes of paper boxes of paper 3 29 7 box box Objects n02883344 box.n.01 objects 39 +1272 flag flag 3 40 7 otherprop Objects misc 40 +354 futon futon 3 39 6 mattress otherfurniture Furniture n03408444 futon.n.01 sofa 10 +339 magazine magazine 3 40 7 magazine otherprop Objects n06595351 magazine.n.01 objects 39 +1009 exit sign exit sign 3 40 7 exit sign otherprop Objects misc 40 +1273 rolled poster rolled poster 3 40 7 otherprop Objects objects 39 +1274 wheel wheel 3 40 7 otherprop Objects objects 39 +15 pictures picture 3 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +1275 blackboard eraser blackboard eraser 3 40 7 eraser otherprop Objects n03294833 eraser.n.01 objects 39 +361 organizer organizer 3 40 7 otherprop Objects n03918737 personal_digital_assistant.n.01 objects 39 +1276 doll doll 3 40 7 toy otherprop Objects n03219135 doll.n.01 objects 39 +326 book rack book rack 3 39 6 bookrack otherfurniture Furniture objects 39 +1277 laundry bag laundry bag 3 40 7 laundry basket otherprop Objects basket 2801938 n03050864 clothes_hamper.n.01 objects 39 +1278 sponge sponge 3 40 7 otherprop Objects n01906749 sponge.n.04 objects 39 +116 seating seat 3 39 6 furniture otherfurniture Furniture n04161981 seat.n.03 furniture 36 +1184 folded chairs folded chair 2 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1279 lotion bottle lotion bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +212 can can 2 40 7 can otherprop Objects can 2946921 n02946921 can.n.01 objects 39 +1280 lunch box lunch box 2 40 7 otherprop Objects objects 39 +1281 food display food display 2 40 7 otherprop Objects misc 40 +794 storage shelf storage shelf 2 40 7 otherprop Objects shelving 31 +1282 sliding wood door sliding wood door 2 40 7 otherprop Objects door 4 +955 pants pants 2 40 7 otherprop Objects n04489008 trouser.n.01 clothes 38 +387 wood wood 2 40 7 otherprop Objects misc 40 +69 boards board 2 38 7 board otherstructure Objects board_panel 35 +65 bottles bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +523 washcloth washcloth 2 40 7 otherprop Objects n04554523 washcloth.n.01 towel 20 +389 workbench workbench 2 39 6 bench otherfurniture Furniture bench table 4379243 n04600486 workbench.n.01 table 5 +29 open kitchen cabinet kitchen cabinet 2 3 6 cabinet cabinet Furniture n02933112 cabinet.n.01 cabinet 7 +1283 organizer shelf organizer shelf 2 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +146 frame frame 2 38 7 otherstructure Objects misc 40 +130 cups cup 2 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +372 exercise ball exercise ball 2 40 7 ball otherprop Objects n04285146 sports_equipment.n.01 gym_equipment 33 +289 easel easel 2 39 6 stand otherfurniture Furniture n03262809 easel.n.01 furniture 36 +440 garbage bag garbage bag 2 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +321 roomba roomba 2 40 7 otherprop Objects objects 39 +976 garage door garage door 2 38 7 garage door otherstructure Objects door door 4 +1256 luggage rack luggage stand 2 39 6 stand otherfurniture Furniture n04038440 shelving 31 +1284 bike lock bike lock 2 40 7 otherprop Objects objects 39 +1285 briefcase briefcase 2 40 7 otherprop Objects n02900705 briefcase.n.01 objects 39 +357 hand towel hand towel 2 27 7 towel towel Objects n03490006 hand_towel.n.01 towel 20 +1286 bath products bath product 2 40 7 otherprop Objects objects 39 +1287 star star 2 40 7 otherprop Objects n09444783 star.n.03 misc 40 +365 map map 2 40 7 map otherprop Objects n03720163 map.n.01 misc 40 +1288 coffee bean bag coffee bean bag 2 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +81 headboard headboard 2 39 6 headboard otherfurniture Furniture n03502200 headboard.n.01 bed 11 +1289 ipad ipad 2 40 7 otherprop Objects objects 39 +1290 display rack display rack 2 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +948 traffic cone traffic cone 2 40 7 cone otherprop Objects cone objects 39 +174 toiletry toiletry 2 40 7 otherprop Objects n04447443 toiletry.n.01 objects 39 +1028 canopy canopy 2 40 7 otherprop Objects misc 40 +1291 massage chair massage chair 2 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1292 paper organizer paper organizer 2 40 7 otherprop Objects objects 39 +1005 barricade barricade 2 40 7 otherprop Objects misc 40 +235 platform platform 2 38 7 otherstructure Objects misc 40 +1293 cap cap 2 40 7 hat otherprop Objects n03497657 hat.n.01 clothes 38 +1294 dumbbell plates dumbbell plates 2 40 7 otherprop Objects objects 39 +1295 elevator elevator 2 38 7 otherstructure Objects misc 40 +1296 cooking pan cooking pan 2 40 7 pan otherprop Objects n03880531 pan.n.01 objects 39 +1297 trash bag trash bag 2 37 7 bag bag Objects objects 39 +1298 santa santa 2 40 7 otherprop Objects misc 40 +1299 jewelry box jewelry box 2 29 7 box box Objects n02883344 box.n.01 objects 39 +1300 boat boat 2 40 7 otherprop Objects misc 40 +1301 sock sock 2 21 7 clothes clothes Objects n04254777 sock.n.01 clothes 38 +1051 kinect kinect 2 40 7 kinect otherprop Objects objects 39 +566 crib crib 2 39 6 crib otherfurniture Furniture furniture 36 +1302 plastic storage bin plastic storage bin 2 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1062 cooler cooler 2 24 6 refridgerator refridgerator Furniture n03102654 cooler.n.01 appliances 37 +1303 kitchen apron kitchen apron 2 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +1304 dishwashing soap bottle dishwashing soap bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1305 xbox controller xbox controller 2 40 7 otherprop Objects objects 39 +1306 banana holder banana holder 2 40 7 otherprop Objects objects 39 +298 ping pong paddle ping pong paddle 2 40 7 otherprop Objects table 5 +1307 airplane airplane 2 40 7 otherprop Objects misc 40 +1308 conditioner bottle conditioner bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1309 tea kettle tea kettle 2 40 7 tea kettle otherprop Objects n04397768 teakettle.n.01 objects 39 +43 bedframe bedframe 2 39 6 otherfurniture Furniture n02822579 bedstead.n.01 bed 11 +1310 wood beam wood beam 2 38 7 otherstructure Objects beam 29 +593 toilet paper package toilet paper package 2 40 7 otherprop Objects objects 39 +1311 wall mounted coat rack wall mounted coat rack 2 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31 +1312 film light film light 2 40 7 otherprop Objects lighting 28 +749 ceiling lamp ceiling lamp 1 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +623 chain chain 1 40 7 otherprop Objects chair 3 +1313 sofa sofa 1 6 9 sofa sofa Sofa sofa sofa sofa 4256520 n04256520 sofa.n.01 sofa 10 +99 closet wardrobe wardrobe 1 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +265 sweater sweater 1 40 7 otherprop Objects n04370048 sweater.n.01 clothes 38 +1314 kitchen mixer kitchen mixer 1 40 7 otherprop Objects appliances 37 +99 wardrobe wardrobe 1 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +1315 water softener water softener 1 40 7 otherprop Objects misc 40 +448 banister banister 1 38 7 banister otherstructure Objects n02788148 bannister.n.02 railing 30 +257 trolley trolley 1 40 7 trolley otherprop Objects n04335435 streetcar.n.01 misc 40 +1316 pantry shelf pantry shelf 1 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +786 sofa bed sofa bed 1 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +801 loofa loofa 1 40 7 otherprop Objects objects 39 +972 shower faucet handle shower faucet handle 1 40 7 handle otherprop Objects shower 23 +1317 toy piano toy piano 1 40 7 toy otherprop Objects n03964744 plaything.n.01 objects 39 +1318 fish fish 1 40 7 otherprop Objects n02512053 fish.n.01 objects 39 +75 file cabinets file cabinet 1 3 6 cabinet cabinet Furniture cabinet 2933112 n03337140 file.n.03 cabinet 7 +657 cat litter box cat litter box 1 29 7 box box Objects objects 39 +561 electric panel electric panel 1 40 7 otherprop Objects misc 40 +93 suitcases suitcase 1 40 7 luggage otherprop Objects n02774630 baggage.n.01 objects 39 +513 curtain rod curtain rod 1 38 7 curtain rod otherstructure Objects curtain 12 +411 bunk bed bunk bed 1 39 6 bunk bed otherfurniture Furniture bed bed bed 2818832 n02920259 bunk_bed.n.01 bed 11 +1122 chandelier chandelier 1 38 7 chandelier otherstructure Objects n03005285 chandelier.n.01 lighting 28 +922 tape tape 1 40 7 tape otherprop Objects objects 39 +88 plates plate 1 40 7 otherprop Objects n03959485 plate.n.04 objects 39 +518 alarm alarm 1 40 7 alarm otherprop Objects clock 3046257 n02694662 alarm_clock.n.01 objects 39 +814 fire hose fire hose 1 40 7 otherprop Objects n03346004 fire_hose.n.01 misc 40 +1319 toy dinosaur toy dinosaur 1 40 7 toy otherprop Objects n03964744 plaything.n.01 objects 39 +1320 cone cone 1 40 7 otherprop Objects objects 39 +649 glass doors glass door 1 8 12 door door Wall door n03221720 door.n.01 door 4 +607 hatrack hatrack 1 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31 +819 subwoofer subwoofer 1 40 7 speaker otherprop Objects speaker 3691459 n04349401 subwoofer.n.01 objects 39 +1321 fire sprinkler fire sprinkler 1 40 7 otherprop Objects misc 40 +1322 trash cabinet trash cabinet 1 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +1204 pantry walls pantry wall 1 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +227 photo photo 1 40 7 photo otherprop Objects n03925226 photograph.n.01 picture 6 +817 barrier barrier 1 40 7 otherprop Objects n02796623 barrier.n.01 misc 40 +130 stacks of cups cup 1 40 7 otherprop Objects n03147509 cup.n.01 objects 39 +712 beachball beachball 1 40 7 ball otherprop Objects n02814224 beach_ball.n.01 objects 39 +1323 folded boxes folded boxes 1 40 7 otherprop Objects objects 39 +1324 contact lens solution bottle contact lens solution bottle 1 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +673 covered box covered box 1 29 7 box box Objects objects 39 +459 folder folder 1 40 7 folder otherprop Objects n03376279 folder.n.02 objects 39 +643 mail trays mail tray 1 40 7 mail tray otherprop Objects objects 39 +238 slipper slipper 1 40 7 otherprop Objects n04241394 slipper.n.01 clothes 38 +765 magazine rack magazine rack 1 39 6 stand otherfurniture Furniture n03704549 magazine_rack.n.01 shelving 31 +1008 sticker sticker 1 40 7 sticker otherprop Objects n07272545 gummed_label.n.01 objects 39 +225 lotion lotion 1 40 7 otherprop Objects n03690938 lotion.n.01 objects 39 +1083 buddha buddha 1 40 7 otherprop Objects objects 39 +813 file organizer file organizer 1 40 7 otherprop Objects objects 39 +138 paper towel rolls paper towel roll 1 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20 +1145 night lamp night lamp 1 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +796 fuse box fuse box 1 40 7 otherprop Objects misc 40 +1325 knife block knife block 1 40 7 otherprop Objects objects 39 +363 furnace furnace 1 39 6 furnace otherfurniture Furniture n03404449 furnace.n.01 +1174 cd cases cd case 1 40 7 otherprop Objects objects 39 +38 stools stool 1 40 7 stool otherprop Objects stool n04326896 stool.n.01 stool 19 +1326 hand sanitzer dispenser hand sanitzer dispenser 1 40 7 otherprop Objects n04254120 soap_dispenser.n.01 objects 39 +997 teapot teapot 1 40 7 tea pot otherprop Objects n04398044 teapot.n.01 objects 39 +1327 pen holder pen holder 1 40 7 otherprop Objects objects 39 +1328 tray rack tray rack 1 40 7 otherprop Objects objects 39 +1329 wig wig 1 40 7 otherprop Objects n04584207 wig.n.01 objects 39 +182 switch switch 1 40 7 otherprop Objects n04372370 switch.n.01 misc 40 +280 plastic containers plastic container 1 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1330 night light night light 1 40 7 otherprop Objects lighting 28 +1331 notepad notepad 1 40 7 otherprop Objects objects 39 +1332 mail bin mail bin 1 40 7 otherprop Objects misc 40 +1333 elevator button elevator button 1 40 7 otherprop Objects misc 40 +939 gaming wheel gaming wheel 1 40 7 otherprop Objects objects 39 +1334 drum set drum set 1 40 7 otherprop Objects objects 39 +480 cosmetic bag cosmetic bag 1 37 7 bag bag Objects objects 39 +907 coffee mug coffee mug 1 40 7 vessel otherprop Objects cup or mug 3797390 n03063599 coffee_mug.n.01 objects 39 +1335 closet shelf closet shelf 1 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +1336 baby mobile baby mobile 1 40 7 otherprop Objects objects 39 +829 diaper bin diaper bin 1 40 7 bin otherprop Objects objects 39 +947 door wall door wall 1 1 12 wall wall Wall wall 1 +1116 stepstool stepstool 1 40 7 step stool otherprop Objects objects 39 +599 paper shredder shredder 1 40 7 otherprop Objects n04210120 shredder.n.01 objects 39 +733 dress rack dress rack 1 40 7 otherprop Objects n03238762 dress_rack.n.01 misc 40 +123 cover cover 1 40 7 blanket otherprop Objects objects 39 +506 shopping bag shopping bag 1 37 7 bag bag Objects n04204081 shopping_bag.n.01 objects 39 +569 sliding door sliding door 1 8 12 door door Wall door n04239074 sliding_door.n.01 door 4 +1337 exercise bike exercise bike 1 40 7 machine otherprop Objects n04210120 shredder.n.01 gym_equipment 33 +1338 recliner chair recliner chair 1 5 4 chair chair Chair chair chair chair 3001627 n03238762 dress_rack.n.01 chair 3 +1314 kitchenaid mixer kitchen mixer 1 40 7 otherprop Objects appliances 37 +1339 soda can soda can 1 40 7 can otherprop Objects can 2946921 n02946921 can.n.01 objects 39 +1340 stovetop stovetop 1 38 7 stove otherstructure Objects stove 4330267 n04330267 stove.n.02 appliances 37 +851 stepladder stepladder 1 39 6 ladder otherfurniture Furniture stairs n04315599 step_ladder.n.01 stairs 16 +142 tap tap 1 40 7 faucet otherprop Objects faucet 3325088 n04559451 water_faucet.n.01 objects 39 +436 cable cable 1 40 7 cables otherprop Objects objects 39 +1341 baby changing station baby changing station 1 39 6 otherfurniture Furniture furniture 36 +1342 costume costume 1 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +885 rocking chair rocking chair 1 5 4 chair chair Chair chair chair chair 3001627 n04099969 rocking_chair.n.01 chair 3 +693 binder binder 1 40 7 binder otherprop Objects objects 39 +815 media center media center 1 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +401 towel rack towel rack 1 40 7 otherprop Objects n04459773 towel_rack.n.01 misc 40 +1343 medal medal 1 40 7 otherprop Objects objects 39 +1184 stack of folded chairs folded chair 1 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1344 telescope telescope 1 40 7 otherprop Objects n04403638 telescope.n.01 objects 39 +1345 closet doorframe closet doorframe 1 8 12 door door Wall door door 4 +160 glass glass 1 38 7 glass otherstructure Objects n03438257 glass.n.02 misc 40 +1126 baseball cap baseball cap 1 40 7 otherprop Objects cap 2954340 n02799323 baseball_cap.n.01 clothes 38 +1346 battery disposal jar battery disposal jar 1 40 7 jar otherprop Objects jar 3593526 n03593526 jar.n.01 objects 39 +332 mop mop 1 40 7 otherprop Objects n04367480 swab.n.02 objects 39 +397 tank tank 1 40 7 otherprop Objects objects 39 +643 mail tray mail tray 1 40 7 mail tray otherprop Objects objects 39 +551 centerpiece centerpiece 1 40 7 centerpiece otherprop Objects n02994419 centerpiece.n.02 objects 39 +1163 stick stick 1 40 7 stick otherprop Objects objects 39 +1347 closet floor closet floor 1 2 5 floor floor Floor n03365592 floor.n.01 floor 2 +1348 dryer sheets dryer sheets 1 40 7 otherprop Objects objects 39 +803 bycicle bycicle 1 40 7 otherprop Objects misc 40 +484 flower stand flower stand 1 39 6 stand otherfurniture Furniture furniture 36 +1349 air mattress air mattress 1 4 1 bed bed Bed bed bed bed 2818832 n02690809 air_mattress.n.01 bed 11 +1350 clip clip 1 40 7 otherprop Objects objects 39 +222 side table side table 1 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +1253 pizza boxes pizza box 1 29 7 box box Objects n02883344 box.n.01 objects 39 +1351 display display 1 39 7 otherfurniture Furniture n03211117 display.n.06 misc 40 +1352 postcard postcard 1 40 7 otherprop Objects objects 39 +828 display sign display sign 1 40 7 sign otherprop Objects misc 40 +1353 paper towel paper towel 1 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20 +612 boots boot 1 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +1354 tennis racket bag tennis racket bag 1 40 7 otherprop Objects objects 39 +1355 air hockey table air hockey table 1 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +1301 socks sock 1 21 7 clothes clothes Objects n04254777 sock.n.01 clothes 38 +1356 food bag food bag 1 37 7 bag bag Objects objects 39 +1199 clothes hangers clothes hanger 1 40 7 otherprop Objects n03057920 coat_hanger.n.01 misc 40 +1357 starbucks cup starbucks cup 1 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 diff --git a/datasets/scannet_preprocess/meta_data/scannetv2-labels.combined.tsv b/datasets/scannet_preprocess/meta_data/scannetv2-labels.combined.tsv new file mode 100644 index 0000000000000000000000000000000000000000..cff61b132f3ebf4edd513445b76fd39db54462d2 --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv2-labels.combined.tsv @@ -0,0 +1,608 @@ +id raw_category category count nyu40id eigen13id nyuClass nyu40class eigen13class ModelNet40 ModelNet10 ShapeNetCore55 synsetoffset wnsynsetid wnsynsetkey mpcat40 mpcat40index +1 wall wall 8277 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +2 chair chair 4646 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +22 books book 1678 23 2 book books Books n02870526 book.n.11 objects 39 +3 floor floor 1553 2 5 floor floor Floor n03365592 floor.n.01 floor 2 +5 door door 1483 8 12 door door Wall door n03221720 door.n.01 door 4 +1163 object object 1313 40 7 otherprop Objects objects 39 +16 window window 1209 9 13 window window Window n04587648 window.n.01 window 9 +4 table table 1170 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +56 trash can trash can 1090 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +13 pillow pillow 937 18 7 pillow pillow Objects pillow 3938244 n03938244 pillow.n.01 cushion 8 +15 picture picture 862 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +41 ceiling ceiling 806 22 3 ceiling ceiling Ceiling n02990373 ceiling.n.01 ceiling 17 +26 box box 775 29 7 box box Objects n02883344 box.n.01 objects 39 +161 doorframe doorframe 768 8 12 door door Wall door doorframe.n.01 door 4 +19 monitor monitor 765 40 7 monitor otherprop Objects monitor monitor tv or monitor 3211117 n03782190 monitor.n.04 objects 39 +7 cabinet cabinet 731 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +9 desk desk 680 14 10 desk desk Table desk desk table 4379243 n03179701 desk.n.01 table 5 +8 shelf shelf 641 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +10 office chair office chair 595 5 4 chair chair Chair chair chair chair 3001627 n04373704 swivel_chair.n.01 chair 3 +31 towel towel 570 27 7 towel towel Objects n04459362 towel.n.01 towel 20 +6 couch couch 502 6 9 sofa sofa Sofa sofa sofa sofa 4256520 n04256520 sofa.n.01 sofa 10 +14 sink sink 488 34 7 sink sink Objects sink n04223580 sink.n.01 sink 15 +48 backpack backpack 479 40 7 backpack otherprop Objects n02769748 backpack.n.01 objects 39 +28 lamp lamp 419 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +11 bed bed 370 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +18 bookshelf bookshelf 360 10 6 bookshelf bookshelf Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +71 mirror mirror 349 19 7 mirror mirror Objects n03773035 mirror.n.01 mirror 21 +21 curtain curtain 347 16 13 curtain curtain Window curtain n03151077 curtain.n.01 curtain 12 +40 plant plant 331 40 7 plant otherprop Objects plant n00017222 plant.n.02 plant 14 +52 whiteboard whiteboard 327 30 7 whiteboard whiteboard Objects n03211616 display_panel.n.01 board_panel 35 +96 radiator radiator 322 39 6 radiator otherfurniture Furniture n04041069 radiator.n.02 misc 40 +22 book book 318 23 2 book books Books n02870526 book.n.11 objects 39 +29 kitchen cabinet kitchen cabinet 310 3 6 cabinet cabinet Furniture n02933112 cabinet.n.01 cabinet 7 +49 toilet paper toilet paper 291 40 7 toilet paper otherprop Objects n15075141 toilet_tissue.n.01 objects 39 +29 kitchen cabinets kitchen cabinet 289 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +23 armchair armchair 281 5 4 chair chair Chair chair chair chair 3001627 n02738535 armchair.n.01 chair 3 +63 shoes shoe 272 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +24 coffee table coffee table 258 7 10 coffee table table Table table table table 4379243 n03063968 coffee_table.n.01 table 5 +17 toilet toilet 256 33 7 toilet toilet Objects toilet toilet n04446276 toilet.n.01 toilet 18 +47 bag bag 252 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +32 clothes clothes 248 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +46 keyboard keyboard 246 40 7 keyboard otherprop Objects keyboard computer keyboard 3085013 n03085013 computer_keyboard.n.01 objects 39 +65 bottle bottle 226 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +97 recycling bin recycling bin 225 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +34 nightstand nightstand 224 32 6 night stand night stand Furniture night_stand night_stand n03015254 chest_of_drawers.n.01 chest_of_drawers 13 +38 stool stool 221 40 7 stool otherprop Objects stool n04326896 stool.n.01 stool 19 +33 tv tv 219 25 11 television television TV tv or monitor 3211117 n03211117 display.n.06 tv_monitor 22 +75 file cabinet file cabinet 217 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +36 dresser dresser 213 17 6 dresser dresser Furniture dresser dresser n03015254 chest_of_drawers.n.01 chest_of_drawers 13 +64 computer tower computer tower 203 40 7 computer otherprop Objects n03082979 computer.n.01 objects 39 +32 clothing clothes 165 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +101 telephone telephone 164 40 7 telephone otherprop Objects telephone 4401088 n04401088 telephone.n.01 objects 39 +130 cup cup 157 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +27 refrigerator refrigerator 154 24 6 refridgerator refridgerator Furniture n04070727 refrigerator.n.01 appliances 37 +44 end table end table 147 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +131 jacket jacket 146 40 7 jacket otherprop Objects n03589791 jacket.n.01 clothes 38 +55 shower curtain shower curtain 144 28 7 shower curtain shower curtain Objects curtain n04209239 shower_curtain.n.01 curtain 12 +42 bathtub bathtub 144 36 7 bathtub bathtub Objects bathtub bathtub tub 2808440 n02808440 bathtub.n.01 bathtub 25 +59 microwave microwave 141 40 7 microwave otherprop Objects microwave 3761084 n03761084 microwave.n.02 appliances 37 +159 kitchen counter kitchen counter 140 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26 +74 sofa chair sofa chair 129 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +82 paper towel dispenser paper towel dispenser 129 40 7 paper towel dispenser otherprop Objects objects 39 +1164 bathroom vanity bathroom vanity 126 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 table 5 +93 suitcase suitcase 118 40 7 luggage otherprop Objects n02773838 bag.n.06 objects 39 +77 laptop laptop 111 40 7 laptop otherprop Objects laptop laptop 3642806 n03642806 laptop.n.01 objects 39 +67 ottoman ottoman 111 39 6 ottoman otherfurniture Furniture stool n03380724 footstool.n.01 stool 19 +128 shower walls shower wall 109 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +50 printer printer 106 40 7 printer otherprop Objects printer 4004475 n04004475 printer.n.03 appliances 37 +35 counter counter 104 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26 +69 board board 100 38 7 board otherstructure Objects board_panel 35 +100 soap dispenser soap dispenser 99 40 7 otherprop Objects n04254120 soap_dispenser.n.01 objects 39 +62 stove stove 95 38 7 stove otherstructure Objects stove 4330267 n04330267 stove.n.02 appliances 37 +105 light light 93 38 7 light otherstructure Objects n03665366 light.n.02 lighting 28 +1165 closet wall closet wall 90 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +165 mini fridge mini fridge 87 24 6 refridgerator refridgerator Furniture n03273913 electric_refrigerator.n.01 appliances 37 +7 cabinets cabinet 79 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +5 doors door 76 8 12 door door Wall door n03221720 door.n.01 door 4 +76 fan fan 75 40 7 fan otherprop Objects n03320046 fan.n.01 misc 40 +230 tissue box tissue box 73 40 7 tissue box otherprop Objects n02883344 box.n.01 objects 39 +54 blanket blanket 72 40 7 blanket otherprop Objects n02849154 blanket.n.01 objects 39 +125 bathroom stall bathroom stall 71 38 7 otherstructure Objects n02873839 booth.n.02 misc 40 +72 copier copier 70 40 7 otherprop Objects n03257586 duplicator.n.01 appliances 37 +68 bench bench 66 39 6 bench otherfurniture Furniture bench bench 2828884 n02828884 bench.n.01 seating 34 +145 bar bar 66 38 7 bar otherstructure Objects n02788689 bar.n.03 misc 40 +157 soap dish soap dish 65 40 7 soap dish otherprop Objects n04254009 soap_dish.n.01 objects 39 +1166 laundry hamper laundry hamper 65 40 7 laundry basket otherprop Objects objects 39 +132 storage bin storage bin 63 40 7 storage bin otherprop Objects objects 39 +1167 bathroom stall door bathroom stall door 62 8 12 door door Wall door n03221720 door.n.01 door 4 +232 light switch light switch 61 38 7 light switch otherstructure Objects n04372370 switch.n.01 misc 40 +134 coffee maker coffee maker 61 40 7 otherprop Objects n03063338 coffee_maker.n.01 appliances 37 +51 tv stand tv stand 61 39 6 tv stand otherfurniture Furniture tv_stand n03290653 entertainment_center.n.01 furniture 36 +250 decoration decoration 60 40 7 otherprop Objects n03169390 decoration.n.01 misc 40 +1168 ceiling light ceiling light 59 38 7 light otherstructure Objects n03665366 light.n.02 lighting 28 +342 range hood range hood 59 38 7 range hood otherstructure Objects range_hood n04053677 range_hood.n.01 misc 40 +89 blackboard blackboard 58 38 7 blackboard otherstructure Objects n02846511 blackboard.n.01 board_panel 35 +103 clock clock 58 40 7 clock otherprop Objects clock 3046257 n03046257 clock.n.01 objects 39 +99 wardrobe closet wardrobe 54 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +95 rail rail 53 38 7 railing otherstructure Objects n04047401 railing.n.01 railing 30 +154 bulletin board bulletin board 53 38 7 board otherstructure Objects n03211616 display_panel.n.01 board_panel 35 +140 mat mat 52 20 5 floor mat floor mat Floor n03727837 mat.n.01 floor 2 +1169 trash bin trash bin 52 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +193 ledge ledge 51 38 7 otherstructure Objects n09337253 ledge.n.01 misc 40 +116 seat seat 49 39 6 furniture otherfurniture Furniture n04161981 seat.n.03 furniture 36 +202 mouse mouse 49 40 7 mouse otherprop Objects n03793489 mouse.n.04 objects 39 +73 basket basket 48 40 7 basket otherprop Objects basket 2801938 n02801938 basket.n.01 objects 39 +78 shower shower 48 38 7 otherstructure Objects n04208936 shower.n.01 shower 23 +1170 dumbbell dumbbell 48 40 7 otherprop Objects n03255030 dumbbell.n.01 objects 39 +79 paper paper 46 26 7 paper paper Objects n14974264 paper.n.01 objects 39 +80 person person 46 31 7 person person Objects person n05217688 person.n.02 misc 40 +141 windowsill windowsill 45 38 7 otherstructure Objects n04590263 windowsill.n.01 window 9 +57 closet closet 45 39 6 wardrobe otherfurniture Furniture wardrobe misc 40 +102 bucket bucket 45 40 7 bucket otherprop Objects n02909870 bucket.n.01 misc 40 +261 sign sign 44 40 7 sign otherprop Objects n04217882 signboard.n.01 objects 39 +118 speaker speaker 43 40 7 speaker otherprop Objects speaker 3691459 n03691459 loudspeaker.n.01 objects 39 +136 dishwasher dishwasher 43 38 7 dishwasher otherstructure Objects dishwasher 3207941 n03207941 dishwasher.n.01 appliances 37 +98 container container 43 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1171 stair rail stair rail 42 38 7 banister otherstructure Objects n02788148 bannister.n.02 railing 30 +170 shower curtain rod shower curtain rod 42 40 7 otherprop Objects curtain 12 +1172 tube tube 41 40 7 otherprop Objects misc 40 +1173 bathroom cabinet bathroom cabinet 39 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +79 papers paper 39 26 7 paper paper Objects n14974264 paper.n.01 objects 39 +221 storage container storage container 39 40 7 container otherprop Objects objects 39 +570 paper bag paper bag 39 37 7 bag bag Objects n04122825 sack.n.01 objects 39 +138 paper towel roll paper towel roll 39 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20 +168 ball ball 39 40 7 ball otherprop Objects objects 39 +276 closet doors closet door 38 8 12 door door Wall door n03221720 door.n.01 door 4 +106 laundry basket laundry basket 37 40 7 laundry basket otherprop Objects basket 2801938 n03050864 clothes_hamper.n.01 objects 39 +214 cart cart 37 40 7 cart otherprop Objects n03484083 handcart.n.01 shelving 31 +276 closet door closet door 35 8 12 door door Wall door n03221720 door.n.01 door 4 +323 dish rack dish rack 35 40 7 dish rack otherprop Objects n03207630 dish_rack.n.01 objects 39 +58 stairs stairs 35 38 7 stairs otherstructure Objects n04298308 stairway.n.01 stairs 16 +86 blinds blinds 35 13 13 blinds blinds Window n02851099 blind.n.03 blinds 32 +2 stack of chairs chair 35 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +399 purse purse 34 40 7 purse otherprop Objects n02774152 bag.n.04 objects 39 +121 bicycle bicycle 33 40 7 bicycle otherprop Objects bicycle 2834778 n02834778 bicycle.n.01 objects 39 +185 tray tray 32 40 7 tray otherprop Objects n04476259 tray.n.01 objects 39 +300 plunger plunger 30 40 7 otherprop Objects n03970156 plunger.n.03 objects 39 +180 paper cutter paper cutter 30 40 7 paper cutter otherprop Objects n03886940 paper_cutter.n.01 objects 39 +163 toilet paper dispenser toilet paper dispenser 29 40 7 otherprop Objects objects 39 +26 boxes box 29 29 7 box box Objects n02883344 box.n.01 objects 39 +66 bin bin 28 40 7 bin otherprop Objects n02839910 bin.n.01 objects 39 +208 toilet seat cover dispenser toilet seat cover dispenser 28 40 7 otherprop Objects objects 39 +112 guitar guitar 28 40 7 guitar otherprop Objects guitar guitar 3467517 n03467517 guitar.n.01 objects 39 +540 mailboxes mailbox 28 29 7 box box Objects mailbox 3710193 n03710193 mailbox.n.01 misc 40 +395 handicap bar handicap bar 27 38 7 bar otherstructure Objects misc 40 +166 fire extinguisher fire extinguisher 27 40 7 fire extinguisher otherprop Objects n03345837 fire_extinguisher.n.01 misc 40 +122 ladder ladder 27 39 6 ladder otherfurniture Furniture stairs n03632277 ladder.n.01 stairs 16 +120 column column 26 38 7 column otherstructure Objects n03074380 column.n.06 column 24 +107 pipe pipe 25 40 7 pipe otherprop Objects n03944672 pipe.n.02 misc 40 +283 vacuum cleaner vacuum cleaner 25 40 7 otherprop Objects n04517823 vacuum.n.04 objects 39 +88 plate plate 24 40 7 plate otherprop Objects n03959485 plate.n.04 objects 39 +90 piano piano 24 39 6 piano otherfurniture Furniture piano piano 3928116 n03928116 piano.n.01 furniture 36 +177 water cooler water cooler 24 39 6 water cooler otherfurniture Furniture n04559166 water_cooler.n.01 misc 40 +1174 cd case cd case 24 40 7 otherprop Objects objects 39 +562 bowl bowl 24 40 7 bowl otherprop Objects bowl bowl 2880940 n02880940 bowl.n.03 objects 39 +1175 closet rod closet rod 24 40 7 otherprop Objects n04100174 rod.n.01 misc 40 +1156 bathroom counter bathroom counter 24 12 6 counter counter Furniture table table table 4379243 n03116530 counter.n.01 counter 26 +84 oven oven 23 38 7 oven otherstructure Objects n03862676 oven.n.01 appliances 37 +104 stand stand 23 39 6 stand otherfurniture Furniture table table table 4379243 n04301000 stand.n.04 table 5 +229 scale scale 23 40 7 scale otherprop Objects n04141975 scale.n.07 objects 39 +70 washing machine washing machine 23 39 6 washing machine otherfurniture Furniture washing_machine 4554684 n04554684 washer.n.03 appliances 37 +325 broom broom 22 40 7 broom otherprop Objects n02906734 broom.n.01 objects 39 +169 hat hat 22 40 7 hat otherprop Objects n03497657 hat.n.01 clothes 38 +128 shower wall shower wall 22 1 12 wall wall Wall n04208936 shower.n.01 wall 1 +331 guitar case guitar case 21 40 7 guitar case otherprop Objects objects 39 +87 rack rack 21 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +488 water pitcher water pitcher 21 40 7 pitcher otherprop Objects n03950228 pitcher.n.02 objects 39 +776 laundry detergent laundry detergent 21 40 7 otherprop Objects objects 39 +370 hair dryer hair dryer 21 40 7 hair dryer otherprop Objects n03483316 hand_blower.n.01 objects 39 +191 pillar pillar 21 38 7 column otherstructure Objects n03073977 column.n.07 column 24 +748 divider divider 20 40 7 otherprop Objects wall 1 +242 power outlet power outlet 19 40 7 otherprop Objects misc 40 +45 dining table dining table 19 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +417 shower floor shower floor 19 2 5 floor floor Floor n04208936 shower.n.01 floor 2 +70 washing machines washing machine 19 39 6 washing machine otherfurniture Furniture washing_machine 4554684 n04554684 washer.n.03 appliances 37 +188 shower door shower door 19 8 12 door door Wall door n04208936 shower.n.01 door 4 +1176 coffee kettle coffee kettle 18 40 7 pot otherprop Objects n03612814 kettle.n.01 objects 39 +1177 wardrobe cabinet wardrobe 18 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +1178 structure structure 18 38 7 otherstructure Objects misc 40 +18 bookshelves bookshelf 17 10 6 bookshelf bookshelf Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +110 clothes dryer clothes dryer 17 39 6 otherfurniture Furniture n03251766 dryer.n.01 appliances 37 +148 toaster toaster 17 40 7 toaster otherprop Objects n04442312 toaster.n.02 appliances 37 +63 shoe shoe 17 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +155 ironing board ironing board 16 39 6 ironing board otherfurniture Furniture n03586090 ironing_board.n.01 objects 39 +572 alarm clock alarm clock 16 40 7 alarm clock otherprop Objects clock 3046257 n02694662 alarm_clock.n.01 objects 39 +1179 shower head shower head 15 38 7 otherstructure Objects shower 23 +28 lamp base lamp 15 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +392 water bottle water bottle 15 40 7 bottle otherprop Objects bottle bottle 2876657 n04557648 water_bottle.n.01 objects 39 +1180 keyboard piano keyboard piano 15 39 6 piano otherfurniture Furniture piano piano 3928116 n03928116 piano.n.01 furniture 36 +609 projector screen projector screen 15 38 7 projector screen otherstructure Objects misc 40 +1181 case of water bottles case of water bottles 15 40 7 otherprop Objects objects 39 +195 toaster oven toaster oven 14 40 7 toaster oven otherprop Objects n04442441 toaster_oven.n.01 appliances 37 +581 music stand music stand 14 39 6 music stand otherfurniture Furniture n03801760 music_stand.n.01 furniture 36 +58 staircase stairs 14 38 7 stairs otherstructure Objects n04298308 stairway.n.01 stairs 16 +1182 coat rack coat rack 14 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 3 +1183 storage organizer storage organizer 14 40 7 otherprop Objects shelving 3 +139 machine machine 14 40 7 machine otherprop Objects n03699975 machine.n.01 appliances 37 +1184 folded chair folded chair 14 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1185 fire alarm fire alarm 14 40 7 otherprop Objects n03343737 fire_alarm.n.02 misc 40 +156 fireplace fireplace 13 38 7 fireplace otherstructure Objects n03346455 fireplace.n.01 fireplace 27 +408 vent vent 13 40 7 otherprop Objects n04526241 vent.n.01 misc 40 +213 furniture furniture 13 39 6 furniture otherfurniture Furniture n03405725 furniture.n.01 furniture 36 +1186 power strip power strip 13 40 7 otherprop Objects objects 39 +1187 calendar calendar 13 40 7 otherprop Objects objects 39 +1188 poster poster 13 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +115 toilet paper holder toilet paper holder 13 40 7 toilet paper holder otherprop Objects objects 39 +1189 potted plant potted plant 12 40 7 plant otherprop Objects plant n00017222 plant.n.02 plant 14 +304 stuffed animal stuffed animal 12 40 7 stuffed animal otherprop Objects n04399382 teddy.n.01 objects 39 +1190 luggage luggage 12 40 7 luggage otherprop Objects n02774630 baggage.n.01 objects 39 +21 curtains curtain 12 16 13 curtain curtain Window curtain n03151077 curtain.n.01 curtain 12 +312 headphones headphones 12 40 7 otherprop Objects n03261776 earphone.n.01 objects 39 +233 crate crate 12 39 6 crate otherfurniture Furniture n03127925 crate.n.01 objects 39 +286 candle candle 12 40 7 candle otherprop Objects lamp n02948072 candle.n.01 objects 39 +264 projector projector 12 40 7 projector otherprop Objects n04009552 projector.n.02 objects 39 +110 clothes dryers clothes dryer 12 39 6 otherfurniture Furniture n03251766 dryer.n.01 appliances 37 +1191 mattress mattress 12 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +356 dustpan dustpan 12 40 7 otherprop Objects n03259009 dustpan.n.02 objects 39 +25 drawer drawer 11 39 6 drawer otherfurniture Furniture n03233905 drawer.n.01 furniture 36 +750 rod rod 11 40 7 otherprop Objects pistol 3948459 n03427202 gat.n.01 misc 40 +269 globe globe 11 40 7 globe otherprop Objects objects 39 +307 footrest footrest 11 39 6 foot rest otherfurniture Furniture stool n03380724 footstool.n.01 stool 19 +410 piano bench piano bench 11 39 6 piano bench otherfurniture Furniture bench bench 2828884 n02828884 bench.n.01 seating 34 +730 breakfast bar breakfast bar 11 38 7 bar otherstructure Objects counter 26 +216 step stool step stool 11 40 7 step stool otherprop Objects stool n04315713 step_stool.n.01 stool 19 +1192 hand rail hand rail 11 38 7 railing otherstructure Objects railing 30 +119 vending machine vending machine 11 40 7 machine otherprop Objects n04525305 vending_machine.n.01 appliances 37 +682 ceiling fan ceiling fan 11 40 7 fan otherprop Objects n03320046 fan.n.01 misc 40 +434 swiffer swiffer 11 40 7 otherprop Objects objects 39 +126 foosball table foosball table 11 39 6 foosball table otherfurniture Furniture table table table 4379243 n04379243 table.n.02 table 5 +919 jar jar 11 40 7 jar otherprop Objects jar 3593526 n03593526 jar.n.01 objects 39 +85 footstool footstool 11 39 6 ottoman otherfurniture Furniture stool n03380724 footstool.n.01 stool 19 +1193 folded table folded table 10 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +108 round table round table 10 7 10 table table Table table table table 4379243 n04114554 round_table.n.02 table 5 +135 hamper hamper 10 40 7 basket otherprop Objects basket 2801938 n03482405 hamper.n.02 objects 39 +1194 poster tube poster tube 10 40 7 otherprop Objects objects 39 +432 case case 10 40 7 case otherprop Objects objects 39 +53 carpet carpet 10 40 7 rug otherprop Objects n04118021 rug.n.01 floor 2 +1195 thermostat thermostat 10 40 7 otherprop Objects n04422875 thermostat.n.01 misc 40 +111 coat coat 10 40 7 jacket otherprop Objects n03057021 coat.n.01 clothes 38 +305 water fountain water fountain 10 38 7 water fountain otherstructure Objects n03241335 drinking_fountain.n.01 misc 40 +1125 smoke detector smoke detector 10 40 7 otherprop Objects misc 40 +13 pillows pillow 9 18 7 pillow pillow Objects pillow 3938244 n03938244 pillow.n.01 cushion 8 +1196 flip flops flip flops 9 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +1197 cloth cloth 9 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +1198 banner banner 9 40 7 otherprop Objects n02788021 banner.n.01 misc 40 +1199 clothes hanger clothes hanger 9 40 7 otherprop Objects n03057920 coat_hanger.n.01 objects 39 +1200 whiteboard eraser whiteboard eraser 9 40 7 otherprop Objects objects 39 +378 iron iron 9 40 7 otherprop Objects n03584829 iron.n.04 objects 39 +591 instrument case instrument case 9 40 7 case otherprop Objects objects 39 +49 toilet paper rolls toilet paper 9 40 7 toilet paper otherprop Objects n15075141 toilet_tissue.n.01 objects 39 +92 soap soap 9 40 7 soap otherprop Objects n04253437 soap.n.01 objects 39 +1098 block block 9 40 7 otherprop Objects misc 40 +291 wall hanging wall hanging 8 40 7 otherprop Objects n03491178 hanging.n.01 picture 6 +1063 kitchen island kitchen island 8 38 7 kitchen island otherstructure Objects n03620600 kitchen_island.n.01 counter 26 +107 pipes pipe 8 38 7 otherstructure Objects misc 40 +1135 toothbrush toothbrush 8 40 7 toothbrush otherprop Objects n04453156 toothbrush.n.01 objects 39 +189 shirt shirt 8 40 7 otherprop Objects n04197391 shirt.n.01 clothes 38 +245 cutting board cutting board 8 40 7 cutting board otherprop Objects n03025513 chopping_board.n.01 objects 39 +194 vase vase 8 40 7 vase otherprop Objects vase jar 3593526 n04522168 vase.n.01 objects 39 +1201 shower control valve shower control valve 8 38 7 otherstructure Objects n04208936 shower.n.01 shower 23 +386 exercise machine exercise machine 8 40 7 machine otherprop Objects gym_equipment 33 +1202 compost bin compost bin 8 39 6 garbage bin otherfurniture Furniture trash_bin 2747177 n02747177 ashcan.n.01 objects 39 +857 shorts shorts 8 40 7 shorts otherprop Objects clothes 38 +452 tire tire 8 40 7 otherprop Objects n04440749 tire.n.01 objects 39 +1203 teddy bear teddy bear 7 40 7 stuffed animal otherprop Objects n04399382 teddy.n.01 objects 39 +346 bathrobe bathrobe 7 40 7 otherprop Objects n02807616 bathrobe.n.01 clothes 38 +152 handrail handrail 7 38 7 railing otherstructure Objects n02788148 bannister.n.02 railing 30 +83 faucet faucet 7 40 7 faucet otherprop Objects faucet 3325088 n03325088 faucet.n.01 misc 40 +1204 pantry wall pantry wall 7 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +726 thermos thermos 7 40 7 flask otherprop Objects bottle bottle 2876657 n04422727 thermos.n.01 objects 39 +61 rug rug 7 40 7 rug otherprop Objects n04118021 rug.n.01 floor 2 +39 couch cushions cushion 7 18 7 pillow pillow Objects n03151500 cushion.n.03 cushion 8 +1117 tripod tripod 7 39 6 stand otherfurniture Furniture n04485082 tripod.n.01 objects 39 +540 mailbox mailbox 7 29 7 box box Objects mailbox 3710193 n03710193 mailbox.n.01 misc 40 +1205 tupperware tupperware 7 40 7 otherprop Objects objects 39 +415 shoe rack shoe rack 7 40 7 shoe rack otherprop Objects shelving 31 +31 towels towel 6 27 7 towel towel Objects n04459362 towel.n.01 towel 20 +1206 beer bottles beer bottle 6 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +153 treadmill treadmill 6 39 6 treadmill otherfurniture Furniture n04477387 treadmill.n.01 gym_equipment 33 +1207 salt salt 6 40 7 otherprop Objects objects 39 +129 chest chest 6 39 6 chest otherfurniture Furniture dresser dresser chest_of_drawers 13 +220 dispenser dispenser 6 40 7 otherprop Objects n03210683 dispenser.n.01 objects 39 +1208 mirror doors mirror door 6 8 12 door door Wall door n03221720 door.n.01 door 4 +231 remote remote 6 40 7 otherprop Objects remote_control 4074963 n04074963 remote_control.n.01 objects 39 +1209 folded ladder folded ladder 6 39 6 ladder otherfurniture Furniture stairs n03632277 ladder.n.01 misc 40 +39 cushion cushion 6 18 7 pillow pillow Objects n03151500 cushion.n.03 cushion 8 +1210 carton carton 6 40 7 otherprop Objects objects 39 +117 step step 6 38 7 otherstructure Objects n04314914 step.n.04 misc 40 +822 drying rack drying rack 6 39 6 drying rack otherfurniture Furniture shelving 31 +238 slippers slipper 6 40 7 shoe otherprop Objects n04241394 slipper.n.01 clothes 38 +143 pool table pool table 6 39 6 pool table otherfurniture Furniture table table table 4379243 n03982430 pool_table.n.01 table 5 +1211 soda stream soda stream 6 40 7 otherprop Objects objects 39 +228 toilet brush toilet brush 6 40 7 toilet brush otherprop Objects objects 39 +494 loft bed loft bed 6 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +226 cooking pot cooking pot 6 40 7 pot otherprop Objects objects 39 +91 heater heater 6 39 6 heater otherfurniture Furniture n03508101 heater.n.01 misc 40 +1072 messenger bag messenger bag 6 37 7 bag bag Objects objects 39 +435 stapler stapler 6 40 7 stapler otherprop Objects n04303497 stapler.n.01 objects 39 +1165 closet walls closet wall 5 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +345 scanner scanner 5 40 7 otherprop Objects appliances 37 +893 elliptical machine elliptical machine 5 40 7 machine otherprop Objects gym_equipment 33 +621 kettle kettle 5 40 7 pot otherprop Objects n03612814 kettle.n.01 objects 39 +1212 metronome metronome 5 40 7 otherprop Objects n03757604 metronome.n.01 objects 39 +297 dumbell dumbell 5 40 7 otherprop Objects objects 39 +1213 music book music book 5 23 2 book books Books n02870526 book.n.11 objects 39 +1214 rice cooker rice cooker 5 40 7 otherprop Objects objects 39 +1215 dart board dart board 5 38 7 board otherstructure Objects n03162940 dartboard.n.01 objects 39 +529 sewing machine sewing machine 5 40 7 sewing machine otherprop Objects n04179913 sewing_machine.n.01 objects 39 +1216 grab bar grab bar 5 38 7 railing otherstructure Objects railing 30 +1217 flowerpot flowerpot 5 40 7 vase otherprop Objects vase jar 3593526 n04522168 vase.n.01 objects 39 +1218 painting painting 5 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +1219 railing railing 5 38 7 railing otherstructure Objects n04047401 railing.n.01 railing 30 +1220 stair stair 5 38 7 stairs otherstructure Objects stairs n04314914 step.n.04 stairs 16 +525 toolbox toolbox 5 39 6 chest otherfurniture Furniture n04452615 toolbox.n.01 objects 39 +204 nerf gun nerf gun 5 40 7 otherprop Objects objects 39 +693 binders binder 5 40 7 binder otherprop Objects objects 39 +179 desk lamp desk lamp 5 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +1221 quadcopter quadcopter 5 40 7 otherprop Objects objects 39 +1222 pitcher pitcher 5 40 7 pitcher otherprop Objects n03950228 pitcher.n.02 objects 39 +1223 hanging hanging 5 40 7 otherprop Objects misc 40 +1224 mail mail 5 40 7 otherprop Objects misc 40 +1225 closet ceiling closet ceiling 5 22 3 ceiling ceiling Ceiling n02990373 ceiling.n.01 ceiling 17 +1226 hoverboard hoverboard 5 40 7 otherprop Objects objects 39 +1227 beanbag chair beanbag chair 5 39 6 bean bag otherfurniture Furniture n02816656 beanbag.n.01 chair 3 +571 water heater water heater 5 40 7 water heater otherprop Objects n04560113 water_heater.n.01 misc 40 +1228 spray bottle spray bottle 5 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +556 rope rope 5 40 7 rope otherprop Objects n04108268 rope.n.01 objects 39 +280 plastic container plastic container 5 40 7 container otherprop Objects objects 39 +1229 soap bottle soap bottle 5 40 7 soap otherprop Objects objects 39 +1230 ikea bag ikea bag 4 37 7 bag bag Objects 2773838 n02773838 bag.n.06 objects 39 +1231 sleeping bag sleeping bag 4 40 7 otherprop Objects n04235860 sleeping_bag.n.01 objects 39 +1232 duffel bag duffel bag 4 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +746 frying pan frying pan 4 40 7 frying pan otherprop Objects n03400231 frying_pan.n.01 objects 39 +1233 oven mitt oven mitt 4 40 7 otherprop Objects objects 39 +1234 pot pot 4 40 7 pot otherprop Objects n04235860 sleeping_bag.n.01 objects 39 +144 hand dryer hand dryer 4 40 7 otherprop Objects objects 39 +282 dollhouse dollhouse 4 39 6 doll house otherfurniture Furniture n03219483 dollhouse.n.01 objects 39 +167 shampoo bottle shampoo bottle 4 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1235 hair brush hair brush 4 40 7 otherprop Objects n02908217 brush.n.02 objects 39 +1236 tennis racket tennis racket 4 40 7 otherprop Objects n04409806 tennis_racket.n.01 objects 39 +1237 display case display case 4 40 7 case otherprop Objects objects 39 +234 ping pong table ping pong table 4 39 6 ping pong table otherfurniture Furniture table table table 4379243 n04379243 table.n.02 table 5 +563 boiler boiler 4 40 7 otherprop Objects misc 40 +1238 bag of coffee beans bag of coffee beans 4 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +1239 bananas banana 4 40 7 otherprop Objects n00021265 food.n.01 objects 39 +1240 carseat carseat 4 40 7 otherprop Objects misc 40 +366 helmet helmet 4 40 7 otherprop Objects helmet 3513137 n03513137 helmet.n.02 clothes 38 +816 umbrella umbrella 4 40 7 umbrella otherprop Objects n04507155 umbrella.n.01 objects 39 +1241 coffee box coffee box 4 40 7 otherprop Objects objects 39 +719 envelope envelope 4 40 7 envelope otherprop Objects n03291819 envelope.n.01 objects 39 +284 wet floor sign wet floor sign 4 40 7 sign otherprop Objects misc 40 +1242 clothing rack clothing rack 4 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +247 controller controller 4 40 7 otherprop Objects n03096960 control.n.09 objects 39 +1243 bath walls bathroom wall 4 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +1244 podium podium 4 39 6 otherfurniture Furniture n03159640 dais.n.01 furniture 36 +1245 storage box storage box 4 29 7 box box Objects n02883344 box.n.01 objects 39 +1246 dolly dolly 4 40 7 otherprop Objects misc 40 +1247 shampoo shampoo 3 40 7 otherprop Objects n04183516 shampoo.n.01 objects 39 +592 paper tray paper tray 3 40 7 paper tray otherprop Objects objects 39 +385 cabinet door cabinet door 3 8 12 door door Wall door door 4 +1248 changing station changing station 3 40 7 otherprop Objects misc 40 +1249 poster printer poster printer 3 40 7 printer otherprop Objects printer 4004475 n04004475 printer.n.03 appliances 37 +133 screen screen 3 40 7 otherprop Objects n03151077 curtain.n.01 curtain 12 +301 soap bar soap bar 3 38 7 bar otherstructure Objects objects 39 +1250 crutches crutches 3 40 7 otherprop Objects n03141823 crutch.n.01 objects 39 +379 studio light studio light 3 38 7 light otherstructure Objects lighting 28 +130 stack of cups cup 3 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +1251 toilet flush button toilet flush button 3 40 7 otherprop Objects objects 39 +450 trunk trunk 3 40 7 otherprop Objects misc 40 +1252 grocery bag grocery bag 3 37 7 bag bag Objects suitcase 2773838 n03461288 grocery_bag.n.01 objects 39 +316 plastic bin plastic bin 3 40 7 bin otherprop Objects objects 39 +1253 pizza box pizza box 3 29 7 box box Objects objects 39 +385 cabinet doors cabinet door 3 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 door 4 +1254 legs legs 3 31 7 person person Objects person n05217688 person.n.02 misc 40 +461 car car 3 40 7 car otherprop Objects car car 2958343 n02958343 car.n.01 misc 40 +1255 shaving cream shaving cream 3 40 7 otherprop Objects n04186051 shaving_cream.n.01 objects 39 +1256 luggage stand luggage stand 3 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +599 shredder shredder 3 40 7 otherprop Objects n04210120 shredder.n.01 objects 39 +281 statue statue 3 40 7 sculpture otherprop Objects n04306847 statue.n.01 misc 40 +1257 urinal urinal 3 33 7 toilet toilet Objects toilet toilet n04515991 urinal.n.01 toilet 18 +1258 hose hose 3 40 7 otherprop Objects n03539875 hose.n.03 misc 40 +1259 bike pump bike pump 3 40 7 otherprop Objects objects 39 +319 coatrack coatrack 3 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31 +1260 bear bear 3 40 7 otherprop Objects objects 39 +28 wall lamp lamp 3 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +1261 humidifier humidifier 3 40 7 otherprop Objects objects 39 +546 toothpaste toothpaste 3 40 7 toothpaste otherprop Objects objects 39 +1262 mouthwash bottle mouthwash bottle 3 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1263 poster cutter poster cutter 3 40 7 otherprop Objects objects 39 +1264 golf bag golf bag 3 37 7 bag bag Objects suitcase 2773838 n03445617 golf_bag.n.01 objects 39 +1265 food container food container 3 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1266 camera camera 3 40 7 otherprop Objects objects 39 +28 table lamp lamp 3 35 7 lamp lamp Objects lamp lamp 3636649 n04380533 table_lamp.n.01 lighting 28 +1267 yoga mat yoga mat 3 20 5 floor mat floor mat Floor n03727837 mat.n.01 floor 2 +1268 card card 3 40 7 otherprop Objects objects 39 +1269 mug mug 3 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +188 shower doors shower door 3 38 7 otherstructure Objects n04208936 shower.n.01 door 4 +689 cardboard cardboard 3 40 7 otherprop Objects objects 39 +1270 rack stand rack stand 3 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +1271 boxes of paper boxes of paper 3 29 7 box box Objects n02883344 box.n.01 objects 39 +1272 flag flag 3 40 7 otherprop Objects misc 40 +354 futon futon 3 39 6 mattress otherfurniture Furniture n03408444 futon.n.01 sofa 10 +339 magazine magazine 3 40 7 magazine otherprop Objects n06595351 magazine.n.01 objects 39 +1009 exit sign exit sign 3 40 7 exit sign otherprop Objects misc 40 +1273 rolled poster rolled poster 3 40 7 otherprop Objects objects 39 +1274 wheel wheel 3 40 7 otherprop Objects objects 39 +15 pictures picture 3 11 8 picture picture Picture n03931044 picture.n.01 picture 6 +1275 blackboard eraser blackboard eraser 3 40 7 eraser otherprop Objects n03294833 eraser.n.01 objects 39 +361 organizer organizer 3 40 7 otherprop Objects n03918737 personal_digital_assistant.n.01 objects 39 +1276 doll doll 3 40 7 toy otherprop Objects n03219135 doll.n.01 objects 39 +326 book rack book rack 3 39 6 bookrack otherfurniture Furniture objects 39 +1277 laundry bag laundry bag 3 40 7 laundry basket otherprop Objects basket 2801938 n03050864 clothes_hamper.n.01 objects 39 +1278 sponge sponge 3 40 7 otherprop Objects n01906749 sponge.n.04 objects 39 +116 seating seat 3 39 6 furniture otherfurniture Furniture n04161981 seat.n.03 furniture 36 +1184 folded chairs folded chair 2 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1279 lotion bottle lotion bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +212 can can 2 40 7 can otherprop Objects can 2946921 n02946921 can.n.01 objects 39 +1280 lunch box lunch box 2 40 7 otherprop Objects objects 39 +1281 food display food display 2 40 7 otherprop Objects misc 40 +794 storage shelf storage shelf 2 40 7 otherprop Objects shelving 31 +1282 sliding wood door sliding wood door 2 40 7 otherprop Objects door 4 +955 pants pants 2 40 7 otherprop Objects n04489008 trouser.n.01 clothes 38 +387 wood wood 2 40 7 otherprop Objects misc 40 +69 boards board 2 38 7 board otherstructure Objects board_panel 35 +65 bottles bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +523 washcloth washcloth 2 40 7 otherprop Objects n04554523 washcloth.n.01 towel 20 +389 workbench workbench 2 39 6 bench otherfurniture Furniture bench table 4379243 n04600486 workbench.n.01 table 5 +29 open kitchen cabinet kitchen cabinet 2 3 6 cabinet cabinet Furniture n02933112 cabinet.n.01 cabinet 7 +1283 organizer shelf organizer shelf 2 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +146 frame frame 2 38 7 otherstructure Objects misc 40 +130 cups cup 2 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 +372 exercise ball exercise ball 2 40 7 ball otherprop Objects n04285146 sports_equipment.n.01 gym_equipment 33 +289 easel easel 2 39 6 stand otherfurniture Furniture n03262809 easel.n.01 furniture 36 +440 garbage bag garbage bag 2 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +321 roomba roomba 2 40 7 otherprop Objects objects 39 +976 garage door garage door 2 38 7 garage door otherstructure Objects door door 4 +1256 luggage rack luggage stand 2 39 6 stand otherfurniture Furniture n04038440 shelving 31 +1284 bike lock bike lock 2 40 7 otherprop Objects objects 39 +1285 briefcase briefcase 2 40 7 otherprop Objects n02900705 briefcase.n.01 objects 39 +357 hand towel hand towel 2 27 7 towel towel Objects n03490006 hand_towel.n.01 towel 20 +1286 bath products bath product 2 40 7 otherprop Objects objects 39 +1287 star star 2 40 7 otherprop Objects n09444783 star.n.03 misc 40 +365 map map 2 40 7 map otherprop Objects n03720163 map.n.01 misc 40 +1288 coffee bean bag coffee bean bag 2 37 7 bag bag Objects suitcase 2773838 n02773838 bag.n.06 objects 39 +81 headboard headboard 2 39 6 headboard otherfurniture Furniture n03502200 headboard.n.01 bed 11 +1289 ipad ipad 2 40 7 otherprop Objects objects 39 +1290 display rack display rack 2 39 6 stand otherfurniture Furniture n04038440 rack.n.05 shelving 31 +948 traffic cone traffic cone 2 40 7 cone otherprop Objects cone objects 39 +174 toiletry toiletry 2 40 7 otherprop Objects n04447443 toiletry.n.01 objects 39 +1028 canopy canopy 2 40 7 otherprop Objects misc 40 +1291 massage chair massage chair 2 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1292 paper organizer paper organizer 2 40 7 otherprop Objects objects 39 +1005 barricade barricade 2 40 7 otherprop Objects misc 40 +235 platform platform 2 38 7 otherstructure Objects misc 40 +1293 cap cap 2 40 7 hat otherprop Objects n03497657 hat.n.01 clothes 38 +1294 dumbbell plates dumbbell plates 2 40 7 otherprop Objects objects 39 +1295 elevator elevator 2 38 7 otherstructure Objects misc 40 +1296 cooking pan cooking pan 2 40 7 pan otherprop Objects n03880531 pan.n.01 objects 39 +1297 trash bag trash bag 2 37 7 bag bag Objects objects 39 +1298 santa santa 2 40 7 otherprop Objects misc 40 +1299 jewelry box jewelry box 2 29 7 box box Objects n02883344 box.n.01 objects 39 +1300 boat boat 2 40 7 otherprop Objects misc 40 +1301 sock sock 2 21 7 clothes clothes Objects n04254777 sock.n.01 clothes 38 +1051 kinect kinect 2 40 7 kinect otherprop Objects objects 39 +566 crib crib 2 39 6 crib otherfurniture Furniture furniture 36 +1302 plastic storage bin plastic storage bin 2 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1062 cooler cooler 2 24 6 refridgerator refridgerator Furniture n03102654 cooler.n.01 appliances 37 +1303 kitchen apron kitchen apron 2 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +1304 dishwashing soap bottle dishwashing soap bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1305 xbox controller xbox controller 2 40 7 otherprop Objects objects 39 +1306 banana holder banana holder 2 40 7 otherprop Objects objects 39 +298 ping pong paddle ping pong paddle 2 40 7 otherprop Objects table 5 +1307 airplane airplane 2 40 7 otherprop Objects misc 40 +1308 conditioner bottle conditioner bottle 2 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +1309 tea kettle tea kettle 2 40 7 tea kettle otherprop Objects n04397768 teakettle.n.01 objects 39 +43 bedframe bedframe 2 39 6 otherfurniture Furniture n02822579 bedstead.n.01 bed 11 +1310 wood beam wood beam 2 38 7 otherstructure Objects beam 29 +593 toilet paper package toilet paper package 2 40 7 otherprop Objects objects 39 +1311 wall mounted coat rack wall mounted coat rack 2 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31 +1312 film light film light 2 40 7 otherprop Objects lighting 28 +749 ceiling lamp ceiling lamp 1 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +623 chain chain 1 40 7 otherprop Objects chair 3 +1313 sofa sofa 1 6 9 sofa sofa Sofa sofa sofa sofa 4256520 n04256520 sofa.n.01 sofa 10 +99 closet wardrobe wardrobe 1 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +265 sweater sweater 1 40 7 otherprop Objects n04370048 sweater.n.01 clothes 38 +1314 kitchen mixer kitchen mixer 1 40 7 otherprop Objects appliances 37 +99 wardrobe wardrobe 1 39 6 wardrobe otherfurniture Furniture wardrobe n04550184 wardrobe.n.01 furniture 36 +1315 water softener water softener 1 40 7 otherprop Objects misc 40 +448 banister banister 1 38 7 banister otherstructure Objects n02788148 bannister.n.02 railing 30 +257 trolley trolley 1 40 7 trolley otherprop Objects n04335435 streetcar.n.01 misc 40 +1316 pantry shelf pantry shelf 1 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +786 sofa bed sofa bed 1 4 1 bed bed Bed bed bed bed 2818832 n02818832 bed.n.01 bed 11 +801 loofa loofa 1 40 7 otherprop Objects objects 39 +972 shower faucet handle shower faucet handle 1 40 7 handle otherprop Objects shower 23 +1317 toy piano toy piano 1 40 7 toy otherprop Objects n03964744 plaything.n.01 objects 39 +1318 fish fish 1 40 7 otherprop Objects n02512053 fish.n.01 objects 39 +75 file cabinets file cabinet 1 3 6 cabinet cabinet Furniture cabinet 2933112 n03337140 file.n.03 cabinet 7 +657 cat litter box cat litter box 1 29 7 box box Objects objects 39 +561 electric panel electric panel 1 40 7 otherprop Objects misc 40 +93 suitcases suitcase 1 40 7 luggage otherprop Objects n02774630 baggage.n.01 objects 39 +513 curtain rod curtain rod 1 38 7 curtain rod otherstructure Objects curtain 12 +411 bunk bed bunk bed 1 39 6 bunk bed otherfurniture Furniture bed bed bed 2818832 n02920259 bunk_bed.n.01 bed 11 +1122 chandelier chandelier 1 38 7 chandelier otherstructure Objects n03005285 chandelier.n.01 lighting 28 +922 tape tape 1 40 7 tape otherprop Objects objects 39 +88 plates plate 1 40 7 otherprop Objects n03959485 plate.n.04 objects 39 +518 alarm alarm 1 40 7 alarm otherprop Objects clock 3046257 n02694662 alarm_clock.n.01 objects 39 +814 fire hose fire hose 1 40 7 otherprop Objects n03346004 fire_hose.n.01 misc 40 +1319 toy dinosaur toy dinosaur 1 40 7 toy otherprop Objects n03964744 plaything.n.01 objects 39 +1320 cone cone 1 40 7 otherprop Objects objects 39 +649 glass doors glass door 1 8 12 door door Wall door n03221720 door.n.01 door 4 +607 hatrack hatrack 1 40 7 otherprop Objects n03059103 coatrack.n.01 shelving 31 +819 subwoofer subwoofer 1 40 7 speaker otherprop Objects speaker 3691459 n04349401 subwoofer.n.01 objects 39 +1321 fire sprinkler fire sprinkler 1 40 7 otherprop Objects misc 40 +1322 trash cabinet trash cabinet 1 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +1204 pantry walls pantry wall 1 1 12 wall wall Wall n04546855 wall.n.01 wall 1 +227 photo photo 1 40 7 photo otherprop Objects n03925226 photograph.n.01 picture 6 +817 barrier barrier 1 40 7 otherprop Objects n02796623 barrier.n.01 misc 40 +130 stacks of cups cup 1 40 7 otherprop Objects n03147509 cup.n.01 objects 39 +712 beachball beachball 1 40 7 ball otherprop Objects n02814224 beach_ball.n.01 objects 39 +1323 folded boxes folded boxes 1 40 7 otherprop Objects objects 39 +1324 contact lens solution bottle contact lens solution bottle 1 40 7 bottle otherprop Objects bottle bottle 2876657 n02876657 bottle.n.01 objects 39 +673 covered box covered box 1 29 7 box box Objects objects 39 +459 folder folder 1 40 7 folder otherprop Objects n03376279 folder.n.02 objects 39 +643 mail trays mail tray 1 40 7 mail tray otherprop Objects objects 39 +238 slipper slipper 1 40 7 otherprop Objects n04241394 slipper.n.01 clothes 38 +765 magazine rack magazine rack 1 39 6 stand otherfurniture Furniture n03704549 magazine_rack.n.01 shelving 31 +1008 sticker sticker 1 40 7 sticker otherprop Objects n07272545 gummed_label.n.01 objects 39 +225 lotion lotion 1 40 7 otherprop Objects n03690938 lotion.n.01 objects 39 +1083 buddha buddha 1 40 7 otherprop Objects objects 39 +813 file organizer file organizer 1 40 7 otherprop Objects objects 39 +138 paper towel rolls paper towel roll 1 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20 +1145 night lamp night lamp 1 35 7 lamp lamp Objects lamp lamp 3636649 n03636649 lamp.n.02 lighting 28 +796 fuse box fuse box 1 40 7 otherprop Objects misc 40 +1325 knife block knife block 1 40 7 otherprop Objects objects 39 +363 furnace furnace 1 39 6 furnace otherfurniture Furniture n03404449 furnace.n.01 +1174 cd cases cd case 1 40 7 otherprop Objects objects 39 +38 stools stool 1 40 7 stool otherprop Objects stool n04326896 stool.n.01 stool 19 +1326 hand sanitzer dispenser hand sanitzer dispenser 1 40 7 otherprop Objects n04254120 soap_dispenser.n.01 objects 39 +997 teapot teapot 1 40 7 tea pot otherprop Objects n04398044 teapot.n.01 objects 39 +1327 pen holder pen holder 1 40 7 otherprop Objects objects 39 +1328 tray rack tray rack 1 40 7 otherprop Objects objects 39 +1329 wig wig 1 40 7 otherprop Objects n04584207 wig.n.01 objects 39 +182 switch switch 1 40 7 otherprop Objects n04372370 switch.n.01 misc 40 +280 plastic containers plastic container 1 40 7 container otherprop Objects n03094503 container.n.01 objects 39 +1330 night light night light 1 40 7 otherprop Objects lighting 28 +1331 notepad notepad 1 40 7 otherprop Objects objects 39 +1332 mail bin mail bin 1 40 7 otherprop Objects misc 40 +1333 elevator button elevator button 1 40 7 otherprop Objects misc 40 +939 gaming wheel gaming wheel 1 40 7 otherprop Objects objects 39 +1334 drum set drum set 1 40 7 otherprop Objects objects 39 +480 cosmetic bag cosmetic bag 1 37 7 bag bag Objects objects 39 +907 coffee mug coffee mug 1 40 7 vessel otherprop Objects cup or mug 3797390 n03063599 coffee_mug.n.01 objects 39 +1335 closet shelf closet shelf 1 15 6 shelves shelves Furniture bookshelf bookshelf 2871439 n02871439 bookshelf.n.01 shelving 31 +1336 baby mobile baby mobile 1 40 7 otherprop Objects objects 39 +829 diaper bin diaper bin 1 40 7 bin otherprop Objects objects 39 +947 door wall door wall 1 1 12 wall wall Wall wall 1 +1116 stepstool stepstool 1 40 7 step stool otherprop Objects objects 39 +599 paper shredder shredder 1 40 7 otherprop Objects n04210120 shredder.n.01 objects 39 +733 dress rack dress rack 1 40 7 otherprop Objects n03238762 dress_rack.n.01 misc 40 +123 cover cover 1 40 7 blanket otherprop Objects objects 39 +506 shopping bag shopping bag 1 37 7 bag bag Objects n04204081 shopping_bag.n.01 objects 39 +569 sliding door sliding door 1 8 12 door door Wall door n04239074 sliding_door.n.01 door 4 +1337 exercise bike exercise bike 1 40 7 machine otherprop Objects n04210120 shredder.n.01 gym_equipment 33 +1338 recliner chair recliner chair 1 5 4 chair chair Chair chair chair chair 3001627 n03238762 dress_rack.n.01 chair 3 +1314 kitchenaid mixer kitchen mixer 1 40 7 otherprop Objects appliances 37 +1339 soda can soda can 1 40 7 can otherprop Objects can 2946921 n02946921 can.n.01 objects 39 +1340 stovetop stovetop 1 38 7 stove otherstructure Objects stove 4330267 n04330267 stove.n.02 appliances 37 +851 stepladder stepladder 1 39 6 ladder otherfurniture Furniture stairs n04315599 step_ladder.n.01 stairs 16 +142 tap tap 1 40 7 faucet otherprop Objects faucet 3325088 n04559451 water_faucet.n.01 objects 39 +436 cable cable 1 40 7 cables otherprop Objects objects 39 +1341 baby changing station baby changing station 1 39 6 otherfurniture Furniture furniture 36 +1342 costume costume 1 21 7 clothes clothes Objects n02728440 apparel.n.01 clothes 38 +885 rocking chair rocking chair 1 5 4 chair chair Chair chair chair chair 3001627 n04099969 rocking_chair.n.01 chair 3 +693 binder binder 1 40 7 binder otherprop Objects objects 39 +815 media center media center 1 3 6 cabinet cabinet Furniture cabinet 2933112 n02933112 cabinet.n.01 cabinet 7 +401 towel rack towel rack 1 40 7 otherprop Objects n04459773 towel_rack.n.01 misc 40 +1343 medal medal 1 40 7 otherprop Objects objects 39 +1184 stack of folded chairs folded chair 1 5 4 chair chair Chair chair chair chair 3001627 n03001627 chair.n.01 chair 3 +1344 telescope telescope 1 40 7 otherprop Objects n04403638 telescope.n.01 objects 39 +1345 closet doorframe closet doorframe 1 8 12 door door Wall door door 4 +160 glass glass 1 38 7 glass otherstructure Objects n03438257 glass.n.02 misc 40 +1126 baseball cap baseball cap 1 40 7 otherprop Objects cap 2954340 n02799323 baseball_cap.n.01 clothes 38 +1346 battery disposal jar battery disposal jar 1 40 7 jar otherprop Objects jar 3593526 n03593526 jar.n.01 objects 39 +332 mop mop 1 40 7 otherprop Objects n04367480 swab.n.02 objects 39 +397 tank tank 1 40 7 otherprop Objects objects 39 +643 mail tray mail tray 1 40 7 mail tray otherprop Objects objects 39 +551 centerpiece centerpiece 1 40 7 centerpiece otherprop Objects n02994419 centerpiece.n.02 objects 39 +1163 object stick 1 40 7 stick otherprop Objects objects 39 +1347 closet floor closet floor 1 2 5 floor floor Floor n03365592 floor.n.01 floor 2 +1348 dryer sheets dryer sheets 1 40 7 otherprop Objects objects 39 +803 bycicle bycicle 1 40 7 otherprop Objects misc 40 +484 flower stand flower stand 1 39 6 stand otherfurniture Furniture furniture 36 +1349 air mattress air mattress 1 4 1 bed bed Bed bed bed bed 2818832 n02690809 air_mattress.n.01 bed 11 +1350 clip clip 1 40 7 otherprop Objects objects 39 +222 side table side table 1 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +1253 pizza boxes pizza box 1 29 7 box box Objects n02883344 box.n.01 objects 39 +1351 display display 1 39 7 otherfurniture Furniture n03211117 display.n.06 misc 40 +1352 postcard postcard 1 40 7 otherprop Objects objects 39 +828 display sign display sign 1 40 7 sign otherprop Objects misc 40 +1353 paper towel paper towel 1 40 7 paper towel otherprop Objects n03887697 paper_towel.n.01 towel 20 +612 boots boot 1 40 7 shoe otherprop Objects n04199027 shoe.n.01 clothes 38 +1354 tennis racket bag tennis racket bag 1 40 7 otherprop Objects objects 39 +1355 air hockey table air hockey table 1 7 10 table table Table table table table 4379243 n04379243 table.n.02 table 5 +1301 socks sock 1 21 7 clothes clothes Objects n04254777 sock.n.01 clothes 38 +1356 food bag food bag 1 37 7 bag bag Objects objects 39 +1199 clothes hangers clothes hanger 1 40 7 otherprop Objects n03057920 coat_hanger.n.01 misc 40 +1357 starbucks cup starbucks cup 1 40 7 cup otherprop Objects cup cup or mug 3797390 n03797390 mug.n.04 objects 39 \ No newline at end of file diff --git a/datasets/scannet_preprocess/meta_data/scannetv2_test.txt b/datasets/scannet_preprocess/meta_data/scannetv2_test.txt new file mode 100644 index 0000000000000000000000000000000000000000..79d15b0ee4afa889883562a722b837b78ee8ce4b --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv2_test.txt @@ -0,0 +1,100 @@ +scene0707_00 +scene0708_00 +scene0709_00 +scene0710_00 +scene0711_00 +scene0712_00 +scene0713_00 +scene0714_00 +scene0715_00 +scene0716_00 +scene0717_00 +scene0718_00 +scene0719_00 +scene0720_00 +scene0721_00 +scene0722_00 +scene0723_00 +scene0724_00 +scene0725_00 +scene0726_00 +scene0727_00 +scene0728_00 +scene0729_00 +scene0730_00 +scene0731_00 +scene0732_00 +scene0733_00 +scene0734_00 +scene0735_00 +scene0736_00 +scene0737_00 +scene0738_00 +scene0739_00 +scene0740_00 +scene0741_00 +scene0742_00 +scene0743_00 +scene0744_00 +scene0745_00 +scene0746_00 +scene0747_00 +scene0748_00 +scene0749_00 +scene0750_00 +scene0751_00 +scene0752_00 +scene0753_00 +scene0754_00 +scene0755_00 +scene0756_00 +scene0757_00 +scene0758_00 +scene0759_00 +scene0760_00 +scene0761_00 +scene0762_00 +scene0763_00 +scene0764_00 +scene0765_00 +scene0766_00 +scene0767_00 +scene0768_00 +scene0769_00 +scene0770_00 +scene0771_00 +scene0772_00 +scene0773_00 +scene0774_00 +scene0775_00 +scene0776_00 +scene0777_00 +scene0778_00 +scene0779_00 +scene0780_00 +scene0781_00 +scene0782_00 +scene0783_00 +scene0784_00 +scene0785_00 +scene0786_00 +scene0787_00 +scene0788_00 +scene0789_00 +scene0790_00 +scene0791_00 +scene0792_00 +scene0793_00 +scene0794_00 +scene0795_00 +scene0796_00 +scene0797_00 +scene0798_00 +scene0799_00 +scene0800_00 +scene0801_00 +scene0802_00 +scene0803_00 +scene0804_00 +scene0805_00 +scene0806_00 diff --git a/datasets/scannet_preprocess/meta_data/scannetv2_train.txt b/datasets/scannet_preprocess/meta_data/scannetv2_train.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef625f120b812fea5ac507d3b7049fc7ebd2e7e4 --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv2_train.txt @@ -0,0 +1,1201 @@ +scene0191_00 +scene0191_01 +scene0191_02 +scene0119_00 +scene0230_00 +scene0528_00 +scene0528_01 +scene0705_00 +scene0705_01 +scene0705_02 +scene0415_00 +scene0415_01 +scene0415_02 +scene0007_00 +scene0141_00 +scene0141_01 +scene0141_02 +scene0515_00 +scene0515_01 +scene0515_02 +scene0447_00 +scene0447_01 +scene0447_02 +scene0531_00 +scene0503_00 +scene0285_00 +scene0069_00 +scene0584_00 +scene0584_01 +scene0584_02 +scene0581_00 +scene0581_01 +scene0581_02 +scene0620_00 +scene0620_01 +scene0263_00 +scene0263_01 +scene0481_00 +scene0481_01 +scene0020_00 +scene0020_01 +scene0291_00 +scene0291_01 +scene0291_02 +scene0469_00 +scene0469_01 +scene0469_02 +scene0659_00 +scene0659_01 +scene0024_00 +scene0024_01 +scene0024_02 +scene0564_00 +scene0117_00 +scene0027_00 +scene0027_01 +scene0027_02 +scene0028_00 +scene0330_00 +scene0418_00 +scene0418_01 +scene0418_02 +scene0233_00 +scene0233_01 +scene0673_00 +scene0673_01 +scene0673_02 +scene0673_03 +scene0673_04 +scene0673_05 +scene0585_00 +scene0585_01 +scene0362_00 +scene0362_01 +scene0362_02 +scene0362_03 +scene0035_00 +scene0035_01 +scene0358_00 +scene0358_01 +scene0358_02 +scene0037_00 +scene0194_00 +scene0321_00 +scene0293_00 +scene0293_01 +scene0623_00 +scene0623_01 +scene0592_00 +scene0592_01 +scene0569_00 +scene0569_01 +scene0413_00 +scene0313_00 +scene0313_01 +scene0313_02 +scene0480_00 +scene0480_01 +scene0401_00 +scene0517_00 +scene0517_01 +scene0517_02 +scene0032_00 +scene0032_01 +scene0613_00 +scene0613_01 +scene0613_02 +scene0306_00 +scene0306_01 +scene0052_00 +scene0052_01 +scene0052_02 +scene0053_00 +scene0444_00 +scene0444_01 +scene0055_00 +scene0055_01 +scene0055_02 +scene0560_00 +scene0589_00 +scene0589_01 +scene0589_02 +scene0610_00 +scene0610_01 +scene0610_02 +scene0364_00 +scene0364_01 +scene0383_00 +scene0383_01 +scene0383_02 +scene0006_00 +scene0006_01 +scene0006_02 +scene0275_00 +scene0451_00 +scene0451_01 +scene0451_02 +scene0451_03 +scene0451_04 +scene0451_05 +scene0135_00 +scene0065_00 +scene0065_01 +scene0065_02 +scene0104_00 +scene0674_00 +scene0674_01 +scene0448_00 +scene0448_01 +scene0448_02 +scene0502_00 +scene0502_01 +scene0502_02 +scene0440_00 +scene0440_01 +scene0440_02 +scene0071_00 +scene0072_00 +scene0072_01 +scene0072_02 +scene0509_00 +scene0509_01 +scene0509_02 +scene0649_00 +scene0649_01 +scene0602_00 +scene0694_00 +scene0694_01 +scene0101_00 +scene0101_01 +scene0101_02 +scene0101_03 +scene0101_04 +scene0101_05 +scene0218_00 +scene0218_01 +scene0579_00 +scene0579_01 +scene0579_02 +scene0039_00 +scene0039_01 +scene0493_00 +scene0493_01 +scene0242_00 +scene0242_01 +scene0242_02 +scene0083_00 +scene0083_01 +scene0127_00 +scene0127_01 +scene0662_00 +scene0662_01 +scene0662_02 +scene0018_00 +scene0087_00 +scene0087_01 +scene0087_02 +scene0332_00 +scene0332_01 +scene0332_02 +scene0628_00 +scene0628_01 +scene0628_02 +scene0134_00 +scene0134_01 +scene0134_02 +scene0238_00 +scene0238_01 +scene0092_00 +scene0092_01 +scene0092_02 +scene0092_03 +scene0092_04 +scene0022_00 +scene0022_01 +scene0467_00 +scene0392_00 +scene0392_01 +scene0392_02 +scene0424_00 +scene0424_01 +scene0424_02 +scene0646_00 +scene0646_01 +scene0646_02 +scene0098_00 +scene0098_01 +scene0044_00 +scene0044_01 +scene0044_02 +scene0510_00 +scene0510_01 +scene0510_02 +scene0571_00 +scene0571_01 +scene0166_00 +scene0166_01 +scene0166_02 +scene0563_00 +scene0172_00 +scene0172_01 +scene0388_00 +scene0388_01 +scene0215_00 +scene0215_01 +scene0252_00 +scene0287_00 +scene0668_00 +scene0572_00 +scene0572_01 +scene0572_02 +scene0026_00 +scene0224_00 +scene0113_00 +scene0113_01 +scene0551_00 +scene0381_00 +scene0381_01 +scene0381_02 +scene0371_00 +scene0371_01 +scene0460_00 +scene0118_00 +scene0118_01 +scene0118_02 +scene0417_00 +scene0008_00 +scene0634_00 +scene0521_00 +scene0123_00 +scene0123_01 +scene0123_02 +scene0045_00 +scene0045_01 +scene0511_00 +scene0511_01 +scene0114_00 +scene0114_01 +scene0114_02 +scene0070_00 +scene0029_00 +scene0029_01 +scene0029_02 +scene0129_00 +scene0103_00 +scene0103_01 +scene0002_00 +scene0002_01 +scene0132_00 +scene0132_01 +scene0132_02 +scene0124_00 +scene0124_01 +scene0143_00 +scene0143_01 +scene0143_02 +scene0604_00 +scene0604_01 +scene0604_02 +scene0507_00 +scene0105_00 +scene0105_01 +scene0105_02 +scene0428_00 +scene0428_01 +scene0311_00 +scene0140_00 +scene0140_01 +scene0182_00 +scene0182_01 +scene0182_02 +scene0142_00 +scene0142_01 +scene0399_00 +scene0399_01 +scene0012_00 +scene0012_01 +scene0012_02 +scene0060_00 +scene0060_01 +scene0370_00 +scene0370_01 +scene0370_02 +scene0310_00 +scene0310_01 +scene0310_02 +scene0661_00 +scene0650_00 +scene0152_00 +scene0152_01 +scene0152_02 +scene0158_00 +scene0158_01 +scene0158_02 +scene0482_00 +scene0482_01 +scene0600_00 +scene0600_01 +scene0600_02 +scene0393_00 +scene0393_01 +scene0393_02 +scene0562_00 +scene0174_00 +scene0174_01 +scene0157_00 +scene0157_01 +scene0161_00 +scene0161_01 +scene0161_02 +scene0159_00 +scene0254_00 +scene0254_01 +scene0115_00 +scene0115_01 +scene0115_02 +scene0162_00 +scene0163_00 +scene0163_01 +scene0523_00 +scene0523_01 +scene0523_02 +scene0459_00 +scene0459_01 +scene0175_00 +scene0085_00 +scene0085_01 +scene0279_00 +scene0279_01 +scene0279_02 +scene0201_00 +scene0201_01 +scene0201_02 +scene0283_00 +scene0456_00 +scene0456_01 +scene0429_00 +scene0043_00 +scene0043_01 +scene0419_00 +scene0419_01 +scene0419_02 +scene0368_00 +scene0368_01 +scene0348_00 +scene0348_01 +scene0348_02 +scene0442_00 +scene0178_00 +scene0380_00 +scene0380_01 +scene0380_02 +scene0165_00 +scene0165_01 +scene0165_02 +scene0181_00 +scene0181_01 +scene0181_02 +scene0181_03 +scene0333_00 +scene0614_00 +scene0614_01 +scene0614_02 +scene0404_00 +scene0404_01 +scene0404_02 +scene0185_00 +scene0126_00 +scene0126_01 +scene0126_02 +scene0519_00 +scene0236_00 +scene0236_01 +scene0189_00 +scene0075_00 +scene0267_00 +scene0192_00 +scene0192_01 +scene0192_02 +scene0281_00 +scene0420_00 +scene0420_01 +scene0420_02 +scene0195_00 +scene0195_01 +scene0195_02 +scene0597_00 +scene0597_01 +scene0597_02 +scene0041_00 +scene0041_01 +scene0111_00 +scene0111_01 +scene0111_02 +scene0666_00 +scene0666_01 +scene0666_02 +scene0200_00 +scene0200_01 +scene0200_02 +scene0536_00 +scene0536_01 +scene0536_02 +scene0390_00 +scene0280_00 +scene0280_01 +scene0280_02 +scene0344_00 +scene0344_01 +scene0205_00 +scene0205_01 +scene0205_02 +scene0484_00 +scene0484_01 +scene0009_00 +scene0009_01 +scene0009_02 +scene0302_00 +scene0302_01 +scene0209_00 +scene0209_01 +scene0209_02 +scene0210_00 +scene0210_01 +scene0395_00 +scene0395_01 +scene0395_02 +scene0683_00 +scene0601_00 +scene0601_01 +scene0214_00 +scene0214_01 +scene0214_02 +scene0477_00 +scene0477_01 +scene0439_00 +scene0439_01 +scene0468_00 +scene0468_01 +scene0468_02 +scene0546_00 +scene0466_00 +scene0466_01 +scene0220_00 +scene0220_01 +scene0220_02 +scene0122_00 +scene0122_01 +scene0130_00 +scene0110_00 +scene0110_01 +scene0110_02 +scene0327_00 +scene0156_00 +scene0266_00 +scene0266_01 +scene0001_00 +scene0001_01 +scene0228_00 +scene0199_00 +scene0219_00 +scene0464_00 +scene0232_00 +scene0232_01 +scene0232_02 +scene0299_00 +scene0299_01 +scene0530_00 +scene0363_00 +scene0453_00 +scene0453_01 +scene0570_00 +scene0570_01 +scene0570_02 +scene0183_00 +scene0239_00 +scene0239_01 +scene0239_02 +scene0373_00 +scene0373_01 +scene0241_00 +scene0241_01 +scene0241_02 +scene0188_00 +scene0622_00 +scene0622_01 +scene0244_00 +scene0244_01 +scene0691_00 +scene0691_01 +scene0206_00 +scene0206_01 +scene0206_02 +scene0247_00 +scene0247_01 +scene0061_00 +scene0061_01 +scene0082_00 +scene0250_00 +scene0250_01 +scene0250_02 +scene0501_00 +scene0501_01 +scene0501_02 +scene0320_00 +scene0320_01 +scene0320_02 +scene0320_03 +scene0631_00 +scene0631_01 +scene0631_02 +scene0255_00 +scene0255_01 +scene0255_02 +scene0047_00 +scene0265_00 +scene0265_01 +scene0265_02 +scene0004_00 +scene0336_00 +scene0336_01 +scene0058_00 +scene0058_01 +scene0260_00 +scene0260_01 +scene0260_02 +scene0243_00 +scene0603_00 +scene0603_01 +scene0093_00 +scene0093_01 +scene0093_02 +scene0109_00 +scene0109_01 +scene0434_00 +scene0434_01 +scene0434_02 +scene0290_00 +scene0627_00 +scene0627_01 +scene0470_00 +scene0470_01 +scene0137_00 +scene0137_01 +scene0137_02 +scene0270_00 +scene0270_01 +scene0270_02 +scene0271_00 +scene0271_01 +scene0504_00 +scene0274_00 +scene0274_01 +scene0274_02 +scene0036_00 +scene0036_01 +scene0276_00 +scene0276_01 +scene0272_00 +scene0272_01 +scene0499_00 +scene0698_00 +scene0698_01 +scene0051_00 +scene0051_01 +scene0051_02 +scene0051_03 +scene0108_00 +scene0245_00 +scene0369_00 +scene0369_01 +scene0369_02 +scene0284_00 +scene0289_00 +scene0289_01 +scene0286_00 +scene0286_01 +scene0286_02 +scene0286_03 +scene0031_00 +scene0031_01 +scene0031_02 +scene0545_00 +scene0545_01 +scene0545_02 +scene0557_00 +scene0557_01 +scene0557_02 +scene0533_00 +scene0533_01 +scene0116_00 +scene0116_01 +scene0116_02 +scene0611_00 +scene0611_01 +scene0688_00 +scene0294_00 +scene0294_01 +scene0294_02 +scene0295_00 +scene0295_01 +scene0296_00 +scene0296_01 +scene0596_00 +scene0596_01 +scene0596_02 +scene0532_00 +scene0532_01 +scene0637_00 +scene0638_00 +scene0121_00 +scene0121_01 +scene0121_02 +scene0040_00 +scene0040_01 +scene0197_00 +scene0197_01 +scene0197_02 +scene0410_00 +scene0410_01 +scene0305_00 +scene0305_01 +scene0615_00 +scene0615_01 +scene0703_00 +scene0703_01 +scene0555_00 +scene0297_00 +scene0297_01 +scene0297_02 +scene0582_00 +scene0582_01 +scene0582_02 +scene0023_00 +scene0094_00 +scene0013_00 +scene0013_01 +scene0013_02 +scene0136_00 +scene0136_01 +scene0136_02 +scene0407_00 +scene0407_01 +scene0062_00 +scene0062_01 +scene0062_02 +scene0386_00 +scene0318_00 +scene0554_00 +scene0554_01 +scene0497_00 +scene0213_00 +scene0258_00 +scene0323_00 +scene0323_01 +scene0324_00 +scene0324_01 +scene0016_00 +scene0016_01 +scene0016_02 +scene0681_00 +scene0398_00 +scene0398_01 +scene0227_00 +scene0090_00 +scene0066_00 +scene0262_00 +scene0262_01 +scene0155_00 +scene0155_01 +scene0155_02 +scene0352_00 +scene0352_01 +scene0352_02 +scene0038_00 +scene0038_01 +scene0038_02 +scene0335_00 +scene0335_01 +scene0335_02 +scene0261_00 +scene0261_01 +scene0261_02 +scene0261_03 +scene0640_00 +scene0640_01 +scene0640_02 +scene0080_00 +scene0080_01 +scene0080_02 +scene0403_00 +scene0403_01 +scene0282_00 +scene0282_01 +scene0282_02 +scene0682_00 +scene0173_00 +scene0173_01 +scene0173_02 +scene0522_00 +scene0687_00 +scene0345_00 +scene0345_01 +scene0612_00 +scene0612_01 +scene0411_00 +scene0411_01 +scene0411_02 +scene0625_00 +scene0625_01 +scene0211_00 +scene0211_01 +scene0211_02 +scene0211_03 +scene0676_00 +scene0676_01 +scene0179_00 +scene0498_00 +scene0498_01 +scene0498_02 +scene0547_00 +scene0547_01 +scene0547_02 +scene0269_00 +scene0269_01 +scene0269_02 +scene0366_00 +scene0680_00 +scene0680_01 +scene0588_00 +scene0588_01 +scene0588_02 +scene0588_03 +scene0346_00 +scene0346_01 +scene0359_00 +scene0359_01 +scene0014_00 +scene0120_00 +scene0120_01 +scene0212_00 +scene0212_01 +scene0212_02 +scene0176_00 +scene0049_00 +scene0259_00 +scene0259_01 +scene0586_00 +scene0586_01 +scene0586_02 +scene0309_00 +scene0309_01 +scene0125_00 +scene0455_00 +scene0177_00 +scene0177_01 +scene0177_02 +scene0326_00 +scene0372_00 +scene0171_00 +scene0171_01 +scene0374_00 +scene0654_00 +scene0654_01 +scene0445_00 +scene0445_01 +scene0475_00 +scene0475_01 +scene0475_02 +scene0349_00 +scene0349_01 +scene0234_00 +scene0669_00 +scene0669_01 +scene0375_00 +scene0375_01 +scene0375_02 +scene0387_00 +scene0387_01 +scene0387_02 +scene0312_00 +scene0312_01 +scene0312_02 +scene0384_00 +scene0385_00 +scene0385_01 +scene0385_02 +scene0000_00 +scene0000_01 +scene0000_02 +scene0376_00 +scene0376_01 +scene0376_02 +scene0301_00 +scene0301_01 +scene0301_02 +scene0322_00 +scene0542_00 +scene0079_00 +scene0079_01 +scene0099_00 +scene0099_01 +scene0476_00 +scene0476_01 +scene0476_02 +scene0394_00 +scene0394_01 +scene0147_00 +scene0147_01 +scene0067_00 +scene0067_01 +scene0067_02 +scene0397_00 +scene0397_01 +scene0337_00 +scene0337_01 +scene0337_02 +scene0431_00 +scene0223_00 +scene0223_01 +scene0223_02 +scene0010_00 +scene0010_01 +scene0402_00 +scene0268_00 +scene0268_01 +scene0268_02 +scene0679_00 +scene0679_01 +scene0405_00 +scene0128_00 +scene0408_00 +scene0408_01 +scene0190_00 +scene0107_00 +scene0076_00 +scene0167_00 +scene0361_00 +scene0361_01 +scene0361_02 +scene0216_00 +scene0202_00 +scene0303_00 +scene0303_01 +scene0303_02 +scene0446_00 +scene0446_01 +scene0089_00 +scene0089_01 +scene0089_02 +scene0360_00 +scene0150_00 +scene0150_01 +scene0150_02 +scene0421_00 +scene0421_01 +scene0421_02 +scene0454_00 +scene0626_00 +scene0626_01 +scene0626_02 +scene0186_00 +scene0186_01 +scene0538_00 +scene0479_00 +scene0479_01 +scene0479_02 +scene0656_00 +scene0656_01 +scene0656_02 +scene0656_03 +scene0525_00 +scene0525_01 +scene0525_02 +scene0308_00 +scene0396_00 +scene0396_01 +scene0396_02 +scene0624_00 +scene0292_00 +scene0292_01 +scene0632_00 +scene0253_00 +scene0021_00 +scene0325_00 +scene0325_01 +scene0437_00 +scene0437_01 +scene0438_00 +scene0590_00 +scene0590_01 +scene0400_00 +scene0400_01 +scene0541_00 +scene0541_01 +scene0541_02 +scene0677_00 +scene0677_01 +scene0677_02 +scene0443_00 +scene0315_00 +scene0288_00 +scene0288_01 +scene0288_02 +scene0422_00 +scene0672_00 +scene0672_01 +scene0184_00 +scene0449_00 +scene0449_01 +scene0449_02 +scene0048_00 +scene0048_01 +scene0138_00 +scene0452_00 +scene0452_01 +scene0452_02 +scene0667_00 +scene0667_01 +scene0667_02 +scene0463_00 +scene0463_01 +scene0078_00 +scene0078_01 +scene0078_02 +scene0636_00 +scene0457_00 +scene0457_01 +scene0457_02 +scene0465_00 +scene0465_01 +scene0577_00 +scene0151_00 +scene0151_01 +scene0339_00 +scene0573_00 +scene0573_01 +scene0154_00 +scene0096_00 +scene0096_01 +scene0096_02 +scene0235_00 +scene0168_00 +scene0168_01 +scene0168_02 +scene0594_00 +scene0587_00 +scene0587_01 +scene0587_02 +scene0587_03 +scene0229_00 +scene0229_01 +scene0229_02 +scene0512_00 +scene0106_00 +scene0106_01 +scene0106_02 +scene0472_00 +scene0472_01 +scene0472_02 +scene0489_00 +scene0489_01 +scene0489_02 +scene0425_00 +scene0425_01 +scene0641_00 +scene0526_00 +scene0526_01 +scene0317_00 +scene0317_01 +scene0544_00 +scene0017_00 +scene0017_01 +scene0017_02 +scene0042_00 +scene0042_01 +scene0042_02 +scene0576_00 +scene0576_01 +scene0576_02 +scene0347_00 +scene0347_01 +scene0347_02 +scene0436_00 +scene0226_00 +scene0226_01 +scene0485_00 +scene0486_00 +scene0487_00 +scene0487_01 +scene0619_00 +scene0097_00 +scene0367_00 +scene0367_01 +scene0491_00 +scene0492_00 +scene0492_01 +scene0005_00 +scene0005_01 +scene0543_00 +scene0543_01 +scene0543_02 +scene0657_00 +scene0341_00 +scene0341_01 +scene0534_00 +scene0534_01 +scene0319_00 +scene0273_00 +scene0273_01 +scene0225_00 +scene0198_00 +scene0003_00 +scene0003_01 +scene0003_02 +scene0409_00 +scene0409_01 +scene0331_00 +scene0331_01 +scene0505_00 +scene0505_01 +scene0505_02 +scene0505_03 +scene0505_04 +scene0506_00 +scene0057_00 +scene0057_01 +scene0074_00 +scene0074_01 +scene0074_02 +scene0091_00 +scene0112_00 +scene0112_01 +scene0112_02 +scene0240_00 +scene0102_00 +scene0102_01 +scene0513_00 +scene0514_00 +scene0514_01 +scene0537_00 +scene0516_00 +scene0516_01 +scene0495_00 +scene0617_00 +scene0133_00 +scene0520_00 +scene0520_01 +scene0635_00 +scene0635_01 +scene0054_00 +scene0473_00 +scene0473_01 +scene0524_00 +scene0524_01 +scene0379_00 +scene0471_00 +scene0471_01 +scene0471_02 +scene0566_00 +scene0248_00 +scene0248_01 +scene0248_02 +scene0529_00 +scene0529_01 +scene0529_02 +scene0391_00 +scene0264_00 +scene0264_01 +scene0264_02 +scene0675_00 +scene0675_01 +scene0350_00 +scene0350_01 +scene0350_02 +scene0450_00 +scene0068_00 +scene0068_01 +scene0237_00 +scene0237_01 +scene0365_00 +scene0365_01 +scene0365_02 +scene0605_00 +scene0605_01 +scene0539_00 +scene0539_01 +scene0539_02 +scene0540_00 +scene0540_01 +scene0540_02 +scene0170_00 +scene0170_01 +scene0170_02 +scene0433_00 +scene0340_00 +scene0340_01 +scene0340_02 +scene0160_00 +scene0160_01 +scene0160_02 +scene0160_03 +scene0160_04 +scene0059_00 +scene0059_01 +scene0059_02 +scene0056_00 +scene0056_01 +scene0478_00 +scene0478_01 +scene0548_00 +scene0548_01 +scene0548_02 +scene0204_00 +scene0204_01 +scene0204_02 +scene0033_00 +scene0145_00 +scene0483_00 +scene0508_00 +scene0508_01 +scene0508_02 +scene0180_00 +scene0148_00 +scene0556_00 +scene0556_01 +scene0416_00 +scene0416_01 +scene0416_02 +scene0416_03 +scene0416_04 +scene0073_00 +scene0073_01 +scene0073_02 +scene0073_03 +scene0034_00 +scene0034_01 +scene0034_02 +scene0639_00 +scene0561_00 +scene0561_01 +scene0298_00 +scene0692_00 +scene0692_01 +scene0692_02 +scene0692_03 +scene0692_04 +scene0642_00 +scene0642_01 +scene0642_02 +scene0642_03 +scene0630_00 +scene0630_01 +scene0630_02 +scene0630_03 +scene0630_04 +scene0630_05 +scene0630_06 +scene0706_00 +scene0567_00 +scene0567_01 diff --git a/datasets/scannet_preprocess/meta_data/scannetv2_val.txt b/datasets/scannet_preprocess/meta_data/scannetv2_val.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9e7d9205321e8ca047a527466f4b7100c9c9d2c --- /dev/null +++ b/datasets/scannet_preprocess/meta_data/scannetv2_val.txt @@ -0,0 +1,312 @@ +scene0568_00 +scene0568_01 +scene0568_02 +scene0304_00 +scene0488_00 +scene0488_01 +scene0412_00 +scene0412_01 +scene0217_00 +scene0019_00 +scene0019_01 +scene0414_00 +scene0575_00 +scene0575_01 +scene0575_02 +scene0426_00 +scene0426_01 +scene0426_02 +scene0426_03 +scene0549_00 +scene0549_01 +scene0578_00 +scene0578_01 +scene0578_02 +scene0665_00 +scene0665_01 +scene0050_00 +scene0050_01 +scene0050_02 +scene0257_00 +scene0025_00 +scene0025_01 +scene0025_02 +scene0583_00 +scene0583_01 +scene0583_02 +scene0701_00 +scene0701_01 +scene0701_02 +scene0580_00 +scene0580_01 +scene0565_00 +scene0169_00 +scene0169_01 +scene0655_00 +scene0655_01 +scene0655_02 +scene0063_00 +scene0221_00 +scene0221_01 +scene0591_00 +scene0591_01 +scene0591_02 +scene0678_00 +scene0678_01 +scene0678_02 +scene0462_00 +scene0427_00 +scene0595_00 +scene0193_00 +scene0193_01 +scene0164_00 +scene0164_01 +scene0164_02 +scene0164_03 +scene0598_00 +scene0598_01 +scene0598_02 +scene0599_00 +scene0599_01 +scene0599_02 +scene0328_00 +scene0300_00 +scene0300_01 +scene0354_00 +scene0458_00 +scene0458_01 +scene0423_00 +scene0423_01 +scene0423_02 +scene0307_00 +scene0307_01 +scene0307_02 +scene0606_00 +scene0606_01 +scene0606_02 +scene0432_00 +scene0432_01 +scene0608_00 +scene0608_01 +scene0608_02 +scene0651_00 +scene0651_01 +scene0651_02 +scene0430_00 +scene0430_01 +scene0689_00 +scene0357_00 +scene0357_01 +scene0574_00 +scene0574_01 +scene0574_02 +scene0329_00 +scene0329_01 +scene0329_02 +scene0153_00 +scene0153_01 +scene0616_00 +scene0616_01 +scene0671_00 +scene0671_01 +scene0618_00 +scene0382_00 +scene0382_01 +scene0490_00 +scene0621_00 +scene0607_00 +scene0607_01 +scene0149_00 +scene0695_00 +scene0695_01 +scene0695_02 +scene0695_03 +scene0389_00 +scene0377_00 +scene0377_01 +scene0377_02 +scene0342_00 +scene0139_00 +scene0629_00 +scene0629_01 +scene0629_02 +scene0496_00 +scene0633_00 +scene0633_01 +scene0518_00 +scene0652_00 +scene0406_00 +scene0406_01 +scene0406_02 +scene0144_00 +scene0144_01 +scene0494_00 +scene0278_00 +scene0278_01 +scene0316_00 +scene0609_00 +scene0609_01 +scene0609_02 +scene0609_03 +scene0084_00 +scene0084_01 +scene0084_02 +scene0696_00 +scene0696_01 +scene0696_02 +scene0351_00 +scene0351_01 +scene0643_00 +scene0644_00 +scene0645_00 +scene0645_01 +scene0645_02 +scene0081_00 +scene0081_01 +scene0081_02 +scene0647_00 +scene0647_01 +scene0535_00 +scene0353_00 +scene0353_01 +scene0353_02 +scene0559_00 +scene0559_01 +scene0559_02 +scene0593_00 +scene0593_01 +scene0246_00 +scene0653_00 +scene0653_01 +scene0064_00 +scene0064_01 +scene0356_00 +scene0356_01 +scene0356_02 +scene0030_00 +scene0030_01 +scene0030_02 +scene0222_00 +scene0222_01 +scene0338_00 +scene0338_01 +scene0338_02 +scene0378_00 +scene0378_01 +scene0378_02 +scene0660_00 +scene0553_00 +scene0553_01 +scene0553_02 +scene0527_00 +scene0663_00 +scene0663_01 +scene0663_02 +scene0664_00 +scene0664_01 +scene0664_02 +scene0334_00 +scene0334_01 +scene0334_02 +scene0046_00 +scene0046_01 +scene0046_02 +scene0203_00 +scene0203_01 +scene0203_02 +scene0088_00 +scene0088_01 +scene0088_02 +scene0088_03 +scene0086_00 +scene0086_01 +scene0086_02 +scene0670_00 +scene0670_01 +scene0256_00 +scene0256_01 +scene0256_02 +scene0249_00 +scene0441_00 +scene0658_00 +scene0704_00 +scene0704_01 +scene0187_00 +scene0187_01 +scene0131_00 +scene0131_01 +scene0131_02 +scene0207_00 +scene0207_01 +scene0207_02 +scene0461_00 +scene0011_00 +scene0011_01 +scene0343_00 +scene0251_00 +scene0077_00 +scene0077_01 +scene0684_00 +scene0684_01 +scene0550_00 +scene0686_00 +scene0686_01 +scene0686_02 +scene0208_00 +scene0500_00 +scene0500_01 +scene0552_00 +scene0552_01 +scene0648_00 +scene0648_01 +scene0435_00 +scene0435_01 +scene0435_02 +scene0435_03 +scene0690_00 +scene0690_01 +scene0693_00 +scene0693_01 +scene0693_02 +scene0700_00 +scene0700_01 +scene0700_02 +scene0699_00 +scene0231_00 +scene0231_01 +scene0231_02 +scene0697_00 +scene0697_01 +scene0697_02 +scene0697_03 +scene0474_00 +scene0474_01 +scene0474_02 +scene0474_03 +scene0474_04 +scene0474_05 +scene0355_00 +scene0355_01 +scene0146_00 +scene0146_01 +scene0146_02 +scene0196_00 +scene0702_00 +scene0702_01 +scene0702_02 +scene0314_00 +scene0277_00 +scene0277_01 +scene0277_02 +scene0095_00 +scene0095_01 +scene0015_00 +scene0100_00 +scene0100_01 +scene0100_02 +scene0558_00 +scene0558_01 +scene0558_02 +scene0685_00 +scene0685_01 +scene0685_02 diff --git a/datasets/scannet_preprocess/prepare_2d_data/SensorData.py b/datasets/scannet_preprocess/prepare_2d_data/SensorData.py new file mode 100644 index 0000000000000000000000000000000000000000..dec0d116a63d4ea55a72979266a75e9be89a9fc9 --- /dev/null +++ b/datasets/scannet_preprocess/prepare_2d_data/SensorData.py @@ -0,0 +1,121 @@ + +import os, struct +import numpy as np +import zlib +import imageio +import cv2 + +COMPRESSION_TYPE_COLOR = {-1:'unknown', 0:'raw', 1:'png', 2:'jpeg'} +COMPRESSION_TYPE_DEPTH = {-1:'unknown', 0:'raw_ushort', 1:'zlib_ushort', 2:'occi_ushort'} + +class RGBDFrame(): + + def load(self, file_handle): + self.camera_to_world = np.asarray(struct.unpack('f'*16, file_handle.read(16*4)), dtype=np.float32).reshape(4, 4) + self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0] + self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0] + self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0] + self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0] + self.color_data = b''.join(struct.unpack('c'*self.color_size_bytes, file_handle.read(self.color_size_bytes))) + self.depth_data = b''.join(struct.unpack('c'*self.depth_size_bytes, file_handle.read(self.depth_size_bytes))) + + + def decompress_depth(self, compression_type): + if compression_type == 'zlib_ushort': + return self.decompress_depth_zlib() + else: + raise + + + def decompress_depth_zlib(self): + return zlib.decompress(self.depth_data) + + + def decompress_color(self, compression_type): + if compression_type == 'jpeg': + return self.decompress_color_jpeg() + else: + raise + + + def decompress_color_jpeg(self): + return imageio.imread(self.color_data) + + +class SensorData: + + def __init__(self, filename): + self.version = 4 + self.load(filename) + + + def load(self, filename): + with open(filename, 'rb') as f: + version = struct.unpack('I', f.read(4))[0] + assert self.version == version + strlen = struct.unpack('Q', f.read(8))[0] + self.sensor_name = b''.join(struct.unpack('c'*strlen, f.read(strlen))) + self.intrinsic_color = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4) + self.extrinsic_color = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4) + self.intrinsic_depth = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4) + self.extrinsic_depth = np.asarray(struct.unpack('f'*16, f.read(16*4)), dtype=np.float32).reshape(4, 4) + self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack('i', f.read(4))[0]] + self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack('i', f.read(4))[0]] + self.color_width = struct.unpack('I', f.read(4))[0] + self.color_height = struct.unpack('I', f.read(4))[0] + self.depth_width = struct.unpack('I', f.read(4))[0] + self.depth_height = struct.unpack('I', f.read(4))[0] + self.depth_shift = struct.unpack('f', f.read(4))[0] + num_frames = struct.unpack('Q', f.read(8))[0] + self.frames = [] + for i in range(num_frames): + frame = RGBDFrame() + frame.load(f) + self.frames.append(frame) + + + def export_depth_images(self, output_path, image_size=None, frame_skip=1): + if not os.path.exists(output_path): + os.makedirs(output_path) + # print 'exporting', len(self.frames)//frame_skip, ' depth frames to', output_path + for f in range(0, len(self.frames), frame_skip): + depth_data = self.frames[f].decompress_depth(self.depth_compression_type) + depth = np.fromstring(depth_data, dtype=np.uint16).reshape(self.depth_height, self.depth_width) + if image_size is not None: + depth = cv2.resize(depth, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST) + imageio.imwrite(os.path.join(output_path, str(f) + '.png'), depth) + + + def export_color_images(self, output_path, image_size=None, frame_skip=1): + if not os.path.exists(output_path): + os.makedirs(output_path) + # print 'exporting', len(self.frames)//frame_skip, 'color frames to', output_path + for f in range(0, len(self.frames), frame_skip): + color = self.frames[f].decompress_color(self.color_compression_type) + if image_size is not None: + color = cv2.resize(color, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST) + imageio.imwrite(os.path.join(output_path, str(f) + '.jpg'), color) + + + def save_mat_to_file(self, matrix, filename): + with open(filename, 'w') as f: + for line in matrix: + np.savetxt(f, line[np.newaxis], fmt='%f') + + + def export_poses(self, output_path, frame_skip=1): + if not os.path.exists(output_path): + os.makedirs(output_path) + # print 'exporting', len(self.frames)//frame_skip, 'camera poses to', output_path + for f in range(0, len(self.frames), frame_skip): + self.save_mat_to_file(self.frames[f].camera_to_world, os.path.join(output_path, str(f) + '.txt')) + + + def export_intrinsics(self, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + # print 'exporting camera intrinsics to', output_path + self.save_mat_to_file(self.intrinsic_color, os.path.join(output_path, 'intrinsic_color.txt')) + self.save_mat_to_file(self.extrinsic_color, os.path.join(output_path, 'extrinsic_color.txt')) + self.save_mat_to_file(self.intrinsic_depth, os.path.join(output_path, 'intrinsic_depth.txt')) + self.save_mat_to_file(self.extrinsic_depth, os.path.join(output_path, 'extrinsic_depth.txt')) \ No newline at end of file diff --git a/datasets/scannet_preprocess/prepare_2d_data/prepare_2d_data.py b/datasets/scannet_preprocess/prepare_2d_data/prepare_2d_data.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc37236cefe0201d89be8eb8feb9d6369505446 --- /dev/null +++ b/datasets/scannet_preprocess/prepare_2d_data/prepare_2d_data.py @@ -0,0 +1,123 @@ +# pre-process ScanNet 2D data +# note: depends on the sens file reader from ScanNet: +# https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py +# if export_label_images flag is on: +# - depends on https://github.com/ScanNet/ScanNet/tree/master/BenchmarkScripts/util.py +# - also assumes that label images are unzipped as scene*/label*/*.png +# expected file structure: +# - prepare_2d_data.py +# - https://github.com/ScanNet/ScanNet/tree/master/BenchmarkScripts/util.py +# - https://github.com/ScanNet/ScanNet/blob/master/SensReader/python/SensorData.py +# +# example usage: +# python prepare_2d_data.py --scannet_path data/scannetv2 --output_path data/scannetv2_images --export_label_images + +import argparse +import os, sys +import numpy as np +import skimage.transform as sktf +import imageio +from SensorData import SensorData +import util +# try: +# from prepare_2d_data.SensorData import SensorData +# except: +# print('Failed to import SensorData (from ScanNet code toolbox)') +# sys.exit(-1) +# try: +# from prepare_2d_data import util +# except: +# print('Failed to import ScanNet code toolbox util') +# sys.exit(-1) + +# params +parser = argparse.ArgumentParser() +parser.add_argument('--scannet_path', required=True, help='path to scannet data') +parser.add_argument('--output_path', required=True, help='where to output 2d data') +parser.add_argument('--export_label_images', dest='export_label_images', action='store_true') +parser.add_argument('--label_type', default='label-filt', help='which labels (label or label-filt)') +parser.add_argument('--frame_skip', type=int, default=20, help='export every nth frame') +parser.add_argument('--label_map_file', default='scannet-preprocess/meta_data/scannetv2-labels.combined.tsv', + help='path to scannetv2-labels.combined.tsv (required for label export only)') +parser.add_argument('--output_image_width', type=int, default=640, help='export image width') +parser.add_argument('--output_image_height', type=int, default=480, help='export image height') + +parser.set_defaults(export_label_images=False) +opt = parser.parse_args() +if opt.export_label_images: + assert opt.label_map_file != '' +print(opt) + + +def print_error(message): + sys.stderr.write('ERROR: ' + str(message) + '\n') + sys.exit(-1) + + +# from https://github.com/ScanNet/ScanNet/tree/master/BenchmarkScripts/2d_helpers/convert_scannet_label_image.py +def map_label_image(image, label_mapping): + mapped = np.copy(image) + for k, v in label_mapping.iteritems(): + mapped[image == k] = v + return mapped.astype(np.uint8) + + +def main(): + if not os.path.exists(opt.output_path): + os.makedirs(opt.output_path) + + label_mapping = None + if opt.export_label_images: + label_map = util.read_label_mapping(opt.label_map_file, label_from='id', label_to='nyu40id') + + scenes = [d for d in os.listdir(opt.scannet_path) if os.path.isdir(os.path.join(opt.scannet_path, d))] + print('Found %d scenes' % len(scenes)) + for i in range(0,len(scenes)): + if scenes[i] != 'scene0000_00': continue + sens_file = os.path.join(opt.scannet_path, scenes[i], scenes[i] + '.sens') + label_path = os.path.join(opt.scannet_path, scenes[i], opt.label_type) + if opt.export_label_images and not os.path.isdir(label_path): + print_error('Error: using export_label_images option but label path %s does not exist' % label_path) + output_color_path = os.path.join(opt.output_path, scenes[i], 'color') + if not os.path.isdir(output_color_path): + os.makedirs(output_color_path) + output_depth_path = os.path.join(opt.output_path, scenes[i], 'depth') + if not os.path.isdir(output_depth_path): + os.makedirs(output_depth_path) + output_pose_path = os.path.join(opt.output_path, scenes[i], 'pose') + if not os.path.isdir(output_pose_path): + os.makedirs(output_pose_path) + output_label_path = os.path.join(opt.output_path, scenes[i], 'label') + if opt.export_label_images and not os.path.isdir(output_label_path): + os.makedirs(output_label_path) + output_intrinsics_path = os.path.join(opt.output_path, scenes[i], 'intrinsics') + if opt.export_label_images and not os.path.isdir(output_label_path): + os.makedirs(output_label_path) + + # read and export + sys.stdout.write('\r[ %d | %d ] %s\tloading...' % ((i + 1), len(scenes), scenes[i])) + sys.stdout.flush() + sd = SensorData(sens_file) + sys.stdout.write('\r[ %d | %d ] %s\texporting...' % ((i + 1), len(scenes), scenes[i])) + sys.stdout.flush() + sd.export_color_images(output_color_path, image_size=[opt.output_image_height, opt.output_image_width], + frame_skip=opt.frame_skip) + sd.export_depth_images(output_depth_path, image_size=[opt.output_image_height, opt.output_image_width], + frame_skip=opt.frame_skip) + sd.export_poses(output_pose_path, frame_skip=opt.frame_skip) + sd.export_intrinsics(output_intrinsics_path) + + if opt.export_label_images: + + for f in range(0, len(sd.frames), opt.frame_skip): + label_file = os.path.join(label_path, str(f) + '.png') + image = np.array(imageio.imread(label_file)) + image = sktf.resize(image, [opt.output_image_height, opt.output_image_width], order=0, + preserve_range=True) + mapped_image = map_label_image(image, label_map) + imageio.imwrite(os.path.join(output_label_path, str(f) + '.png'), mapped_image) + print('') + + +if __name__ == '__main__': + main() diff --git a/datasets/scannet_preprocess/prepare_2d_data/util.py b/datasets/scannet_preprocess/prepare_2d_data/util.py new file mode 100644 index 0000000000000000000000000000000000000000..0b781c2559a62e24df5859e462b90eac8d894d0b --- /dev/null +++ b/datasets/scannet_preprocess/prepare_2d_data/util.py @@ -0,0 +1,127 @@ +import os, sys +import csv + +try: + import numpy as np +except: + # print "Failed to import numpy package." + sys.exit(-1) +try: + import imageio +except: + print("Please install the module 'imageio' for image processing, e.g.") + print("pip install imageio") + sys.exit(-1) + + +# print an error message and quit +def print_error(message, user_fault=False): + sys.stderr.write('ERROR: ' + str(message) + '\n') + if user_fault: + sys.exit(2) + sys.exit(-1) + + +# if string s represents an int +def represents_int(s): + try: + int(s) + return True + except ValueError: + return False + + +def read_label_mapping(filename, label_from='raw_category', label_to='nyu40id'): + assert os.path.isfile(filename) + mapping = dict() + with open(filename) as csvfile: + reader = csv.DictReader(csvfile, delimiter='\t') + for row in reader: + mapping[row[label_from]] = int(row[label_to]) + # if ints convert + if represents_int(list(mapping.keys())[0]): + mapping = {int(k): v for k, v in mapping.items()} + return mapping + + +# input: scene_types.txt or scene_types_all.txt +def read_scene_types_mapping(filename, remove_spaces=True): + assert os.path.isfile(filename) + mapping = dict() + lines = open(filename).read().splitlines() + lines = [line.split('\t') for line in lines] + if remove_spaces: + mapping = {x[1].strip(): int(x[0]) for x in lines} + else: + mapping = {x[1]: int(x[0]) for x in lines} + return mapping + + +# color by label +def visualize_label_image(filename, image): + height = image.shape[0] + width = image.shape[1] + vis_image = np.zeros([height, width, 3], dtype=np.uint8) + color_palette = create_color_palette() + for idx, color in enumerate(color_palette): + vis_image[image == idx] = color + imageio.imwrite(filename, vis_image) + + +# color by different instances (mod length of color palette) +def visualize_instance_image(filename, image): + height = image.shape[0] + width = image.shape[1] + vis_image = np.zeros([height, width, 3], dtype=np.uint8) + color_palette = create_color_palette() + instances = np.unique(image) + for idx, inst in enumerate(instances): + vis_image[image == inst] = color_palette[inst % len(color_palette)] + imageio.imwrite(filename, vis_image) + + +# color palette for nyu40 labels +def create_color_palette(): + return [ + (0, 0, 0), + (174, 199, 232), # wall + (152, 223, 138), # floor + (31, 119, 180), # cabinet + (255, 187, 120), # bed + (188, 189, 34), # chair + (140, 86, 75), # sofa + (255, 152, 150), # table + (214, 39, 40), # door + (197, 176, 213), # window + (148, 103, 189), # bookshelf + (196, 156, 148), # picture + (23, 190, 207), # counter + (178, 76, 76), + (247, 182, 210), # desk + (66, 188, 102), + (219, 219, 141), # curtain + (140, 57, 197), + (202, 185, 52), + (51, 176, 203), + (200, 54, 131), + (92, 193, 61), + (78, 71, 183), + (172, 114, 82), + (255, 127, 14), # refrigerator + (91, 163, 138), + (153, 98, 156), + (140, 153, 101), + (158, 218, 229), # shower curtain + (100, 125, 154), + (178, 127, 135), + (120, 185, 128), + (146, 111, 194), + (44, 160, 44), # toilet + (112, 128, 144), # sink + (96, 207, 209), + (227, 119, 194), # bathtub + (213, 92, 176), + (94, 106, 211), + (82, 84, 163), # otherfurn + (100, 85, 144) + ] diff --git a/datasets/scannet_preprocess/preprocess_scannet.py b/datasets/scannet_preprocess/preprocess_scannet.py new file mode 100644 index 0000000000000000000000000000000000000000..714a18d9ceeb7825824ce1c8e0f81d7e33969894 --- /dev/null +++ b/datasets/scannet_preprocess/preprocess_scannet.py @@ -0,0 +1,215 @@ +""" +Preprocessing Script for ScanNet 20/200 + +Author: Xiaoyang Wu (xiaoyang.wu.cs@gmail.com) +Please cite our work if the code is helpful to you. +""" + +import warnings + +import torch + +warnings.filterwarnings("ignore", category=DeprecationWarning) + +import sys +import os +import argparse +import glob +import json +import plyfile +import numpy as np +import pandas as pd +import multiprocessing as mp +from concurrent.futures import ProcessPoolExecutor +from itertools import repeat + +# Load external constants +from meta_data.scannet200_constants import VALID_CLASS_IDS_200, VALID_CLASS_IDS_20 + +CLOUD_FILE_PFIX = '_vh_clean_2' +SEGMENTS_FILE_PFIX = '.0.010000.segs.json' +AGGREGATIONS_FILE_PFIX = '.aggregation.json' +CLASS_IDS200 = VALID_CLASS_IDS_200 +CLASS_IDS20 = VALID_CLASS_IDS_20 +IGNORE_INDEX = -1 + + +def read_plymesh(filepath): + """Read ply file and return it as numpy array. Returns None if emtpy.""" + with open(filepath, 'rb') as f: + plydata = plyfile.PlyData.read(f) + if plydata.elements: + vertices = pd.DataFrame(plydata['vertex'].data).values + faces = np.stack(plydata['face'].data['vertex_indices'], axis=0) + return vertices, faces + + +# Map the raw category id to the point cloud +def point_indices_from_group(seg_indices, group, labels_pd): + group_segments = np.array(group['segments']) + label = group['label'] + + # Map the category name to id + label_id20 = labels_pd[labels_pd['raw_category'] == label]['nyu40id'] + label_id20 = int(label_id20.iloc[0]) if len(label_id20) > 0 else 0 + label_id200 = labels_pd[labels_pd['raw_category'] == label]['id'] + label_id200 = int(label_id200.iloc[0]) if len(label_id200) > 0 else 0 + + # Only store for the valid categories + if label_id20 in CLASS_IDS20: + label_id20 = CLASS_IDS20.index(label_id20) + else: + label_id20 = IGNORE_INDEX + + if label_id200 in CLASS_IDS200: + label_id200 = CLASS_IDS200.index(label_id200) + else: + label_id200 = IGNORE_INDEX + + # get points, where segment indices (points labelled with segment ids) are in the group segment list + point_idx = np.where(np.isin(seg_indices, group_segments))[0] + return point_idx, label_id20, label_id200 + + +def face_normal(vertex, face): + v01 = vertex[face[:, 1]] - vertex[face[:, 0]] + v02 = vertex[face[:, 2]] - vertex[face[:, 0]] + vec = np.cross(v01, v02) + length = np.sqrt(np.sum(vec ** 2, axis=1, keepdims=True)) + 1.0e-8 + nf = vec / length + area = length * 0.5 + return nf, area + + +def vertex_normal(vertex, face): + nf, area = face_normal(vertex, face) + nf = nf * area + + nv = np.zeros_like(vertex) + for i in range(face.shape[0]): + nv[face[i]] += nf[i] + + length = np.sqrt(np.sum(nv ** 2, axis=1, keepdims=True)) + 1.0e-8 + nv = nv / length + return nv + + +def handle_process(scene_path, output_path, labels_pd, train_scenes, val_scenes, parse_normals=True): + scene_id = os.path.basename(scene_path) + mesh_path = os.path.join(scene_path, f'{scene_id}{CLOUD_FILE_PFIX}.ply') + segments_file = os.path.join(scene_path, f'{scene_id}{CLOUD_FILE_PFIX}{SEGMENTS_FILE_PFIX}') + aggregations_file = os.path.join(scene_path, f'{scene_id}{AGGREGATIONS_FILE_PFIX}') + info_file = os.path.join(scene_path, f'{scene_id}.txt') + + if scene_id in train_scenes: + output_file = os.path.join(output_path, 'train', f'{scene_id}.pth') + split_name = 'train' + elif scene_id in val_scenes: + output_file = os.path.join(output_path, 'val', f'{scene_id}.pth') + split_name = 'val' + else: + output_file = os.path.join(output_path, 'test', f'{scene_id}.pth') + split_name = 'test' + + print(f'Processing: {scene_id} in {split_name}') + + vertices, faces = read_plymesh(mesh_path) + coords = vertices[:, :3] + colors = vertices[:, 3:6] + save_dict = dict(coord=coords, color=colors, scene_id=scene_id) + + # # Rotating the mesh to axis aligned + # info_dict = {} + # with open(info_file) as f: + # for line in f: + # (key, val) = line.split(" = ") + # info_dict[key] = np.fromstring(val, sep=' ') + # + # if 'axisAlignment' not in info_dict: + # rot_matrix = np.identity(4) + # else: + # rot_matrix = info_dict['axisAlignment'].reshape(4, 4) + # r_coords = coords.transpose() + # r_coords = np.append(r_coords, np.ones((1, r_coords.shape[1])), axis=0) + # r_coords = np.dot(rot_matrix, r_coords) + # coords = r_coords + + # Parse Normals + if parse_normals: + save_dict["normal"] = vertex_normal(coords, faces) + + # Load segments file + if split_name != "test": + with open(segments_file) as f: + segments = json.load(f) + seg_indices = np.array(segments['segIndices']) + + # Load Aggregations file + with open(aggregations_file) as f: + aggregation = json.load(f) + seg_groups = np.array(aggregation['segGroups']) + + # Generate new labels + semantic_gt20 = np.ones((vertices.shape[0])) * IGNORE_INDEX + semantic_gt200 = np.ones((vertices.shape[0])) * IGNORE_INDEX + instance_ids = np.ones((vertices.shape[0])) * IGNORE_INDEX + for group in seg_groups: + point_idx, label_id20, label_id200 = \ + point_indices_from_group(seg_indices, group, labels_pd) + + semantic_gt20[point_idx] = label_id20 + semantic_gt200[point_idx] = label_id200 + instance_ids[point_idx] = group['id'] + + semantic_gt20 = semantic_gt20.astype(int) + semantic_gt200 = semantic_gt200.astype(int) + instance_ids = instance_ids.astype(int) + + save_dict["semantic_gt20"] = semantic_gt20 + save_dict["semantic_gt200"] = semantic_gt200 + save_dict["instance_gt"] = instance_ids + + # Concatenate with original cloud + processed_vertices = np.hstack((semantic_gt200, instance_ids)) + + if np.any(np.isnan(processed_vertices)) or not np.all(np.isfinite(processed_vertices)): + raise ValueError(f'Find NaN in Scene: {scene_id}') + + # Save processed data + torch.save(save_dict, output_file) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--dataset_root', required=True, help='Path to the ScanNet dataset containing scene folders') + parser.add_argument('--output_root', required=True, help='Output path where train/val folders will be located') + parser.add_argument('--parse_normals', default=True, type=bool, help='Whether parse point normals') + config = parser.parse_args() + + # Load label map + labels_pd = pd.read_csv('scannet-preprocess/meta_data/scannetv2-labels.combined.tsv', + sep='\t', header=0) + + # Load train/val splits + with open('scannet-preprocess/meta_data/scannetv2_train.txt') as train_file: + train_scenes = train_file.read().splitlines() + with open('scannet-preprocess/meta_data/scannetv2_val.txt') as val_file: + val_scenes = val_file.read().splitlines() + + # Create output directories + train_output_dir = os.path.join(config.output_root, 'train') + os.makedirs(train_output_dir, exist_ok=True) + val_output_dir = os.path.join(config.output_root, 'val') + os.makedirs(val_output_dir, exist_ok=True) + test_output_dir = os.path.join(config.output_root, 'test') + os.makedirs(test_output_dir, exist_ok=True) + + # Load scene paths + scene_paths = sorted(glob.glob(config.dataset_root + '/scans*/scene*')) + + # Preprocess data. + print('Processing scenes...') + pool = ProcessPoolExecutor(max_workers=mp.cpu_count()) + # pool = ProcessPoolExecutor(max_workers=1) + _ = list(pool.map(handle_process, scene_paths, repeat(config.output_root), repeat(labels_pd), repeat(train_scenes), + repeat(val_scenes), repeat(config.parse_normals))) diff --git a/datasets/scannet_preprocess/scannet_pair/SensorData.py b/datasets/scannet_preprocess/scannet_pair/SensorData.py new file mode 100644 index 0000000000000000000000000000000000000000..6a35bfb52a4f389d3368e4d3ce014d008674f21b --- /dev/null +++ b/datasets/scannet_preprocess/scannet_pair/SensorData.py @@ -0,0 +1,121 @@ +import os, struct +import numpy as np +import zlib +import imageio +import cv2 + +COMPRESSION_TYPE_COLOR = {-1: 'unknown', 0: 'raw', 1: 'png', 2: 'jpeg'} +COMPRESSION_TYPE_DEPTH = {-1: 'unknown', 0: 'raw_ushort', 1: 'zlib_ushort', 2: 'occi_ushort'} + + +class RGBDFrame(): + + def load(self, file_handle): + self.camera_to_world = np.asarray(struct.unpack('f' * 16, file_handle.read(16 * 4)), dtype=np.float32).reshape( + 4, 4) + self.timestamp_color = struct.unpack('Q', file_handle.read(8))[0] + self.timestamp_depth = struct.unpack('Q', file_handle.read(8))[0] + self.color_size_bytes = struct.unpack('Q', file_handle.read(8))[0] + self.depth_size_bytes = struct.unpack('Q', file_handle.read(8))[0] + self.color_data = b''.join(struct.unpack('c' * self.color_size_bytes, file_handle.read(self.color_size_bytes))) + self.depth_data = b''.join(struct.unpack('c' * self.depth_size_bytes, file_handle.read(self.depth_size_bytes))) + + def decompress_depth(self, compression_type): + if compression_type == 'zlib_ushort': + return self.decompress_depth_zlib() + else: + raise + + def decompress_depth_zlib(self): + return zlib.decompress(self.depth_data) + + def decompress_color(self, compression_type): + if compression_type == 'jpeg': + return self.decompress_color_jpeg() + else: + raise + + def decompress_color_jpeg(self): + return imageio.imread(self.color_data) + + +class SensorData: + def __init__(self, filename): + self.version = 4 + self.load(filename) + + def load(self, filename): + with open(filename, 'rb') as f: + version = struct.unpack('I', f.read(4))[0] + assert self.version == version + strlen = struct.unpack('Q', f.read(8))[0] + self.sensor_name = b''.join(struct.unpack('c' * strlen, f.read(strlen))) + self.intrinsic_color = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) + self.extrinsic_color = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) + self.intrinsic_depth = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) + self.extrinsic_depth = np.asarray(struct.unpack('f' * 16, f.read(16 * 4)), dtype=np.float32).reshape(4, 4) + self.color_compression_type = COMPRESSION_TYPE_COLOR[struct.unpack('i', f.read(4))[0]] + self.depth_compression_type = COMPRESSION_TYPE_DEPTH[struct.unpack('i', f.read(4))[0]] + self.color_width = struct.unpack('I', f.read(4))[0] + self.color_height = struct.unpack('I', f.read(4))[0] + self.depth_width = struct.unpack('I', f.read(4))[0] + self.depth_height = struct.unpack('I', f.read(4))[0] + self.depth_shift = struct.unpack('f', f.read(4))[0] + num_frames = struct.unpack('Q', f.read(8))[0] + self.frames = [] + for i in range(num_frames): + frame = RGBDFrame() + frame.load(f) + self.frames.append(frame) + + def export_depth_images(self, output_path, image_size=None, frame_skip=1): + if not os.path.exists(output_path): + os.makedirs(output_path) + print('exporting', len(self.frames) // frame_skip, ' depth frames to', output_path) + for f in range(0, len(self.frames), frame_skip): + if os.path.exists((os.path.join(output_path, str(f) + '.png'))): + continue + if f % 100 == 0: + print('exporting', f, 'th depth frames to', os.path.join(output_path, str(f) + '.png')) + + depth_data = self.frames[f].decompress_depth(self.depth_compression_type) + depth = np.fromstring(depth_data, dtype=np.uint16).reshape(self.depth_height, self.depth_width) + if image_size is not None: + depth = cv2.resize(depth, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST) + imageio.imwrite(os.path.join(output_path, str(f) + '.png'), depth) + + def export_color_images(self, output_path, image_size=None, frame_skip=1): + if not os.path.exists(output_path): + os.makedirs(output_path) + print('exporting', len(self.frames) // frame_skip, 'color frames to', output_path) + for f in range(0, len(self.frames), frame_skip): + if os.path.exists((os.path.join(output_path, str(f) + '.png'))): + continue + if f % 100 == 0: + print('exporting', f, 'th color frames to', os.path.join(output_path, str(f) + '.png')) + color = self.frames[f].decompress_color(self.color_compression_type) + if image_size is not None: + color = cv2.resize(color, (image_size[1], image_size[0]), interpolation=cv2.INTER_NEAREST) + # imageio.imwrite(os.path.join(output_path, str(f) + '.jpg'), color) + imageio.imwrite(os.path.join(output_path, str(f) + '.png'), color) + + def save_mat_to_file(self, matrix, filename): + with open(filename, 'w') as f: + for line in matrix: + np.savetxt(f, line[np.newaxis], fmt='%f') + + def export_poses(self, output_path, frame_skip=1): + if not os.path.exists(output_path): + os.makedirs(output_path) + print('exporting', len(self.frames) // frame_skip, 'camera poses to', output_path) + for f in range(0, len(self.frames), frame_skip): + self.save_mat_to_file(self.frames[f].camera_to_world, os.path.join(output_path, str(f) + '.txt')) + + def export_intrinsics(self, output_path): + if not os.path.exists(output_path): + os.makedirs(output_path) + print('exporting camera intrinsics to', output_path) + self.save_mat_to_file(self.intrinsic_color, os.path.join(output_path, 'intrinsic_color.txt')) + self.save_mat_to_file(self.extrinsic_color, os.path.join(output_path, 'extrinsic_color.txt')) + self.save_mat_to_file(self.intrinsic_depth, os.path.join(output_path, 'intrinsic_depth.txt')) + self.save_mat_to_file(self.extrinsic_depth, os.path.join(output_path, 'extrinsic_depth.txt')) diff --git a/datasets/scannet_preprocess/scannet_pair/compute_full_overlapping.py b/datasets/scannet_preprocess/scannet_pair/compute_full_overlapping.py new file mode 100644 index 0000000000000000000000000000000000000000..1b6e2ec653a4019ac85491ca9b847a257a8b3b0f --- /dev/null +++ b/datasets/scannet_preprocess/scannet_pair/compute_full_overlapping.py @@ -0,0 +1,80 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import copy +import torch +import numpy as np +import math +import glob, os +import argparse +import open3d as o3d + + +def make_open3d_point_cloud(xyz, color=None, voxel_size=None): + if np.isnan(xyz).any(): + return None + + xyz = xyz[:,:3] + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(xyz) + if color is not None: + pcd.colors = o3d.utility.Vector3dVector(color) + if voxel_size is not None: + pcd = pcd.voxel_down_sample(voxel_size) + + return pcd + + +def compute_overlap_ratio(pcd0, pcd1, voxel_size): + pcd0_down = pcd0.voxel_down_sample(voxel_size) + pcd1_down = pcd1.voxel_down_sample(voxel_size) + matching01 = get_matching_indices(pcd0_down, pcd1_down, voxel_size * 1.5, 1) + matching10 = get_matching_indices(pcd1_down, pcd0_down, voxel_size * 1.5, 1) + overlap0 = float(len(matching01)) / float(len(pcd0_down.points)) + overlap1 = float(len(matching10)) / float(len(pcd1_down.points)) + return max(overlap0, overlap1) + + +def get_matching_indices(source, pcd_tree, search_voxel_size, K=None): + match_inds = [] + for i, point in enumerate(source.points): + [_, idx, _] = pcd_tree.search_radius_vector_3d(point, search_voxel_size) + if K is not None: + idx = idx[:K] + for j in idx: + match_inds.append((i, j)) + return match_inds + + +def compute_full_overlapping(data_root, scene_id, voxel_size=0.05): + _points = [ + (pcd_name, make_open3d_point_cloud(torch.load(pcd_name)['coord'], voxel_size=voxel_size)) + for pcd_name in glob.glob(os.path.join(data_root, scene_id, "pcd", "*.pth")) + ] + points = [(pcd_name, pcd) for (pcd_name, pcd) in _points if pcd is not None] + print('load {} point clouds ({} invalid has been filtered), computing matching/overlapping'.format( + len(points), len(_points) - len(points))) + + matching_matrix = np.zeros((len(points), len(points))) + for i, (pcd0_name, pcd0) in enumerate(points): + print('matching to...{}'.format(pcd0_name)) + pcd0_tree = o3d.geometry.KDTreeFlann(copy.deepcopy(pcd0)) + for j, (pcd1_name, pcd1) in enumerate(points): + if i == j: + continue + matching_matrix[i, j] = float(len(get_matching_indices(pcd1, pcd0_tree, 1.5 * voxel_size, 1))) / float( + len(pcd1.points)) + + # write to file + with open(os.path.join(data_root, scene_id, "pcd", "overlap.txt"), 'w') as f: + for i, (pcd0_name, pcd0) in enumerate(points): + for j, (pcd1_name, pcd1) in enumerate(points): + if i < j: + overlap = max(matching_matrix[i, j], matching_matrix[j, i]) + f.write("{} {} {}\n".format( + pcd0_name.replace(data_root, ""), pcd1_name.replace(data_root, ""), overlap + )) + + diff --git a/datasets/scannet_preprocess/scannet_pair/generage_list.py b/datasets/scannet_preprocess/scannet_pair/generage_list.py new file mode 100644 index 0000000000000000000000000000000000000000..8faf9feb74b68123bd363e08f7603bc7ad12c8b7 --- /dev/null +++ b/datasets/scannet_preprocess/scannet_pair/generage_list.py @@ -0,0 +1,31 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import argparse +import glob, os, sys + +from SensorData import SensorData + +# params +parser = argparse.ArgumentParser() +# data paths +parser.add_argument('--target_dir', required=True, help='path to the target dir') + +opt = parser.parse_args() +print(opt) + +def main(): + overlaps = glob.glob(os.path.join(opt.target_dir, "*/pcd/overlap.txt")) + with open(os.path.join(opt.target_dir, 'overlap30.txt'), 'w') as f: + for fo in overlaps: + for line in open(fo): + pcd0, pcd1, op = line.strip().split() + if float(op) >= 0.3: + print('{} {} {}'.format(pcd0, pcd1, op), file=f) + print('done') + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/datasets/scannet_preprocess/scannet_pair/plyfile.py b/datasets/scannet_preprocess/scannet_pair/plyfile.py new file mode 100644 index 0000000000000000000000000000000000000000..69c2aa9e898a999406ee4ecfa856c715f14b9251 --- /dev/null +++ b/datasets/scannet_preprocess/scannet_pair/plyfile.py @@ -0,0 +1,916 @@ +# Copyright 2014 Darsh Ranjan +# +# This file is part of python-plyfile. +# +# python-plyfile is free software: you can redistribute it and/or +# modify it under the terms of the GNU General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# python-plyfile is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with python-plyfile. If not, see +# . + +from itertools import islice as _islice + +import numpy as _np +from sys import byteorder as _byteorder + + +try: + _range = xrange +except NameError: + _range = range + + +# Many-many relation +_data_type_relation = [ + ('int8', 'i1'), + ('char', 'i1'), + ('uint8', 'u1'), + ('uchar', 'b1'), + ('uchar', 'u1'), + ('int16', 'i2'), + ('short', 'i2'), + ('uint16', 'u2'), + ('ushort', 'u2'), + ('int32', 'i4'), + ('int', 'i4'), + ('uint32', 'u4'), + ('uint', 'u4'), + ('float32', 'f4'), + ('float', 'f4'), + ('float64', 'f8'), + ('double', 'f8') +] + +_data_types = dict(_data_type_relation) +_data_type_reverse = dict((b, a) for (a, b) in _data_type_relation) + +_types_list = [] +_types_set = set() +for (_a, _b) in _data_type_relation: + if _a not in _types_set: + _types_list.append(_a) + _types_set.add(_a) + if _b not in _types_set: + _types_list.append(_b) + _types_set.add(_b) + + +_byte_order_map = { + 'ascii': '=', + 'binary_little_endian': '<', + 'binary_big_endian': '>' +} + +_byte_order_reverse = { + '<': 'binary_little_endian', + '>': 'binary_big_endian' +} + +_native_byte_order = {'little': '<', 'big': '>'}[_byteorder] + + +def _lookup_type(type_str): + if type_str not in _data_type_reverse: + try: + type_str = _data_types[type_str] + except KeyError: + raise ValueError("field type %r not in %r" % + (type_str, _types_list)) + + return _data_type_reverse[type_str] + + +def _split_line(line, n): + fields = line.split(None, n) + if len(fields) == n: + fields.append('') + + assert len(fields) == n + 1 + + return fields + + +def make2d(array, cols=None, dtype=None): + ''' + Make a 2D array from an array of arrays. The `cols' and `dtype' + arguments can be omitted if the array is not empty. + + ''' + if (cols is None or dtype is None) and not len(array): + raise RuntimeError("cols and dtype must be specified for empty " + "array") + + if cols is None: + cols = len(array[0]) + + if dtype is None: + dtype = array[0].dtype + + return _np.fromiter(array, [('_', dtype, (cols,))], + count=len(array))['_'] + + +class PlyParseError(Exception): + + ''' + Raised when a PLY file cannot be parsed. + + The attributes `element', `row', `property', and `message' give + additional information. + + ''' + + def __init__(self, message, element=None, row=None, prop=None): + self.message = message + self.element = element + self.row = row + self.prop = prop + + s = '' + if self.element: + s += 'element %r: ' % self.element.name + if self.row is not None: + s += 'row %d: ' % self.row + if self.prop: + s += 'property %r: ' % self.prop.name + s += self.message + + Exception.__init__(self, s) + + def __repr__(self): + return ('PlyParseError(%r, element=%r, row=%r, prop=%r)' % + self.message, self.element, self.row, self.prop) + + +class PlyData(object): + + ''' + PLY file header and data. + + A PlyData instance is created in one of two ways: by the static + method PlyData.read (to read a PLY file), or directly from __init__ + given a sequence of elements (which can then be written to a PLY + file). + + ''' + + def __init__(self, elements=[], text=False, byte_order='=', + comments=[], obj_info=[]): + ''' + elements: sequence of PlyElement instances. + + text: whether the resulting PLY file will be text (True) or + binary (False). + + byte_order: '<' for little-endian, '>' for big-endian, or '=' + for native. This is only relevant if `text' is False. + + comments: sequence of strings that will be placed in the header + between the 'ply' and 'format ...' lines. + + obj_info: like comments, but will be placed in the header with + "obj_info ..." instead of "comment ...". + + ''' + if byte_order == '=' and not text: + byte_order = _native_byte_order + + self.byte_order = byte_order + self.text = text + + self.comments = list(comments) + self.obj_info = list(obj_info) + self.elements = elements + + def _get_elements(self): + return self._elements + + def _set_elements(self, elements): + self._elements = tuple(elements) + self._index() + + elements = property(_get_elements, _set_elements) + + def _get_byte_order(self): + return self._byte_order + + def _set_byte_order(self, byte_order): + if byte_order not in ['<', '>', '=']: + raise ValueError("byte order must be '<', '>', or '='") + + self._byte_order = byte_order + + byte_order = property(_get_byte_order, _set_byte_order) + + def _index(self): + self._element_lookup = dict((elt.name, elt) for elt in + self._elements) + if len(self._element_lookup) != len(self._elements): + raise ValueError("two elements with same name") + + @staticmethod + def _parse_header(stream): + ''' + Parse a PLY header from a readable file-like stream. + + ''' + lines = [] + comments = {'comment': [], 'obj_info': []} + while True: + line = stream.readline().decode('ascii').strip() + fields = _split_line(line, 1) + + if fields[0] == 'end_header': + break + + elif fields[0] in comments.keys(): + lines.append(fields) + else: + lines.append(line.split()) + + a = 0 + if lines[a] != ['ply']: + raise PlyParseError("expected 'ply'") + + a += 1 + while lines[a][0] in comments.keys(): + comments[lines[a][0]].append(lines[a][1]) + a += 1 + + if lines[a][0] != 'format': + raise PlyParseError("expected 'format'") + + if lines[a][2] != '1.0': + raise PlyParseError("expected version '1.0'") + + if len(lines[a]) != 3: + raise PlyParseError("too many fields after 'format'") + + fmt = lines[a][1] + + if fmt not in _byte_order_map: + raise PlyParseError("don't understand format %r" % fmt) + + byte_order = _byte_order_map[fmt] + text = fmt == 'ascii' + + a += 1 + while a < len(lines) and lines[a][0] in comments.keys(): + comments[lines[a][0]].append(lines[a][1]) + a += 1 + + return PlyData(PlyElement._parse_multi(lines[a:]), + text, byte_order, + comments['comment'], comments['obj_info']) + + @staticmethod + def read(stream): + ''' + Read PLY data from a readable file-like object or filename. + + ''' + (must_close, stream) = _open_stream(stream, 'read') + try: + data = PlyData._parse_header(stream) + for elt in data: + elt._read(stream, data.text, data.byte_order) + finally: + if must_close: + stream.close() + + return data + + def write(self, stream): + ''' + Write PLY data to a writeable file-like object or filename. + + ''' + (must_close, stream) = _open_stream(stream, 'write') + try: + stream.write(self.header.encode('ascii')) + stream.write(b'\r\n') + for elt in self: + elt._write(stream, self.text, self.byte_order) + finally: + if must_close: + stream.close() + + @property + def header(self): + ''' + Provide PLY-formatted metadata for the instance. + + ''' + lines = ['ply'] + + if self.text: + lines.append('format ascii 1.0') + else: + lines.append('format ' + + _byte_order_reverse[self.byte_order] + + ' 1.0') + + # Some information is lost here, since all comments are placed + # between the 'format' line and the first element. + for c in self.comments: + lines.append('comment ' + c) + + for c in self.obj_info: + lines.append('obj_info ' + c) + + lines.extend(elt.header for elt in self.elements) + lines.append('end_header') + return '\r\n'.join(lines) + + def __iter__(self): + return iter(self.elements) + + def __len__(self): + return len(self.elements) + + def __contains__(self, name): + return name in self._element_lookup + + def __getitem__(self, name): + return self._element_lookup[name] + + def __str__(self): + return self.header + + def __repr__(self): + return ('PlyData(%r, text=%r, byte_order=%r, ' + 'comments=%r, obj_info=%r)' % + (self.elements, self.text, self.byte_order, + self.comments, self.obj_info)) + + +def _open_stream(stream, read_or_write): + if hasattr(stream, read_or_write): + return (False, stream) + try: + return (True, open(stream, read_or_write[0] + 'b')) + except TypeError: + raise RuntimeError("expected open file or filename") + + +class PlyElement(object): + + ''' + PLY file element. + + A client of this library doesn't normally need to instantiate this + directly, so the following is only for the sake of documenting the + internals. + + Creating a PlyElement instance is generally done in one of two ways: + as a byproduct of PlyData.read (when reading a PLY file) and by + PlyElement.describe (before writing a PLY file). + + ''' + + def __init__(self, name, properties, count, comments=[]): + ''' + This is not part of the public interface. The preferred methods + of obtaining PlyElement instances are PlyData.read (to read from + a file) and PlyElement.describe (to construct from a numpy + array). + + ''' + self._name = str(name) + self._check_name() + self._count = count + + self._properties = tuple(properties) + self._index() + + self.comments = list(comments) + + self._have_list = any(isinstance(p, PlyListProperty) + for p in self.properties) + + @property + def count(self): + return self._count + + def _get_data(self): + return self._data + + def _set_data(self, data): + self._data = data + self._count = len(data) + self._check_sanity() + + data = property(_get_data, _set_data) + + def _check_sanity(self): + for prop in self.properties: + if prop.name not in self._data.dtype.fields: + raise ValueError("dangling property %r" % prop.name) + + def _get_properties(self): + return self._properties + + def _set_properties(self, properties): + self._properties = tuple(properties) + self._check_sanity() + self._index() + + properties = property(_get_properties, _set_properties) + + def _index(self): + self._property_lookup = dict((prop.name, prop) + for prop in self._properties) + if len(self._property_lookup) != len(self._properties): + raise ValueError("two properties with same name") + + def ply_property(self, name): + return self._property_lookup[name] + + @property + def name(self): + return self._name + + def _check_name(self): + if any(c.isspace() for c in self._name): + msg = "element name %r contains spaces" % self._name + raise ValueError(msg) + + def dtype(self, byte_order='='): + ''' + Return the numpy dtype of the in-memory representation of the + data. (If there are no list properties, and the PLY format is + binary, then this also accurately describes the on-disk + representation of the element.) + + ''' + return [(prop.name, prop.dtype(byte_order)) + for prop in self.properties] + + @staticmethod + def _parse_multi(header_lines): + ''' + Parse a list of PLY element definitions. + + ''' + elements = [] + while header_lines: + (elt, header_lines) = PlyElement._parse_one(header_lines) + elements.append(elt) + + return elements + + @staticmethod + def _parse_one(lines): + ''' + Consume one element definition. The unconsumed input is + returned along with a PlyElement instance. + + ''' + a = 0 + line = lines[a] + + if line[0] != 'element': + raise PlyParseError("expected 'element'") + if len(line) > 3: + raise PlyParseError("too many fields after 'element'") + if len(line) < 3: + raise PlyParseError("too few fields after 'element'") + + (name, count) = (line[1], int(line[2])) + + comments = [] + properties = [] + while True: + a += 1 + if a >= len(lines): + break + + if lines[a][0] == 'comment': + comments.append(lines[a][1]) + elif lines[a][0] == 'property': + properties.append(PlyProperty._parse_one(lines[a])) + else: + break + + return (PlyElement(name, properties, count, comments), + lines[a:]) + + @staticmethod + def describe(data, name, len_types={}, val_types={}, + comments=[]): + ''' + Construct a PlyElement from an array's metadata. + + len_types and val_types can be given as mappings from list + property names to type strings (like 'u1', 'f4', etc., or + 'int8', 'float32', etc.). These can be used to define the length + and value types of list properties. List property lengths + always default to type 'u1' (8-bit unsigned integer), and value + types default to 'i4' (32-bit integer). + + ''' + if not isinstance(data, _np.ndarray): + raise TypeError("only numpy arrays are supported") + + if len(data.shape) != 1: + raise ValueError("only one-dimensional arrays are " + "supported") + + count = len(data) + + properties = [] + descr = data.dtype.descr + + for t in descr: + if not isinstance(t[1], str): + raise ValueError("nested records not supported") + + if not t[0]: + raise ValueError("field with empty name") + + if len(t) != 2 or t[1][1] == 'O': + # non-scalar field, which corresponds to a list + # property in PLY. + + if t[1][1] == 'O': + if len(t) != 2: + raise ValueError("non-scalar object fields not " + "supported") + + len_str = _data_type_reverse[len_types.get(t[0], 'u1')] + if t[1][1] == 'O': + val_type = val_types.get(t[0], 'i4') + val_str = _lookup_type(val_type) + else: + val_str = _lookup_type(t[1][1:]) + + prop = PlyListProperty(t[0], len_str, val_str) + else: + val_str = _lookup_type(t[1][1:]) + prop = PlyProperty(t[0], val_str) + + properties.append(prop) + + elt = PlyElement(name, properties, count, comments) + elt.data = data + + return elt + + def _read(self, stream, text, byte_order): + ''' + Read the actual data from a PLY file. + + ''' + if text: + self._read_txt(stream) + else: + if self._have_list: + # There are list properties, so a simple load is + # impossible. + self._read_bin(stream, byte_order) + else: + # There are no list properties, so loading the data is + # much more straightforward. + self._data = _np.fromfile(stream, + self.dtype(byte_order), + self.count) + + if len(self._data) < self.count: + k = len(self._data) + del self._data + raise PlyParseError("early end-of-file", self, k) + + self._check_sanity() + + def _write(self, stream, text, byte_order): + ''' + Write the data to a PLY file. + + ''' + if text: + self._write_txt(stream) + else: + if self._have_list: + # There are list properties, so serialization is + # slightly complicated. + self._write_bin(stream, byte_order) + else: + # no list properties, so serialization is + # straightforward. + self.data.astype(self.dtype(byte_order), + copy=False).tofile(stream) + + def _read_txt(self, stream): + ''' + Load a PLY element from an ASCII-format PLY file. The element + may contain list properties. + + ''' + self._data = _np.empty(self.count, dtype=self.dtype()) + + k = 0 + for line in _islice(iter(stream.readline, b''), self.count): + fields = iter(line.strip().split()) + for prop in self.properties: + try: + self._data[prop.name][k] = prop._from_fields(fields) + except StopIteration: + raise PlyParseError("early end-of-line", + self, k, prop) + except ValueError: + raise PlyParseError("malformed input", + self, k, prop) + try: + next(fields) + except StopIteration: + pass + else: + raise PlyParseError("expected end-of-line", self, k) + k += 1 + + if k < self.count: + del self._data + raise PlyParseError("early end-of-file", self, k) + + def _write_txt(self, stream): + ''' + Save a PLY element to an ASCII-format PLY file. The element may + contain list properties. + + ''' + for rec in self.data: + fields = [] + for prop in self.properties: + fields.extend(prop._to_fields(rec[prop.name])) + + _np.savetxt(stream, [fields], '%.18g', newline='\r\n') + + def _read_bin(self, stream, byte_order): + ''' + Load a PLY element from a binary PLY file. The element may + contain list properties. + + ''' + self._data = _np.empty(self.count, dtype=self.dtype(byte_order)) + + for k in _range(self.count): + for prop in self.properties: + try: + self._data[prop.name][k] = \ + prop._read_bin(stream, byte_order) + except StopIteration: + raise PlyParseError("early end-of-file", + self, k, prop) + + def _write_bin(self, stream, byte_order): + ''' + Save a PLY element to a binary PLY file. The element may + contain list properties. + + ''' + for rec in self.data: + for prop in self.properties: + prop._write_bin(rec[prop.name], stream, byte_order) + + @property + def header(self): + ''' + Format this element's metadata as it would appear in a PLY + header. + + ''' + lines = ['element %s %d' % (self.name, self.count)] + + # Some information is lost here, since all comments are placed + # between the 'element' line and the first property definition. + for c in self.comments: + lines.append('comment ' + c) + + lines.extend(list(map(str, self.properties))) + + return '\r\n'.join(lines) + + def __getitem__(self, key): + return self.data[key] + + def __setitem__(self, key, value): + self.data[key] = value + + def __str__(self): + return self.header + + def __repr__(self): + return ('PlyElement(%r, %r, count=%d, comments=%r)' % + (self.name, self.properties, self.count, + self.comments)) + + +class PlyProperty(object): + + ''' + PLY property description. This class is pure metadata; the data + itself is contained in PlyElement instances. + + ''' + + def __init__(self, name, val_dtype): + self._name = str(name) + self._check_name() + self.val_dtype = val_dtype + + def _get_val_dtype(self): + return self._val_dtype + + def _set_val_dtype(self, val_dtype): + self._val_dtype = _data_types[_lookup_type(val_dtype)] + + val_dtype = property(_get_val_dtype, _set_val_dtype) + + @property + def name(self): + return self._name + + def _check_name(self): + if any(c.isspace() for c in self._name): + msg = "Error: property name %r contains spaces" % self._name + raise RuntimeError(msg) + + @staticmethod + def _parse_one(line): + assert line[0] == 'property' + + if line[1] == 'list': + if len(line) > 5: + raise PlyParseError("too many fields after " + "'property list'") + if len(line) < 5: + raise PlyParseError("too few fields after " + "'property list'") + + return PlyListProperty(line[4], line[2], line[3]) + + else: + if len(line) > 3: + raise PlyParseError("too many fields after " + "'property'") + if len(line) < 3: + raise PlyParseError("too few fields after " + "'property'") + + return PlyProperty(line[2], line[1]) + + def dtype(self, byte_order='='): + ''' + Return the numpy dtype description for this property (as a tuple + of strings). + + ''' + return byte_order + self.val_dtype + + def _from_fields(self, fields): + ''' + Parse from generator. Raise StopIteration if the property could + not be read. + + ''' + return _np.dtype(self.dtype()).type(next(fields)) + + def _to_fields(self, data): + ''' + Return generator over one item. + + ''' + yield _np.dtype(self.dtype()).type(data) + + def _read_bin(self, stream, byte_order): + ''' + Read data from a binary stream. Raise StopIteration if the + property could not be read. + + ''' + try: + return _np.fromfile(stream, self.dtype(byte_order), 1)[0] + except IndexError: + raise StopIteration + + def _write_bin(self, data, stream, byte_order): + ''' + Write data to a binary stream. + + ''' + _np.dtype(self.dtype(byte_order)).type(data).tofile(stream) + + def __str__(self): + val_str = _data_type_reverse[self.val_dtype] + return 'property %s %s' % (val_str, self.name) + + def __repr__(self): + return 'PlyProperty(%r, %r)' % (self.name, + _lookup_type(self.val_dtype)) + + +class PlyListProperty(PlyProperty): + + ''' + PLY list property description. + + ''' + + def __init__(self, name, len_dtype, val_dtype): + PlyProperty.__init__(self, name, val_dtype) + + self.len_dtype = len_dtype + + def _get_len_dtype(self): + return self._len_dtype + + def _set_len_dtype(self, len_dtype): + self._len_dtype = _data_types[_lookup_type(len_dtype)] + + len_dtype = property(_get_len_dtype, _set_len_dtype) + + def dtype(self, byte_order='='): + ''' + List properties always have a numpy dtype of "object". + + ''' + return '|O' + + def list_dtype(self, byte_order='='): + ''' + Return the pair (len_dtype, val_dtype) (both numpy-friendly + strings). + + ''' + return (byte_order + self.len_dtype, + byte_order + self.val_dtype) + + def _from_fields(self, fields): + (len_t, val_t) = self.list_dtype() + + n = int(_np.dtype(len_t).type(next(fields))) + + data = _np.loadtxt(list(_islice(fields, n)), val_t, ndmin=1) + if len(data) < n: + raise StopIteration + + return data + + def _to_fields(self, data): + ''' + Return generator over the (numerical) PLY representation of the + list data (length followed by actual data). + + ''' + (len_t, val_t) = self.list_dtype() + + data = _np.asarray(data, dtype=val_t).ravel() + + yield _np.dtype(len_t).type(data.size) + for x in data: + yield x + + def _read_bin(self, stream, byte_order): + (len_t, val_t) = self.list_dtype(byte_order) + + try: + n = _np.fromfile(stream, len_t, 1)[0] + except IndexError: + raise StopIteration + + data = _np.fromfile(stream, val_t, n) + if len(data) < n: + raise StopIteration + + return data + + def _write_bin(self, data, stream, byte_order): + ''' + Write data to a binary stream. + + ''' + (len_t, val_t) = self.list_dtype(byte_order) + + data = _np.asarray(data, dtype=val_t).ravel() + + _np.array(data.size, dtype=len_t).tofile(stream) + data.tofile(stream) + + def __str__(self): + len_str = _data_type_reverse[self.len_dtype] + val_str = _data_type_reverse[self.val_dtype] + return 'property list %s %s %s' % (len_str, val_str, self.name) + + def __repr__(self): + return ('PlyListProperty(%r, %r, %r)' % + (self.name, + _lookup_type(self.len_dtype), + _lookup_type(self.val_dtype))) diff --git a/datasets/scannet_preprocess/scannet_pair/point_cloud_extractor.py b/datasets/scannet_preprocess/scannet_pair/point_cloud_extractor.py new file mode 100644 index 0000000000000000000000000000000000000000..e2b8568eef6413e2e024025b21f3d07401b0f223 --- /dev/null +++ b/datasets/scannet_preprocess/scannet_pair/point_cloud_extractor.py @@ -0,0 +1,89 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + + +import glob, os +import numpy as np +import cv2 +import torch + + +def extractor(input_path, output_path): + if not os.path.exists(output_path): + os.mkdir(output_path) + + # Load Depth Camera Intrinsic + depth_intrinsic = np.loadtxt(input_path + '/intrinsic/intrinsic_depth.txt') + print('Depth intrinsic: ') + print(depth_intrinsic) + + # Compute Camrea Distance (just for demo, so you can choose the camera distance in frame sampling) + poses = sorted(glob.glob(input_path + '/pose/*.txt'), key=lambda a: int(os.path.basename(a).split('.')[0])) + depths = sorted(glob.glob(input_path + '/depth/*.png'), key=lambda a: int(os.path.basename(a).split('.')[0])) + colors = sorted(glob.glob(input_path + '/color/*.png'), key=lambda a: int(os.path.basename(a).split('.')[0])) + + # # Get Aligned Point Clouds. + for ind, (pose, depth, color) in enumerate(zip(poses, depths, colors)): + name = os.path.basename(pose).split('.')[0] + + if os.path.exists(output_path + '/{}.npz'.format(name)): + continue + + try: + print('=' * 50, ': {}'.format(pose)) + depth_img = cv2.imread(depth, -1) # read 16bit grayscale image + mask = (depth_img != 0) + color_image = cv2.imread(color) + color_image = cv2.resize(color_image, (640, 480)) + color_image = np.reshape(color_image[mask], [-1, 3]) + colors = np.zeros_like(color_image) + colors[:, 0] = color_image[:, 2] + colors[:, 1] = color_image[:, 1] + colors[:, 2] = color_image[:, 0] + + pose = np.loadtxt(poses[ind]) + print('Camera pose: ') + print(pose) + + depth_shift = 1000.0 + x, y = np.meshgrid(np.linspace(0, depth_img.shape[1] - 1, depth_img.shape[1]), + np.linspace(0, depth_img.shape[0] - 1, depth_img.shape[0])) + uv_depth = np.zeros((depth_img.shape[0], depth_img.shape[1], 3)) + uv_depth[:, :, 0] = x + uv_depth[:, :, 1] = y + uv_depth[:, :, 2] = depth_img / depth_shift + uv_depth = np.reshape(uv_depth, [-1, 3]) + uv_depth = uv_depth[np.where(uv_depth[:, 2] != 0), :].squeeze() + + intrinsic_inv = np.linalg.inv(depth_intrinsic) + fx = depth_intrinsic[0, 0] + fy = depth_intrinsic[1, 1] + cx = depth_intrinsic[0, 2] + cy = depth_intrinsic[1, 2] + bx = depth_intrinsic[0, 3] + by = depth_intrinsic[1, 3] + point_list = [] + n = uv_depth.shape[0] + points = np.ones((n, 4)) + X = (uv_depth[:, 0] - cx) * uv_depth[:, 2] / fx + bx + Y = (uv_depth[:, 1] - cy) * uv_depth[:, 2] / fy + by + points[:, 0] = X + points[:, 1] = Y + points[:, 2] = uv_depth[:, 2] + points_world = np.dot(points, np.transpose(pose)) + print(points_world.shape) + + pcd = dict(coord=points_world[:, :3], color=colors) + # pcd_save = np.zeros((points_world.shape[0], 7)) + # pcd_save[:, :3] = points_world[:, :3] + # pcd_save[:, 3:6] = colors + + # print('Saving npz file...') + # np.savez(output_path + '/{}.npz'.format(name), pcd=pcd_save) + torch.save(pcd, output_path + '/{}.pth'.format(name)) + except: + continue + + diff --git a/datasets/scannet_preprocess/scannet_pair/preprocess.py b/datasets/scannet_preprocess/scannet_pair/preprocess.py new file mode 100644 index 0000000000000000000000000000000000000000..7c17a67936bf30ab41564625318079b92d173182 --- /dev/null +++ b/datasets/scannet_preprocess/scannet_pair/preprocess.py @@ -0,0 +1,38 @@ +import os +import argparse +import glob +import multiprocessing as mp +from concurrent.futures import ProcessPoolExecutor +from itertools import repeat +from reader import reader +from point_cloud_extractor import extractor +from compute_full_overlapping import compute_full_overlapping + + +frame_skip = 25 + + +def parse_sens(sens_dir, output_dir): + scene_id = os.path.basename(os.path.dirname(sens_dir)) + print(f"Parsing sens data{sens_dir}") + reader(sens_dir, os.path.join(output_dir, scene_id), frame_skip, + export_color_images=True, export_depth_images=True, export_poses=True, export_intrinsics=True) + extractor(os.path.join(output_dir, scene_id), os.path.join(output_dir, scene_id, "pcd")) + compute_full_overlapping(output_dir, scene_id) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--dataset_root', required=True, help='Path to the ScanNet dataset containing scene folders') + parser.add_argument('--output_root', required=True, help='Output path where train/val folders will be located') + opt = parser.parse_args() + sens_list = sorted(glob.glob(os.path.join(opt.dataset_root, "scans/scene*/*.sens"))) + # Preprocess data. + pool = ProcessPoolExecutor(max_workers=mp.cpu_count()) + # pool = ProcessPoolExecutor(max_workers=1) + print('Processing scenes...') + _ = list(pool.map(parse_sens, sens_list, repeat(opt.output_root))) + + # sens_dir = "/home/gofinge/Documents/datasets/scannet/scans/scene0024_00/scene0024_00.sens" + # output_dir = "/home/gofinge/Downloads" + # parse_sens(sens_dir, output_dir) diff --git a/datasets/scannet_preprocess/scannet_pair/reader.py b/datasets/scannet_preprocess/scannet_pair/reader.py new file mode 100644 index 0000000000000000000000000000000000000000..3077ec828988957c95fa85e8399ed382273a0257 --- /dev/null +++ b/datasets/scannet_preprocess/scannet_pair/reader.py @@ -0,0 +1,27 @@ +import argparse +import os, sys + +from SensorData import SensorData + + +def reader(filename, + output_path, + frame_skip, + export_color_images=False, + export_depth_images=False, + export_poses=False, + export_intrinsics=False): + if not os.path.exists(output_path): + os.makedirs(output_path) + + # load the data + print('loading %s...' % filename) + sd = SensorData(filename) + if export_depth_images: + sd.export_depth_images(os.path.join(output_path, 'depth'), frame_skip=frame_skip) + if export_color_images: + sd.export_color_images(os.path.join(output_path, 'color'), frame_skip=frame_skip) + if export_poses: + sd.export_poses(os.path.join(output_path, 'pose'), frame_skip=frame_skip) + if export_intrinsics: + sd.export_intrinsics(os.path.join(output_path, 'intrinsic')) diff --git a/demo.py b/demo.py new file mode 100644 index 0000000000000000000000000000000000000000..8055add6448d1ae28a79576bb4a738a7da7afb44 --- /dev/null +++ b/demo.py @@ -0,0 +1,123 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import argparse +import glob +import multiprocessing as mp +import os +import time +import cv2 +import tqdm +import numpy as np + +from detectron2.config import get_cfg + +from detectron2.projects.deeplab import add_deeplab_config +from detectron2.data.detection_utils import read_image +from detectron2.utils.logger import setup_logger +from open_vocab_seg import add_ovseg_config + +from open_vocab_seg.utils import VisualizationDemo + +# constants +WINDOW_NAME = "Open vocabulary segmentation" + + +def setup_cfg(args): + # load config from file and command-line arguments + cfg = get_cfg() + # for poly lr schedule + add_deeplab_config(cfg) + add_ovseg_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + return cfg + + +def get_parser(): + parser = argparse.ArgumentParser(description="Detectron2 demo for open vocabulary segmentation") + parser.add_argument( + "--config-file", + default="configs/ovseg_swinB_vitL_demo.yaml", + metavar="FILE", + help="path to config file", + ) + parser.add_argument( + "--input", + nargs="+", + help="A list of space separated input images; " + "or a single glob pattern such as 'directory/*.jpg'", + ) + parser.add_argument( + "--class-names", + nargs="+", + help="A list of user-defined class_names" + ) + parser.add_argument( + "--output", + help="A file or directory to save output visualizations. " + "If not given, will show output in an OpenCV window.", + ) + parser.add_argument( + "--opts", + help="Modify config options using the command-line 'KEY VALUE' pairs", + default=[], + nargs=argparse.REMAINDER, + ) + return parser + + +if __name__ == "__main__": + mp.set_start_method("spawn", force=True) + args = get_parser().parse_args() + setup_logger(name="fvcore") + logger = setup_logger() + logger.info("Arguments: " + str(args)) + + cfg = setup_cfg(args) + + demo = VisualizationDemo(cfg) + class_names = args.class_names + if args.input: + if len(args.input) == 1: + args.input = glob.glob(os.path.expanduser(args.input[0])) + assert args.input, "The input path(s) was not found" + for path in tqdm.tqdm(args.input, disable=not args.output): + # use PIL, to be consistent with evaluation + start_time = time.time() + predictions, visualized_output_rgb, visualized_output_depth, visualized_output_rgb_sam, visualized_output_depth_sam = demo.run_on_image_sam(path, class_names) + logger.info( + "{}: {} in {:.2f}s".format( + path, + "detected {} instances".format(len(predictions["instances"])) + if "instances" in predictions + else "finished", + time.time() - start_time, + ) + ) + + if args.output: + if os.path.isdir(args.output): + assert os.path.isdir(args.output), args.output + out_filename = os.path.join(args.output, os.path.basename(path)) + else: + assert len(args.input) == 1, "Please specify a directory with args.output" + out_filename = args.output + visualized_output_rgb.save('RGB_Semantic_SAM.png') + visualized_output_depth.save('Depth_Semantic_SAM.png') + visualized_output_rgb_sam.save('RGB_Semantic_SAM_Mask.png') + visualized_output_depth_sam.save('Depth_Semantic_SAM_Mask.png') + rgb_3d_sam = demo.get_xyzrgb('RGB_Semantic_SAM.png', path) + depth_3d_sam = demo.get_xyzrgb('Depth_Semantic_SAM.png', path) + rgb_3d_sam_mask = demo.get_xyzrgb('RGB_Semantic_SAM_Mask.png', path) + depth_3d_sam_mask = demo.get_xyzrgb('Depth_Semantic_SAM_Mask.png', path) + np.savez('xyzrgb.npz', rgb_3d_sam = rgb_3d_sam, depth_3d_sam = depth_3d_sam, rgb_3d_sam_mask = rgb_3d_sam_mask, depth_3d_sam_mask = depth_3d_sam_mask) + demo.render_3d_video('xyzrgb.npz', path) + else: + cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL) + cv2.imshow(WINDOW_NAME, visualized_output_rgb.get_image()[:, :, ::-1]) + if cv2.waitKey(0) == 27: + break # esc to quit + else: + raise NotImplementedError diff --git a/flagged/log.csv b/flagged/log.csv new file mode 100644 index 0000000000000000000000000000000000000000..afcc908ea4df4a8b0353ab174a38818fc1d859b5 --- /dev/null +++ b/flagged/log.csv @@ -0,0 +1,3 @@ +name,output,flag,username,timestamp +t,/mnt/lustre/jkyang/PSG4D/segment_anything_sailvos3d/ov-seg/flagged/output/tmpii192qpn.png,,,2023-04-23 12:23:23.301078 +t,/mnt/lustre/jkyang/PSG4D/segment_anything_sailvos3d/ov-seg/flagged/output/tmpqm122tsi.png,,,2023-04-23 12:26:06.661559 diff --git a/flagged/output/tmpii192qpn.png b/flagged/output/tmpii192qpn.png new file mode 100644 index 0000000000000000000000000000000000000000..fbb8f921b3dae75f2a719d74170f8f4c87b127c0 Binary files /dev/null and b/flagged/output/tmpii192qpn.png differ diff --git a/flagged/output/tmpqm122tsi.png b/flagged/output/tmpqm122tsi.png new file mode 100644 index 0000000000000000000000000000000000000000..fbb8f921b3dae75f2a719d74170f8f4c87b127c0 Binary files /dev/null and b/flagged/output/tmpqm122tsi.png differ diff --git a/open_vocab_seg/__init__.py b/open_vocab_seg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b54fce14b8a029f1355bc8b74c20884e880ee9c4 --- /dev/null +++ b/open_vocab_seg/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from . import data +from . import modeling +from .config import add_ovseg_config + +from .test_time_augmentation import SemanticSegmentorWithTTA +from .ovseg_model import OVSeg, OVSegDEMO diff --git a/open_vocab_seg/config.py b/open_vocab_seg/config.py new file mode 100644 index 0000000000000000000000000000000000000000..400e9a05d4995e3f3401b34a22ae687b2c9c90e0 --- /dev/null +++ b/open_vocab_seg/config.py @@ -0,0 +1,133 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from detectron2.config import CfgNode as CN + + +def add_mask_former_default_config(cfg): + # data config + # select the dataset mapper + cfg.INPUT.DATASET_MAPPER_NAME = "mask_former_semantic" + # Color augmentation + cfg.INPUT.COLOR_AUG_SSD = False + # We retry random cropping until no single category in semantic segmentation GT occupies more + # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. + cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 + # Pad image and segmentation GT in dataset mapper. + cfg.INPUT.SIZE_DIVISIBILITY = -1 + + # solver config + # test batch size + cfg.SOLVER.TEST_IMS_PER_BATCH = 1 + # weight decay on embedding + cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 + # optimizer + cfg.SOLVER.OPTIMIZER = "ADAMW" + cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 + + # mask_former model config + cfg.MODEL.MASK_FORMER = CN() + + # loss + cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION = True + cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT = 0.1 + cfg.MODEL.MASK_FORMER.DICE_WEIGHT = 1.0 + cfg.MODEL.MASK_FORMER.MASK_WEIGHT = 20.0 + + # transformer config + cfg.MODEL.MASK_FORMER.NHEADS = 8 + cfg.MODEL.MASK_FORMER.DROPOUT = 0.1 + cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD = 2048 + cfg.MODEL.MASK_FORMER.ENC_LAYERS = 0 + cfg.MODEL.MASK_FORMER.DEC_LAYERS = 6 + cfg.MODEL.MASK_FORMER.PRE_NORM = False + + cfg.MODEL.MASK_FORMER.HIDDEN_DIM = 256 + cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES = 100 + + cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE = "res5" + cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ = False + + # mask_former inference config + cfg.MODEL.MASK_FORMER.TEST = CN() + cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON = False + cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD = 0.0 + cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD = 0.0 + cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False + + # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) + # you can use this config to override + cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY = 32 + + # pixel decoder config + cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 + # adding transformer in pixel decoder + cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 + # pixel decoder + cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" + + # swin transformer backbone + cfg.MODEL.SWIN = CN() + cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 + cfg.MODEL.SWIN.PATCH_SIZE = 4 + cfg.MODEL.SWIN.EMBED_DIM = 96 + cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] + cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] + cfg.MODEL.SWIN.WINDOW_SIZE = 7 + cfg.MODEL.SWIN.MLP_RATIO = 4.0 + cfg.MODEL.SWIN.QKV_BIAS = True + cfg.MODEL.SWIN.QK_SCALE = None + cfg.MODEL.SWIN.NORM_INDICES = None + cfg.MODEL.SWIN.PROJECTION = False + cfg.MODEL.SWIN.PROJECT_DIM = 256 + cfg.MODEL.SWIN.DROP_RATE = 0.0 + cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 + cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 + cfg.MODEL.SWIN.APE = False + cfg.MODEL.SWIN.PATCH_NORM = True + cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] + + +def add_our_config(cfg): + cfg.TEST.SLIDING_WINDOW = False + cfg.TEST.SLIDING_TILE_SIZE = 224 + cfg.TEST.SLIDING_OVERLAP = 2 / 3.0 + # whether to use dense crf + cfg.TEST.DENSE_CRF = False + cfg.DATASETS.SAMPLE_PER_CLASS = -1 + cfg.DATASETS.SAMPLE_SEED = 0 + # embedding head + cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM = 512 + cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM = 1024 + cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS = 2 + # clip_adapter + cfg.MODEL.CLIP_ADAPTER = CN() + cfg.MODEL.CLIP_ADAPTER.TEXT_TEMPLATES = "vild" + # for predefined + cfg.MODEL.CLIP_ADAPTER.PREDEFINED_PROMPT_TEMPLATES = ["a photo of a {}."] + # for learnable prompt + cfg.MODEL.CLIP_ADAPTER.PROMPT_CHECKPOINT = "" + cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME = "ViT-B/16" + cfg.MODEL.CLIP_ADAPTER.MASK_FILL = "mean" + cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO = 1.0 + cfg.MODEL.CLIP_ADAPTER.MASK_THR = 0.4 + cfg.MODEL.CLIP_ADAPTER.MASK_MATTING = False + cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED = True + cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE = True + cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT = 0.7 + # for mask prompt + cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH = 3 + cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD = False + + # wandb + cfg.WANDB = CN() + cfg.WANDB.PROJECT = "open_vocab_seg" + cfg.WANDB.NAME = None + + +def add_ovseg_config(cfg): + """ + Add config for open_vocab_seg. + """ + add_mask_former_default_config(cfg) + add_our_config(cfg) diff --git a/open_vocab_seg/data/__init__.py b/open_vocab_seg/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..970e2c8ce7f90afab089bf84e249af5ee7124951 --- /dev/null +++ b/open_vocab_seg/data/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from .dataset_mappers import * +from . import datasets +from .build import ( + build_detection_train_loader, + build_detection_test_loader, +) diff --git a/open_vocab_seg/data/augmentations.py b/open_vocab_seg/data/augmentations.py new file mode 100644 index 0000000000000000000000000000000000000000..44e4906d4827812fa707f50e703f253a64ab6e43 --- /dev/null +++ b/open_vocab_seg/data/augmentations.py @@ -0,0 +1,202 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import math +import numbers +import numpy as np +from detectron2.data.transforms.augmentation import Augmentation +from detectron2.data.transforms.transform import ( + CropTransform, + ResizeTransform, + TransformList, +) +from PIL import Image +from fvcore.transforms.transform import PadTransform + + +def mask2box(mask: np.ndarray): + # use naive way + row = np.nonzero(mask.sum(axis=0))[0] + if len(row) == 0: + return None + x1 = row.min() + x2 = row.max() + col = np.nonzero(mask.sum(axis=1))[0] + y1 = col.min() + y2 = col.max() + return x1, y1, x2 + 1 - x1, y2 + 1 - y1 + + +def expand_box(x, y, w, h, expand_ratio=1.0, max_h=None, max_w=None): + cx = x + 0.5 * w + cy = y + 0.5 * h + w = w * expand_ratio + h = h * expand_ratio + box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h] + if max_h is not None: + box[1] = max(0, box[1]) + box[3] = min(max_h - 1, box[3]) + if max_w is not None: + box[0] = max(0, box[0]) + box[2] = min(max_w - 1, box[2]) + box[2] = box[2] - box[0] + box[3] = box[3] - box[1] + + return [int(b) for b in box] + + +class CropImageWithMask(Augmentation): + def __init__(self, expand_ratio=1.0, mode="choice"): + if isinstance(expand_ratio, numbers.Number): + expand_ratio = (expand_ratio, expand_ratio) + self.mode = mode + self.expand_ratio = expand_ratio + if self.mode == "range": + assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1] + + def get_transform(self, image, sem_seg, category_id): + input_size = image.shape[:2] + bin_mask = sem_seg == category_id + x, y, w, h = mask2box(bin_mask) + if self.mode == "choice": + expand_ratio = np.random.choice(self.expand_ratio) + else: + expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1]) + x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size) + w = max(w, 1) + h = max(h, 1) + return CropTransform(x, y, w, h, input_size[1], input_size[0]) + + +class CropImageWithBox(Augmentation): + def __init__(self, expand_ratio=1.0, mode="choice"): + if isinstance(expand_ratio, numbers.Number): + expand_ratio = (expand_ratio, expand_ratio) + self.mode = mode + self.expand_ratio = expand_ratio + if self.mode == "range": + assert len(expand_ratio) == 2 and expand_ratio[0] < expand_ratio[1] + + def get_transform(self, image, boxes): + input_size = image.shape[:2] + x, y, x2, y2 = boxes[0] + w = x2 - x + 1 + h = y2 - y + 1 + if self.mode == "choice": + expand_ratio = np.random.choice(self.expand_ratio) + else: + expand_ratio = np.random.uniform(self.expand_ratio[0], self.expand_ratio[1]) + x, y, w, h = expand_box(x, y, w, h, expand_ratio, *input_size) + w = max(w, 1) + h = max(h, 1) + return CropTransform(x, y, w, h, input_size[1], input_size[0]) + + +class RandomResizedCrop(Augmentation): + def __init__( + self, + size, + scale=(0.08, 1.0), + ratio=(3.0 / 4.0, 4.0 / 3.0), + interpolation=Image.BILINEAR, + ): + if isinstance(size, int): + size = (size, size) + else: + assert isinstance(size, (tuple, list)) and len(size) == 2 + + self.size = size + + self.scale = scale + self.ratio = ratio + self.interpolation = interpolation + + def get_transform(self, image): + height, width = image.shape[:2] + area = height * width + + log_ratio = np.log(np.array(self.ratio)) + is_success = False + for _ in range(10): + target_area = area * np.random.uniform(self.scale[0], self.scale[1]) + aspect_ratio = np.exp(np.random.uniform(log_ratio[0], log_ratio[1])) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if 0 < w <= width and 0 < h <= height: + i = np.random.randint(0, width - w + 1) + j = np.random.randint(0, height - h + 1) + + is_success = True + break + + if not is_success: + # Fallback to central crop + in_ratio = float(width) / float(height) + if in_ratio < min(self.ratio): + w = width + h = int(round(w / min(self.ratio))) + elif in_ratio > max(self.ratio): + h = height + w = int(round(h * max(self.ratio))) + else: # whole image + w = width + h = height + i = (width - w) // 2 + j = (height - h) // 2 + return TransformList( + [ + CropTransform(i, j, w, h, width, height), + ResizeTransform( + h, w, self.size[1], self.size[0], interp=self.interpolation + ), + ] + ) + + +class CenterCrop(Augmentation): + def __init__(self, size, seg_ignore_label): + if isinstance(size, numbers.Number): + size = (int(size), int(size)) + elif isinstance(size, (tuple, list)) and len(size) == 1: + size = (size[0], size[0]) + self.size = size + self.seg_ignore_label = seg_ignore_label + + def get_transform(self, image): + + image_height, image_width = image.shape[:2] + crop_height, crop_width = self.size + + transforms = [] + if crop_width > image_width or crop_height > image_height: + padding_ltrb = [ + (crop_width - image_width) // 2 if crop_width > image_width else 0, + (crop_height - image_height) // 2 if crop_height > image_height else 0, + (crop_width - image_width + 1) // 2 if crop_width > image_width else 0, + (crop_height - image_height + 1) // 2 + if crop_height > image_height + else 0, + ] + transforms.append( + PadTransform( + *padding_ltrb, + orig_w=image_width, + orig_h=image_height, + seg_pad_value=self.seg_ignore_label + ) + ) + image_width, image_height = ( + image_width + padding_ltrb[0] + padding_ltrb[2], + image_height + padding_ltrb[1] + padding_ltrb[3], + ) + + crop_top = int(round((image_height - crop_height) / 2.0)) + crop_left = int(round((image_width - crop_width) / 2.0)) + transforms.append( + CropTransform( + crop_left, crop_top, crop_width, crop_height, image_width, image_height + ) + ) + return TransformList(transforms) diff --git a/open_vocab_seg/data/build.py b/open_vocab_seg/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..bcd3b9dcebb86c319b91a632c25bcf7827292c3f --- /dev/null +++ b/open_vocab_seg/data/build.py @@ -0,0 +1,344 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import itertools +import logging +import numpy as np +from collections import Counter +import torch.utils.data +from tabulate import tabulate +from termcolor import colored + +from detectron2.utils.logger import _log_api_usage, log_first_n +from detectron2.data.catalog import DatasetCatalog, MetadataCatalog +import torch.utils.data +from detectron2.config import configurable +from detectron2.data.build import ( + build_batch_data_loader, + trivial_batch_collator, + load_proposals_into_dataset, + filter_images_with_only_crowd_annotations, + filter_images_with_few_keypoints, + print_instances_class_histogram, +) + +from detectron2.data.common import DatasetFromList, MapDataset +from detectron2.data.dataset_mapper import DatasetMapper +from detectron2.data.detection_utils import check_metadata_consistency +from detectron2.data.samplers import ( + InferenceSampler, + RandomSubsetTrainingSampler, + RepeatFactorTrainingSampler, + TrainingSampler, +) + +""" +This file contains the default logic to build a dataloader for training or testing. +""" + +__all__ = [ + "build_detection_train_loader", + "build_detection_test_loader", +] + + +def print_classification_instances_class_histogram(dataset_dicts, class_names): + """ + Args: + dataset_dicts (list[dict]): list of dataset dicts. + class_names (list[str]): list of class names (zero-indexed). + """ + num_classes = len(class_names) + hist_bins = np.arange(num_classes + 1) + histogram = np.zeros((num_classes,), dtype=np.int) + for entry in dataset_dicts: + classes = np.asarray([entry["category_id"]], dtype=np.int) + if len(classes): + assert classes.min() >= 0, f"Got an invalid category_id={classes.min()}" + assert ( + classes.max() < num_classes + ), f"Got an invalid category_id={classes.max()} for a dataset of {num_classes} classes" + histogram += np.histogram(classes, bins=hist_bins)[0] + + N_COLS = min(6, len(class_names) * 2) + + def short_name(x): + # make long class names shorter. useful for lvis + if len(x) > 13: + return x[:11] + ".." + return x + + data = list( + itertools.chain( + *[[short_name(class_names[i]), int(v)] for i, v in enumerate(histogram)] + ) + ) + total_num_instances = sum(data[1::2]) + data.extend([None] * (N_COLS - (len(data) % N_COLS))) + if num_classes > 1: + data.extend(["total", total_num_instances]) + data = itertools.zip_longest(*[data[i::N_COLS] for i in range(N_COLS)]) + table = tabulate( + data, + headers=["category", "#instances"] * (N_COLS // 2), + tablefmt="pipe", + numalign="left", + stralign="center", + ) + log_first_n( + logging.INFO, + "Distribution of instances among all {} categories:\n".format(num_classes) + + colored(table, "cyan"), + key="message", + ) + + +def wrap_metas(dataset_dict, **kwargs): + def _assign_attr(data_dict: dict, **kwargs): + assert not any( + [key in data_dict for key in kwargs] + ), "Assigned attributes should not exist in the original sample." + data_dict.update(kwargs) + return data_dict + + return [_assign_attr(sample, meta=kwargs) for sample in dataset_dict] + + +def get_detection_dataset_dicts( + names, filter_empty=True, min_keypoints=0, proposal_files=None +): + """ + Load and prepare dataset dicts for instance detection/segmentation and semantic segmentation. + + Args: + names (str or list[str]): a dataset name or a list of dataset names + filter_empty (bool): whether to filter out images without instance annotations + min_keypoints (int): filter out images with fewer keypoints than + `min_keypoints`. Set to 0 to do nothing. + proposal_files (list[str]): if given, a list of object proposal files + that match each dataset in `names`. + + Returns: + list[dict]: a list of dicts following the standard dataset dict format. + """ + if isinstance(names, str): + names = [names] + assert len(names), names + dataset_dicts = [ + wrap_metas(DatasetCatalog.get(dataset_name), dataset_name=dataset_name) + for dataset_name in names + ] + for dataset_name, dicts in zip(names, dataset_dicts): + assert len(dicts), "Dataset '{}' is empty!".format(dataset_name) + + if proposal_files is not None: + assert len(names) == len(proposal_files) + # load precomputed proposals from proposal files + dataset_dicts = [ + load_proposals_into_dataset(dataset_i_dicts, proposal_file) + for dataset_i_dicts, proposal_file in zip(dataset_dicts, proposal_files) + ] + + dataset_dicts = list(itertools.chain.from_iterable(dataset_dicts)) + + has_instances = "annotations" in dataset_dicts[0] + if filter_empty and has_instances: + dataset_dicts = filter_images_with_only_crowd_annotations(dataset_dicts) + if min_keypoints > 0 and has_instances: + dataset_dicts = filter_images_with_few_keypoints(dataset_dicts, min_keypoints) + + if has_instances: + try: + class_names = MetadataCatalog.get(names[0]).thing_classes + check_metadata_consistency("thing_classes", names) + print_instances_class_histogram(dataset_dicts, class_names) + except AttributeError: # class names are not available for this dataset + pass + + assert len(dataset_dicts), "No valid data found in {}.".format(",".join(names)) + return dataset_dicts + + +def _train_loader_from_config(cfg, mapper=None, *, dataset=None, sampler=None): + if dataset is None: + dataset = get_detection_dataset_dicts( + cfg.DATASETS.TRAIN, + filter_empty=cfg.DATALOADER.FILTER_EMPTY_ANNOTATIONS, + min_keypoints=cfg.MODEL.ROI_KEYPOINT_HEAD.MIN_KEYPOINTS_PER_IMAGE + if cfg.MODEL.KEYPOINT_ON + else 0, + proposal_files=cfg.DATASETS.PROPOSAL_FILES_TRAIN + if cfg.MODEL.LOAD_PROPOSALS + else None, + ) + _log_api_usage("dataset." + cfg.DATASETS.TRAIN[0]) + + if mapper is None: + mapper = DatasetMapper(cfg, True) + + if sampler is None: + sampler_name = cfg.DATALOADER.SAMPLER_TRAIN + logger = logging.getLogger(__name__) + logger.info("Using training sampler {}".format(sampler_name)) + if sampler_name == "TrainingSampler": + sampler = TrainingSampler(len(dataset)) + elif sampler_name == "RepeatFactorTrainingSampler": + repeat_factors = ( + RepeatFactorTrainingSampler.repeat_factors_from_category_frequency( + dataset, cfg.DATALOADER.REPEAT_THRESHOLD + ) + ) + sampler = RepeatFactorTrainingSampler(repeat_factors) + elif sampler_name == "RandomSubsetTrainingSampler": + sampler = RandomSubsetTrainingSampler( + len(dataset), cfg.DATALOADER.RANDOM_SUBSET_RATIO + ) + else: + raise ValueError("Unknown training sampler: {}".format(sampler_name)) + + return { + "dataset": dataset, + "sampler": sampler, + "mapper": mapper, + "total_batch_size": cfg.SOLVER.IMS_PER_BATCH, + "aspect_ratio_grouping": cfg.DATALOADER.ASPECT_RATIO_GROUPING, + "num_workers": cfg.DATALOADER.NUM_WORKERS, + } + + +# TODO can allow dataset as an iterable or IterableDataset to make this function more general +@configurable(from_config=_train_loader_from_config) +def build_detection_train_loader( + dataset, + *, + mapper, + sampler=None, + total_batch_size, + aspect_ratio_grouping=True, + num_workers=0, +): + """ + Build a dataloader for object detection with some default features. + This interface is experimental. + + Args: + dataset (list or torch.utils.data.Dataset): a list of dataset dicts, + or a map-style pytorch dataset. They can be obtained by using + :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. + mapper (callable): a callable which takes a sample (dict) from dataset and + returns the format to be consumed by the model. + When using cfg, the default choice is ``DatasetMapper(cfg, is_train=True)``. + sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces + indices to be applied on ``dataset``. Default to :class:`TrainingSampler`, + which coordinates an infinite random shuffle sequence across all workers. + total_batch_size (int): total batch size across all workers. Batching + simply puts data into a list. + aspect_ratio_grouping (bool): whether to group images with similar + aspect ratio for efficiency. When enabled, it requires each + element in dataset be a dict with keys "width" and "height". + num_workers (int): number of parallel data loading workers + + Returns: + torch.utils.data.DataLoader: + a dataloader. Each output from it is a ``list[mapped_element]`` of length + ``total_batch_size / num_workers``, where ``mapped_element`` is produced + by the ``mapper``. + """ + if isinstance(dataset, list): + dataset = DatasetFromList(dataset, copy=False) + if mapper is not None: + dataset = MapDataset(dataset, mapper) + if sampler is None: + sampler = TrainingSampler(len(dataset)) + assert isinstance(sampler, torch.utils.data.sampler.Sampler) + return build_batch_data_loader( + dataset, + sampler, + total_batch_size, + aspect_ratio_grouping=aspect_ratio_grouping, + num_workers=num_workers, + ) + + +def _test_loader_from_config(cfg, dataset_name, mapper=None): + """ + Uses the given `dataset_name` argument (instead of the names in cfg), because the + standard practice is to evaluate each test set individually (not combining them). + """ + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + + dataset = get_detection_dataset_dicts( + dataset_name, + filter_empty=False, + proposal_files=[ + cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] + for x in dataset_name + ] + if cfg.MODEL.LOAD_PROPOSALS + else None, + ) + if mapper is None: + mapper = DatasetMapper(cfg, False) + return { + "dataset": dataset, + "mapper": mapper, + "num_workers": 0, + "samples_per_gpu": cfg.SOLVER.TEST_IMS_PER_BATCH, + } + + +@configurable(from_config=_test_loader_from_config) +def build_detection_test_loader( + dataset, *, mapper, sampler=None, num_workers=0, samples_per_gpu=1 +): + """ + Similar to `build_detection_train_loader`, but uses a batch size of 1, + and :class:`InferenceSampler`. This sampler coordinates all workers to + produce the exact set of all samples. + This interface is experimental. + + Args: + dataset (list or torch.utils.data.Dataset): a list of dataset dicts, + or a map-style pytorch dataset. They can be obtained by using + :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. + mapper (callable): a callable which takes a sample (dict) from dataset + and returns the format to be consumed by the model. + When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. + sampler (torch.utils.data.sampler.Sampler or None): a sampler that produces + indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, + which splits the dataset across all workers. + num_workers (int): number of parallel data loading workers + + Returns: + DataLoader: a torch DataLoader, that loads the given detection + dataset, with test-time transformation and batching. + + Examples: + :: + data_loader = build_detection_test_loader( + DatasetRegistry.get("my_test"), + mapper=DatasetMapper(...)) + + # or, instantiate with a CfgNode: + data_loader = build_detection_test_loader(cfg, "my_test") + """ + if isinstance(dataset, list): + dataset = DatasetFromList(dataset, copy=False) + if mapper is not None: + dataset = MapDataset(dataset, mapper) + if sampler is None: + sampler = InferenceSampler(len(dataset)) + # Always use 1 image per worker during inference since this is the + # standard when reporting inference time in papers. + batch_sampler = torch.utils.data.sampler.BatchSampler( + sampler, samples_per_gpu, drop_last=False + ) + data_loader = torch.utils.data.DataLoader( + dataset, + num_workers=num_workers, + batch_sampler=batch_sampler, + collate_fn=trivial_batch_collator, + ) + return data_loader + diff --git a/open_vocab_seg/data/dataset_mappers/__init__.py b/open_vocab_seg/data/dataset_mappers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f63cd5c034fcb60af8c78431205ae9b410f33250 --- /dev/null +++ b/open_vocab_seg/data/dataset_mappers/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from .mask_former_semantic_dataset_mapper import MaskFormerSemanticDatasetMapper diff --git a/open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py b/open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..2836579942cf91c726cb34cbbd2d137c975bee37 --- /dev/null +++ b/open_vocab_seg/data/dataset_mappers/mask_former_semantic_dataset_mapper.py @@ -0,0 +1,208 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import copy +import logging + +import numpy as np +import torch +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.data import MetadataCatalog +from detectron2.data import detection_utils as utils +from detectron2.data import transforms as T +from detectron2.projects.point_rend import ColorAugSSDTransform +from detectron2.structures import BitMasks, Instances + +__all__ = ["MaskFormerSemanticDatasetMapper"] + + +class MaskFormerSemanticDatasetMapper: + """ + A callable which takes a dataset dict in Detectron2 Dataset format, + and map it into a format used by MaskFormer for semantic segmentation. + + The callable currently does the following: + + 1. Read the image from "file_name" + 2. Applies geometric transforms to the image and annotation + 3. Find and applies suitable cropping to the image and annotation + 4. Prepare image and annotation to Tensors + """ + + @configurable + def __init__( + self, + is_train=True, + *, + augmentations, + image_format, + ignore_label, + size_divisibility, + ): + """ + NOTE: this interface is experimental. + Args: + is_train: for training or inference + augmentations: a list of augmentations or deterministic transforms to apply + image_format: an image format supported by :func:`detection_utils.read_image`. + ignore_label: the label that is ignored to evaluation + size_divisibility: pad image size to be divisible by this value + """ + self.is_train = is_train + self.tfm_gens = augmentations + self.img_format = image_format + self.ignore_label = ignore_label + self.size_divisibility = size_divisibility + + logger = logging.getLogger(__name__) + mode = "training" if is_train else "inference" + logger.info( + f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}" + ) + + @classmethod + def from_config(cls, cfg, is_train=True): + # Build augmentation + if is_train: + augs = [ + T.ResizeShortestEdge( + cfg.INPUT.MIN_SIZE_TRAIN, + cfg.INPUT.MAX_SIZE_TRAIN, + cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, + ) + ] + if cfg.INPUT.CROP.ENABLED: + augs.append( + T.RandomCrop_CategoryAreaConstraint( + cfg.INPUT.CROP.TYPE, + cfg.INPUT.CROP.SIZE, + cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, + cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + ) + ) + if cfg.INPUT.COLOR_AUG_SSD: + augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) + augs.append(T.RandomFlip()) + + # Assume always applies to the training set. + dataset_names = cfg.DATASETS.TRAIN + else: + min_size = cfg.INPUT.MIN_SIZE_TEST + max_size = cfg.INPUT.MAX_SIZE_TEST + sample_style = "choice" + augs = [T.ResizeShortestEdge(min_size, max_size, sample_style)] + dataset_names = cfg.DATASETS.TEST + meta = MetadataCatalog.get(dataset_names[0]) + ignore_label = meta.ignore_label + + ret = { + "is_train": is_train, + "augmentations": augs, + "image_format": cfg.INPUT.FORMAT, + "ignore_label": ignore_label, + "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY if is_train else -1, + } + return ret + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + # assert self.is_train, "MaskFormerSemanticDatasetMapper should only be used for training!" + + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + utils.check_image_size(dataset_dict, image) + + if "sem_seg_file_name" in dataset_dict: + # PyTorch transformation not implemented for uint16, so converting it to double first + sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype( + "double" + ) + else: + sem_seg_gt = None + + if sem_seg_gt is None: + raise ValueError( + "Cannot find 'sem_seg_file_name' for semantic segmentation dataset {}.".format( + dataset_dict["file_name"] + ) + ) + + aug_input = T.AugInput(image, sem_seg=sem_seg_gt) + aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) + image = aug_input.image + sem_seg_gt = aug_input.sem_seg + + # Pad image and segmentation label here! + image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) + if sem_seg_gt is not None: + sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) + + if self.size_divisibility > 0: + image_size = (image.shape[-2], image.shape[-1]) + padding_size = [ + 0, + self.size_divisibility - image_size[1], + 0, + self.size_divisibility - image_size[0], + ] + image = F.pad(image, padding_size, value=128).contiguous() + if sem_seg_gt is not None: + sem_seg_gt = F.pad( + sem_seg_gt, padding_size, value=self.ignore_label + ).contiguous() + + image_shape = (image.shape[-2], image.shape[-1]) # h, w + + # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, + # but not efficient on large generic data structures due to the use of pickle & mp.Queue. + # Therefore it's important to use torch.Tensor. + dataset_dict["image"] = image + + if sem_seg_gt is not None: + dataset_dict["sem_seg"] = sem_seg_gt.long() + + if "annotations" in dataset_dict: + raise ValueError( + "Semantic segmentation dataset should not have 'annotations'." + ) + + # Prepare per-category binary masks + if sem_seg_gt is not None: + sem_seg_gt = sem_seg_gt.numpy() + instances = Instances(image_shape) + classes = np.unique(sem_seg_gt) + # remove ignored region + classes = classes[classes != self.ignore_label] + instances.gt_classes = torch.tensor(classes, dtype=torch.int64) + + masks = [] + for class_id in classes: + masks.append(sem_seg_gt == class_id) + + if len(masks) == 0: + # Some image does not have annotation (all ignored) + instances.gt_masks = torch.zeros( + (0, sem_seg_gt.shape[-2], sem_seg_gt.shape[-1]) + ) + else: + masks = BitMasks( + torch.stack( + [ + torch.from_numpy(np.ascontiguousarray(x.copy())) + for x in masks + ] + ) + ) + instances.gt_masks = masks.tensor + + dataset_dict["instances"] = instances + + return dataset_dict diff --git a/open_vocab_seg/data/datasets/__init__.py b/open_vocab_seg/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..352792b6fcdbffefa229d5d67a5c7375769fa345 --- /dev/null +++ b/open_vocab_seg/data/datasets/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from . import register_coco_stuff, register_voc_seg +from . import register_cc3m +from . import register_ade20k_full +from . import register_pascal_context \ No newline at end of file diff --git a/open_vocab_seg/data/datasets/csv_data.py b/open_vocab_seg/data/datasets/csv_data.py new file mode 100644 index 0000000000000000000000000000000000000000..3a4c9e52b0b792d49c48fe8bc2693be5ea879581 --- /dev/null +++ b/open_vocab_seg/data/datasets/csv_data.py @@ -0,0 +1,459 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +import ast +import json +import logging +import math +import os +import random +import sys +import time +from dataclasses import dataclass +from multiprocessing import Value + +import braceexpand +import numpy as np +import pandas as pd +import torch +import torchvision.datasets as datasets +import webdataset as wds +from PIL import Image +from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler, IterableDataset, get_worker_info +from torch.utils.data.distributed import DistributedSampler +from webdataset.filters import _shuffle +from webdataset.tariterators import base_plus_ext, url_opener, tar_file_expander, valid_sample + +try: + import horovod.torch as hvd +except ImportError: + hvd = None + +from clip import tokenize + + +class CsvDataset(Dataset): + def __init__(self, input_filename, transforms, img_key, caption_key, sep="\t"): + logging.debug(f'Loading csv data from {input_filename}.') + df = pd.read_csv(input_filename, sep=sep) + + self.images = df[img_key].tolist() + self.captions = df[caption_key].tolist() + self.transforms = transforms + logging.debug('Done loading data.') + + def __len__(self): + return len(self.captions) + + def __getitem__(self, idx): + images = self.transforms(Image.open(str(self.images[idx]))) + texts = tokenize([str(self.captions[idx])])[0] + return images, texts + + +class SharedEpoch: + def __init__(self, epoch: int = 0): + self.shared_epoch = Value('i', epoch) + + def set_value(self, epoch): + self.shared_epoch.value = epoch + + def get_value(self): + return self.shared_epoch.value + + +@dataclass +class DataInfo: + dataloader: DataLoader + sampler: DistributedSampler = None + shared_epoch: SharedEpoch = None + + def set_epoch(self, epoch): + if self.shared_epoch is not None: + self.shared_epoch.set_value(epoch) + if self.sampler is not None and isinstance(self.sampler, DistributedSampler): + self.sampler.set_epoch(epoch) + + +def preprocess_txt(text): + return tokenize([str(text)])[0] + + +def get_dataset_size(shards): + shards_list = list(braceexpand.braceexpand(shards)) + dir_path = os.path.dirname(shards) + sizes_filename = os.path.join(dir_path, 'sizes.json') + len_filename = os.path.join(dir_path, '__len__') + if os.path.exists(sizes_filename): + sizes = json.load(open(sizes_filename, 'r')) + total_size = sum([int(sizes[os.path.basename(shard)]) for shard in shards_list]) + elif os.path.exists(len_filename): + # FIXME this used to be eval(open(...)) but that seemed rather unsafe + total_size = ast.literal_eval(open(len_filename, 'r').read()) + else: + total_size = None # num samples undefined + # some common dataset sizes (at time of authors last download) + # CC3M (train): 2905954 + # CC12M: 10968539 + # LAION-400M: 407332084 + # LAION-2B (english): 2170337258 + num_shards = len(shards_list) + return total_size, num_shards + + +def get_imagenet(args, preprocess_fns, split): + assert split in ["train", "val", "v2"] + is_train = split == "train" + preprocess_train, preprocess_val = preprocess_fns + + if split == "v2": + from imagenetv2_pytorch import ImageNetV2Dataset + dataset = ImageNetV2Dataset(location=args.imagenet_v2, transform=preprocess_val) + else: + if is_train: + data_path = args.imagenet_train + preprocess_fn = preprocess_train + else: + data_path = args.imagenet_val + preprocess_fn = preprocess_val + assert data_path + + dataset = datasets.ImageFolder(data_path, transform=preprocess_fn) + + if is_train: + idxs = np.zeros(len(dataset.targets)) + target_array = np.array(dataset.targets) + k = 50 + for c in range(1000): + m = target_array == c + n = len(idxs[m]) + arr = np.zeros(n) + arr[:k] = 1 + np.random.shuffle(arr) + idxs[m] = arr + + idxs = idxs.astype('int') + sampler = SubsetRandomSampler(np.where(idxs)[0]) + else: + sampler = None + + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=args.batch_size, + num_workers=args.workers, + sampler=sampler, + ) + + return DataInfo(dataloader=dataloader, sampler=sampler) + + +def count_samples(dataloader): + os.environ["WDS_EPOCH"] = "0" + n_elements, n_batches = 0, 0 + for images, texts in dataloader: + n_batches += 1 + n_elements += len(images) + assert len(images) == len(texts) + return n_elements, n_batches + + +def filter_no_caption(sample): + return 'txt' in sample + + +def log_and_continue(exn): + """Call in an exception handler to ignore any exception, isssue a warning, and continue.""" + logging.warning(f'Handling webdataset error ({repr(exn)}). Ignoring.') + return True + + +def group_by_keys_nothrow(data, keys=base_plus_ext, lcase=True, suffixes=None, handler=None): + """Return function over iterator that groups key, value pairs into samples. + + :param keys: function that splits the key into key and extension (base_plus_ext) + :param lcase: convert suffixes to lower case (Default value = True) + """ + current_sample = None + for filesample in data: + assert isinstance(filesample, dict) + fname, value = filesample["fname"], filesample["data"] + prefix, suffix = keys(fname) + if prefix is None: + continue + if lcase: + suffix = suffix.lower() + # FIXME webdataset version throws if suffix in current_sample, but we have a potential for + # this happening in the current LAION400m dataset if a tar ends with same prefix as the next + # begins, rare, but can happen since prefix aren't unique across tar files in that dataset + if current_sample is None or prefix != current_sample["__key__"] or suffix in current_sample: + if valid_sample(current_sample): + yield current_sample + current_sample = dict(__key__=prefix, __url__=filesample["__url__"]) + if suffixes is None or suffix in suffixes: + current_sample[suffix] = value + if valid_sample(current_sample): + yield current_sample + + +def tarfile_to_samples_nothrow(src, handler=log_and_continue): + # NOTE this is a re-impl of the webdataset impl with group_by_keys that doesn't throw + streams = url_opener(src, handler=handler) + files = tar_file_expander(streams, handler=handler) + samples = group_by_keys_nothrow(files, handler=handler) + return samples + + +def pytorch_worker_seed(): + """get dataloader worker seed from pytorch""" + worker_info = get_worker_info() + if worker_info is not None: + # favour the seed already created for pytorch dataloader workers if it exists + return worker_info.seed + # fallback to wds rank based seed + return wds.utils.pytorch_worker_seed() + + +_SHARD_SHUFFLE_SIZE = 2000 +_SHARD_SHUFFLE_INITIAL = 500 +_SAMPLE_SHUFFLE_SIZE = 5000 +_SAMPLE_SHUFFLE_INITIAL = 1000 + + +class detshuffle2(wds.PipelineStage): + def __init__( + self, + bufsize=1000, + initial=100, + seed=0, + epoch=-1, + ): + self.bufsize = bufsize + self.initial = initial + self.seed = seed + self.epoch = epoch + + def run(self, src): + if isinstance(self.epoch, SharedEpoch): + epoch = self.epoch.get_value() + else: + # NOTE: this is epoch tracking is problematic in a multiprocess (dataloader workers or train) + # situation as different workers may wrap at different times (or not at all). + self.epoch += 1 + epoch = self.epoch + rng = random.Random() + if self.seed < 0: + seed = pytorch_worker_seed() + epoch + else: + seed = self.seed + epoch + rng.seed(seed) + return _shuffle(src, self.bufsize, self.initial, rng) + + +class ResampledShards2(IterableDataset): + """An iterable dataset yielding a list of urls.""" + + def __init__( + self, + urls, + nshards=sys.maxsize, + worker_seed=None, + deterministic=False, + epoch=-1, + ): + """Sample shards from the shard list with replacement. + + :param urls: a list of URLs as a Python list or brace notation string + """ + super().__init__() + urls = wds.shardlists.expand_urls(urls) + self.urls = urls + assert isinstance(self.urls[0], str) + self.nshards = nshards + self.rng = random.Random() + self.worker_seed = pytorch_worker_seed if worker_seed is None else worker_seed + self.deterministic = deterministic + self.epoch = epoch + + def __iter__(self): + """Return an iterator over the shards.""" + if isinstance(self.epoch, SharedEpoch): + epoch = self.epoch.get_value() + else: + # NOTE: this is epoch tracking is problematic in a multiprocess (dataloader workers or train) + # situation as different workers may wrap at different times (or not at all). + self.epoch += 1 + epoch = self.epoch + if self.deterministic: + # reset seed w/ epoch if deterministic, worker seed should be deterministic due to arg.seed + self.rng.seed(self.worker_seed() + epoch) + for _ in range(self.nshards): + yield dict(url=self.rng.choice(self.urls)) + + +def get_wds_dataset(args, preprocess_img, is_train, epoch=0, floor=False): + input_shards = args.train_data if is_train else args.val_data + assert input_shards is not None + resampled = getattr(args, 'dataset_resampled', False) and is_train + + num_samples, num_shards = get_dataset_size(input_shards) + if not num_samples: + if is_train: + num_samples = args.train_num_samples + if not num_samples: + raise RuntimeError( + 'Currently, number of dataset samples must be specified for training dataset. ' + 'Please specify via `--train-num-samples` if no dataset length info present.') + else: + num_samples = args.val_num_samples or 0 # eval will just exhaust the iterator if not specified + + shared_epoch = SharedEpoch(epoch=epoch) # create a shared epoch store to sync epoch to dataloader worker proc + if resampled: + pipeline = [ResampledShards2(input_shards, deterministic=True, epoch=shared_epoch)] + else: + pipeline = [wds.SimpleShardList(input_shards)] + + # at this point we have an iterator over all the shards + if is_train: + if not resampled: + pipeline.extend([ + detshuffle2( + bufsize=_SHARD_SHUFFLE_SIZE, + initial=_SHARD_SHUFFLE_INITIAL, + seed=args.seed, + epoch=shared_epoch, + ), + wds.split_by_node, + wds.split_by_worker, + ]) + pipeline.extend([ + # at this point, we have an iterator over the shards assigned to each worker at each node + tarfile_to_samples_nothrow, # wds.tarfile_to_samples(handler=log_and_continue), + wds.shuffle( + bufsize=_SAMPLE_SHUFFLE_SIZE, + initial=_SAMPLE_SHUFFLE_INITIAL, + ), + ]) + else: + pipeline.extend([ + wds.split_by_worker, + # at this point, we have an iterator over the shards assigned to each worker + wds.tarfile_to_samples(handler=log_and_continue), + ]) + pipeline.extend([ + wds.select(filter_no_caption), + wds.decode("pilrgb", handler=log_and_continue), + wds.rename(image="jpg;png", text="txt"), + wds.map_dict(image=preprocess_img, text=preprocess_txt), + wds.to_tuple("image", "text"), + wds.batched(args.batch_size, partial=not is_train), + ]) + + dataset = wds.DataPipeline(*pipeline) + if is_train: + if not resampled: + assert num_shards >= args.workers * args.world_size, 'number of shards must be >= total workers' + # roll over and repeat a few samples to get same number of full batches on each node + round_fn = math.floor if floor else math.ceil + global_batch_size = args.batch_size * args.world_size + num_batches = round_fn(num_samples / global_batch_size) + num_workers = max(1, args.workers) + num_worker_batches = round_fn(num_batches / num_workers) # per dataloader worker + num_batches = num_worker_batches * num_workers + num_samples = num_batches * global_batch_size + dataset = dataset.with_epoch(num_worker_batches) # each worker is iterating over this + else: + # last batches are partial, eval is done on single (master) node + num_batches = math.ceil(num_samples / args.batch_size) + + dataloader = wds.WebLoader( + dataset, + batch_size=None, + shuffle=False, + num_workers=args.workers, + persistent_workers=True, + ) + + # FIXME not clear which approach is better, with_epoch before vs after dataloader? + # hoping to resolve via https://github.com/webdataset/webdataset/issues/169 + # if is_train: + # # roll over and repeat a few samples to get same number of full batches on each node + # global_batch_size = args.batch_size * args.world_size + # num_batches = math.ceil(num_samples / global_batch_size) + # num_workers = max(1, args.workers) + # num_batches = math.ceil(num_batches / num_workers) * num_workers + # num_samples = num_batches * global_batch_size + # dataloader = dataloader.with_epoch(num_batches) + # else: + # # last batches are partial, eval is done on single (master) node + # num_batches = math.ceil(num_samples / args.batch_size) + + # add meta-data to dataloader instance for convenience + dataloader.num_batches = num_batches + dataloader.num_samples = num_samples + + return DataInfo(dataloader=dataloader, shared_epoch=shared_epoch) + + +def get_csv_dataset(args, preprocess_fn, is_train, epoch=0): + input_filename = args.train_data if is_train else args.val_data + assert input_filename + dataset = CsvDataset( + input_filename, + preprocess_fn, + img_key=args.csv_img_key, + caption_key=args.csv_caption_key, + sep=args.csv_separator) + num_samples = len(dataset) + sampler = DistributedSampler(dataset) if args.distributed and is_train else None + shuffle = is_train and sampler is None + + dataloader = DataLoader( + dataset, + batch_size=args.batch_size, + shuffle=shuffle, + num_workers=args.workers, + pin_memory=True, + sampler=sampler, + drop_last=is_train, + ) + dataloader.num_samples = num_samples + dataloader.num_batches = len(dataloader) + + return DataInfo(dataloader, sampler) + + +def get_dataset_fn(data_path, dataset_type): + if dataset_type == "webdataset": + return get_wds_dataset + elif dataset_type == "csv": + return get_csv_dataset + elif dataset_type == "auto": + ext = data_path.split('.')[-1] + if ext in ['csv', 'tsv']: + return get_csv_dataset + elif ext in ['tar']: + return get_wds_dataset + else: + raise ValueError( + f"Tried to figure out dataset type, but failed for extention {ext}.") + else: + raise ValueError(f"Unsupported dataset type: {dataset_type}") + + +def get_data(args, preprocess_fns, epoch=0): + preprocess_train, preprocess_val = preprocess_fns + data = {} + + if args.train_data: + data["train"] = get_dataset_fn(args.train_data, args.dataset_type)( + args, preprocess_train, is_train=True, epoch=epoch) + + if args.val_data: + data["val"] = get_dataset_fn(args.val_data, args.dataset_type)( + args, preprocess_val, is_train=False) + + if args.imagenet_val is not None: + data["imagenet-val"] = get_imagenet(args, preprocess_fns, "val") + + if args.imagenet_v2 is not None: + data["imagenet-v2"] = get_imagenet(args, preprocess_fns, "v2") + + return data diff --git a/open_vocab_seg/data/datasets/register_ade20k_full.py b/open_vocab_seg/data/datasets/register_ade20k_full.py new file mode 100644 index 0000000000000000000000000000000000000000..7ba35274c8ba7f03cbe92621f944c8368794497f --- /dev/null +++ b/open_vocab_seg/data/datasets/register_ade20k_full.py @@ -0,0 +1,995 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import os + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.data.datasets import load_sem_seg + +ADE20K_SEM_SEG_FULL_CATEGORIES = [ + {"name": "wall", "id": 2978, "trainId": 0}, + {"name": "building, edifice", "id": 312, "trainId": 1}, + {"name": "sky", "id": 2420, "trainId": 2}, + {"name": "tree", "id": 2855, "trainId": 3}, + {"name": "road, route", "id": 2131, "trainId": 4}, + {"name": "floor, flooring", "id": 976, "trainId": 5}, + {"name": "ceiling", "id": 447, "trainId": 6}, + {"name": "bed", "id": 165, "trainId": 7}, + {"name": "sidewalk, pavement", "id": 2377, "trainId": 8}, + {"name": "earth, ground", "id": 838, "trainId": 9}, + {"name": "cabinet", "id": 350, "trainId": 10}, + { + "name": "person, individual, someone, somebody, mortal, soul", + "id": 1831, + "trainId": 11, + }, + {"name": "grass", "id": 1125, "trainId": 12}, + {"name": "windowpane, window", "id": 3055, "trainId": 13}, + {"name": "car, auto, automobile, machine, motorcar", "id": 401, "trainId": 14}, + {"name": "mountain, mount", "id": 1610, "trainId": 15}, + {"name": "plant, flora, plant life", "id": 1910, "trainId": 16}, + {"name": "table", "id": 2684, "trainId": 17}, + {"name": "chair", "id": 471, "trainId": 18}, + {"name": "curtain, drape, drapery, mantle, pall", "id": 687, "trainId": 19}, + {"name": "door", "id": 774, "trainId": 20}, + {"name": "sofa, couch, lounge", "id": 2473, "trainId": 21}, + {"name": "sea", "id": 2264, "trainId": 22}, + {"name": "painting, picture", "id": 1735, "trainId": 23}, + {"name": "water", "id": 2994, "trainId": 24}, + {"name": "mirror", "id": 1564, "trainId": 25}, + {"name": "house", "id": 1276, "trainId": 26}, + {"name": "rug, carpet, carpeting", "id": 2178, "trainId": 27}, + {"name": "shelf", "id": 2329, "trainId": 28}, + {"name": "armchair", "id": 57, "trainId": 29}, + {"name": "fence, fencing", "id": 907, "trainId": 30}, + {"name": "field", "id": 913, "trainId": 31}, + {"name": "lamp", "id": 1395, "trainId": 32}, + {"name": "rock, stone", "id": 2138, "trainId": 33}, + {"name": "seat", "id": 2272, "trainId": 34}, + {"name": "river", "id": 2128, "trainId": 35}, + {"name": "desk", "id": 724, "trainId": 36}, + {"name": "bathtub, bathing tub, bath, tub", "id": 155, "trainId": 37}, + {"name": "railing, rail", "id": 2053, "trainId": 38}, + {"name": "signboard, sign", "id": 2380, "trainId": 39}, + {"name": "cushion", "id": 689, "trainId": 40}, + {"name": "path", "id": 1788, "trainId": 41}, + {"name": "work surface", "id": 3087, "trainId": 42}, + {"name": "stairs, steps", "id": 2530, "trainId": 43}, + {"name": "column, pillar", "id": 581, "trainId": 44}, + {"name": "sink", "id": 2388, "trainId": 45}, + {"name": "wardrobe, closet, press", "id": 2985, "trainId": 46}, + {"name": "snow", "id": 2454, "trainId": 47}, + {"name": "refrigerator, icebox", "id": 2096, "trainId": 48}, + {"name": "base, pedestal, stand", "id": 137, "trainId": 49}, + {"name": "bridge, span", "id": 294, "trainId": 50}, + {"name": "blind, screen", "id": 212, "trainId": 51}, + {"name": "runway", "id": 2185, "trainId": 52}, + {"name": "cliff, drop, drop-off", "id": 524, "trainId": 53}, + {"name": "sand", "id": 2212, "trainId": 54}, + {"name": "fireplace, hearth, open fireplace", "id": 943, "trainId": 55}, + {"name": "pillow", "id": 1869, "trainId": 56}, + {"name": "screen door, screen", "id": 2251, "trainId": 57}, + { + "name": "toilet, can, commode, crapper, pot, potty, stool, throne", + "id": 2793, + "trainId": 58, + }, + {"name": "skyscraper", "id": 2423, "trainId": 59}, + {"name": "grandstand, covered stand", "id": 1121, "trainId": 60}, + {"name": "box", "id": 266, "trainId": 61}, + {"name": "pool table, billiard table, snooker table", "id": 1948, "trainId": 62}, + {"name": "palm, palm tree", "id": 1744, "trainId": 63}, + {"name": "double door", "id": 783, "trainId": 64}, + {"name": "coffee table, cocktail table", "id": 571, "trainId": 65}, + {"name": "counter", "id": 627, "trainId": 66}, + {"name": "countertop", "id": 629, "trainId": 67}, + {"name": "chest of drawers, chest, bureau, dresser", "id": 491, "trainId": 68}, + {"name": "kitchen island", "id": 1374, "trainId": 69}, + {"name": "boat", "id": 223, "trainId": 70}, + {"name": "waterfall, falls", "id": 3016, "trainId": 71}, + { + "name": "stove, kitchen stove, range, kitchen range, cooking stove", + "id": 2598, + "trainId": 72, + }, + {"name": "flower", "id": 978, "trainId": 73}, + {"name": "bookcase", "id": 239, "trainId": 74}, + {"name": "controls", "id": 608, "trainId": 75}, + {"name": "book", "id": 236, "trainId": 76}, + {"name": "stairway, staircase", "id": 2531, "trainId": 77}, + {"name": "streetlight, street lamp", "id": 2616, "trainId": 78}, + { + "name": "computer, computing machine, computing device, data processor, electronic computer, information processing system", + "id": 591, + "trainId": 79, + }, + { + "name": "bus, autobus, coach, charabanc, double-decker, jitney, motorbus, motorcoach, omnibus, passenger vehicle", + "id": 327, + "trainId": 80, + }, + {"name": "swivel chair", "id": 2679, "trainId": 81}, + {"name": "light, light source", "id": 1451, "trainId": 82}, + {"name": "bench", "id": 181, "trainId": 83}, + {"name": "case, display case, showcase, vitrine", "id": 420, "trainId": 84}, + {"name": "towel", "id": 2821, "trainId": 85}, + {"name": "fountain", "id": 1023, "trainId": 86}, + {"name": "embankment", "id": 855, "trainId": 87}, + { + "name": "television receiver, television, television set, tv, tv set, idiot box, boob tube, telly, goggle box", + "id": 2733, + "trainId": 88, + }, + {"name": "van", "id": 2928, "trainId": 89}, + {"name": "hill", "id": 1240, "trainId": 90}, + {"name": "awning, sunshade, sunblind", "id": 77, "trainId": 91}, + {"name": "poster, posting, placard, notice, bill, card", "id": 1969, "trainId": 92}, + {"name": "truck, motortruck", "id": 2880, "trainId": 93}, + {"name": "airplane, aeroplane, plane", "id": 14, "trainId": 94}, + {"name": "pole", "id": 1936, "trainId": 95}, + {"name": "tower", "id": 2828, "trainId": 96}, + {"name": "court", "id": 631, "trainId": 97}, + {"name": "ball", "id": 103, "trainId": 98}, + { + "name": "aircraft carrier, carrier, flattop, attack aircraft carrier", + "id": 3144, + "trainId": 99, + }, + {"name": "buffet, counter, sideboard", "id": 308, "trainId": 100}, + {"name": "hovel, hut, hutch, shack, shanty", "id": 1282, "trainId": 101}, + {"name": "apparel, wearing apparel, dress, clothes", "id": 38, "trainId": 102}, + {"name": "minibike, motorbike", "id": 1563, "trainId": 103}, + { + "name": "animal, animate being, beast, brute, creature, fauna", + "id": 29, + "trainId": 104, + }, + {"name": "chandelier, pendant, pendent", "id": 480, "trainId": 105}, + {"name": "step, stair", "id": 2569, "trainId": 106}, + {"name": "booth, cubicle, stall, kiosk", "id": 247, "trainId": 107}, + {"name": "bicycle, bike, wheel, cycle", "id": 187, "trainId": 108}, + {"name": "doorframe, doorcase", "id": 778, "trainId": 109}, + {"name": "sconce", "id": 2243, "trainId": 110}, + {"name": "pond", "id": 1941, "trainId": 111}, + {"name": "trade name, brand name, brand, marque", "id": 2833, "trainId": 112}, + { + "name": "bannister, banister, balustrade, balusters, handrail", + "id": 120, + "trainId": 113, + }, + {"name": "bag", "id": 95, "trainId": 114}, + {"name": "traffic light, traffic signal, stoplight", "id": 2836, "trainId": 115}, + {"name": "gazebo", "id": 1087, "trainId": 116}, + {"name": "escalator, moving staircase, moving stairway", "id": 868, "trainId": 117}, + {"name": "land, ground, soil", "id": 1401, "trainId": 118}, + {"name": "board, plank", "id": 220, "trainId": 119}, + {"name": "arcade machine", "id": 47, "trainId": 120}, + {"name": "eiderdown, duvet, continental quilt", "id": 843, "trainId": 121}, + {"name": "bar", "id": 123, "trainId": 122}, + {"name": "stall, stand, sales booth", "id": 2537, "trainId": 123}, + {"name": "playground", "id": 1927, "trainId": 124}, + {"name": "ship", "id": 2337, "trainId": 125}, + {"name": "ottoman, pouf, pouffe, puff, hassock", "id": 1702, "trainId": 126}, + { + "name": "ashcan, trash can, garbage can, wastebin, ash bin, ash-bin, ashbin, dustbin, trash barrel, trash bin", + "id": 64, + "trainId": 127, + }, + {"name": "bottle", "id": 249, "trainId": 128}, + {"name": "cradle", "id": 642, "trainId": 129}, + {"name": "pot, flowerpot", "id": 1981, "trainId": 130}, + { + "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", + "id": 609, + "trainId": 131, + }, + {"name": "train, railroad train", "id": 2840, "trainId": 132}, + {"name": "stool", "id": 2586, "trainId": 133}, + {"name": "lake", "id": 1393, "trainId": 134}, + {"name": "tank, storage tank", "id": 2704, "trainId": 135}, + {"name": "ice, water ice", "id": 1304, "trainId": 136}, + {"name": "basket, handbasket", "id": 146, "trainId": 137}, + {"name": "manhole", "id": 1494, "trainId": 138}, + {"name": "tent, collapsible shelter", "id": 2739, "trainId": 139}, + {"name": "canopy", "id": 389, "trainId": 140}, + {"name": "microwave, microwave oven", "id": 1551, "trainId": 141}, + {"name": "barrel, cask", "id": 131, "trainId": 142}, + {"name": "dirt track", "id": 738, "trainId": 143}, + {"name": "beam", "id": 161, "trainId": 144}, + {"name": "dishwasher, dish washer, dishwashing machine", "id": 747, "trainId": 145}, + {"name": "plate", "id": 1919, "trainId": 146}, + {"name": "screen, crt screen", "id": 3109, "trainId": 147}, + {"name": "ruins", "id": 2179, "trainId": 148}, + {"name": "washer, automatic washer, washing machine", "id": 2989, "trainId": 149}, + {"name": "blanket, cover", "id": 206, "trainId": 150}, + {"name": "plaything, toy", "id": 1930, "trainId": 151}, + {"name": "food, solid food", "id": 1002, "trainId": 152}, + {"name": "screen, silver screen, projection screen", "id": 2254, "trainId": 153}, + {"name": "oven", "id": 1708, "trainId": 154}, + {"name": "stage", "id": 2526, "trainId": 155}, + {"name": "beacon, lighthouse, beacon light, pharos", "id": 160, "trainId": 156}, + {"name": "umbrella", "id": 2901, "trainId": 157}, + {"name": "sculpture", "id": 2262, "trainId": 158}, + {"name": "aqueduct", "id": 44, "trainId": 159}, + {"name": "container", "id": 597, "trainId": 160}, + {"name": "scaffolding, staging", "id": 2235, "trainId": 161}, + {"name": "hood, exhaust hood", "id": 1260, "trainId": 162}, + {"name": "curb, curbing, kerb", "id": 682, "trainId": 163}, + {"name": "roller coaster", "id": 2151, "trainId": 164}, + {"name": "horse, equus caballus", "id": 3107, "trainId": 165}, + {"name": "catwalk", "id": 432, "trainId": 166}, + {"name": "glass, drinking glass", "id": 1098, "trainId": 167}, + {"name": "vase", "id": 2932, "trainId": 168}, + {"name": "central reservation", "id": 461, "trainId": 169}, + {"name": "carousel", "id": 410, "trainId": 170}, + {"name": "radiator", "id": 2046, "trainId": 171}, + {"name": "closet", "id": 533, "trainId": 172}, + {"name": "machine", "id": 1481, "trainId": 173}, + {"name": "pier, wharf, wharfage, dock", "id": 1858, "trainId": 174}, + {"name": "fan", "id": 894, "trainId": 175}, + {"name": "inflatable bounce game", "id": 1322, "trainId": 176}, + {"name": "pitch", "id": 1891, "trainId": 177}, + {"name": "paper", "id": 1756, "trainId": 178}, + {"name": "arcade, colonnade", "id": 49, "trainId": 179}, + {"name": "hot tub", "id": 1272, "trainId": 180}, + {"name": "helicopter", "id": 1229, "trainId": 181}, + {"name": "tray", "id": 2850, "trainId": 182}, + {"name": "partition, divider", "id": 1784, "trainId": 183}, + {"name": "vineyard", "id": 2962, "trainId": 184}, + {"name": "bowl", "id": 259, "trainId": 185}, + {"name": "bullring", "id": 319, "trainId": 186}, + {"name": "flag", "id": 954, "trainId": 187}, + {"name": "pot", "id": 1974, "trainId": 188}, + {"name": "footbridge, overcrossing, pedestrian bridge", "id": 1013, "trainId": 189}, + {"name": "shower", "id": 2356, "trainId": 190}, + { + "name": "bag, traveling bag, travelling bag, grip, suitcase", + "id": 97, + "trainId": 191, + }, + {"name": "bulletin board, notice board", "id": 318, "trainId": 192}, + {"name": "confessional booth", "id": 592, "trainId": 193}, + {"name": "trunk, tree trunk, bole", "id": 2885, "trainId": 194}, + {"name": "forest", "id": 1017, "trainId": 195}, + {"name": "elevator door", "id": 851, "trainId": 196}, + {"name": "laptop, laptop computer", "id": 1407, "trainId": 197}, + {"name": "instrument panel", "id": 1332, "trainId": 198}, + {"name": "bucket, pail", "id": 303, "trainId": 199}, + {"name": "tapestry, tapis", "id": 2714, "trainId": 200}, + {"name": "platform", "id": 1924, "trainId": 201}, + {"name": "jacket", "id": 1346, "trainId": 202}, + {"name": "gate", "id": 1081, "trainId": 203}, + {"name": "monitor, monitoring device", "id": 1583, "trainId": 204}, + { + "name": "telephone booth, phone booth, call box, telephone box, telephone kiosk", + "id": 2727, + "trainId": 205, + }, + {"name": "spotlight, spot", "id": 2509, "trainId": 206}, + {"name": "ring", "id": 2123, "trainId": 207}, + {"name": "control panel", "id": 602, "trainId": 208}, + {"name": "blackboard, chalkboard", "id": 202, "trainId": 209}, + {"name": "air conditioner, air conditioning", "id": 10, "trainId": 210}, + {"name": "chest", "id": 490, "trainId": 211}, + {"name": "clock", "id": 530, "trainId": 212}, + {"name": "sand dune", "id": 2213, "trainId": 213}, + {"name": "pipe, pipage, piping", "id": 1884, "trainId": 214}, + {"name": "vault", "id": 2934, "trainId": 215}, + {"name": "table football", "id": 2687, "trainId": 216}, + {"name": "cannon", "id": 387, "trainId": 217}, + {"name": "swimming pool, swimming bath, natatorium", "id": 2668, "trainId": 218}, + {"name": "fluorescent, fluorescent fixture", "id": 982, "trainId": 219}, + {"name": "statue", "id": 2547, "trainId": 220}, + { + "name": "loudspeaker, speaker, speaker unit, loudspeaker system, speaker system", + "id": 1474, + "trainId": 221, + }, + {"name": "exhibitor", "id": 877, "trainId": 222}, + {"name": "ladder", "id": 1391, "trainId": 223}, + {"name": "carport", "id": 414, "trainId": 224}, + {"name": "dam", "id": 698, "trainId": 225}, + {"name": "pulpit", "id": 2019, "trainId": 226}, + {"name": "skylight, fanlight", "id": 2422, "trainId": 227}, + {"name": "water tower", "id": 3010, "trainId": 228}, + {"name": "grill, grille, grillwork", "id": 1139, "trainId": 229}, + {"name": "display board", "id": 753, "trainId": 230}, + {"name": "pane, pane of glass, window glass", "id": 1747, "trainId": 231}, + {"name": "rubbish, trash, scrap", "id": 2175, "trainId": 232}, + {"name": "ice rink", "id": 1301, "trainId": 233}, + {"name": "fruit", "id": 1033, "trainId": 234}, + {"name": "patio", "id": 1789, "trainId": 235}, + {"name": "vending machine", "id": 2939, "trainId": 236}, + {"name": "telephone, phone, telephone set", "id": 2730, "trainId": 237}, + {"name": "net", "id": 1652, "trainId": 238}, + { + "name": "backpack, back pack, knapsack, packsack, rucksack, haversack", + "id": 90, + "trainId": 239, + }, + {"name": "jar", "id": 1349, "trainId": 240}, + {"name": "track", "id": 2830, "trainId": 241}, + {"name": "magazine", "id": 1485, "trainId": 242}, + {"name": "shutter", "id": 2370, "trainId": 243}, + {"name": "roof", "id": 2155, "trainId": 244}, + {"name": "banner, streamer", "id": 118, "trainId": 245}, + {"name": "landfill", "id": 1402, "trainId": 246}, + {"name": "post", "id": 1957, "trainId": 247}, + {"name": "altarpiece, reredos", "id": 3130, "trainId": 248}, + {"name": "hat, chapeau, lid", "id": 1197, "trainId": 249}, + {"name": "arch, archway", "id": 52, "trainId": 250}, + {"name": "table game", "id": 2688, "trainId": 251}, + {"name": "bag, handbag, pocketbook, purse", "id": 96, "trainId": 252}, + {"name": "document, written document, papers", "id": 762, "trainId": 253}, + {"name": "dome", "id": 772, "trainId": 254}, + {"name": "pier", "id": 1857, "trainId": 255}, + {"name": "shanties", "id": 2315, "trainId": 256}, + {"name": "forecourt", "id": 1016, "trainId": 257}, + {"name": "crane", "id": 643, "trainId": 258}, + {"name": "dog, domestic dog, canis familiaris", "id": 3105, "trainId": 259}, + {"name": "piano, pianoforte, forte-piano", "id": 1849, "trainId": 260}, + {"name": "drawing", "id": 791, "trainId": 261}, + {"name": "cabin", "id": 349, "trainId": 262}, + { + "name": "ad, advertisement, advertizement, advertising, advertizing, advert", + "id": 6, + "trainId": 263, + }, + {"name": "amphitheater, amphitheatre, coliseum", "id": 3114, "trainId": 264}, + {"name": "monument", "id": 1587, "trainId": 265}, + {"name": "henhouse", "id": 1233, "trainId": 266}, + {"name": "cockpit", "id": 559, "trainId": 267}, + {"name": "heater, warmer", "id": 1223, "trainId": 268}, + {"name": "windmill, aerogenerator, wind generator", "id": 3049, "trainId": 269}, + {"name": "pool", "id": 1943, "trainId": 270}, + {"name": "elevator, lift", "id": 853, "trainId": 271}, + {"name": "decoration, ornament, ornamentation", "id": 709, "trainId": 272}, + {"name": "labyrinth", "id": 1390, "trainId": 273}, + {"name": "text, textual matter", "id": 2748, "trainId": 274}, + {"name": "printer", "id": 2007, "trainId": 275}, + {"name": "mezzanine, first balcony", "id": 1546, "trainId": 276}, + {"name": "mattress", "id": 1513, "trainId": 277}, + {"name": "straw", "id": 2600, "trainId": 278}, + {"name": "stalls", "id": 2538, "trainId": 279}, + {"name": "patio, terrace", "id": 1790, "trainId": 280}, + {"name": "billboard, hoarding", "id": 194, "trainId": 281}, + {"name": "bus stop", "id": 326, "trainId": 282}, + {"name": "trouser, pant", "id": 2877, "trainId": 283}, + {"name": "console table, console", "id": 594, "trainId": 284}, + {"name": "rack", "id": 2036, "trainId": 285}, + {"name": "notebook", "id": 1662, "trainId": 286}, + {"name": "shrine", "id": 2366, "trainId": 287}, + {"name": "pantry", "id": 1754, "trainId": 288}, + {"name": "cart", "id": 418, "trainId": 289}, + {"name": "steam shovel", "id": 2553, "trainId": 290}, + {"name": "porch", "id": 1951, "trainId": 291}, + {"name": "postbox, mailbox, letter box", "id": 1963, "trainId": 292}, + {"name": "figurine, statuette", "id": 918, "trainId": 293}, + {"name": "recycling bin", "id": 2086, "trainId": 294}, + {"name": "folding screen", "id": 997, "trainId": 295}, + {"name": "telescope", "id": 2731, "trainId": 296}, + {"name": "deck chair, beach chair", "id": 704, "trainId": 297}, + {"name": "kennel", "id": 1365, "trainId": 298}, + {"name": "coffee maker", "id": 569, "trainId": 299}, + {"name": "altar, communion table, lord's table", "id": 3108, "trainId": 300}, + {"name": "fish", "id": 948, "trainId": 301}, + {"name": "easel", "id": 839, "trainId": 302}, + {"name": "artificial golf green", "id": 63, "trainId": 303}, + {"name": "iceberg", "id": 1305, "trainId": 304}, + {"name": "candlestick, candle holder", "id": 378, "trainId": 305}, + {"name": "shower stall, shower bath", "id": 2362, "trainId": 306}, + {"name": "television stand", "id": 2734, "trainId": 307}, + { + "name": "wall socket, wall plug, electric outlet, electrical outlet, outlet, electric receptacle", + "id": 2982, + "trainId": 308, + }, + {"name": "skeleton", "id": 2398, "trainId": 309}, + {"name": "grand piano, grand", "id": 1119, "trainId": 310}, + {"name": "candy, confect", "id": 382, "trainId": 311}, + {"name": "grille door", "id": 1141, "trainId": 312}, + {"name": "pedestal, plinth, footstall", "id": 1805, "trainId": 313}, + {"name": "jersey, t-shirt, tee shirt", "id": 3102, "trainId": 314}, + {"name": "shoe", "id": 2341, "trainId": 315}, + {"name": "gravestone, headstone, tombstone", "id": 1131, "trainId": 316}, + {"name": "shanty", "id": 2316, "trainId": 317}, + {"name": "structure", "id": 2626, "trainId": 318}, + {"name": "rocking chair, rocker", "id": 3104, "trainId": 319}, + {"name": "bird", "id": 198, "trainId": 320}, + {"name": "place mat", "id": 1896, "trainId": 321}, + {"name": "tomb", "id": 2800, "trainId": 322}, + {"name": "big top", "id": 190, "trainId": 323}, + { + "name": "gas pump, gasoline pump, petrol pump, island dispenser", + "id": 3131, + "trainId": 324, + }, + {"name": "lockers", "id": 1463, "trainId": 325}, + {"name": "cage", "id": 357, "trainId": 326}, + {"name": "finger", "id": 929, "trainId": 327}, + {"name": "bleachers", "id": 209, "trainId": 328}, + {"name": "ferris wheel", "id": 912, "trainId": 329}, + {"name": "hairdresser chair", "id": 1164, "trainId": 330}, + {"name": "mat", "id": 1509, "trainId": 331}, + {"name": "stands", "id": 2539, "trainId": 332}, + {"name": "aquarium, fish tank, marine museum", "id": 3116, "trainId": 333}, + { + "name": "streetcar, tram, tramcar, trolley, trolley car", + "id": 2615, + "trainId": 334, + }, + {"name": "napkin, table napkin, serviette", "id": 1644, "trainId": 335}, + {"name": "dummy", "id": 818, "trainId": 336}, + {"name": "booklet, brochure, folder, leaflet, pamphlet", "id": 242, "trainId": 337}, + {"name": "sand trap", "id": 2217, "trainId": 338}, + {"name": "shop, store", "id": 2347, "trainId": 339}, + {"name": "table cloth", "id": 2686, "trainId": 340}, + {"name": "service station", "id": 2300, "trainId": 341}, + {"name": "coffin", "id": 572, "trainId": 342}, + {"name": "drawer", "id": 789, "trainId": 343}, + {"name": "cages", "id": 358, "trainId": 344}, + {"name": "slot machine, coin machine", "id": 2443, "trainId": 345}, + {"name": "balcony", "id": 101, "trainId": 346}, + {"name": "volleyball court", "id": 2969, "trainId": 347}, + {"name": "table tennis", "id": 2692, "trainId": 348}, + {"name": "control table", "id": 606, "trainId": 349}, + {"name": "shirt", "id": 2339, "trainId": 350}, + {"name": "merchandise, ware, product", "id": 1533, "trainId": 351}, + {"name": "railway", "id": 2060, "trainId": 352}, + {"name": "parterre", "id": 1782, "trainId": 353}, + {"name": "chimney", "id": 495, "trainId": 354}, + {"name": "can, tin, tin can", "id": 371, "trainId": 355}, + {"name": "tanks", "id": 2707, "trainId": 356}, + {"name": "fabric, cloth, material, textile", "id": 889, "trainId": 357}, + {"name": "alga, algae", "id": 3156, "trainId": 358}, + {"name": "system", "id": 2683, "trainId": 359}, + {"name": "map", "id": 1499, "trainId": 360}, + {"name": "greenhouse", "id": 1135, "trainId": 361}, + {"name": "mug", "id": 1619, "trainId": 362}, + {"name": "barbecue", "id": 125, "trainId": 363}, + {"name": "trailer", "id": 2838, "trainId": 364}, + { + "name": "toilet tissue, toilet paper, bathroom tissue", + "id": 2792, + "trainId": 365, + }, + {"name": "organ", "id": 1695, "trainId": 366}, + {"name": "dishrag, dishcloth", "id": 746, "trainId": 367}, + {"name": "island", "id": 1343, "trainId": 368}, + {"name": "keyboard", "id": 1370, "trainId": 369}, + {"name": "trench", "id": 2858, "trainId": 370}, + {"name": "basket, basketball hoop, hoop", "id": 145, "trainId": 371}, + {"name": "steering wheel, wheel", "id": 2565, "trainId": 372}, + {"name": "pitcher, ewer", "id": 1892, "trainId": 373}, + {"name": "goal", "id": 1103, "trainId": 374}, + {"name": "bread, breadstuff, staff of life", "id": 286, "trainId": 375}, + {"name": "beds", "id": 170, "trainId": 376}, + {"name": "wood", "id": 3073, "trainId": 377}, + {"name": "file cabinet", "id": 922, "trainId": 378}, + {"name": "newspaper, paper", "id": 1655, "trainId": 379}, + {"name": "motorboat", "id": 1602, "trainId": 380}, + {"name": "rope", "id": 2160, "trainId": 381}, + {"name": "guitar", "id": 1151, "trainId": 382}, + {"name": "rubble", "id": 2176, "trainId": 383}, + {"name": "scarf", "id": 2239, "trainId": 384}, + {"name": "barrels", "id": 132, "trainId": 385}, + {"name": "cap", "id": 394, "trainId": 386}, + {"name": "leaves", "id": 1424, "trainId": 387}, + {"name": "control tower", "id": 607, "trainId": 388}, + {"name": "dashboard", "id": 700, "trainId": 389}, + {"name": "bandstand", "id": 116, "trainId": 390}, + {"name": "lectern", "id": 1425, "trainId": 391}, + {"name": "switch, electric switch, electrical switch", "id": 2676, "trainId": 392}, + {"name": "baseboard, mopboard, skirting board", "id": 141, "trainId": 393}, + {"name": "shower room", "id": 2360, "trainId": 394}, + {"name": "smoke", "id": 2449, "trainId": 395}, + {"name": "faucet, spigot", "id": 897, "trainId": 396}, + {"name": "bulldozer", "id": 317, "trainId": 397}, + {"name": "saucepan", "id": 2228, "trainId": 398}, + {"name": "shops", "id": 2351, "trainId": 399}, + {"name": "meter", "id": 1543, "trainId": 400}, + {"name": "crevasse", "id": 656, "trainId": 401}, + {"name": "gear", "id": 1088, "trainId": 402}, + {"name": "candelabrum, candelabra", "id": 373, "trainId": 403}, + {"name": "sofa bed", "id": 2472, "trainId": 404}, + {"name": "tunnel", "id": 2892, "trainId": 405}, + {"name": "pallet", "id": 1740, "trainId": 406}, + {"name": "wire, conducting wire", "id": 3067, "trainId": 407}, + {"name": "kettle, boiler", "id": 1367, "trainId": 408}, + {"name": "bidet", "id": 188, "trainId": 409}, + { + "name": "baby buggy, baby carriage, carriage, perambulator, pram, stroller, go-cart, pushchair, pusher", + "id": 79, + "trainId": 410, + }, + {"name": "music stand", "id": 1633, "trainId": 411}, + {"name": "pipe, tube", "id": 1885, "trainId": 412}, + {"name": "cup", "id": 677, "trainId": 413}, + {"name": "parking meter", "id": 1779, "trainId": 414}, + {"name": "ice hockey rink", "id": 1297, "trainId": 415}, + {"name": "shelter", "id": 2334, "trainId": 416}, + {"name": "weeds", "id": 3027, "trainId": 417}, + {"name": "temple", "id": 2735, "trainId": 418}, + {"name": "patty, cake", "id": 1791, "trainId": 419}, + {"name": "ski slope", "id": 2405, "trainId": 420}, + {"name": "panel", "id": 1748, "trainId": 421}, + {"name": "wallet", "id": 2983, "trainId": 422}, + {"name": "wheel", "id": 3035, "trainId": 423}, + {"name": "towel rack, towel horse", "id": 2824, "trainId": 424}, + {"name": "roundabout", "id": 2168, "trainId": 425}, + {"name": "canister, cannister, tin", "id": 385, "trainId": 426}, + {"name": "rod", "id": 2148, "trainId": 427}, + {"name": "soap dispenser", "id": 2465, "trainId": 428}, + {"name": "bell", "id": 175, "trainId": 429}, + {"name": "canvas", "id": 390, "trainId": 430}, + {"name": "box office, ticket office, ticket booth", "id": 268, "trainId": 431}, + {"name": "teacup", "id": 2722, "trainId": 432}, + {"name": "trellis", "id": 2857, "trainId": 433}, + {"name": "workbench", "id": 3088, "trainId": 434}, + {"name": "valley, vale", "id": 2926, "trainId": 435}, + {"name": "toaster", "id": 2782, "trainId": 436}, + {"name": "knife", "id": 1378, "trainId": 437}, + {"name": "podium", "id": 1934, "trainId": 438}, + {"name": "ramp", "id": 2072, "trainId": 439}, + {"name": "tumble dryer", "id": 2889, "trainId": 440}, + {"name": "fireplug, fire hydrant, plug", "id": 944, "trainId": 441}, + {"name": "gym shoe, sneaker, tennis shoe", "id": 1158, "trainId": 442}, + {"name": "lab bench", "id": 1383, "trainId": 443}, + {"name": "equipment", "id": 867, "trainId": 444}, + {"name": "rocky formation", "id": 2145, "trainId": 445}, + {"name": "plastic", "id": 1915, "trainId": 446}, + {"name": "calendar", "id": 361, "trainId": 447}, + {"name": "caravan", "id": 402, "trainId": 448}, + {"name": "check-in-desk", "id": 482, "trainId": 449}, + {"name": "ticket counter", "id": 2761, "trainId": 450}, + {"name": "brush", "id": 300, "trainId": 451}, + {"name": "mill", "id": 1554, "trainId": 452}, + {"name": "covered bridge", "id": 636, "trainId": 453}, + {"name": "bowling alley", "id": 260, "trainId": 454}, + {"name": "hanger", "id": 1186, "trainId": 455}, + {"name": "excavator", "id": 871, "trainId": 456}, + {"name": "trestle", "id": 2859, "trainId": 457}, + {"name": "revolving door", "id": 2103, "trainId": 458}, + {"name": "blast furnace", "id": 208, "trainId": 459}, + {"name": "scale, weighing machine", "id": 2236, "trainId": 460}, + {"name": "projector", "id": 2012, "trainId": 461}, + {"name": "soap", "id": 2462, "trainId": 462}, + {"name": "locker", "id": 1462, "trainId": 463}, + {"name": "tractor", "id": 2832, "trainId": 464}, + {"name": "stretcher", "id": 2617, "trainId": 465}, + {"name": "frame", "id": 1024, "trainId": 466}, + {"name": "grating", "id": 1129, "trainId": 467}, + {"name": "alembic", "id": 18, "trainId": 468}, + {"name": "candle, taper, wax light", "id": 376, "trainId": 469}, + {"name": "barrier", "id": 134, "trainId": 470}, + {"name": "cardboard", "id": 407, "trainId": 471}, + {"name": "cave", "id": 434, "trainId": 472}, + {"name": "puddle", "id": 2017, "trainId": 473}, + {"name": "tarp", "id": 2717, "trainId": 474}, + {"name": "price tag", "id": 2005, "trainId": 475}, + {"name": "watchtower", "id": 2993, "trainId": 476}, + {"name": "meters", "id": 1545, "trainId": 477}, + { + "name": "light bulb, lightbulb, bulb, incandescent lamp, electric light, electric-light bulb", + "id": 1445, + "trainId": 478, + }, + {"name": "tracks", "id": 2831, "trainId": 479}, + {"name": "hair dryer", "id": 1161, "trainId": 480}, + {"name": "skirt", "id": 2411, "trainId": 481}, + {"name": "viaduct", "id": 2949, "trainId": 482}, + {"name": "paper towel", "id": 1769, "trainId": 483}, + {"name": "coat", "id": 552, "trainId": 484}, + {"name": "sheet", "id": 2327, "trainId": 485}, + {"name": "fire extinguisher, extinguisher, asphyxiator", "id": 939, "trainId": 486}, + {"name": "water wheel", "id": 3013, "trainId": 487}, + {"name": "pottery, clayware", "id": 1986, "trainId": 488}, + {"name": "magazine rack", "id": 1486, "trainId": 489}, + {"name": "teapot", "id": 2723, "trainId": 490}, + {"name": "microphone, mike", "id": 1549, "trainId": 491}, + {"name": "support", "id": 2649, "trainId": 492}, + {"name": "forklift", "id": 1020, "trainId": 493}, + {"name": "canyon", "id": 392, "trainId": 494}, + {"name": "cash register, register", "id": 422, "trainId": 495}, + {"name": "leaf, leafage, foliage", "id": 1419, "trainId": 496}, + {"name": "remote control, remote", "id": 2099, "trainId": 497}, + {"name": "soap dish", "id": 2464, "trainId": 498}, + {"name": "windshield, windscreen", "id": 3058, "trainId": 499}, + {"name": "cat", "id": 430, "trainId": 500}, + {"name": "cue, cue stick, pool cue, pool stick", "id": 675, "trainId": 501}, + {"name": "vent, venthole, vent-hole, blowhole", "id": 2941, "trainId": 502}, + {"name": "videos", "id": 2955, "trainId": 503}, + {"name": "shovel", "id": 2355, "trainId": 504}, + {"name": "eaves", "id": 840, "trainId": 505}, + {"name": "antenna, aerial, transmitting aerial", "id": 32, "trainId": 506}, + {"name": "shipyard", "id": 2338, "trainId": 507}, + {"name": "hen, biddy", "id": 1232, "trainId": 508}, + {"name": "traffic cone", "id": 2834, "trainId": 509}, + {"name": "washing machines", "id": 2991, "trainId": 510}, + {"name": "truck crane", "id": 2879, "trainId": 511}, + {"name": "cds", "id": 444, "trainId": 512}, + {"name": "niche", "id": 1657, "trainId": 513}, + {"name": "scoreboard", "id": 2246, "trainId": 514}, + {"name": "briefcase", "id": 296, "trainId": 515}, + {"name": "boot", "id": 245, "trainId": 516}, + {"name": "sweater, jumper", "id": 2661, "trainId": 517}, + {"name": "hay", "id": 1202, "trainId": 518}, + {"name": "pack", "id": 1714, "trainId": 519}, + {"name": "bottle rack", "id": 251, "trainId": 520}, + {"name": "glacier", "id": 1095, "trainId": 521}, + {"name": "pergola", "id": 1828, "trainId": 522}, + {"name": "building materials", "id": 311, "trainId": 523}, + {"name": "television camera", "id": 2732, "trainId": 524}, + {"name": "first floor", "id": 947, "trainId": 525}, + {"name": "rifle", "id": 2115, "trainId": 526}, + {"name": "tennis table", "id": 2738, "trainId": 527}, + {"name": "stadium", "id": 2525, "trainId": 528}, + {"name": "safety belt", "id": 2194, "trainId": 529}, + {"name": "cover", "id": 634, "trainId": 530}, + {"name": "dish rack", "id": 740, "trainId": 531}, + {"name": "synthesizer", "id": 2682, "trainId": 532}, + {"name": "pumpkin", "id": 2020, "trainId": 533}, + {"name": "gutter", "id": 1156, "trainId": 534}, + {"name": "fruit stand", "id": 1036, "trainId": 535}, + {"name": "ice floe, floe", "id": 1295, "trainId": 536}, + {"name": "handle, grip, handgrip, hold", "id": 1181, "trainId": 537}, + {"name": "wheelchair", "id": 3037, "trainId": 538}, + {"name": "mousepad, mouse mat", "id": 1614, "trainId": 539}, + {"name": "diploma", "id": 736, "trainId": 540}, + {"name": "fairground ride", "id": 893, "trainId": 541}, + {"name": "radio", "id": 2047, "trainId": 542}, + {"name": "hotplate", "id": 1274, "trainId": 543}, + {"name": "junk", "id": 1361, "trainId": 544}, + {"name": "wheelbarrow", "id": 3036, "trainId": 545}, + {"name": "stream", "id": 2606, "trainId": 546}, + {"name": "toll plaza", "id": 2797, "trainId": 547}, + {"name": "punching bag", "id": 2022, "trainId": 548}, + {"name": "trough", "id": 2876, "trainId": 549}, + {"name": "throne", "id": 2758, "trainId": 550}, + {"name": "chair desk", "id": 472, "trainId": 551}, + {"name": "weighbridge", "id": 3028, "trainId": 552}, + {"name": "extractor fan", "id": 882, "trainId": 553}, + {"name": "hanging clothes", "id": 1189, "trainId": 554}, + {"name": "dish, dish aerial, dish antenna, saucer", "id": 743, "trainId": 555}, + {"name": "alarm clock, alarm", "id": 3122, "trainId": 556}, + {"name": "ski lift", "id": 2401, "trainId": 557}, + {"name": "chain", "id": 468, "trainId": 558}, + {"name": "garage", "id": 1061, "trainId": 559}, + {"name": "mechanical shovel", "id": 1523, "trainId": 560}, + {"name": "wine rack", "id": 3059, "trainId": 561}, + {"name": "tramway", "id": 2843, "trainId": 562}, + {"name": "treadmill", "id": 2853, "trainId": 563}, + {"name": "menu", "id": 1529, "trainId": 564}, + {"name": "block", "id": 214, "trainId": 565}, + {"name": "well", "id": 3032, "trainId": 566}, + {"name": "witness stand", "id": 3071, "trainId": 567}, + {"name": "branch", "id": 277, "trainId": 568}, + {"name": "duck", "id": 813, "trainId": 569}, + {"name": "casserole", "id": 426, "trainId": 570}, + {"name": "frying pan", "id": 1039, "trainId": 571}, + {"name": "desk organizer", "id": 727, "trainId": 572}, + {"name": "mast", "id": 1508, "trainId": 573}, + {"name": "spectacles, specs, eyeglasses, glasses", "id": 2490, "trainId": 574}, + {"name": "service elevator", "id": 2299, "trainId": 575}, + {"name": "dollhouse", "id": 768, "trainId": 576}, + {"name": "hammock", "id": 1172, "trainId": 577}, + {"name": "clothes hanging", "id": 537, "trainId": 578}, + {"name": "photocopier", "id": 1847, "trainId": 579}, + {"name": "notepad", "id": 1664, "trainId": 580}, + {"name": "golf cart", "id": 1110, "trainId": 581}, + {"name": "footpath", "id": 1014, "trainId": 582}, + {"name": "cross", "id": 662, "trainId": 583}, + {"name": "baptismal font", "id": 121, "trainId": 584}, + {"name": "boiler", "id": 227, "trainId": 585}, + {"name": "skip", "id": 2410, "trainId": 586}, + {"name": "rotisserie", "id": 2165, "trainId": 587}, + {"name": "tables", "id": 2696, "trainId": 588}, + {"name": "water mill", "id": 3005, "trainId": 589}, + {"name": "helmet", "id": 1231, "trainId": 590}, + {"name": "cover curtain", "id": 635, "trainId": 591}, + {"name": "brick", "id": 292, "trainId": 592}, + {"name": "table runner", "id": 2690, "trainId": 593}, + {"name": "ashtray", "id": 65, "trainId": 594}, + {"name": "street box", "id": 2607, "trainId": 595}, + {"name": "stick", "id": 2574, "trainId": 596}, + {"name": "hangers", "id": 1188, "trainId": 597}, + {"name": "cells", "id": 456, "trainId": 598}, + {"name": "urinal", "id": 2913, "trainId": 599}, + {"name": "centerpiece", "id": 459, "trainId": 600}, + {"name": "portable fridge", "id": 1955, "trainId": 601}, + {"name": "dvds", "id": 827, "trainId": 602}, + {"name": "golf club", "id": 1111, "trainId": 603}, + {"name": "skirting board", "id": 2412, "trainId": 604}, + {"name": "water cooler", "id": 2997, "trainId": 605}, + {"name": "clipboard", "id": 528, "trainId": 606}, + {"name": "camera, photographic camera", "id": 366, "trainId": 607}, + {"name": "pigeonhole", "id": 1863, "trainId": 608}, + {"name": "chips", "id": 500, "trainId": 609}, + {"name": "food processor", "id": 1001, "trainId": 610}, + {"name": "post box", "id": 1958, "trainId": 611}, + {"name": "lid", "id": 1441, "trainId": 612}, + {"name": "drum", "id": 809, "trainId": 613}, + {"name": "blender", "id": 210, "trainId": 614}, + {"name": "cave entrance", "id": 435, "trainId": 615}, + {"name": "dental chair", "id": 718, "trainId": 616}, + {"name": "obelisk", "id": 1674, "trainId": 617}, + {"name": "canoe", "id": 388, "trainId": 618}, + {"name": "mobile", "id": 1572, "trainId": 619}, + {"name": "monitors", "id": 1584, "trainId": 620}, + {"name": "pool ball", "id": 1944, "trainId": 621}, + {"name": "cue rack", "id": 674, "trainId": 622}, + {"name": "baggage carts", "id": 99, "trainId": 623}, + {"name": "shore", "id": 2352, "trainId": 624}, + {"name": "fork", "id": 1019, "trainId": 625}, + {"name": "paper filer", "id": 1763, "trainId": 626}, + {"name": "bicycle rack", "id": 185, "trainId": 627}, + {"name": "coat rack", "id": 554, "trainId": 628}, + {"name": "garland", "id": 1066, "trainId": 629}, + {"name": "sports bag", "id": 2508, "trainId": 630}, + {"name": "fish tank", "id": 951, "trainId": 631}, + {"name": "towel dispenser", "id": 2822, "trainId": 632}, + {"name": "carriage", "id": 415, "trainId": 633}, + {"name": "brochure", "id": 297, "trainId": 634}, + {"name": "plaque", "id": 1914, "trainId": 635}, + {"name": "stringer", "id": 2619, "trainId": 636}, + {"name": "iron", "id": 1338, "trainId": 637}, + {"name": "spoon", "id": 2505, "trainId": 638}, + {"name": "flag pole", "id": 955, "trainId": 639}, + {"name": "toilet brush", "id": 2786, "trainId": 640}, + {"name": "book stand", "id": 238, "trainId": 641}, + {"name": "water faucet, water tap, tap, hydrant", "id": 3000, "trainId": 642}, + {"name": "ticket office", "id": 2763, "trainId": 643}, + {"name": "broom", "id": 299, "trainId": 644}, + {"name": "dvd", "id": 822, "trainId": 645}, + {"name": "ice bucket", "id": 1288, "trainId": 646}, + {"name": "carapace, shell, cuticle, shield", "id": 3101, "trainId": 647}, + {"name": "tureen", "id": 2894, "trainId": 648}, + {"name": "folders", "id": 992, "trainId": 649}, + {"name": "chess", "id": 489, "trainId": 650}, + {"name": "root", "id": 2157, "trainId": 651}, + {"name": "sewing machine", "id": 2309, "trainId": 652}, + {"name": "model", "id": 1576, "trainId": 653}, + {"name": "pen", "id": 1810, "trainId": 654}, + {"name": "violin", "id": 2964, "trainId": 655}, + {"name": "sweatshirt", "id": 2662, "trainId": 656}, + {"name": "recycling materials", "id": 2087, "trainId": 657}, + {"name": "mitten", "id": 1569, "trainId": 658}, + {"name": "chopping board, cutting board", "id": 503, "trainId": 659}, + {"name": "mask", "id": 1505, "trainId": 660}, + {"name": "log", "id": 1468, "trainId": 661}, + {"name": "mouse, computer mouse", "id": 1613, "trainId": 662}, + {"name": "grill", "id": 1138, "trainId": 663}, + {"name": "hole", "id": 1256, "trainId": 664}, + {"name": "target", "id": 2715, "trainId": 665}, + {"name": "trash bag", "id": 2846, "trainId": 666}, + {"name": "chalk", "id": 477, "trainId": 667}, + {"name": "sticks", "id": 2576, "trainId": 668}, + {"name": "balloon", "id": 108, "trainId": 669}, + {"name": "score", "id": 2245, "trainId": 670}, + {"name": "hair spray", "id": 1162, "trainId": 671}, + {"name": "roll", "id": 2149, "trainId": 672}, + {"name": "runner", "id": 2183, "trainId": 673}, + {"name": "engine", "id": 858, "trainId": 674}, + {"name": "inflatable glove", "id": 1324, "trainId": 675}, + {"name": "games", "id": 1055, "trainId": 676}, + {"name": "pallets", "id": 1741, "trainId": 677}, + {"name": "baskets", "id": 149, "trainId": 678}, + {"name": "coop", "id": 615, "trainId": 679}, + {"name": "dvd player", "id": 825, "trainId": 680}, + {"name": "rocking horse", "id": 2143, "trainId": 681}, + {"name": "buckets", "id": 304, "trainId": 682}, + {"name": "bread rolls", "id": 283, "trainId": 683}, + {"name": "shawl", "id": 2322, "trainId": 684}, + {"name": "watering can", "id": 3017, "trainId": 685}, + {"name": "spotlights", "id": 2510, "trainId": 686}, + {"name": "post-it", "id": 1960, "trainId": 687}, + {"name": "bowls", "id": 265, "trainId": 688}, + {"name": "security camera", "id": 2282, "trainId": 689}, + {"name": "runner cloth", "id": 2184, "trainId": 690}, + {"name": "lock", "id": 1461, "trainId": 691}, + {"name": "alarm, warning device, alarm system", "id": 3113, "trainId": 692}, + {"name": "side", "id": 2372, "trainId": 693}, + {"name": "roulette", "id": 2166, "trainId": 694}, + {"name": "bone", "id": 232, "trainId": 695}, + {"name": "cutlery", "id": 693, "trainId": 696}, + {"name": "pool balls", "id": 1945, "trainId": 697}, + {"name": "wheels", "id": 3039, "trainId": 698}, + {"name": "spice rack", "id": 2494, "trainId": 699}, + {"name": "plant pots", "id": 1908, "trainId": 700}, + {"name": "towel ring", "id": 2827, "trainId": 701}, + {"name": "bread box", "id": 280, "trainId": 702}, + {"name": "video", "id": 2950, "trainId": 703}, + {"name": "funfair", "id": 1044, "trainId": 704}, + {"name": "breads", "id": 288, "trainId": 705}, + {"name": "tripod", "id": 2863, "trainId": 706}, + {"name": "ironing board", "id": 1342, "trainId": 707}, + {"name": "skimmer", "id": 2409, "trainId": 708}, + {"name": "hollow", "id": 1258, "trainId": 709}, + {"name": "scratching post", "id": 2249, "trainId": 710}, + {"name": "tricycle", "id": 2862, "trainId": 711}, + {"name": "file box", "id": 920, "trainId": 712}, + {"name": "mountain pass", "id": 1607, "trainId": 713}, + {"name": "tombstones", "id": 2802, "trainId": 714}, + {"name": "cooker", "id": 610, "trainId": 715}, + {"name": "card game, cards", "id": 3129, "trainId": 716}, + {"name": "golf bag", "id": 1108, "trainId": 717}, + {"name": "towel paper", "id": 2823, "trainId": 718}, + {"name": "chaise lounge", "id": 476, "trainId": 719}, + {"name": "sun", "id": 2641, "trainId": 720}, + {"name": "toilet paper holder", "id": 2788, "trainId": 721}, + {"name": "rake", "id": 2070, "trainId": 722}, + {"name": "key", "id": 1368, "trainId": 723}, + {"name": "umbrella stand", "id": 2903, "trainId": 724}, + {"name": "dartboard", "id": 699, "trainId": 725}, + {"name": "transformer", "id": 2844, "trainId": 726}, + {"name": "fireplace utensils", "id": 942, "trainId": 727}, + {"name": "sweatshirts", "id": 2663, "trainId": 728}, + { + "name": "cellular telephone, cellular phone, cellphone, cell, mobile phone", + "id": 457, + "trainId": 729, + }, + {"name": "tallboy", "id": 2701, "trainId": 730}, + {"name": "stapler", "id": 2540, "trainId": 731}, + {"name": "sauna", "id": 2231, "trainId": 732}, + {"name": "test tube", "id": 2746, "trainId": 733}, + {"name": "palette", "id": 1738, "trainId": 734}, + {"name": "shopping carts", "id": 2350, "trainId": 735}, + {"name": "tools", "id": 2808, "trainId": 736}, + {"name": "push button, push, button", "id": 2025, "trainId": 737}, + {"name": "star", "id": 2541, "trainId": 738}, + {"name": "roof rack", "id": 2156, "trainId": 739}, + {"name": "barbed wire", "id": 126, "trainId": 740}, + {"name": "spray", "id": 2512, "trainId": 741}, + {"name": "ear", "id": 831, "trainId": 742}, + {"name": "sponge", "id": 2503, "trainId": 743}, + {"name": "racket", "id": 2039, "trainId": 744}, + {"name": "tins", "id": 2774, "trainId": 745}, + {"name": "eyeglasses", "id": 886, "trainId": 746}, + {"name": "file", "id": 919, "trainId": 747}, + {"name": "scarfs", "id": 2240, "trainId": 748}, + {"name": "sugar bowl", "id": 2636, "trainId": 749}, + {"name": "flip flop", "id": 963, "trainId": 750}, + {"name": "headstones", "id": 1218, "trainId": 751}, + {"name": "laptop bag", "id": 1406, "trainId": 752}, + {"name": "leash", "id": 1420, "trainId": 753}, + {"name": "climbing frame", "id": 526, "trainId": 754}, + {"name": "suit hanger", "id": 2639, "trainId": 755}, + {"name": "floor spotlight", "id": 975, "trainId": 756}, + {"name": "plate rack", "id": 1921, "trainId": 757}, + {"name": "sewer", "id": 2305, "trainId": 758}, + {"name": "hard drive", "id": 1193, "trainId": 759}, + {"name": "sprinkler", "id": 2517, "trainId": 760}, + {"name": "tools box", "id": 2809, "trainId": 761}, + {"name": "necklace", "id": 1647, "trainId": 762}, + {"name": "bulbs", "id": 314, "trainId": 763}, + {"name": "steel industry", "id": 2560, "trainId": 764}, + {"name": "club", "id": 545, "trainId": 765}, + {"name": "jack", "id": 1345, "trainId": 766}, + {"name": "door bars", "id": 775, "trainId": 767}, + { + "name": "control panel, instrument panel, control board, board, panel", + "id": 603, + "trainId": 768, + }, + {"name": "hairbrush", "id": 1163, "trainId": 769}, + {"name": "napkin holder", "id": 1641, "trainId": 770}, + {"name": "office", "id": 1678, "trainId": 771}, + {"name": "smoke detector", "id": 2450, "trainId": 772}, + {"name": "utensils", "id": 2915, "trainId": 773}, + {"name": "apron", "id": 42, "trainId": 774}, + {"name": "scissors", "id": 2242, "trainId": 775}, + {"name": "terminal", "id": 2741, "trainId": 776}, + {"name": "grinder", "id": 1143, "trainId": 777}, + {"name": "entry phone", "id": 862, "trainId": 778}, + {"name": "newspaper stand", "id": 1654, "trainId": 779}, + {"name": "pepper shaker", "id": 1826, "trainId": 780}, + {"name": "onions", "id": 1689, "trainId": 781}, + { + "name": "central processing unit, cpu, c p u , central processor, processor, mainframe", + "id": 3124, + "trainId": 782, + }, + {"name": "tape", "id": 2710, "trainId": 783}, + {"name": "bat", "id": 152, "trainId": 784}, + {"name": "coaster", "id": 549, "trainId": 785}, + {"name": "calculator", "id": 360, "trainId": 786}, + {"name": "potatoes", "id": 1982, "trainId": 787}, + {"name": "luggage rack", "id": 1478, "trainId": 788}, + {"name": "salt", "id": 2203, "trainId": 789}, + {"name": "street number", "id": 2612, "trainId": 790}, + {"name": "viewpoint", "id": 2956, "trainId": 791}, + {"name": "sword", "id": 2681, "trainId": 792}, + {"name": "cd", "id": 437, "trainId": 793}, + {"name": "rowing machine", "id": 2171, "trainId": 794}, + {"name": "plug", "id": 1933, "trainId": 795}, + {"name": "andiron, firedog, dog, dog-iron", "id": 3110, "trainId": 796}, + {"name": "pepper", "id": 1824, "trainId": 797}, + {"name": "tongs", "id": 2803, "trainId": 798}, + {"name": "bonfire", "id": 234, "trainId": 799}, + {"name": "dog dish", "id": 764, "trainId": 800}, + {"name": "belt", "id": 177, "trainId": 801}, + {"name": "dumbbells", "id": 817, "trainId": 802}, + {"name": "videocassette recorder, vcr", "id": 3145, "trainId": 803}, + {"name": "hook", "id": 1262, "trainId": 804}, + {"name": "envelopes", "id": 864, "trainId": 805}, + {"name": "shower faucet", "id": 2359, "trainId": 806}, + {"name": "watch", "id": 2992, "trainId": 807}, + {"name": "padlock", "id": 1725, "trainId": 808}, + {"name": "swimming pool ladder", "id": 2667, "trainId": 809}, + {"name": "spanners", "id": 2484, "trainId": 810}, + {"name": "gravy boat", "id": 1133, "trainId": 811}, + {"name": "notice board", "id": 1667, "trainId": 812}, + {"name": "trash bags", "id": 2847, "trainId": 813}, + {"name": "fire alarm", "id": 932, "trainId": 814}, + {"name": "ladle", "id": 1392, "trainId": 815}, + {"name": "stethoscope", "id": 2573, "trainId": 816}, + {"name": "rocket", "id": 2140, "trainId": 817}, + {"name": "funnel", "id": 1046, "trainId": 818}, + {"name": "bowling pins", "id": 264, "trainId": 819}, + {"name": "valve", "id": 2927, "trainId": 820}, + {"name": "thermometer", "id": 2752, "trainId": 821}, + {"name": "cups", "id": 679, "trainId": 822}, + {"name": "spice jar", "id": 2493, "trainId": 823}, + {"name": "night light", "id": 1658, "trainId": 824}, + {"name": "soaps", "id": 2466, "trainId": 825}, + {"name": "games table", "id": 1057, "trainId": 826}, + {"name": "slotted spoon", "id": 2444, "trainId": 827}, + {"name": "reel", "id": 2093, "trainId": 828}, + {"name": "scourer", "id": 2248, "trainId": 829}, + {"name": "sleeping robe", "id": 2432, "trainId": 830}, + {"name": "desk mat", "id": 726, "trainId": 831}, + {"name": "dumbbell", "id": 816, "trainId": 832}, + {"name": "hammer", "id": 1171, "trainId": 833}, + {"name": "tie", "id": 2766, "trainId": 834}, + {"name": "typewriter", "id": 2900, "trainId": 835}, + {"name": "shaker", "id": 2313, "trainId": 836}, + {"name": "cheese dish", "id": 488, "trainId": 837}, + {"name": "sea star", "id": 2265, "trainId": 838}, + {"name": "racquet", "id": 2043, "trainId": 839}, + {"name": "butane gas cylinder", "id": 332, "trainId": 840}, + {"name": "paper weight", "id": 1771, "trainId": 841}, + {"name": "shaving brush", "id": 2320, "trainId": 842}, + {"name": "sunglasses", "id": 2646, "trainId": 843}, + {"name": "gear shift", "id": 1089, "trainId": 844}, + {"name": "towel rail", "id": 2826, "trainId": 845}, + {"name": "adding machine, totalizer, totaliser", "id": 3148, "trainId": 846}, +] + + +def _get_ade20k_full_meta(): + stuff_ids = [k["id"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES] + assert len(stuff_ids) == 847, len(stuff_ids) + + stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} + stuff_classes = [k["name"] for k in ADE20K_SEM_SEG_FULL_CATEGORIES] + + ret = { + "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, + "stuff_classes": stuff_classes, + } + return ret + + +def register_all_ade20k_full(root): + meta = _get_ade20k_full_meta() + for name, dirname in [("val", "validation")]: + image_dir = os.path.join(root, "ADE20K_2021_17_01/images_detectron2", dirname) + gt_dir = os.path.join(root, "ADE20K_2021_17_01/annotations_detectron2", dirname) + name = f"ade20k_full_sem_seg_{name}" + DatasetCatalog.register( + name, + lambda x=image_dir, y=gt_dir: load_sem_seg( + y, x, gt_ext="tif", image_ext="jpg" + ), + ) + MetadataCatalog.get(name).set( + stuff_classes=meta["stuff_classes"][:], + thing_classes=meta["stuff_classes"][:], # the same as stuff_classes + image_root=image_dir, + sem_seg_root=gt_dir, + evaluator_type="sem_seg", + ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images + ) + + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_all_ade20k_full(_root) diff --git a/open_vocab_seg/data/datasets/register_cc3m.py b/open_vocab_seg/data/datasets/register_cc3m.py new file mode 100644 index 0000000000000000000000000000000000000000..8aa5cb07bc99b574505b6319835750789bb3ee26 --- /dev/null +++ b/open_vocab_seg/data/datasets/register_cc3m.py @@ -0,0 +1,457 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import os + +import pandas as pd +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.data.datasets import load_sem_seg +from detectron2.utils.file_io import PathManager + + +COCO_CATEGORIES = [ + {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, + {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, + {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, + {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, + {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, + {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, + {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, + {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, + {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, + {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, + {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, + {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, + {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, + {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, + {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, + {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, + {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, + {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, + {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, + {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, + {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, + {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, + {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, + {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, + {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, + {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, + {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, + {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, + {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, + {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, + {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, + {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, + {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, + {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, + {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, + {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, + {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, + {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, + {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, + {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, + {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, + {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, + {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, + {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, + {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, + {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, + {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, + {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, + {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, + {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, + {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, + {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, + {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, + {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, + {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, + {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, + {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, + {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, + {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, + {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, + {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, + {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, + {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, + {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, + {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, + {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, + {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, + {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, + {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, + {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, + {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, + {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, + {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, + {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, + {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, + {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, + {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, + {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, + {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, + {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, + {"id": 92, "name": "banner", "supercategory": "textile"}, + {"id": 93, "name": "blanket", "supercategory": "textile"}, + {"id": 94, "name": "branch", "supercategory": "plant"}, + {"id": 95, "name": "bridge", "supercategory": "building"}, + {"id": 96, "name": "building-other", "supercategory": "building"}, + {"id": 97, "name": "bush", "supercategory": "plant"}, + {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"}, + {"id": 99, "name": "cage", "supercategory": "structural"}, + {"id": 100, "name": "cardboard", "supercategory": "raw-material"}, + {"id": 101, "name": "carpet", "supercategory": "floor"}, + {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"}, + {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"}, + {"id": 104, "name": "cloth", "supercategory": "textile"}, + {"id": 105, "name": "clothes", "supercategory": "textile"}, + {"id": 106, "name": "clouds", "supercategory": "sky"}, + {"id": 107, "name": "counter", "supercategory": "furniture-stuff"}, + {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"}, + {"id": 109, "name": "curtain", "supercategory": "textile"}, + {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"}, + {"id": 111, "name": "dirt", "supercategory": "ground"}, + {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"}, + {"id": 113, "name": "fence", "supercategory": "structural"}, + {"id": 114, "name": "floor-marble", "supercategory": "floor"}, + {"id": 115, "name": "floor-other", "supercategory": "floor"}, + {"id": 116, "name": "floor-stone", "supercategory": "floor"}, + {"id": 117, "name": "floor-tile", "supercategory": "floor"}, + {"id": 118, "name": "floor-wood", "supercategory": "floor"}, + {"id": 119, "name": "flower", "supercategory": "plant"}, + {"id": 120, "name": "fog", "supercategory": "water"}, + {"id": 121, "name": "food-other", "supercategory": "food-stuff"}, + {"id": 122, "name": "fruit", "supercategory": "food-stuff"}, + {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"}, + {"id": 124, "name": "grass", "supercategory": "plant"}, + {"id": 125, "name": "gravel", "supercategory": "ground"}, + {"id": 126, "name": "ground-other", "supercategory": "ground"}, + {"id": 127, "name": "hill", "supercategory": "solid"}, + {"id": 128, "name": "house", "supercategory": "building"}, + {"id": 129, "name": "leaves", "supercategory": "plant"}, + {"id": 130, "name": "light", "supercategory": "furniture-stuff"}, + {"id": 131, "name": "mat", "supercategory": "textile"}, + {"id": 132, "name": "metal", "supercategory": "raw-material"}, + {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"}, + {"id": 134, "name": "moss", "supercategory": "plant"}, + {"id": 135, "name": "mountain", "supercategory": "solid"}, + {"id": 136, "name": "mud", "supercategory": "ground"}, + {"id": 137, "name": "napkin", "supercategory": "textile"}, + {"id": 138, "name": "net", "supercategory": "structural"}, + {"id": 139, "name": "paper", "supercategory": "raw-material"}, + {"id": 140, "name": "pavement", "supercategory": "ground"}, + {"id": 141, "name": "pillow", "supercategory": "textile"}, + {"id": 142, "name": "plant-other", "supercategory": "plant"}, + {"id": 143, "name": "plastic", "supercategory": "raw-material"}, + {"id": 144, "name": "platform", "supercategory": "ground"}, + {"id": 145, "name": "playingfield", "supercategory": "ground"}, + {"id": 146, "name": "railing", "supercategory": "structural"}, + {"id": 147, "name": "railroad", "supercategory": "ground"}, + {"id": 148, "name": "river", "supercategory": "water"}, + {"id": 149, "name": "road", "supercategory": "ground"}, + {"id": 150, "name": "rock", "supercategory": "solid"}, + {"id": 151, "name": "roof", "supercategory": "building"}, + {"id": 152, "name": "rug", "supercategory": "textile"}, + {"id": 153, "name": "salad", "supercategory": "food-stuff"}, + {"id": 154, "name": "sand", "supercategory": "ground"}, + {"id": 155, "name": "sea", "supercategory": "water"}, + {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"}, + {"id": 157, "name": "sky-other", "supercategory": "sky"}, + {"id": 158, "name": "skyscraper", "supercategory": "building"}, + {"id": 159, "name": "snow", "supercategory": "ground"}, + {"id": 160, "name": "solid-other", "supercategory": "solid"}, + {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"}, + {"id": 162, "name": "stone", "supercategory": "solid"}, + {"id": 163, "name": "straw", "supercategory": "plant"}, + {"id": 164, "name": "structural-other", "supercategory": "structural"}, + {"id": 165, "name": "table", "supercategory": "furniture-stuff"}, + {"id": 166, "name": "tent", "supercategory": "building"}, + {"id": 167, "name": "textile-other", "supercategory": "textile"}, + {"id": 168, "name": "towel", "supercategory": "textile"}, + {"id": 169, "name": "tree", "supercategory": "plant"}, + {"id": 170, "name": "vegetable", "supercategory": "food-stuff"}, + {"id": 171, "name": "wall-brick", "supercategory": "wall"}, + {"id": 172, "name": "wall-concrete", "supercategory": "wall"}, + {"id": 173, "name": "wall-other", "supercategory": "wall"}, + {"id": 174, "name": "wall-panel", "supercategory": "wall"}, + {"id": 175, "name": "wall-stone", "supercategory": "wall"}, + {"id": 176, "name": "wall-tile", "supercategory": "wall"}, + {"id": 177, "name": "wall-wood", "supercategory": "wall"}, + {"id": 178, "name": "water-other", "supercategory": "water"}, + {"id": 179, "name": "waterdrops", "supercategory": "water"}, + {"id": 180, "name": "window-blind", "supercategory": "window"}, + {"id": 181, "name": "window-other", "supercategory": "window"}, + {"id": 182, "name": "wood", "supercategory": "solid"}, +] + + +ADE20K_150_CATEGORIES = [ + {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"}, + {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"}, + {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"}, + {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"}, + {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"}, + {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"}, + {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"}, + {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"}, + {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "}, + {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"}, + {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"}, + {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"}, + {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"}, + {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"}, + {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"}, + {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"}, + {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"}, + {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"}, + {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"}, + {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"}, + {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"}, + {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"}, + {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"}, + {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"}, + {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"}, + {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"}, + {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"}, + {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"}, + {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"}, + {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"}, + {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"}, + {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"}, + {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"}, + {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"}, + {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"}, + {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"}, + {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"}, + {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"}, + {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"}, + {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"}, + {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"}, + {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"}, + {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"}, + {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"}, + { + "color": [6, 51, 255], + "id": 44, + "isthing": 1, + "name": "chest of drawers, chest, bureau, dresser", + }, + {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"}, + {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"}, + {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"}, + {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"}, + {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"}, + {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"}, + {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"}, + {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"}, + {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"}, + {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"}, + {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"}, + { + "color": [255, 71, 0], + "id": 56, + "isthing": 1, + "name": "pool table, billiard table, snooker table", + }, + {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"}, + {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"}, + {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"}, + {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"}, + {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"}, + {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"}, + {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"}, + {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"}, + { + "color": [0, 255, 133], + "id": 65, + "isthing": 1, + "name": "toilet, can, commode, crapper, pot, potty, stool, throne", + }, + {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"}, + {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"}, + {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"}, + {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"}, + {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"}, + {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"}, + {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"}, + {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"}, + {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"}, + {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"}, + {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"}, + {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"}, + {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"}, + {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"}, + {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"}, + {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"}, + {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"}, + {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"}, + {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"}, + {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"}, + {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"}, + {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"}, + {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"}, + {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"}, + {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"}, + {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"}, + {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"}, + {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"}, + {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"}, + { + "color": [0, 122, 255], + "id": 95, + "isthing": 1, + "name": "bannister, banister, balustrade, balusters, handrail", + }, + { + "color": [0, 255, 163], + "id": 96, + "isthing": 0, + "name": "escalator, moving staircase, moving stairway", + }, + { + "color": [255, 153, 0], + "id": 97, + "isthing": 1, + "name": "ottoman, pouf, pouffe, puff, hassock", + }, + {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"}, + {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"}, + { + "color": [143, 255, 0], + "id": 100, + "isthing": 0, + "name": "poster, posting, placard, notice, bill, card", + }, + {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"}, + {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"}, + {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"}, + {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"}, + { + "color": [133, 0, 255], + "id": 105, + "isthing": 0, + "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", + }, + {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"}, + { + "color": [184, 0, 255], + "id": 107, + "isthing": 1, + "name": "washer, automatic washer, washing machine", + }, + {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"}, + {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"}, + {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"}, + {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"}, + {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"}, + {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"}, + {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"}, + {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"}, + {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"}, + {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"}, + {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"}, + {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"}, + {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"}, + {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"}, + {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"}, + {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"}, + {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"}, + {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"}, + {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"}, + {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"}, + {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"}, + {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"}, + {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"}, + {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"}, + {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"}, + {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"}, + {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"}, + {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"}, + {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"}, + {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"}, + {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"}, + {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"}, + {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"}, + {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"}, + {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"}, + {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"}, + {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"}, + {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"}, + {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"}, + {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"}, + {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"}, + {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"}, +] + +TEST_CATEGORIES = [ + {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "Oculus"}, + {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "Ukulele"}, +] + +COCO_BASE_CATEGORIES = [ + c + for i, c in enumerate(COCO_CATEGORIES) + if c["id"] - 1 + not in [20, 24, 32, 33, 40, 56, 86, 99, 105, 123, 144, 147, 148, 168, 171] +] +COCO_NOVEL_CATEGORIES = [ + c + for i, c in enumerate(COCO_CATEGORIES) + if c["id"] - 1 + in [20, 24, 32, 33, 40, 56, 86, 99, 105, 123, 144, 147, 148, 168, 171] +] + + +def load_cc_image(csv_file, img_key='filepath', caption_key='title', sep="\t"): + print(f'Loading csv data from {csv_file}.') + df = pd.read_csv(csv_file, sep=sep) + + input_files = df[img_key].tolist() + captions = df[caption_key].tolist() + + print("Loaded {} images".format(len(input_files))) + + dataset_dicts = [] + for (img_path, text) in zip(input_files, captions): + record = {} + record["file_name"] = img_path + record["caption"] = text + dataset_dicts.append(record) + + return dataset_dicts + + +def _get_coco_stuff_meta(cat_list): + # Id 0 is reserved for ignore_label, we change ignore_label for 0 + # to 255 in our pre-processing. + stuff_ids = [k["id"] for k in cat_list] + + # For semantic segmentation, this mapping maps from contiguous stuff id + # (in [0, 91], used in models) to ids in the dataset (used for processing results) + stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} + stuff_classes = [k["name"] for k in cat_list] + + ret = { + "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, + "stuff_classes": stuff_classes, + } + return ret + + +def register_cc_3m(csv_file): + + meta = _get_coco_stuff_meta(TEST_CATEGORIES) + name = "cc_3m_train" + + DatasetCatalog.register( + name, + lambda x=csv_file: load_cc_image(x), + ) + MetadataCatalog.get(name).set( + csv_file=csv_file, + evaluator_type="dummy", + ignore_label=255, + **meta, + ) + + +# _csv_file = "/home/jeffliang/zsseg/datasets/coco/coco_train_merge_captions.csv" +_csv_file = "/home/jeffliang/zsseg/configs/masked_images/pred/samples.csv" +register_cc_3m(_csv_file) diff --git a/open_vocab_seg/data/datasets/register_coco_stuff.py b/open_vocab_seg/data/datasets/register_coco_stuff.py new file mode 100644 index 0000000000000000000000000000000000000000..d1a0f5b571a971fe20ebc8932d27499de856a565 --- /dev/null +++ b/open_vocab_seg/data/datasets/register_coco_stuff.py @@ -0,0 +1,250 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import os + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.data.datasets import load_sem_seg + + +COCO_CATEGORIES = [ + {"color": [220, 20, 60], "isthing": 1, "id": 1, "name": "person"}, + {"color": [119, 11, 32], "isthing": 1, "id": 2, "name": "bicycle"}, + {"color": [0, 0, 142], "isthing": 1, "id": 3, "name": "car"}, + {"color": [0, 0, 230], "isthing": 1, "id": 4, "name": "motorcycle"}, + {"color": [106, 0, 228], "isthing": 1, "id": 5, "name": "airplane"}, + {"color": [0, 60, 100], "isthing": 1, "id": 6, "name": "bus"}, + {"color": [0, 80, 100], "isthing": 1, "id": 7, "name": "train"}, + {"color": [0, 0, 70], "isthing": 1, "id": 8, "name": "truck"}, + {"color": [0, 0, 192], "isthing": 1, "id": 9, "name": "boat"}, + {"color": [250, 170, 30], "isthing": 1, "id": 10, "name": "traffic light"}, + {"color": [100, 170, 30], "isthing": 1, "id": 11, "name": "fire hydrant"}, + {"color": [220, 220, 0], "isthing": 1, "id": 13, "name": "stop sign"}, + {"color": [175, 116, 175], "isthing": 1, "id": 14, "name": "parking meter"}, + {"color": [250, 0, 30], "isthing": 1, "id": 15, "name": "bench"}, + {"color": [165, 42, 42], "isthing": 1, "id": 16, "name": "bird"}, + {"color": [255, 77, 255], "isthing": 1, "id": 17, "name": "cat"}, + {"color": [0, 226, 252], "isthing": 1, "id": 18, "name": "dog"}, + {"color": [182, 182, 255], "isthing": 1, "id": 19, "name": "horse"}, + {"color": [0, 82, 0], "isthing": 1, "id": 20, "name": "sheep"}, + {"color": [120, 166, 157], "isthing": 1, "id": 21, "name": "cow"}, + {"color": [110, 76, 0], "isthing": 1, "id": 22, "name": "elephant"}, + {"color": [174, 57, 255], "isthing": 1, "id": 23, "name": "bear"}, + {"color": [199, 100, 0], "isthing": 1, "id": 24, "name": "zebra"}, + {"color": [72, 0, 118], "isthing": 1, "id": 25, "name": "giraffe"}, + {"color": [255, 179, 240], "isthing": 1, "id": 27, "name": "backpack"}, + {"color": [0, 125, 92], "isthing": 1, "id": 28, "name": "umbrella"}, + {"color": [209, 0, 151], "isthing": 1, "id": 31, "name": "handbag"}, + {"color": [188, 208, 182], "isthing": 1, "id": 32, "name": "tie"}, + {"color": [0, 220, 176], "isthing": 1, "id": 33, "name": "suitcase"}, + {"color": [255, 99, 164], "isthing": 1, "id": 34, "name": "frisbee"}, + {"color": [92, 0, 73], "isthing": 1, "id": 35, "name": "skis"}, + {"color": [133, 129, 255], "isthing": 1, "id": 36, "name": "snowboard"}, + {"color": [78, 180, 255], "isthing": 1, "id": 37, "name": "sports ball"}, + {"color": [0, 228, 0], "isthing": 1, "id": 38, "name": "kite"}, + {"color": [174, 255, 243], "isthing": 1, "id": 39, "name": "baseball bat"}, + {"color": [45, 89, 255], "isthing": 1, "id": 40, "name": "baseball glove"}, + {"color": [134, 134, 103], "isthing": 1, "id": 41, "name": "skateboard"}, + {"color": [145, 148, 174], "isthing": 1, "id": 42, "name": "surfboard"}, + {"color": [255, 208, 186], "isthing": 1, "id": 43, "name": "tennis racket"}, + {"color": [197, 226, 255], "isthing": 1, "id": 44, "name": "bottle"}, + {"color": [171, 134, 1], "isthing": 1, "id": 46, "name": "wine glass"}, + {"color": [109, 63, 54], "isthing": 1, "id": 47, "name": "cup"}, + {"color": [207, 138, 255], "isthing": 1, "id": 48, "name": "fork"}, + {"color": [151, 0, 95], "isthing": 1, "id": 49, "name": "knife"}, + {"color": [9, 80, 61], "isthing": 1, "id": 50, "name": "spoon"}, + {"color": [84, 105, 51], "isthing": 1, "id": 51, "name": "bowl"}, + {"color": [74, 65, 105], "isthing": 1, "id": 52, "name": "banana"}, + {"color": [166, 196, 102], "isthing": 1, "id": 53, "name": "apple"}, + {"color": [208, 195, 210], "isthing": 1, "id": 54, "name": "sandwich"}, + {"color": [255, 109, 65], "isthing": 1, "id": 55, "name": "orange"}, + {"color": [0, 143, 149], "isthing": 1, "id": 56, "name": "broccoli"}, + {"color": [179, 0, 194], "isthing": 1, "id": 57, "name": "carrot"}, + {"color": [209, 99, 106], "isthing": 1, "id": 58, "name": "hot dog"}, + {"color": [5, 121, 0], "isthing": 1, "id": 59, "name": "pizza"}, + {"color": [227, 255, 205], "isthing": 1, "id": 60, "name": "donut"}, + {"color": [147, 186, 208], "isthing": 1, "id": 61, "name": "cake"}, + {"color": [153, 69, 1], "isthing": 1, "id": 62, "name": "chair"}, + {"color": [3, 95, 161], "isthing": 1, "id": 63, "name": "couch"}, + {"color": [163, 255, 0], "isthing": 1, "id": 64, "name": "potted plant"}, + {"color": [119, 0, 170], "isthing": 1, "id": 65, "name": "bed"}, + {"color": [0, 182, 199], "isthing": 1, "id": 67, "name": "dining table"}, + {"color": [0, 165, 120], "isthing": 1, "id": 70, "name": "toilet"}, + {"color": [183, 130, 88], "isthing": 1, "id": 72, "name": "tv"}, + {"color": [95, 32, 0], "isthing": 1, "id": 73, "name": "laptop"}, + {"color": [130, 114, 135], "isthing": 1, "id": 74, "name": "mouse"}, + {"color": [110, 129, 133], "isthing": 1, "id": 75, "name": "remote"}, + {"color": [166, 74, 118], "isthing": 1, "id": 76, "name": "keyboard"}, + {"color": [219, 142, 185], "isthing": 1, "id": 77, "name": "cell phone"}, + {"color": [79, 210, 114], "isthing": 1, "id": 78, "name": "microwave"}, + {"color": [178, 90, 62], "isthing": 1, "id": 79, "name": "oven"}, + {"color": [65, 70, 15], "isthing": 1, "id": 80, "name": "toaster"}, + {"color": [127, 167, 115], "isthing": 1, "id": 81, "name": "sink"}, + {"color": [59, 105, 106], "isthing": 1, "id": 82, "name": "refrigerator"}, + {"color": [142, 108, 45], "isthing": 1, "id": 84, "name": "book"}, + {"color": [196, 172, 0], "isthing": 1, "id": 85, "name": "clock"}, + {"color": [95, 54, 80], "isthing": 1, "id": 86, "name": "vase"}, + {"color": [128, 76, 255], "isthing": 1, "id": 87, "name": "scissors"}, + {"color": [201, 57, 1], "isthing": 1, "id": 88, "name": "teddy bear"}, + {"color": [246, 0, 122], "isthing": 1, "id": 89, "name": "hair drier"}, + {"color": [191, 162, 208], "isthing": 1, "id": 90, "name": "toothbrush"}, + {"id": 92, "name": "banner", "supercategory": "textile"}, + {"id": 93, "name": "blanket", "supercategory": "textile"}, + {"id": 94, "name": "branch", "supercategory": "plant"}, + {"id": 95, "name": "bridge", "supercategory": "building"}, + {"id": 96, "name": "building-other", "supercategory": "building"}, + {"id": 97, "name": "bush", "supercategory": "plant"}, + {"id": 98, "name": "cabinet", "supercategory": "furniture-stuff"}, + {"id": 99, "name": "cage", "supercategory": "structural"}, + {"id": 100, "name": "cardboard", "supercategory": "raw-material"}, + {"id": 101, "name": "carpet", "supercategory": "floor"}, + {"id": 102, "name": "ceiling-other", "supercategory": "ceiling"}, + {"id": 103, "name": "ceiling-tile", "supercategory": "ceiling"}, + {"id": 104, "name": "cloth", "supercategory": "textile"}, + {"id": 105, "name": "clothes", "supercategory": "textile"}, + {"id": 106, "name": "clouds", "supercategory": "sky"}, + {"id": 107, "name": "counter", "supercategory": "furniture-stuff"}, + {"id": 108, "name": "cupboard", "supercategory": "furniture-stuff"}, + {"id": 109, "name": "curtain", "supercategory": "textile"}, + {"id": 110, "name": "desk-stuff", "supercategory": "furniture-stuff"}, + {"id": 111, "name": "dirt", "supercategory": "ground"}, + {"id": 112, "name": "door-stuff", "supercategory": "furniture-stuff"}, + {"id": 113, "name": "fence", "supercategory": "structural"}, + {"id": 114, "name": "floor-marble", "supercategory": "floor"}, + {"id": 115, "name": "floor-other", "supercategory": "floor"}, + {"id": 116, "name": "floor-stone", "supercategory": "floor"}, + {"id": 117, "name": "floor-tile", "supercategory": "floor"}, + {"id": 118, "name": "floor-wood", "supercategory": "floor"}, + {"id": 119, "name": "flower", "supercategory": "plant"}, + {"id": 120, "name": "fog", "supercategory": "water"}, + {"id": 121, "name": "food-other", "supercategory": "food-stuff"}, + {"id": 122, "name": "fruit", "supercategory": "food-stuff"}, + {"id": 123, "name": "furniture-other", "supercategory": "furniture-stuff"}, + {"id": 124, "name": "grass", "supercategory": "plant"}, + {"id": 125, "name": "gravel", "supercategory": "ground"}, + {"id": 126, "name": "ground-other", "supercategory": "ground"}, + {"id": 127, "name": "hill", "supercategory": "solid"}, + {"id": 128, "name": "house", "supercategory": "building"}, + {"id": 129, "name": "leaves", "supercategory": "plant"}, + {"id": 130, "name": "light", "supercategory": "furniture-stuff"}, + {"id": 131, "name": "mat", "supercategory": "textile"}, + {"id": 132, "name": "metal", "supercategory": "raw-material"}, + {"id": 133, "name": "mirror-stuff", "supercategory": "furniture-stuff"}, + {"id": 134, "name": "moss", "supercategory": "plant"}, + {"id": 135, "name": "mountain", "supercategory": "solid"}, + {"id": 136, "name": "mud", "supercategory": "ground"}, + {"id": 137, "name": "napkin", "supercategory": "textile"}, + {"id": 138, "name": "net", "supercategory": "structural"}, + {"id": 139, "name": "paper", "supercategory": "raw-material"}, + {"id": 140, "name": "pavement", "supercategory": "ground"}, + {"id": 141, "name": "pillow", "supercategory": "textile"}, + {"id": 142, "name": "plant-other", "supercategory": "plant"}, + {"id": 143, "name": "plastic", "supercategory": "raw-material"}, + {"id": 144, "name": "platform", "supercategory": "ground"}, + {"id": 145, "name": "playingfield", "supercategory": "ground"}, + {"id": 146, "name": "railing", "supercategory": "structural"}, + {"id": 147, "name": "railroad", "supercategory": "ground"}, + {"id": 148, "name": "river", "supercategory": "water"}, + {"id": 149, "name": "road", "supercategory": "ground"}, + {"id": 150, "name": "rock", "supercategory": "solid"}, + {"id": 151, "name": "roof", "supercategory": "building"}, + {"id": 152, "name": "rug", "supercategory": "textile"}, + {"id": 153, "name": "salad", "supercategory": "food-stuff"}, + {"id": 154, "name": "sand", "supercategory": "ground"}, + {"id": 155, "name": "sea", "supercategory": "water"}, + {"id": 156, "name": "shelf", "supercategory": "furniture-stuff"}, + {"id": 157, "name": "sky-other", "supercategory": "sky"}, + {"id": 158, "name": "skyscraper", "supercategory": "building"}, + {"id": 159, "name": "snow", "supercategory": "ground"}, + {"id": 160, "name": "solid-other", "supercategory": "solid"}, + {"id": 161, "name": "stairs", "supercategory": "furniture-stuff"}, + {"id": 162, "name": "stone", "supercategory": "solid"}, + {"id": 163, "name": "straw", "supercategory": "plant"}, + {"id": 164, "name": "structural-other", "supercategory": "structural"}, + {"id": 165, "name": "table", "supercategory": "furniture-stuff"}, + {"id": 166, "name": "tent", "supercategory": "building"}, + {"id": 167, "name": "textile-other", "supercategory": "textile"}, + {"id": 168, "name": "towel", "supercategory": "textile"}, + {"id": 169, "name": "tree", "supercategory": "plant"}, + {"id": 170, "name": "vegetable", "supercategory": "food-stuff"}, + {"id": 171, "name": "wall-brick", "supercategory": "wall"}, + {"id": 172, "name": "wall-concrete", "supercategory": "wall"}, + {"id": 173, "name": "wall-other", "supercategory": "wall"}, + {"id": 174, "name": "wall-panel", "supercategory": "wall"}, + {"id": 175, "name": "wall-stone", "supercategory": "wall"}, + {"id": 176, "name": "wall-tile", "supercategory": "wall"}, + {"id": 177, "name": "wall-wood", "supercategory": "wall"}, + {"id": 178, "name": "water-other", "supercategory": "water"}, + {"id": 179, "name": "waterdrops", "supercategory": "water"}, + {"id": 180, "name": "window-blind", "supercategory": "window"}, + {"id": 181, "name": "window-other", "supercategory": "window"}, + {"id": 182, "name": "wood", "supercategory": "solid"}, +] + +def _get_coco_stuff_meta(cat_list): + # Id 0 is reserved for ignore_label, we change ignore_label for 0 + # to 255 in our pre-processing. + stuff_ids = [k["id"] for k in cat_list] + + # For semantic segmentation, this mapping maps from contiguous stuff id + # (in [0, 91], used in models) to ids in the dataset (used for processing results) + stuff_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(stuff_ids)} + stuff_classes = [k["name"] for k in cat_list] + + ret = { + "stuff_dataset_id_to_contiguous_id": stuff_dataset_id_to_contiguous_id, + "stuff_classes": stuff_classes, + } + return ret + + +def register_all_coco_stuff_10k(root): + root = os.path.join(root, "coco", "coco_stuff_10k") + meta = _get_coco_stuff_meta(COCO_CATEGORIES) + for name, image_dirname, sem_seg_dirname in [ + ("train", "images_detectron2/train", "annotations_detectron2/train"), + ]: + image_dir = os.path.join(root, image_dirname) + gt_dir = os.path.join(root, sem_seg_dirname) + name = f"coco_2017_{name}_stuff_10k_sem_seg" + DatasetCatalog.register( + name, + lambda x=image_dir, y=gt_dir: load_sem_seg( + y, x, gt_ext="png", image_ext="jpg" + ), + ) + MetadataCatalog.get(name).set( + image_root=image_dir, + sem_seg_root=gt_dir, + evaluator_type="sem_seg", + ignore_label=255, + **meta, + ) + + +def register_all_coco_stuff(root): + root = os.path.join(root, "coco") + meta = _get_coco_stuff_meta(COCO_CATEGORIES) + + for name, image_dirname, sem_seg_dirname in [ + ("train", "train2017", "stuffthingmaps_detectron2/train2017"), + ]: + image_dir = os.path.join(root, image_dirname) + gt_dir = os.path.join(root, sem_seg_dirname) + all_name = f"coco_2017_{name}_stuff_sem_seg" + DatasetCatalog.register( + all_name, + lambda x=image_dir, y=gt_dir: load_sem_seg( + y, x, gt_ext="png", image_ext="jpg" + ), + ) + MetadataCatalog.get(all_name).set( + image_root=image_dir, + sem_seg_root=gt_dir, + evaluator_type="sem_seg", + ignore_label=255, + **meta, + ) + + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_all_coco_stuff_10k(_root) +register_all_coco_stuff(_root) diff --git a/open_vocab_seg/data/datasets/register_pascal_context.py b/open_vocab_seg/data/datasets/register_pascal_context.py new file mode 100644 index 0000000000000000000000000000000000000000..e40f87c945da20e78c0a3ea230bc9f36d1800071 --- /dev/null +++ b/open_vocab_seg/data/datasets/register_pascal_context.py @@ -0,0 +1,588 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import os + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.data.datasets import load_sem_seg + +PASCALCONTEX59_NAMES = ( + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "table", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", + "bag", + "bed", + "bench", + "book", + "building", + "cabinet", + "ceiling", + "cloth", + "computer", + "cup", + "door", + "fence", + "floor", + "flower", + "food", + "grass", + "ground", + "keyboard", + "light", + "mountain", + "mouse", + "curtain", + "platform", + "sign", + "plate", + "road", + "rock", + "shelves", + "sidewalk", + "sky", + "snow", + "bedclothes", + "track", + "tree", + "truck", + "wall", + "water", + "window", + "wood", +) + +PASCALCONTEX459_NAMES = ( + "accordion", + "aeroplane", + "air conditioner", + "antenna", + "artillery", + "ashtray", + "atrium", + "baby carriage", + "bag", + "ball", + "balloon", + "bamboo weaving", + "barrel", + "baseball bat", + "basket", + "basketball backboard", + "bathtub", + "bed", + "bedclothes", + "beer", + "bell", + "bench", + "bicycle", + "binoculars", + "bird", + "bird cage", + "bird feeder", + "bird nest", + "blackboard", + "board", + "boat", + "bone", + "book", + "bottle", + "bottle opener", + "bowl", + "box", + "bracelet", + "brick", + "bridge", + "broom", + "brush", + "bucket", + "building", + "bus", + "cabinet", + "cabinet door", + "cage", + "cake", + "calculator", + "calendar", + "camel", + "camera", + "camera lens", + "can", + "candle", + "candle holder", + "cap", + "car", + "card", + "cart", + "case", + "casette recorder", + "cash register", + "cat", + "cd", + "cd player", + "ceiling", + "cell phone", + "cello", + "chain", + "chair", + "chessboard", + "chicken", + "chopstick", + "clip", + "clippers", + "clock", + "closet", + "cloth", + "clothes tree", + "coffee", + "coffee machine", + "comb", + "computer", + "concrete", + "cone", + "container", + "control booth", + "controller", + "cooker", + "copying machine", + "coral", + "cork", + "corkscrew", + "counter", + "court", + "cow", + "crabstick", + "crane", + "crate", + "cross", + "crutch", + "cup", + "curtain", + "cushion", + "cutting board", + "dais", + "disc", + "disc case", + "dishwasher", + "dock", + "dog", + "dolphin", + "door", + "drainer", + "dray", + "drink dispenser", + "drinking machine", + "drop", + "drug", + "drum", + "drum kit", + "duck", + "dumbbell", + "earphone", + "earrings", + "egg", + "electric fan", + "electric iron", + "electric pot", + "electric saw", + "electronic keyboard", + "engine", + "envelope", + "equipment", + "escalator", + "exhibition booth", + "extinguisher", + "eyeglass", + "fan", + "faucet", + "fax machine", + "fence", + "ferris wheel", + "fire extinguisher", + "fire hydrant", + "fire place", + "fish", + "fish tank", + "fishbowl", + "fishing net", + "fishing pole", + "flag", + "flagstaff", + "flame", + "flashlight", + "floor", + "flower", + "fly", + "foam", + "food", + "footbridge", + "forceps", + "fork", + "forklift", + "fountain", + "fox", + "frame", + "fridge", + "frog", + "fruit", + "funnel", + "furnace", + "game controller", + "game machine", + "gas cylinder", + "gas hood", + "gas stove", + "gift box", + "glass", + "glass marble", + "globe", + "glove", + "goal", + "grandstand", + "grass", + "gravestone", + "ground", + "guardrail", + "guitar", + "gun", + "hammer", + "hand cart", + "handle", + "handrail", + "hanger", + "hard disk drive", + "hat", + "hay", + "headphone", + "heater", + "helicopter", + "helmet", + "holder", + "hook", + "horse", + "horse-drawn carriage", + "hot-air balloon", + "hydrovalve", + "ice", + "inflator pump", + "ipod", + "iron", + "ironing board", + "jar", + "kart", + "kettle", + "key", + "keyboard", + "kitchen range", + "kite", + "knife", + "knife block", + "ladder", + "ladder truck", + "ladle", + "laptop", + "leaves", + "lid", + "life buoy", + "light", + "light bulb", + "lighter", + "line", + "lion", + "lobster", + "lock", + "machine", + "mailbox", + "mannequin", + "map", + "mask", + "mat", + "match book", + "mattress", + "menu", + "metal", + "meter box", + "microphone", + "microwave", + "mirror", + "missile", + "model", + "money", + "monkey", + "mop", + "motorbike", + "mountain", + "mouse", + "mouse pad", + "musical instrument", + "napkin", + "net", + "newspaper", + "oar", + "ornament", + "outlet", + "oven", + "oxygen bottle", + "pack", + "pan", + "paper", + "paper box", + "paper cutter", + "parachute", + "parasol", + "parterre", + "patio", + "pelage", + "pen", + "pen container", + "pencil", + "person", + "photo", + "piano", + "picture", + "pig", + "pillar", + "pillow", + "pipe", + "pitcher", + "plant", + "plastic", + "plate", + "platform", + "player", + "playground", + "pliers", + "plume", + "poker", + "poker chip", + "pole", + "pool table", + "postcard", + "poster", + "pot", + "pottedplant", + "printer", + "projector", + "pumpkin", + "rabbit", + "racket", + "radiator", + "radio", + "rail", + "rake", + "ramp", + "range hood", + "receiver", + "recorder", + "recreational machines", + "remote control", + "road", + "robot", + "rock", + "rocket", + "rocking horse", + "rope", + "rug", + "ruler", + "runway", + "saddle", + "sand", + "saw", + "scale", + "scanner", + "scissors", + "scoop", + "screen", + "screwdriver", + "sculpture", + "scythe", + "sewer", + "sewing machine", + "shed", + "sheep", + "shell", + "shelves", + "shoe", + "shopping cart", + "shovel", + "sidecar", + "sidewalk", + "sign", + "signal light", + "sink", + "skateboard", + "ski", + "sky", + "sled", + "slippers", + "smoke", + "snail", + "snake", + "snow", + "snowmobiles", + "sofa", + "spanner", + "spatula", + "speaker", + "speed bump", + "spice container", + "spoon", + "sprayer", + "squirrel", + "stage", + "stair", + "stapler", + "stick", + "sticky note", + "stone", + "stool", + "stove", + "straw", + "stretcher", + "sun", + "sunglass", + "sunshade", + "surveillance camera", + "swan", + "sweeper", + "swim ring", + "swimming pool", + "swing", + "switch", + "table", + "tableware", + "tank", + "tap", + "tape", + "tarp", + "telephone", + "telephone booth", + "tent", + "tire", + "toaster", + "toilet", + "tong", + "tool", + "toothbrush", + "towel", + "toy", + "toy car", + "track", + "train", + "trampoline", + "trash bin", + "tray", + "tree", + "tricycle", + "tripod", + "trophy", + "truck", + "tube", + "turtle", + "tvmonitor", + "tweezers", + "typewriter", + "umbrella", + "unknown", + "vacuum cleaner", + "vending machine", + "video camera", + "video game console", + "video player", + "video tape", + "violin", + "wakeboard", + "wall", + "wallet", + "wardrobe", + "washing machine", + "watch", + "water", + "water dispenser", + "water pipe", + "water skate board", + "watermelon", + "whale", + "wharf", + "wheel", + "wheelchair", + "window", + "window blinds", + "wineglass", + "wire", + "wood", + "wool", + +) + + +def _get_voc_meta(cat_list): + ret = { + "stuff_classes": cat_list, + } + return ret + + +def register_pascal_context_59(root): + root = os.path.join(root, "VOCdevkit/VOC2010") + meta = _get_voc_meta(PASCALCONTEX59_NAMES) + for name, image_dirname, sem_seg_dirname in [ + ("val", "JPEGImages", "annotations_detectron2/pc59_val"), + ]: + image_dir = os.path.join(root, image_dirname) + gt_dir = os.path.join(root, sem_seg_dirname) + all_name = f"pascal_context_59_sem_seg_{name}" + DatasetCatalog.register( + all_name, + lambda x=image_dir, y=gt_dir: load_sem_seg( + y, x, gt_ext="png", image_ext="jpg" + ), + ) + MetadataCatalog.get(all_name).set( + image_root=image_dir, + sem_seg_root=gt_dir, + evaluator_type="sem_seg", + ignore_label=255, + **meta, + ) + +def register_pascal_context_459(root): + root = os.path.join(root, "VOCdevkit/VOC2010") + meta = _get_voc_meta(PASCALCONTEX459_NAMES) + for name, image_dirname, sem_seg_dirname in [ + ("val", "JPEGImages", "annotations_detectron2/pc459_val"), + ]: + image_dir = os.path.join(root, image_dirname) + gt_dir = os.path.join(root, sem_seg_dirname) + all_name = f"pascal_context_459_sem_seg_{name}" + DatasetCatalog.register( + all_name, + lambda x=image_dir, y=gt_dir: load_sem_seg( + y, x, gt_ext="tif", image_ext="jpg" + ), + ) + MetadataCatalog.get(all_name).set( + image_root=image_dir, + sem_seg_root=gt_dir, + evaluator_type="sem_seg", + ignore_label=65535, # NOTE: gt is saved in 16-bit TIFF images + **meta, + ) + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_pascal_context_59(_root) +register_pascal_context_459(_root) diff --git a/open_vocab_seg/data/datasets/register_voc_seg.py b/open_vocab_seg/data/datasets/register_voc_seg.py new file mode 100644 index 0000000000000000000000000000000000000000..b8c2be16f4bb5348de8f1051f3579e02e362488f --- /dev/null +++ b/open_vocab_seg/data/datasets/register_voc_seg.py @@ -0,0 +1,62 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import os + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.data.datasets import load_sem_seg + +PASCALVOC20_NAMES = ( + "aeroplane", + "bicycle", + "bird", + "boat", + "bottle", + "bus", + "car", + "cat", + "chair", + "cow", + "diningtable", + "dog", + "horse", + "motorbike", + "person", + "pottedplant", + "sheep", + "sofa", + "train", + "tvmonitor", +) + +def _get_voc_meta(cat_list): + ret = { + "stuff_classes": cat_list, + } + return ret + + +def register_pascalvoc(root): + root = os.path.join(root, "VOCdevkit/VOC2012") + meta = _get_voc_meta(PASCALVOC20_NAMES) + + for name, image_dirname, sem_seg_dirname in [ + ("val", "JPEGImages", "annotations_detectron2/val"), + ]: + image_dir = os.path.join(root, image_dirname) + gt_dir = os.path.join(root, sem_seg_dirname) + all_name = f"pascalvoc20_sem_seg_{name}" + DatasetCatalog.register( + all_name, + lambda x=image_dir, y=gt_dir: load_sem_seg( + y, x, gt_ext="png", image_ext="jpg" + ), + ) + MetadataCatalog.get(all_name).set( + image_root=image_dir, + sem_seg_root=gt_dir, + evaluator_type="sem_seg", + ignore_label=255, + **meta, + ) + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_pascalvoc(_root) diff --git a/open_vocab_seg/evaluation/__init__.py b/open_vocab_seg/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b9d36d8e9659a1d31471273a6a0f82c2642ea982 --- /dev/null +++ b/open_vocab_seg/evaluation/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from .generalized_sem_seg_evaluation import GeneralizedSemSegEvaluator diff --git a/open_vocab_seg/evaluation/generalized_sem_seg_evaluation.py b/open_vocab_seg/evaluation/generalized_sem_seg_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..ce960ae7cbffde4a981be941ed03a8fc7025ed80 --- /dev/null +++ b/open_vocab_seg/evaluation/generalized_sem_seg_evaluation.py @@ -0,0 +1,159 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import itertools +import json +import numpy as np +import os +from collections import OrderedDict +import PIL.Image as Image +import torch + +from detectron2.data import DatasetCatalog, MetadataCatalog +from detectron2.utils.comm import all_gather, is_main_process, synchronize +from detectron2.utils.file_io import PathManager + +from detectron2.evaluation import SemSegEvaluator + + +class GeneralizedSemSegEvaluator(SemSegEvaluator): + """ + Evaluate semantic segmentation metrics. + """ + + def __init__( + self, + dataset_name, + distributed=True, + output_dir=None, + *, + num_classes=None, + ignore_label=None, + post_process_func=None, + ): + super().__init__( + dataset_name, + distributed=distributed, + output_dir=output_dir, + num_classes=num_classes, + ignore_label=ignore_label, + ) + meta = MetadataCatalog.get(dataset_name) + try: + self._evaluation_set = meta.evaluation_set + except AttributeError: + self._evaluation_set = None + self.post_process_func = ( + post_process_func + if post_process_func is not None + else lambda x, **kwargs: x + ) + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a model. + It is a list of dicts. Each dict corresponds to an image and + contains keys like "height", "width", "file_name". + outputs: the outputs of a model. It is either list of semantic segmentation predictions + (Tensor [H, W]) or list of dicts with key "sem_seg" that contains semantic + segmentation prediction in the same format. + """ + for input, output in zip(inputs, outputs): + output = self.post_process_func( + output["sem_seg"], image=np.array(Image.open(input["file_name"])) + ) + output = output.argmax(dim=0).to(self._cpu_device) + pred = np.array(output, dtype=np.int) + with PathManager.open( + self.input_file_to_gt_file[input["file_name"]], "rb" + ) as f: + gt = np.array(Image.open(f), dtype=np.int) + + gt[gt == self._ignore_label] = self._num_classes + + self._conf_matrix += np.bincount( + (self._num_classes + 1) * pred.reshape(-1) + gt.reshape(-1), + minlength=self._conf_matrix.size, + ).reshape(self._conf_matrix.shape) + + self._predictions.extend(self.encode_json_sem_seg(pred, input["file_name"])) + + def evaluate(self): + """ + Evaluates standard semantic segmentation metrics (http://cocodataset.org/#stuff-eval): + + * Mean intersection-over-union averaged across classes (mIoU) + * Frequency Weighted IoU (fwIoU) + * Mean pixel accuracy averaged across classes (mACC) + * Pixel Accuracy (pACC) + """ + if self._distributed: + synchronize() + conf_matrix_list = all_gather(self._conf_matrix) + self._predictions = all_gather(self._predictions) + self._predictions = list(itertools.chain(*self._predictions)) + if not is_main_process(): + return + + self._conf_matrix = np.zeros_like(self._conf_matrix) + for conf_matrix in conf_matrix_list: + self._conf_matrix += conf_matrix + + if self._output_dir: + PathManager.mkdirs(self._output_dir) + file_path = os.path.join(self._output_dir, "sem_seg_predictions.json") + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(self._predictions)) + + acc = np.full(self._num_classes, np.nan, dtype=np.float) + iou = np.full(self._num_classes, np.nan, dtype=np.float) + tp = self._conf_matrix.diagonal()[:-1].astype(np.float) + pos_gt = np.sum(self._conf_matrix[:-1, :-1], axis=0).astype(np.float) + class_weights = pos_gt / np.sum(pos_gt) + pos_pred = np.sum(self._conf_matrix[:-1, :-1], axis=1).astype(np.float) + acc_valid = pos_gt > 0 + acc[acc_valid] = tp[acc_valid] / pos_gt[acc_valid] + iou_valid = (pos_gt + pos_pred) > 0 + union = pos_gt + pos_pred - tp + iou[acc_valid] = tp[acc_valid] / union[acc_valid] + macc = np.sum(acc[acc_valid]) / np.sum(acc_valid) + miou = np.sum(iou[acc_valid]) / np.sum(iou_valid) + fiou = np.sum(iou[acc_valid] * class_weights[acc_valid]) + pacc = np.sum(tp) / np.sum(pos_gt) + + res = {} + res["mIoU"] = 100 * miou + res["fwIoU"] = 100 * fiou + for i, name in enumerate(self._class_names): + res["IoU-{}".format(name)] = 100 * iou[i] + res["mACC"] = 100 * macc + res["pACC"] = 100 * pacc + for i, name in enumerate(self._class_names): + res["ACC-{}".format(name)] = 100 * acc[i] + if self._evaluation_set is not None: + for set_name, set_inds in self._evaluation_set.items(): + iou_list = [] + set_inds = np.array(set_inds, np.int) + mask = np.zeros((len(iou),)).astype(np.bool) + mask[set_inds] = 1 + miou = np.sum(iou[mask][acc_valid[mask]]) / np.sum(iou_valid[mask]) + pacc = np.sum(tp[mask]) / np.sum(pos_gt[mask]) + res["mIoU-{}".format(set_name)] = 100 * miou + res["pAcc-{}".format(set_name)] = 100 * pacc + iou_list.append(miou) + miou = np.sum(iou[~mask][acc_valid[~mask]]) / np.sum(iou_valid[~mask]) + pacc = np.sum(tp[~mask]) / np.sum(pos_gt[~mask]) + res["mIoU-un{}".format(set_name)] = 100 * miou + res["pAcc-un{}".format(set_name)] = 100 * pacc + iou_list.append(miou) + res["hIoU-{}".format(set_name)] = ( + 100 * len(iou_list) / sum([1 / iou for iou in iou_list]) + ) + if self._output_dir: + file_path = os.path.join(self._output_dir, "sem_seg_evaluation.pth") + with PathManager.open(file_path, "wb") as f: + torch.save(res, f) + results = OrderedDict({"sem_seg": res}) + self._logger.info(results) + return results diff --git a/open_vocab_seg/mask_former_model.py b/open_vocab_seg/mask_former_model.py new file mode 100644 index 0000000000000000000000000000000000000000..3708d65de4695368b1d088abde4bdf4a9fa39b2b --- /dev/null +++ b/open_vocab_seg/mask_former_model.py @@ -0,0 +1,254 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from typing import Tuple + +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.data import MetadataCatalog +from detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head +from detectron2.modeling.backbone import Backbone +from detectron2.modeling.postprocessing import sem_seg_postprocess +from detectron2.structures import ImageList + +from .modeling.criterion import SetCriterion +from .modeling.matcher import HungarianMatcher + + +@META_ARCH_REGISTRY.register() +class MaskFormer(nn.Module): + """ + Main class for mask classification semantic segmentation architectures. + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + sem_seg_head: nn.Module, + criterion: nn.Module, + num_queries: int, + panoptic_on: bool, + object_mask_threshold: float, + overlap_threshold: float, + metadata, + size_divisibility: int, + sem_seg_postprocess_before_inference: bool, + pixel_mean: Tuple[float], + pixel_std: Tuple[float], + ): + """ + Args: + backbone: a backbone module, must follow detectron2's backbone interface + sem_seg_head: a module that predicts semantic segmentation from backbone features + criterion: a module that defines the loss + num_queries: int, number of queries + panoptic_on: bool, whether to output panoptic segmentation prediction + object_mask_threshold: float, threshold to filter query based on classification score + for panoptic segmentation inference + overlap_threshold: overlap threshold used in general inference for panoptic segmentation + metadata: dataset meta, get `thing` and `stuff` category names for panoptic + segmentation inference + size_divisibility: Some backbones require the input height and width to be divisible by a + specific integer. We can use this to override such requirement. + sem_seg_postprocess_before_inference: whether to resize the prediction back + to original input size before semantic segmentation inference or after. + For high-resolution dataset like Mapillary, resizing predictions before + inference will cause OOM error. + pixel_mean, pixel_std: list or tuple with #channels element, representing + the per-channel mean and std to be used to normalize the input image + """ + super().__init__() + self.backbone = backbone + self.sem_seg_head = sem_seg_head + self.criterion = criterion + self.num_queries = num_queries + self.overlap_threshold = overlap_threshold + self.panoptic_on = panoptic_on + self.object_mask_threshold = object_mask_threshold + self.metadata = metadata + if size_divisibility < 0: + # use backbone size_divisibility if not set + size_divisibility = self.backbone.size_divisibility + self.size_divisibility = size_divisibility + self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference + self.register_buffer( + "pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False + ) + self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) + + @classmethod + def from_config(cls, cfg): + backbone = build_backbone(cfg) + sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) + + # Loss parameters: + deep_supervision = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION + no_object_weight = cfg.MODEL.MASK_FORMER.NO_OBJECT_WEIGHT + dice_weight = cfg.MODEL.MASK_FORMER.DICE_WEIGHT + mask_weight = cfg.MODEL.MASK_FORMER.MASK_WEIGHT + + # building criterion + matcher = HungarianMatcher( + cost_class=1, + cost_mask=mask_weight, + cost_dice=dice_weight, + ) + + weight_dict = {"loss_ce": 1, "loss_mask": mask_weight, "loss_dice": dice_weight} + if deep_supervision: + dec_layers = cfg.MODEL.MASK_FORMER.DEC_LAYERS + aux_weight_dict = {} + for i in range(dec_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + + losses = ["labels", "masks"] + + criterion = SetCriterion( + sem_seg_head.num_classes, + matcher=matcher, + weight_dict=weight_dict, + eos_coef=no_object_weight, + losses=losses, + ) + + return { + "backbone": backbone, + "sem_seg_head": sem_seg_head, + "criterion": criterion, + "num_queries": cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES, + "panoptic_on": cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON, + "object_mask_threshold": cfg.MODEL.MASK_FORMER.TEST.OBJECT_MASK_THRESHOLD, + "overlap_threshold": cfg.MODEL.MASK_FORMER.TEST.OVERLAP_THRESHOLD, + "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), + "size_divisibility": cfg.MODEL.MASK_FORMER.SIZE_DIVISIBILITY, + "sem_seg_postprocess_before_inference": ( + cfg.MODEL.MASK_FORMER.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE + or cfg.MODEL.MASK_FORMER.TEST.PANOPTIC_ON + ), + "pixel_mean": cfg.MODEL.PIXEL_MEAN, + "pixel_std": cfg.MODEL.PIXEL_STD, + } + + @property + def device(self): + return self.pixel_mean.device + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + * "image": Tensor, image in (C, H, W) format. + * "instances": per-region ground truth + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model (may be different + from input resolution), used in inference. + Returns: + list[dict]: + each dict has the results for one image. The dict contains the following keys: + + * "sem_seg": + A Tensor that represents the + per-pixel segmentation prediced by the head. + The prediction has shape KxHxW that represents the logits of + each class for each pixel. + * "panoptic_seg": + A tuple that represent panoptic output + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. + segments_info (list[dict]): Describe each segment in `panoptic_seg`. + Each dict contains keys "id", "category_id", "isthing". + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.size_divisibility) + + features = self.backbone(images.tensor) + outputs = self.sem_seg_head(features) + + if self.training: + # mask classification target + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + targets = self.prepare_targets(gt_instances, images) + else: + targets = None + + # bipartite matching-based loss + losses = self.criterion(outputs, targets) + + for k in list(losses.keys()): + if k in self.criterion.weight_dict: + losses[k] *= self.criterion.weight_dict[k] + else: + # remove this loss if not specified in `weight_dict` + losses.pop(k) + + return losses + else: + mask_cls_results = outputs["pred_logits"] + mask_pred_results = outputs["pred_masks"] + # upsample masks + mask_pred_results = F.interpolate( + mask_pred_results, + size=(images.tensor.shape[-2], images.tensor.shape[-1]), + mode="bilinear", + align_corners=False, + ) + + processed_results = [] + for mask_cls_result, mask_pred_result, input_per_image, image_size in zip( + mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes + ): + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + + if self.sem_seg_postprocess_before_inference: + mask_pred_result = sem_seg_postprocess( + mask_pred_result, image_size, height, width + ) + + # semantic segmentation inference + r = self.semantic_inference(mask_cls_result, mask_pred_result) + if not self.sem_seg_postprocess_before_inference: + r = sem_seg_postprocess(r, image_size, height, width) + processed_results.append({"sem_seg": r}) + + # panoptic segmentation inference + if self.panoptic_on: + panoptic_r = self.panoptic_inference( + mask_cls_result, mask_pred_result + ) + processed_results[-1]["panoptic_seg"] = panoptic_r + + return processed_results + + def prepare_targets(self, targets, images): + h, w = images.tensor.shape[-2:] + new_targets = [] + for targets_per_image in targets: + # pad gt + gt_masks = targets_per_image.gt_masks + padded_masks = torch.zeros( + (gt_masks.shape[0], h, w), dtype=gt_masks.dtype, device=gt_masks.device + ) + padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks + new_targets.append( + { + "labels": targets_per_image.gt_classes, + "masks": padded_masks, + } + ) + return new_targets + + def semantic_inference(self, mask_cls, mask_pred): + mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1] + mask_pred = mask_pred.sigmoid() + semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred) + return semseg diff --git a/open_vocab_seg/modeling/__init__.py b/open_vocab_seg/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b4dd2628880e93338b39b0b6562b2a5838692b5 --- /dev/null +++ b/open_vocab_seg/modeling/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from .backbone.swin import D2SwinTransformer +from .backbone.clip_resnet import D2ModifiedResNet +from .heads.mask_former_head import MaskFormerHead +from .heads.open_vocab_mask_former_head import OpenVocabMaskFormerHead +from .heads.pixel_decoder import BasePixelDecoder diff --git a/open_vocab_seg/modeling/backbone/__init__.py b/open_vocab_seg/modeling/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..49f9003b7a688f5396170dd89c26ef335a2c201f --- /dev/null +++ b/open_vocab_seg/modeling/backbone/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved diff --git a/open_vocab_seg/modeling/backbone/clip_resnet.py b/open_vocab_seg/modeling/backbone/clip_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..7d40d88c1eac79a873a1396f7203b3555c68a364 --- /dev/null +++ b/open_vocab_seg/modeling/backbone/clip_resnet.py @@ -0,0 +1,206 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from collections import OrderedDict +import torch +import torch.nn as nn +from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1, dilation=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d( + planes, planes, 3, padding=1 * dilation, bias=False, dilation=dilation + ) + self.bn2 = nn.BatchNorm2d(planes) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential( + OrderedDict( + [ + ("-1", nn.AvgPool2d(stride)), + ( + "0", + nn.Conv2d( + inplanes, + planes * self.expansion, + 1, + stride=1, + bias=False, + ), + ), + ("1", nn.BatchNorm2d(planes * self.expansion)), + ] + ) + ) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, layers, width=64, strides=[2, 1, 2, 2, 2], multi_grid=[1, 1, 1]): + super().__init__() + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False + ) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(strides[0]) if strides[0] > 1 else nn.Identity() + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0], stride=strides[1]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=strides[2]) + self.layer3 = self._make_layer(width * 4, layers[2], stride=strides[3]) + self.layer4 = self._make_layer( + width * 8, layers[3], stride=strides[4], dilations=multi_grid + ) + self.num_features = [width * 4, width * 8, width * 16, width * 32] + + def _make_layer(self, planes, blocks, stride=1, dilations=None): + if dilations is None: + dilations = [1] * blocks + layers = [Bottleneck(self._inplanes, planes, stride, dilation=dilations[0])] + self._inplanes = planes * Bottleneck.expansion + + for i in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes, dilation=dilations[i])) + + return nn.Sequential(*layers) + + def forward(self, x): + def stem(x): + for conv, bn in [ + (self.conv1, self.bn1), + (self.conv2, self.bn2), + (self.conv3, self.bn3), + ]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + output = {} + x = x.type(self.conv1.weight.dtype) + x = stem(x) # 1/4,1/4 + x = self.layer1(x) + output["res2"] = x + x = self.layer2(x) # 1/8,1/8 + output["res3"] = x + x = self.layer3(x) # 1/16,1/16 + output["res4"] = x + x = self.layer4(x) # 1/32,1/32 + output["res5"] = x + return output + + +@BACKBONE_REGISTRY.register() +class D2ModifiedResNet(ModifiedResNet, Backbone): + def __init__(self, cfg, input_shape): + depth = cfg.MODEL.RESNETS.DEPTH + num_groups = cfg.MODEL.RESNETS.NUM_GROUPS + width_per_group = cfg.MODEL.RESNETS.WIDTH_PER_GROUP + bottleneck_channels = num_groups * width_per_group + num_blocks_per_stage = { + 18: [2, 2, 2, 2], + 34: [3, 4, 6, 3], + 50: [3, 4, 6, 3], + 101: [3, 4, 23, 3], + 152: [3, 8, 36, 3], + }[depth] + strides = [2, 1, 2, 2, 2] + multi_grid = cfg.MODEL.RESNETS.RES5_MULTI_GRID + if cfg.MODEL.RESNETS.STEM_TYPE == "deeplab": + strides = [1, 1, 2, 2, 2] + super().__init__( + num_blocks_per_stage, + bottleneck_channels, + strides=strides, + multi_grid=multi_grid, + ) + self._out_features = cfg.MODEL.RESNETS.OUT_FEATURES + + self._out_feature_strides = { + "res2": 4, + "res3": 8, + "res4": 16, + "res5": 32, + } + self._out_feature_channels = { + "res2": self.num_features[0], + "res3": self.num_features[1], + "res4": self.num_features[2], + "res5": self.num_features[3], + } + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: names and the corresponding features + """ + outputs = {} + y = super().forward(x) + for k in y.keys(): + if k in self._out_features: + outputs[k] = y[k] + return outputs + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], + stride=self._out_feature_strides[name], + ) + for name in self._out_features + } + + @property + def size_divisibility(self): + return 32 diff --git a/open_vocab_seg/modeling/backbone/swin.py b/open_vocab_seg/modeling/backbone/swin.py new file mode 100644 index 0000000000000000000000000000000000000000..aa651bdab51bb353e3be4b5554f41e251803d5cb --- /dev/null +++ b/open_vocab_seg/modeling/backbone/swin.py @@ -0,0 +1,832 @@ +# -------------------------------------------------------- +# Swin Transformer +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu, Yutong Lin, Yixuan Wei +# -------------------------------------------------------- + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation/blob/main/mmseg/models/backbones/swin_transformer.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from timm.models.layers import DropPath, to_2tuple, trunc_normal_ + +from detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec + + +class Mlp(nn.Module): + """Multilayer perceptron.""" + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view( + B, H // window_size, W // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = ( + coords_flatten[:, :, None] - coords_flatten[:, None, :] + ) # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute( + 1, 2, 0 + ).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = ( + qkv[0], + qkv[1], + qkv[2], + ) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1], + self.window_size[0] * self.window_size[1], + -1, + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze( + 1 + ).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + """Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert ( + 0 <= self.shift_size < self.window_size + ), "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2) + ) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn( + x_windows, mask=attn_mask + ) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2) + ) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + """Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=patch_size + ) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.size() + if W % self.patch_size[1] != 0: + x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) + if H % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) + + return x + + +class SwinTransformer(nn.Module): + """Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + Args: + pretrain_img_size (int): Input image size for training the pretrained model, + used in absolute postion embedding. Default 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + norm_indices=None, + frozen_stages=-1, + use_checkpoint=False, + projection=False, + project_dim=256, + ): + super().__init__() + + self.pretrain_img_size = pretrain_img_size + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.norm_indices = norm_indices if norm_indices is not None else out_indices + self.frozen_stages = frozen_stages + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1], + ] + + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2 ** i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + ) + self.layers.append(layer) + + num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in self.norm_indices: + if i_layer >= len(self.num_features): + continue + layer = norm_layer(num_features[i_layer]) + layer_name = f"norm{i_layer}" + self.add_module(layer_name, layer) + # add projector head + self.projection = projection + if projection: + self.project_dim = project_dim + self.norm = norm_layer(self.num_features[-1]) + self.projector = nn.Linear(self.num_features[-1], project_dim, bias=False) + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = {} + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + if i in self.norm_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + out = ( + x_out.view(-1, H, W, self.num_features[i]) + .permute(0, 3, 1, 2) + .contiguous() + ) + outs["res{}".format(i + 2)] = out + if self.projection: + x_out = self.norm(x_out) + x_out = x_out.view(-1, H, W, self.num_features[-1]).contiguous() + outs["fc"] = self.projector(x_out).permute(0, 3, 1, 2) + + return outs + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() + + +@BACKBONE_REGISTRY.register() +class D2SwinTransformer(SwinTransformer, Backbone): + def __init__(self, cfg, input_shape): + + pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE + patch_size = cfg.MODEL.SWIN.PATCH_SIZE + in_chans = 3 + embed_dim = cfg.MODEL.SWIN.EMBED_DIM + depths = cfg.MODEL.SWIN.DEPTHS + num_heads = cfg.MODEL.SWIN.NUM_HEADS + window_size = cfg.MODEL.SWIN.WINDOW_SIZE + mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO + qkv_bias = cfg.MODEL.SWIN.QKV_BIAS + qk_scale = cfg.MODEL.SWIN.QK_SCALE + drop_rate = cfg.MODEL.SWIN.DROP_RATE + attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE + drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE + norm_layer = nn.LayerNorm + ape = cfg.MODEL.SWIN.APE + patch_norm = cfg.MODEL.SWIN.PATCH_NORM + norm_indices = cfg.MODEL.SWIN.NORM_INDICES + projection = cfg.MODEL.SWIN.PROJECTION + project_dim = cfg.MODEL.SWIN.PROJECT_DIM + super().__init__( + pretrain_img_size, + patch_size, + in_chans, + embed_dim, + depths, + num_heads, + window_size, + mlp_ratio, + qkv_bias, + qk_scale, + drop_rate, + attn_drop_rate, + drop_path_rate, + norm_layer, + ape, + patch_norm, + norm_indices=norm_indices, + projection=projection, + project_dim=project_dim, + ) + + self._out_features = cfg.MODEL.SWIN.OUT_FEATURES + + self._out_feature_strides = { + "res2": 4, + "res3": 8, + "res4": 16, + "res5": 32, + "fc": 32, + } + self._out_feature_channels = { + "res2": self.num_features[0], + "res3": self.num_features[1], + "res4": self.num_features[2], + "res5": self.num_features[3], + "fc": self.num_features[3], + } + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: names and the corresponding features + """ + assert ( + x.dim() == 4 + ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" + outputs = {} + y = super().forward(x) + for k in y.keys(): + if k in self._out_features: + outputs[k] = y[k] + return outputs + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], + stride=self._out_feature_strides[name], + ) + for name in self._out_features + } + + @property + def size_divisibility(self): + return 32 diff --git a/open_vocab_seg/modeling/clip_adapter/__init__.py b/open_vocab_seg/modeling/clip_adapter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5c880f121e329e0fc2bb31de5aa8240b44b4a25a --- /dev/null +++ b/open_vocab_seg/modeling/clip_adapter/__init__.py @@ -0,0 +1,23 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from .text_template import ( + PredefinedPromptExtractor, + ImageNetPromptExtractor, + VILDPromptExtractor, +) +from .adapter import ClipAdapter, MaskFormerClipAdapter + + +def build_text_prompt(cfg): + if cfg.TEXT_TEMPLATES == "predefined": + text_templates = PredefinedPromptExtractor(cfg.PREDEFINED_PROMPT_TEMPLATES) + elif cfg.TEXT_TEMPLATES == "imagenet": + text_templates = ImageNetPromptExtractor() + elif cfg.TEXT_TEMPLATES == "vild": + text_templates = VILDPromptExtractor() + else: + raise NotImplementedError( + "Prompt learner {} is not supported".format(cfg.TEXT_TEMPLATES) + ) + return text_templates diff --git a/open_vocab_seg/modeling/clip_adapter/adapter.py b/open_vocab_seg/modeling/clip_adapter/adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..864d20b160714865b4130fab8714f323aaae2572 --- /dev/null +++ b/open_vocab_seg/modeling/clip_adapter/adapter.py @@ -0,0 +1,206 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from +# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/clip_adapter/adapter.py + +from typing import List +import torch +from torch import nn +from torch.nn import functional as F +from detectron2.structures import BitMasks +from .utils import build_clip_model, crop_with_mask +from .text_template import PromptExtractor + + +PIXEL_MEAN = (0.48145466, 0.4578275, 0.40821073) +PIXEL_STD = (0.26862954, 0.26130258, 0.27577711) + + +class ClipAdapter(nn.Module): + def __init__(self, clip_model_name: str, mask_prompt_depth: int, text_templates: PromptExtractor): + super().__init__() + self.clip_model = build_clip_model(clip_model_name, mask_prompt_depth) + self.text_templates = text_templates + self.text_templates.init_buffer(self.clip_model) + self.text_feature_buffer = {} + + def forward(self, image: torch.Tensor, text: List[str], **kwargs): + image = self._preprocess_image(image, **kwargs) + text_feature = self.get_text_features(text) # k,feat_dim + image_features = self.get_image_features(image) + return self.get_sim_logits(text_feature, image_features) + + def _preprocess_image(self, image: torch.Tensor): + return image + + def _get_text_features(self, noun_list: List[str]): + left_noun_list = [ + noun for noun in noun_list if noun not in self.text_feature_buffer + ] + if len(left_noun_list) > 0: + left_text_features = self.text_templates( + left_noun_list, self.clip_model + ) + self.text_feature_buffer.update( + { + noun: text_feature + for noun, text_feature in zip( + left_noun_list, left_text_features + ) + } + ) + return torch.stack([self.text_feature_buffer[noun] for noun in noun_list]) + + + def get_text_features(self, noun_list: List[str]): + return self._get_text_features(noun_list) + + def get_image_features(self, image: torch.Tensor): + image_features = self.clip_model.visual(image) + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + return image_features + + def get_sim_logits( + self, + text_features: torch.Tensor, + image_features: torch.Tensor, + temperature: float = 100, + ): + return temperature * image_features @ text_features.T + + def normalize_feature(self, feat: torch.Tensor): + return feat / feat.norm(dim=-1, keepdim=True) + + +class MaskFormerClipAdapter(ClipAdapter): + def __init__( + self, + clip_model_name: str, + text_templates: PromptExtractor, + mask_fill: str = "mean", + mask_expand_ratio: float = 1.0, + mask_thr: float = 0.5, + mask_matting: bool = False, + region_resized: bool = True, + mask_prompt_depth: int = 0, + mask_prompt_fwd: bool = False, + ): + super().__init__(clip_model_name, mask_prompt_depth, text_templates) + self.non_object_embedding = nn.Parameter( + torch.empty(1, self.clip_model.text_projection.shape[-1]) + ) + nn.init.normal_( + self.non_object_embedding.data, + std=self.clip_model.transformer.width ** -0.5, + ) + # for test + self.mask_fill = mask_fill + if self.mask_fill == "zero": + self.mask_fill = (0.0, 0.0, 0.0) + elif self.mask_fill == "mean": + self.mask_fill = [255.0 * c for c in PIXEL_MEAN] + else: + raise NotImplementedError( + "Unknown mask_fill method: {}".format(self.mask_fill) + ) + self.mask_expand_ratio = mask_expand_ratio + self.mask_thr = mask_thr + self.mask_matting = mask_matting + self.region_resized = region_resized + self.mask_prompt_fwd = mask_prompt_fwd + self.register_buffer( + "pixel_mean", torch.Tensor(PIXEL_MEAN).reshape(1, 3, 1, 1) * 255.0 + ) + self.register_buffer( + "pixel_std", torch.Tensor(PIXEL_STD).reshape(1, 3, 1, 1) * 255.0 + ) + + def forward( + self, + image: torch.Tensor, + text: List[str], + mask: torch.Tensor, + normalize: bool = True, + fwd_w_region_mask: bool = False, + ): + (regions, unnorm_regions), region_masks, valid_flag = self._preprocess_image(image, mask, normalize=normalize) + if regions is None: + return None, valid_flag + if isinstance(regions, list): + assert NotImplementedError + image_features = torch.cat( + [self.get_image_features(image_i) for image_i in regions], dim=0 + ) + else: + if self.mask_prompt_fwd: + image_features = self.get_image_features(regions, region_masks) + else: + image_features = self.get_image_features(regions) + text_feature = self.get_text_features(text) # k,feat_dim + return self.get_sim_logits(text_feature, image_features), unnorm_regions, valid_flag + + def get_image_features(self, image, region_masks=None): + image_features = self.clip_model.visual(image, region_masks) + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + return image_features + + def _preprocess_image( + self, image: torch.Tensor, mask: torch.Tensor, normalize: bool = True + ): + """crop, mask and normalize the image + + Args: + image ([type]): [C,H,W] + mask ([type]): [K,H,W + normalize (bool, optional): [description]. Defaults to True. + """ + dtype = mask.dtype + bin_mask = mask > self.mask_thr + valid = bin_mask.sum(dim=(-1, -2)) > 0 + bin_mask = bin_mask[valid] + mask = mask[valid] + if not self.mask_matting: + mask = bin_mask + bin_mask = BitMasks(bin_mask) + bboxes = bin_mask.get_bounding_boxes() + # crop,mask + regions = [] + region_masks = [] + for bbox, single_mask in zip(bboxes, mask): + region, region_mask = crop_with_mask( + image.type(dtype), + single_mask.type(dtype), + bbox, + fill=self.mask_fill, + expand_ratio=self.mask_expand_ratio, + ) + regions.append(region.unsqueeze(0)) + region_masks.append(region_mask.unsqueeze(0)) + if len(regions) == 0: + return None, valid + unnorm_regions = regions + if normalize: + regions = [(r - self.pixel_mean) / self.pixel_std for r in regions] + # resize + if self.region_resized: + regions = [ + F.interpolate(r, size=(224, 224), mode="bicubic") for r in regions + ] + regions = torch.cat(regions) + region_masks = [ + F.interpolate(r, size=(224, 224), mode="nearest") for r in region_masks + ] + region_masks = torch.cat(region_masks) + unnorm_regions = [ + F.interpolate(r, size=(224, 224), mode="bicubic") for r in unnorm_regions + ] + unnorm_regions = torch.cat(unnorm_regions) + return (regions, unnorm_regions), region_masks, valid + + def get_text_features(self, noun_list: List[str]): + object_text_features = self._get_text_features(noun_list) + non_object_text_features = ( + self.non_object_embedding + / self.non_object_embedding.norm(dim=-1, keepdim=True) + ) + return torch.cat([object_text_features, non_object_text_features], dim=0) diff --git a/open_vocab_seg/modeling/clip_adapter/text_template.py b/open_vocab_seg/modeling/clip_adapter/text_template.py new file mode 100644 index 0000000000000000000000000000000000000000..1dd085f9435650bbd982c81a1cf0d9899ce7feb2 --- /dev/null +++ b/open_vocab_seg/modeling/clip_adapter/text_template.py @@ -0,0 +1,155 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from +# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/clip_adapter/text_prompt.py +# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/clip_adapter/utils.py + +from typing import List + +import clip +import torch +from torch import nn + +IMAGENET_PROMPT = [ + "a bad photo of a {}.", + "a photo of many {}.", + "a sculpture of a {}.", + "a photo of the hard to see {}.", + "a low resolution photo of the {}.", + "a rendering of a {}.", + "graffiti of a {}.", + "a bad photo of the {}.", + "a cropped photo of the {}.", + "a tattoo of a {}.", + "the embroidered {}.", + "a photo of a hard to see {}.", + "a bright photo of a {}.", + "a photo of a clean {}.", + "a photo of a dirty {}.", + "a dark photo of the {}.", + "a drawing of a {}.", + "a photo of my {}.", + "the plastic {}.", + "a photo of the cool {}.", + "a close-up photo of a {}.", + "a black and white photo of the {}.", + "a painting of the {}.", + "a painting of a {}.", + "a pixelated photo of the {}.", + "a sculpture of the {}.", + "a bright photo of the {}.", + "a cropped photo of a {}.", + "a plastic {}.", + "a photo of the dirty {}.", + "a jpeg corrupted photo of a {}.", + "a blurry photo of the {}.", + "a photo of the {}.", + "a good photo of the {}.", + "a rendering of the {}.", + "a {} in a video game.", + "a photo of one {}.", + "a doodle of a {}.", + "a close-up photo of the {}.", + "a photo of a {}.", + "the origami {}.", + "the {} in a video game.", + "a sketch of a {}.", + "a doodle of the {}.", + "a origami {}.", + "a low resolution photo of a {}.", + "the toy {}.", + "a rendition of the {}.", + "a photo of the clean {}.", + "a photo of a large {}.", + "a rendition of a {}.", + "a photo of a nice {}.", + "a photo of a weird {}.", + "a blurry photo of a {}.", + "a cartoon {}.", + "art of a {}.", + "a sketch of the {}.", + "a embroidered {}.", + "a pixelated photo of a {}.", + "itap of the {}.", + "a jpeg corrupted photo of the {}.", + "a good photo of a {}.", + "a plushie {}.", + "a photo of the nice {}.", + "a photo of the small {}.", + "a photo of the weird {}.", + "the cartoon {}.", + "art of the {}.", + "a drawing of the {}.", + "a photo of the large {}.", + "a black and white photo of a {}.", + "the plushie {}.", + "a dark photo of a {}.", + "itap of a {}.", + "graffiti of the {}.", + "a toy {}.", + "itap of my {}.", + "a photo of a cool {}.", + "a photo of a small {}.", + "a tattoo of the {}.", +] + +VILD_PROMPT = [ + "a photo of a {}.", + "This is a photo of a {}", + "There is a {} in the scene", + "There is the {} in the scene", + "a photo of a {} in the scene", + "a photo of a small {}.", + "a photo of a medium {}.", + "a photo of a large {}.", + "This is a photo of a small {}.", + "This is a photo of a medium {}.", + "This is a photo of a large {}.", + "There is a small {} in the scene.", + "There is a medium {} in the scene.", + "There is a large {} in the scene.", +] + +class PromptExtractor(nn.Module): + def __init__(self): + super().__init__() + self._buffer_init = False + + def init_buffer(self, clip_model): + self._buffer_init = True + + def forward(self, noun_list: List[str], clip_model: nn.Module): + raise NotImplementedError() + + +class PredefinedPromptExtractor(PromptExtractor): + def __init__(self, templates: List[str]): + super().__init__() + self.templates = templates + + def forward(self, noun_list: List[str], clip_model: nn.Module): + text_features_bucket = [] + for template in self.templates: + noun_tokens = [clip.tokenize(template.format(noun)) for noun in noun_list] + text_inputs = torch.cat(noun_tokens).to( + clip_model.text_projection.data.device + ) + text_features = clip_model.encode_text(text_inputs) + text_features /= text_features.norm(dim=-1, keepdim=True) + text_features_bucket.append(text_features) + del text_inputs + # ensemble by averaging + text_features = torch.stack(text_features_bucket).mean(dim=0) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + return text_features + + +class ImageNetPromptExtractor(PredefinedPromptExtractor): + def __init__(self): + super().__init__(IMAGENET_PROMPT) + + +class VILDPromptExtractor(PredefinedPromptExtractor): + def __init__(self): + super().__init__(VILD_PROMPT) diff --git a/open_vocab_seg/modeling/clip_adapter/utils.py b/open_vocab_seg/modeling/clip_adapter/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..276c1ed9feca77d9a37067d312aca97d132515d3 --- /dev/null +++ b/open_vocab_seg/modeling/clip_adapter/utils.py @@ -0,0 +1,81 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from typing import Tuple +import numpy as np +import torch +import clip +from detectron2.utils.comm import get_local_rank, synchronize + + +def expand_box( + x1: float, + y1: float, + x2: float, + y2: float, + expand_ratio: float = 1.0, + max_h: int = None, + max_w: int = None, +): + cx = 0.5 * (x1 + x2) + cy = 0.5 * (y1 + y2) + w = x2 - x1 + h = y2 - y1 + w = w * expand_ratio + h = h * expand_ratio + box = [cx - 0.5 * w, cy - 0.5 * h, cx + 0.5 * w, cy + 0.5 * h] + if max_h is not None: + box[1] = max(0, box[1]) + box[3] = min(max_h - 1, box[3]) + if max_w is not None: + box[0] = max(0, box[0]) + box[2] = min(max_w - 1, box[2]) + return [int(b) for b in box] + + +def mask2box(mask: torch.Tensor): + # use naive way + row = torch.nonzero(mask.sum(dim=0))[:, 0] + if len(row) == 0: + return None + x1 = row.min() + x2 = row.max() + col = np.nonzero(mask.sum(dim=1))[:, 0] + y1 = col.min() + y2 = col.max() + return x1, y1, x2 + 1, y2 + 1 + + +def crop_with_mask( + image: torch.Tensor, + mask: torch.Tensor, + bbox: torch.Tensor, + fill: Tuple[float, float, float] = (0, 0, 0), + expand_ratio: float = 1.0, +): + l, t, r, b = expand_box(*bbox, expand_ratio) + _, h, w = image.shape + l = max(l, 0) + t = max(t, 0) + r = min(r, w) + b = min(b, h) + new_image = torch.cat( + [image.new_full((1, b - t, r - l), fill_value=val) for val in fill] + ) + # return image[:, t:b, l:r], mask[None, t:b, l:r] + return image[:, t:b, l:r] * mask[None, t:b, l:r] + (1 - mask[None, t:b, l:r]) * new_image, mask[None, t:b, l:r] + + +def build_clip_model(model: str, mask_prompt_depth: int = 0, frozen: bool = True): + rank = get_local_rank() + if rank == 0: + # download on rank 0 only + model, _ = clip.load(model, mask_prompt_depth=mask_prompt_depth, device="cpu") + synchronize() + if rank != 0: + model, _ = clip.load(model, mask_prompt_depth=mask_prompt_depth, device="cpu") + synchronize() + if frozen: + for param in model.parameters(): + param.requires_grad = False + return model diff --git a/open_vocab_seg/modeling/criterion.py b/open_vocab_seg/modeling/criterion.py new file mode 100644 index 0000000000000000000000000000000000000000..f4d5b71242f87c6f67463f9c31f873a742f3e5c7 --- /dev/null +++ b/open_vocab_seg/modeling/criterion.py @@ -0,0 +1,229 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/detr.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +""" +MaskFormer criterion. +""" +import torch +import torch.nn.functional as F +from torch import nn + +from detectron2.utils.comm import get_world_size + +from ..utils.misc import is_dist_avail_and_initialized, nested_tensor_from_tensor_list + + +def dice_loss(inputs, targets, num_masks): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(-1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_masks + + +def sigmoid_focal_loss( + inputs, targets, num_masks, alpha: float = 0.25, gamma: float = 2 +): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_masks + + +class SetCriterion(nn.Module): + """This class computes the loss for DETR. + The process happens in two steps: + 1) we compute hungarian assignment between ground truth boxes and the outputs of the model + 2) we supervise each pair of matched ground-truth / prediction (supervise class and box) + """ + + def __init__(self, num_classes, matcher, weight_dict, eos_coef, losses): + """Create the criterion. + Parameters: + num_classes: number of object categories, omitting the special no-object category + matcher: module able to compute a matching between targets and proposals + weight_dict: dict containing as key the names of the losses and as values their relative weight. + eos_coef: relative classification weight applied to the no-object category + losses: list of all the losses to be applied. See get_loss for list of available losses. + """ + super().__init__() + self.num_classes = num_classes + self.matcher = matcher + self.weight_dict = weight_dict + self.eos_coef = eos_coef + self.losses = losses + if eos_coef > 0: + + empty_weight = torch.ones(self.num_classes + 1) + + empty_weight[-1] = self.eos_coef + self.register_buffer("empty_weight", empty_weight) + self.use_ignore_idx = False + else: + self.use_ignore_idx = True + self.cur_target = [] + + def loss_labels(self, outputs, targets, indices, num_masks): + """Classification loss (NLL) + targets dicts must contain the key "labels" containing a tensor of dim [nb_target_boxes] + """ + assert "pred_logits" in outputs + src_logits = outputs["pred_logits"] + + idx = self._get_src_permutation_idx(indices) + target_classes_o = torch.cat( + [t["labels"][J] for t, (_, J) in zip(targets, indices)] + ) + target_classes = torch.full( + src_logits.shape[:2], + self.num_classes, + dtype=torch.int64, + device=src_logits.device, + ) + target_classes[idx] = target_classes_o + if self.use_ignore_idx: + loss_ce = F.cross_entropy( + src_logits.transpose(1, 2), + target_classes, + ignore_index=self.num_classes, + ) + else: + if "empty_weight" in outputs: + empty_weight = torch.cat( + [outputs["empty_weight"], self.empty_weight[-1:]] + ).detach() + else: + empty_weight = self.empty_weight + loss_ce = F.cross_entropy( + src_logits.transpose(1, 2), target_classes, empty_weight + ) + losses = {"loss_ce": loss_ce} + return losses + + def loss_masks(self, outputs, targets, indices, num_masks): + """Compute the losses related to the masks: the focal loss and the dice loss. + targets dicts must contain the key "masks" containing a tensor of dim [nb_target_boxes, h, w] + """ + assert "pred_masks" in outputs + + src_idx = self._get_src_permutation_idx(indices) + tgt_idx = self._get_tgt_permutation_idx(indices) + src_masks = outputs["pred_masks"] + src_masks = src_masks[src_idx] + masks = [t["masks"] for t in targets] + # TODO use valid to mask invalid areas due to padding in loss + target_masks, valid = nested_tensor_from_tensor_list(masks).decompose() + target_masks = target_masks.to(src_masks) + target_masks = target_masks[tgt_idx] + + # upsample predictions to the target size + src_masks = F.interpolate( + src_masks[:, None], + size=target_masks.shape[-2:], + mode="bilinear", + align_corners=False, + ) + src_masks = src_masks[:, 0].flatten(1) + + target_masks = target_masks.flatten(1) + target_masks = target_masks.view(src_masks.shape) + losses = { + "loss_mask": sigmoid_focal_loss(src_masks, target_masks, num_masks), + "loss_dice": dice_loss(src_masks, target_masks, num_masks), + } + return losses + + def _get_src_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat( + [torch.full_like(src, i) for i, (src, _) in enumerate(indices)] + ) + src_idx = torch.cat([src for (src, _) in indices]) + return batch_idx, src_idx + + def _get_tgt_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat( + [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)] + ) + tgt_idx = torch.cat([tgt for (_, tgt) in indices]) + return batch_idx, tgt_idx + + def get_loss(self, loss, outputs, targets, indices, num_masks): + loss_map = {"labels": self.loss_labels, "masks": self.loss_masks} + assert loss in loss_map, f"do you really want to compute {loss} loss?" + return loss_map[loss](outputs, targets, indices, num_masks) + + def forward(self, outputs, targets): + """This performs the loss computation. + Parameters: + outputs: dict of tensors, see the output specification of the model for the format + targets: list of dicts, such that len(targets) == batch_size. + The expected keys in each dict depends on the losses applied, see each loss' doc + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "aux_outputs"} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_masks = sum(len(t["labels"]) for t in targets) + num_masks = torch.as_tensor( + [num_masks], dtype=torch.float, device=next(iter(outputs.values())).device + ) + if is_dist_avail_and_initialized(): + torch.distributed.all_reduce(num_masks) + num_masks = torch.clamp(num_masks / get_world_size(), min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_masks)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "aux_outputs" in outputs: + for i, aux_outputs in enumerate(outputs["aux_outputs"]): + indices = self.matcher(aux_outputs, targets) + for loss in self.losses: + l_dict = self.get_loss( + loss, aux_outputs, targets, indices, num_masks + ) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + def clean_buffer(self): + self.cur_target = [] diff --git a/open_vocab_seg/modeling/heads/__init__.py b/open_vocab_seg/modeling/heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52db7cce67b1686f7cab3698f15b8f309c897918 --- /dev/null +++ b/open_vocab_seg/modeling/heads/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved \ No newline at end of file diff --git a/open_vocab_seg/modeling/heads/mask_former_head.py b/open_vocab_seg/modeling/heads/mask_former_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5f592662f92d1b0862a3ef76304e7b28b46ecf80 --- /dev/null +++ b/open_vocab_seg/modeling/heads/mask_former_head.py @@ -0,0 +1,135 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import logging +from copy import deepcopy +from typing import Callable, Dict, List, Optional, Tuple, Union + +import fvcore.nn.weight_init as weight_init +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d, ShapeSpec, get_norm +from detectron2.modeling import SEM_SEG_HEADS_REGISTRY + +from ..transformer.transformer_predictor import TransformerPredictor +from .pixel_decoder import build_pixel_decoder + + +@SEM_SEG_HEADS_REGISTRY.register() +class MaskFormerHead(nn.Module): + + _version = 2 + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + if version is None or version < 2: + # Do not warn if train from scratch + scratch = True + logger = logging.getLogger(__name__) + for k in list(state_dict.keys()): + newk = k + if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): + newk = k.replace(prefix, prefix + "pixel_decoder.") + # logger.debug(f"{k} ==> {newk}") + if newk != k: + state_dict[newk] = state_dict[k] + del state_dict[k] + scratch = False + + if not scratch: + logger.warning( + f"Weight format of {self.__class__.__name__} have changed! " + "Please upgrade your models. Applying automatic conversion now ..." + ) + + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + num_classes: int, + pixel_decoder: nn.Module, + loss_weight: float = 1.0, + ignore_value: int = -1, + # extra parameters + transformer_predictor: nn.Module, + transformer_in_feature: str, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + num_classes: number of classes to predict + pixel_decoder: the pixel decoder module + loss_weight: loss weight + ignore_value: category id to be ignored during training. + transformer_predictor: the transformer decoder that makes prediction + transformer_in_feature: input feature name to the transformer_predictor + """ + super().__init__() + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] + feature_strides = [v.stride for k, v in input_shape] + feature_channels = [v.channels for k, v in input_shape] + + self.ignore_value = ignore_value + self.common_stride = 4 + self.loss_weight = loss_weight + + self.pixel_decoder = pixel_decoder + self.predictor = transformer_predictor + self.transformer_in_feature = transformer_in_feature + + self.num_classes = num_classes + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + return { + "input_shape": { + k: v + for k, v in input_shape.items() + if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + }, + "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + "pixel_decoder": build_pixel_decoder(cfg, input_shape), + "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, + "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, + "transformer_predictor": TransformerPredictor( + cfg, + cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder" + else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels, + mask_classification=True, + ), + } + + def forward(self, features): + return self.layers(features) + + def layers(self, features): + ( + mask_features, + transformer_encoder_features, + ) = self.pixel_decoder.forward_features(features) + if self.transformer_in_feature == "transformer_encoder": + assert ( + transformer_encoder_features is not None + ), "Please use the TransformerEncoderPixelDecoder." + predictions = self.predictor(transformer_encoder_features, mask_features) + else: + predictions = self.predictor( + features[self.transformer_in_feature], mask_features + ) + return predictions diff --git a/open_vocab_seg/modeling/heads/open_vocab_mask_former_head.py b/open_vocab_seg/modeling/heads/open_vocab_mask_former_head.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed84f9a44d24415b3334fdf2ea8e1188de32de6 --- /dev/null +++ b/open_vocab_seg/modeling/heads/open_vocab_mask_former_head.py @@ -0,0 +1,145 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from +# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/modeling/heads/zero_shot_mask_former_head.py + +import logging +from copy import deepcopy +from typing import Callable, Dict, List, Optional, Tuple, Union + +import fvcore.nn.weight_init as weight_init +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d, ShapeSpec, get_norm +from detectron2.modeling import SEM_SEG_HEADS_REGISTRY + +from ..transformer.open_vocab_transformer_predictor import OpenVocabTransformerPredictor +from .pixel_decoder import build_pixel_decoder + + +@SEM_SEG_HEADS_REGISTRY.register() +class OpenVocabMaskFormerHead(nn.Module): + + _version = 2 + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + version = local_metadata.get("version", None) + if version is None or version < 2: + # Do not warn if train from scratch + scratch = True + logger = logging.getLogger(__name__) + for k in list(state_dict.keys()): + newk = k + if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): + newk = k.replace(prefix, prefix + "pixel_decoder.") + # logger.debug(f"{k} ==> {newk}") + if newk != k: + state_dict[newk] = state_dict[k] + del state_dict[k] + scratch = False + + if not scratch: + logger.warning( + f"Weight format of {self.__class__.__name__} have changed! " + "Please upgrade your models. Applying automatic conversion now ..." + ) + + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + num_classes: int, + pixel_decoder: nn.Module, + loss_weight: float = 1.0, + ignore_value: int = -1, + # extra parameters + transformer_predictor: nn.Module, + transformer_in_feature: str, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + num_classes: number of classes to predict + pixel_decoder: the pixel decoder module + loss_weight: loss weight + ignore_value: category id to be ignored during training. + transformer_predictor: the transformer decoder that makes prediction + transformer_in_feature: input feature name to the transformer_predictor + """ + super().__init__() + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] + feature_strides = [v.stride for k, v in input_shape] + feature_channels = [v.channels for k, v in input_shape] + + self.ignore_value = ignore_value + self.common_stride = 4 + self.loss_weight = loss_weight + + self.pixel_decoder = pixel_decoder + self.predictor = transformer_predictor + self.transformer_in_feature = transformer_in_feature + + self.num_classes = num_classes + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + return { + "input_shape": { + k: v + for k, v in input_shape.items() + if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + }, + "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + "pixel_decoder": build_pixel_decoder(cfg, input_shape), + "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, + "transformer_in_feature": cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE, + "transformer_predictor": OpenVocabTransformerPredictor( + cfg, + cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + if cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder" + else input_shape[cfg.MODEL.MASK_FORMER.TRANSFORMER_IN_FEATURE].channels, + mask_classification=True, + ), + } + + def forward(self, features): + return self.layers(features) + + def layers(self, features): + ( + mask_features, + transformer_encoder_features, + ) = self.pixel_decoder.forward_features(features) + if self.transformer_in_feature == "transformer_encoder": + assert ( + transformer_encoder_features is not None + ), "Please use the TransformerEncoderPixelDecoder." + predictions = self.predictor(transformer_encoder_features, mask_features) + else: + predictions = self.predictor( + features[self.transformer_in_feature], mask_features + ) + return predictions + + def freeze_pretrained(self): + for name, module in self.named_children(): + if name not in ["predictor"]: + for param in module.parameters(): + param.requires_grad = False + else: + module.freeze_pretrained() diff --git a/open_vocab_seg/modeling/heads/pixel_decoder.py b/open_vocab_seg/modeling/heads/pixel_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..6b10089331785e937b79cf82af6d8fba55519082 --- /dev/null +++ b/open_vocab_seg/modeling/heads/pixel_decoder.py @@ -0,0 +1,308 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import logging +from typing import Callable, Dict, List, Optional, Tuple, Union + +import fvcore.nn.weight_init as weight_init +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d, ShapeSpec, get_norm +from detectron2.modeling import SEM_SEG_HEADS_REGISTRY + +from ..transformer.position_encoding import PositionEmbeddingSine +from ..transformer.transformer import TransformerEncoder, TransformerEncoderLayer + + +def build_pixel_decoder(cfg, input_shape): + """ + Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`. + """ + name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME + model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) + forward_features = getattr(model, "forward_features", None) + if not callable(forward_features): + raise ValueError( + "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. " + f"Please implement forward_features for {name} to only return mask features." + ) + return model + + +@SEM_SEG_HEADS_REGISTRY.register() +class BasePixelDecoder(nn.Module): + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + conv_dim: int, + mask_dim: int, + norm: Optional[Union[str, Callable]] = None, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + conv_dims: number of output channels for the intermediate conv layers. + mask_dim: number of output channels for the final conv layer. + norm (str or callable): normalization for all conv layers + """ + super().__init__() + + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" + feature_channels = [v.channels for k, v in input_shape] + + lateral_convs = [] + output_convs = [] + + use_bias = norm == "" + for idx, in_channels in enumerate(feature_channels): + if idx == len(self.in_features) - 1: + output_norm = get_norm(norm, conv_dim) + output_conv = Conv2d( + in_channels, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + weight_init.c2_xavier_fill(output_conv) + self.add_module("layer_{}".format(idx + 1), output_conv) + + lateral_convs.append(None) + output_convs.append(output_conv) + else: + lateral_norm = get_norm(norm, conv_dim) + output_norm = get_norm(norm, conv_dim) + + lateral_conv = Conv2d( + in_channels, + conv_dim, + kernel_size=1, + bias=use_bias, + norm=lateral_norm, + ) + output_conv = Conv2d( + conv_dim, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + weight_init.c2_xavier_fill(lateral_conv) + weight_init.c2_xavier_fill(output_conv) + self.add_module("adapter_{}".format(idx + 1), lateral_conv) + self.add_module("layer_{}".format(idx + 1), output_conv) + + lateral_convs.append(lateral_conv) + output_convs.append(output_conv) + # Place convs into top-down order (from low to high resolution) + # to make the top-down computation in forward clearer. + self.lateral_convs = lateral_convs[::-1] + self.output_convs = output_convs[::-1] + + self.mask_dim = mask_dim + self.mask_features = Conv2d( + conv_dim, + mask_dim, + kernel_size=3, + stride=1, + padding=1, + ) + weight_init.c2_xavier_fill(self.mask_features) + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + ret = {} + ret["input_shape"] = { + k: v + for k, v in input_shape.items() + if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + } + ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM + return ret + + def forward_features(self, features): + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.in_features[::-1]): + x = features[f] + lateral_conv = self.lateral_convs[idx] + output_conv = self.output_convs[idx] + if lateral_conv is None: + y = output_conv(x) + else: + cur_fpn = lateral_conv(x) + # Following FPN implementation, we use nearest upsampling here + y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") + y = output_conv(y) + return self.mask_features(y), None + + def forward(self, features, targets=None): + logger = logging.getLogger(__name__) + logger.warning( + "Calling forward() may cause unpredicted behavior of PixelDecoder module." + ) + return self.forward_features(features) + + +class TransformerEncoderOnly(nn.Module): + def __init__( + self, + d_model=512, + nhead=8, + num_encoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + self.encoder = TransformerEncoder( + encoder_layer, num_encoder_layers, encoder_norm + ) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, src, mask, pos_embed): + # flatten NxCxHxW to HWxNxC + bs, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + pos_embed = pos_embed.flatten(2).permute(2, 0, 1) + if mask is not None: + mask = mask.flatten(1) + + memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) + return memory.permute(1, 2, 0).view(bs, c, h, w) + + +@SEM_SEG_HEADS_REGISTRY.register() +class TransformerEncoderPixelDecoder(BasePixelDecoder): + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + transformer_dropout: float, + transformer_nheads: int, + transformer_dim_feedforward: int, + transformer_enc_layers: int, + transformer_pre_norm: bool, + conv_dim: int, + mask_dim: int, + norm: Optional[Union[str, Callable]] = None, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + transformer_dropout: dropout probability in transformer + transformer_nheads: number of heads in transformer + transformer_dim_feedforward: dimension of feedforward network + transformer_enc_layers: number of transformer encoder layers + transformer_pre_norm: whether to use pre-layernorm or not + conv_dims: number of output channels for the intermediate conv layers. + mask_dim: number of output channels for the final conv layer. + norm (str or callable): normalization for all conv layers + """ + super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm) + + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" + feature_strides = [v.stride for k, v in input_shape] + feature_channels = [v.channels for k, v in input_shape] + + in_channels = feature_channels[len(self.in_features) - 1] + self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1) + weight_init.c2_xavier_fill(self.input_proj) + self.transformer = TransformerEncoderOnly( + d_model=conv_dim, + dropout=transformer_dropout, + nhead=transformer_nheads, + dim_feedforward=transformer_dim_feedforward, + num_encoder_layers=transformer_enc_layers, + normalize_before=transformer_pre_norm, + ) + N_steps = conv_dim // 2 + self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) + + # update layer + use_bias = norm == "" + output_norm = get_norm(norm, conv_dim) + output_conv = Conv2d( + conv_dim, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + weight_init.c2_xavier_fill(output_conv) + delattr(self, "layer_{}".format(len(self.in_features))) + self.add_module("layer_{}".format(len(self.in_features)), output_conv) + self.output_convs[0] = output_conv + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + ret = super().from_config(cfg, input_shape) + ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT + ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS + ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD + ret[ + "transformer_enc_layers" + ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config + ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM + return ret + + def forward_features(self, features): + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.in_features[::-1]): + x = features[f] + lateral_conv = self.lateral_convs[idx] + output_conv = self.output_convs[idx] + if lateral_conv is None: + transformer = self.input_proj(x) + pos = self.pe_layer(x) + transformer = self.transformer(transformer, None, pos) + y = output_conv(transformer) + # save intermediate feature as input to Transformer decoder + transformer_encoder_features = transformer + else: + cur_fpn = lateral_conv(x) + # Following FPN implementation, we use nearest upsampling here + y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") + y = output_conv(y) + return self.mask_features(y), transformer_encoder_features + + def forward(self, features, targets=None): + logger = logging.getLogger(__name__) + logger.warning( + "Calling forward() may cause unpredicted behavior of PixelDecoder module." + ) + return self.forward_features(features) diff --git a/open_vocab_seg/modeling/matcher.py b/open_vocab_seg/modeling/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..a72ba671ad60db078e08046357a6aa0e5e9bd5dc --- /dev/null +++ b/open_vocab_seg/modeling/matcher.py @@ -0,0 +1,187 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/models/matcher.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +""" +Modules to compute the matching cost and solve the corresponding LSAP. +""" +import torch +import torch.nn.functional as F +from scipy.optimize import linear_sum_assignment +from torch import nn + + +def batch_dice_loss(inputs, targets): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) + denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] + loss = 1 - (numerator + 1) / (denominator + 1) + return loss + + +def batch_sigmoid_focal_loss(inputs, targets, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + alpha: (optional) Weighting factor in range (0,1) to balance + positive vs negative examples. Default = -1 (no weighting). + gamma: Exponent of the modulating factor (1 - p_t) to + balance easy vs hard examples. + Returns: + Loss tensor + """ + hw = inputs.shape[1] + + prob = inputs.sigmoid() + focal_pos = ((1 - prob) ** gamma) * F.binary_cross_entropy_with_logits( + inputs, torch.ones_like(inputs), reduction="none" + ) + focal_neg = (prob ** gamma) * F.binary_cross_entropy_with_logits( + inputs, torch.zeros_like(inputs), reduction="none" + ) + if alpha >= 0: + focal_pos = focal_pos * alpha + focal_neg = focal_neg * (1 - alpha) + + loss = torch.einsum("nc,mc->nm", focal_pos, targets) + torch.einsum( + "nc,mc->nm", focal_neg, (1 - targets) + ) + + return loss / hw + + +class HungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, + there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, + while the others are un-matched (and thus treated as non-objects). + """ + + def __init__( + self, cost_class: float = 1, cost_mask: float = 1, cost_dice: float = 1 + ): + """Creates the matcher + + Params: + cost_class: This is the relative weight of the classification error in the matching cost + cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost + cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost + """ + super().__init__() + self.cost_class = cost_class + self.cost_mask = cost_mask + self.cost_dice = cost_dice + assert ( + cost_class != 0 or cost_mask != 0 or cost_dice != 0 + ), "all costs cant be 0" + + @torch.no_grad() + def memory_efficient_forward(self, outputs, targets): + """More memory-friendly matching""" + bs, num_queries = outputs["pred_logits"].shape[:2] + + # Work out the mask padding size + masks = [v["masks"] for v in targets] + h_max = max([m.shape[1] for m in masks]) + w_max = max([m.shape[2] for m in masks]) + + indices = [] + + # Iterate through batch size + for b in range(bs): + + out_prob = outputs["pred_logits"][b].softmax( + -1 + ) # [num_queries, num_classes] + out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred] + + tgt_ids = targets[b]["labels"] + # gt masks are already padded when preparing target + tgt_mask = targets[b]["masks"].to(out_mask) + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + cost_class = -out_prob[:, tgt_ids] + + # Downsample gt masks to save memory + tgt_mask = F.interpolate( + tgt_mask[:, None], size=out_mask.shape[-2:], mode="nearest" + ) + + # Flatten spatial dimension + out_mask = out_mask.flatten(1) # [batch_size * num_queries, H*W] + tgt_mask = tgt_mask[:, 0].flatten(1) # [num_total_targets, H*W] + + # Compute the focal loss between masks + cost_mask = batch_sigmoid_focal_loss(out_mask, tgt_mask) + + # Compute the dice loss betwen masks + cost_dice = batch_dice_loss(out_mask, tgt_mask) + + # Final cost matrix + C = ( + self.cost_mask * cost_mask + + self.cost_class * cost_class + + self.cost_dice * cost_dice + ) + C = C.reshape(num_queries, -1).cpu() + + indices.append(linear_sum_assignment(C)) + return [ + ( + torch.as_tensor(i, dtype=torch.int64), + torch.as_tensor(j, dtype=torch.int64), + ) + for i, j in indices + ] + + @torch.no_grad() + def forward(self, outputs, targets): + """Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + return self.memory_efficient_forward(outputs, targets) + + def __repr__(self): + head = "Matcher " + self.__class__.__name__ + body = [ + "cost_class: {}".format(self.cost_class), + "cost_mask: {}".format(self.cost_mask), + "cost_dice: {}".format(self.cost_dice), + ] + _repr_indent = 4 + lines = [head] + [" " * _repr_indent + line for line in body] + return "\n".join(lines) diff --git a/open_vocab_seg/modeling/transformer/__init__.py b/open_vocab_seg/modeling/transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..49f9003b7a688f5396170dd89c26ef335a2c201f --- /dev/null +++ b/open_vocab_seg/modeling/transformer/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved diff --git a/open_vocab_seg/modeling/transformer/open_vocab_transformer_predictor.py b/open_vocab_seg/modeling/transformer/open_vocab_transformer_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..0efee3e14c71400a1cc5a55ea6c21b6876189aaa --- /dev/null +++ b/open_vocab_seg/modeling/transformer/open_vocab_transformer_predictor.py @@ -0,0 +1,84 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from torch import nn +from detectron2.config import configurable +from .transformer_predictor import TransformerPredictor, MLP + + +class OpenVocabTransformerPredictor(TransformerPredictor): + @configurable + def __init__( + self, + in_channels, + mask_classification=True, + *, + embedding_dim: int, + embed_hidden_dim: int, + embed_layers: int, + hidden_dim: int, + num_queries: int, + nheads: int, + dropout: float, + dim_feedforward: int, + enc_layers: int, + dec_layers: int, + pre_norm: bool, + deep_supervision: bool, + mask_dim: int, + enforce_input_project: bool, + ): + super().__init__( + in_channels, + False, + num_classes=embedding_dim, + hidden_dim=hidden_dim, + num_queries=num_queries, + nheads=nheads, + dropout=dropout, + dim_feedforward=dim_feedforward, + enc_layers=enc_layers, + dec_layers=dec_layers, + pre_norm=pre_norm, + deep_supervision=deep_supervision, + mask_dim=mask_dim, + enforce_input_project=enforce_input_project, + ) + self.mask_classification = mask_classification + # output FFNs + if self.mask_classification: + self.class_embed = MLP( + hidden_dim, embed_hidden_dim, embedding_dim, embed_layers + ) + + def freeze_pretrained(self): + for name, module in self.named_children(): + if name not in ["class_embed"]: + for param in module.parameters(): + param.requires_grad = False + + @classmethod + def from_config(cls, cfg, in_channels, mask_classification): + ret = {} + ret["in_channels"] = in_channels + ret["mask_classification"] = mask_classification + + ret["embedding_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBEDDING_DIM + ret["embed_hidden_dim"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_HIDDEN_DIM + ret["embed_layers"] = cfg.MODEL.SEM_SEG_HEAD.EMBED_LAYERS + ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM + ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES + # Transformer parameters: + ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS + ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT + ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD + ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS + ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS + ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM + ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION + ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ + + ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + + return ret diff --git a/open_vocab_seg/modeling/transformer/position_encoding.py b/open_vocab_seg/modeling/transformer/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..db236c5b36cbc4f4435a83b542bdc242cbb441c3 --- /dev/null +++ b/open_vocab_seg/modeling/transformer/position_encoding.py @@ -0,0 +1,58 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# # Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/position_encoding.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +""" +Various positional encodings for the transformer. +""" +import math + +import torch +from torch import nn + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, num_pos_feats=64, temperature=10000, normalize=False, scale=None + ): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, x, mask=None): + if mask is None: + mask = torch.zeros( + (x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool + ) + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos diff --git a/open_vocab_seg/modeling/transformer/transformer.py b/open_vocab_seg/modeling/transformer/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..76d1003b3852ce72c6ad5c3c23705f380197362f --- /dev/null +++ b/open_vocab_seg/modeling/transformer/transformer.py @@ -0,0 +1,380 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/transformer.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +""" +Transformer class. + +Copy-paste from torch.nn.Transformer with modifications: + * positional encodings are passed in MHattention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers +""" +import copy +from typing import List, Optional + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + + +class Transformer(nn.Module): + def __init__( + self, + d_model=512, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + return_intermediate_dec=False, + ): + super().__init__() + + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + self.encoder = TransformerEncoder( + encoder_layer, num_encoder_layers, encoder_norm + ) + + decoder_layer = TransformerDecoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec, + ) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, src, mask, query_embed, pos_embed): + # flatten NxCxHxW to HWxNxC + bs, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + pos_embed = pos_embed.flatten(2).permute(2, 0, 1) + query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) + if mask is not None: + mask = mask.flatten(1) + + tgt = torch.zeros_like(query_embed) + memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) + hs = self.decoder( + tgt, + memory, + memory_key_padding_mask=mask, + pos=pos_embed, + query_pos=query_embed, + ) + return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) + + +class TransformerEncoder(nn.Module): + def __init__(self, encoder_layer, num_layers, norm=None): + super().__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + src, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + output = src + + for layer in self.layers: + output = layer( + output, + src_mask=mask, + src_key_padding_mask=src_key_padding_mask, + pos=pos, + ) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(nn.Module): + def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): + super().__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + output = tgt + + intermediate = [] + + for layer in self.layers: + output = layer( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + pos=pos, + query_pos=query_pos, + ) + if self.return_intermediate: + intermediate.append(self.norm(output)) + + if self.norm is not None: + output = self.norm(output) + if self.return_intermediate: + intermediate.pop() + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output.unsqueeze(0) + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + q = k = self.with_pos_embed(src, pos) + src2 = self.self_attn( + q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask + )[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + def forward_pre( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + src2 = self.norm1(src) + q = k = self.with_pos_embed(src2, pos) + src2 = self.self_attn( + q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask + )[0] + src = src + self.dropout1(src2) + src2 = self.norm2(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) + src = src + self.dropout2(src2) + return src + + def forward( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + if self.normalize_before: + return self.forward_pre(src, src_mask, src_key_padding_mask, pos) + return self.forward_post(src, src_mask, src_key_padding_mask, pos) + + +class TransformerDecoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn( + q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask + )[0] + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward_pre( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + tgt2 = self.norm1(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn( + q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask + )[0] + tgt = tgt + self.dropout1(tgt2) + tgt2 = self.norm2(tgt) + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout2(tgt2) + tgt2 = self.norm3(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout3(tgt2) + return tgt + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + if self.normalize_before: + return self.forward_pre( + tgt, + memory, + tgt_mask, + memory_mask, + tgt_key_padding_mask, + memory_key_padding_mask, + pos, + query_pos, + ) + return self.forward_post( + tgt, + memory, + tgt_mask, + memory_mask, + tgt_key_padding_mask, + memory_key_padding_mask, + pos, + query_pos, + ) + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") diff --git a/open_vocab_seg/modeling/transformer/transformer_predictor.py b/open_vocab_seg/modeling/transformer/transformer_predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..72378abe29c01809a00fa1b87d275258ee9c91fa --- /dev/null +++ b/open_vocab_seg/modeling/transformer/transformer_predictor.py @@ -0,0 +1,179 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from: https://github.com/facebookresearch/detr/blob/master/models/detr.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.layers import Conv2d + +from .position_encoding import PositionEmbeddingSine +from .transformer import Transformer + + +class TransformerPredictor(nn.Module): + @configurable + def __init__( + self, + in_channels, + mask_classification=True, + *, + num_classes: int, + hidden_dim: int, + num_queries: int, + nheads: int, + dropout: float, + dim_feedforward: int, + enc_layers: int, + dec_layers: int, + pre_norm: bool, + deep_supervision: bool, + mask_dim: int, + enforce_input_project: bool, + ): + """ + NOTE: this interface is experimental. + Args: + in_channels: channels of the input features + mask_classification: whether to add mask classifier or not + num_classes: number of classes + hidden_dim: Transformer feature dimension + num_queries: number of queries + nheads: number of heads + dropout: dropout in Transformer + dim_feedforward: feature dimension in feedforward network + enc_layers: number of Transformer encoder layers + dec_layers: number of Transformer decoder layers + pre_norm: whether to use pre-LayerNorm or not + deep_supervision: whether to add supervision to every decoder layers + mask_dim: mask feature dimension + enforce_input_project: add input project 1x1 conv even if input + channels and hidden dim is identical + """ + super().__init__() + + self.mask_classification = mask_classification + + # positional encoding + N_steps = hidden_dim // 2 + self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) + + transformer = Transformer( + d_model=hidden_dim, + dropout=dropout, + nhead=nheads, + dim_feedforward=dim_feedforward, + num_encoder_layers=enc_layers, + num_decoder_layers=dec_layers, + normalize_before=pre_norm, + return_intermediate_dec=deep_supervision, + ) + + self.num_queries = num_queries + self.transformer = transformer + hidden_dim = transformer.d_model + + self.query_embed = nn.Embedding(num_queries, hidden_dim) + + if in_channels != hidden_dim or enforce_input_project: + self.input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) + weight_init.c2_xavier_fill(self.input_proj) + else: + self.input_proj = nn.Sequential() + self.aux_loss = deep_supervision + + # output FFNs + if self.mask_classification: + self.class_embed = nn.Linear(hidden_dim, num_classes + 1) + self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) + + @classmethod + def from_config(cls, cfg, in_channels, mask_classification): + ret = {} + ret["in_channels"] = in_channels + ret["mask_classification"] = mask_classification + + ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES + ret["hidden_dim"] = cfg.MODEL.MASK_FORMER.HIDDEN_DIM + ret["num_queries"] = cfg.MODEL.MASK_FORMER.NUM_OBJECT_QUERIES + # Transformer parameters: + ret["nheads"] = cfg.MODEL.MASK_FORMER.NHEADS + ret["dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT + ret["dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD + ret["enc_layers"] = cfg.MODEL.MASK_FORMER.ENC_LAYERS + ret["dec_layers"] = cfg.MODEL.MASK_FORMER.DEC_LAYERS + ret["pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM + ret["deep_supervision"] = cfg.MODEL.MASK_FORMER.DEEP_SUPERVISION + ret["enforce_input_project"] = cfg.MODEL.MASK_FORMER.ENFORCE_INPUT_PROJ + + ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + + return ret + + def forward(self, x, mask_features): + pos = self.pe_layer(x) + + src = x + mask = None + hs, memory = self.transformer( + self.input_proj(src), mask, self.query_embed.weight, pos + ) + + if self.mask_classification: + outputs_class = self.class_embed(hs) + out = {"pred_logits": outputs_class[-1]} + else: + out = {} + + if self.aux_loss: + # [l, bs, queries, embed] + mask_embed = self.mask_embed(hs) + outputs_seg_masks = torch.einsum( + "lbqc,bchw->lbqhw", mask_embed, mask_features + ) + out["pred_masks"] = outputs_seg_masks[-1] + out["aux_outputs"] = self._set_aux_loss( + outputs_class if self.mask_classification else None, outputs_seg_masks + ) + else: + # FIXME h_boxes takes the last one computed, keep this in mind + # [bs, queries, embed] + mask_embed = self.mask_embed(hs[-1]) + outputs_seg_masks = torch.einsum( + "bqc,bchw->bqhw", mask_embed, mask_features + ) + out["pred_masks"] = outputs_seg_masks + return out + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_seg_masks): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + if self.mask_classification: + return [ + {"pred_logits": a, "pred_masks": b} + for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) + ] + else: + return [{"pred_masks": b} for b in outputs_seg_masks[:-1]] + + +class MLP(nn.Module): + """Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x diff --git a/open_vocab_seg/ovseg_model.py b/open_vocab_seg/ovseg_model.py new file mode 100644 index 0000000000000000000000000000000000000000..82a14d487c032ec9c0c3bca73c25e94611c93245 --- /dev/null +++ b/open_vocab_seg/ovseg_model.py @@ -0,0 +1,461 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from +# https://github.com/MendelXu/zsseg.baseline/blob/master/mask_former/zero_shot_mask_former_model.py + +import logging +from typing import Tuple + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from detectron2.config import configurable +from detectron2.data import MetadataCatalog +from detectron2.modeling import META_ARCH_REGISTRY +from detectron2.modeling.backbone import Backbone +from detectron2.modeling.postprocessing import sem_seg_postprocess +from detectron2.structures import ImageList +from detectron2.utils.logger import log_first_n +from .modeling.clip_adapter import ( + ClipAdapter, + MaskFormerClipAdapter, + build_text_prompt, +) +from .mask_former_model import MaskFormer +from .utils.misc import get_gt_binary_masks + +@META_ARCH_REGISTRY.register() +class OVSeg(MaskFormer): + """ + Main class for zero shot mask classification semantic segmentation architectures. + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + sem_seg_head: nn.Module, + clip_adapter: nn.Module, + criterion: nn.Module, + num_queries: int, + panoptic_on: bool, + object_mask_threshold: float, + overlap_threshold: float, + metadata, + size_divisibility: int, + sem_seg_postprocess_before_inference: bool, + clip_ensemble: bool, + clip_ensemble_weight: float, + pixel_mean: Tuple[float], + pixel_std: Tuple[float], + ): + """ + Args: + backbone: a backbone module, must follow detectron2's backbone interface + sem_seg_head: a module that predicts semantic segmentation from backbone features + criterion: a module that defines the loss + clip_adapter: adapter for clip-based mask classification + num_queries: int, number of queries + panoptic_on: bool, whether to output panoptic segmentation prediction + object_mask_threshold: float, threshold to filter query based on classification score + for panoptic segmentation inference + overlap_threshold: overlap threshold used in general inference for panoptic segmentation + metadata: dataset meta, get `thing` and `stuff` category names for panoptic + segmentation inference + size_divisibility: Some backbones require the input height and width to be divisible by a + specific integer. We can use this to override such requirement. + sem_seg_postprocess_before_inference: whether to resize the prediction back + to original input size before semantic segmentation inference or after. + For high-resolution dataset like Mapillary, resizing predictions before + inference will cause OOM error. + pixel_mean, pixel_std: list or tuple with #channels element, representing + the per-channel mean and std to be used to normalize the input image + """ + super().__init__( + backbone=backbone, + sem_seg_head=sem_seg_head, + criterion=criterion, + num_queries=num_queries, + panoptic_on=panoptic_on, + object_mask_threshold=object_mask_threshold, + overlap_threshold=overlap_threshold, + metadata=metadata, + size_divisibility=size_divisibility, + sem_seg_postprocess_before_inference=sem_seg_postprocess_before_inference, + pixel_mean=pixel_mean, + pixel_std=pixel_std, + ) + self.clip_adapter: ClipAdapter = clip_adapter + + self.clip_ensemble: bool = clip_ensemble + self.clip_ensemble_weight: float = clip_ensemble_weight + + @classmethod + def from_config(cls, cfg): + init_kwargs = MaskFormer.from_config(cfg) + text_templates = build_text_prompt(cfg.MODEL.CLIP_ADAPTER) + + clip_adapter = MaskFormerClipAdapter( + cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME, + text_templates, + mask_fill=cfg.MODEL.CLIP_ADAPTER.MASK_FILL, + mask_expand_ratio=cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO, + mask_thr=cfg.MODEL.CLIP_ADAPTER.MASK_THR, + mask_matting=cfg.MODEL.CLIP_ADAPTER.MASK_MATTING, + region_resized=cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED, + mask_prompt_depth=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH, + mask_prompt_fwd=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD, + ) + init_kwargs["clip_adapter"] = clip_adapter + init_kwargs["clip_ensemble"] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE + init_kwargs[ + "clip_ensemble_weight" + ] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT + + return init_kwargs + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + * "image": Tensor, image in (C, H, W) format. + * "instances": per-region ground truth + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model (may be different + from input resolution), used in inference. + Returns: + list[dict]: + each dict has the results for one image. The dict contains the following keys: + + * "sem_seg": + A Tensor that represents the + per-pixel segmentation prediced by the head. + The prediction has shape KxHxW that represents the logits of + each class for each pixel. + * "panoptic_seg": + A tuple that represent panoptic output + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. + segments_info (list[dict]): Describe each segment in `panoptic_seg`. + Each dict contains keys "id", "category_id", "isthing". + """ + dataset_name = [x["meta"]["dataset_name"] for x in batched_inputs] + assert len(set(dataset_name)) == 1 + dataset_name = dataset_name[0] + + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.size_divisibility) + + features = self.backbone(images.tensor) + outputs = self.sem_seg_head(features) + class_names = self.get_class_name_list(dataset_name) + text_features = self.clip_adapter.get_text_features(class_names) + outputs["pred_logits"] = self.clip_adapter.get_sim_logits( + text_features, self.clip_adapter.normalize_feature(outputs["pred_logits"]) + ) + if self.training: + if "aux_outputs" in outputs.keys(): + for i in range(len(outputs["aux_outputs"])): + outputs["aux_outputs"][i][ + "pred_logits" + ] = self.clip_adapter.get_sim_logits( + text_features, + self.clip_adapter.normalize_feature( + outputs["aux_outputs"][i]["pred_logits"] + ), + ) + # mask classification target + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + targets = self.prepare_targets(gt_instances, images) + else: + targets = None + + # bipartite matching-based loss + losses = self.criterion(outputs, targets) + + for k in list(losses.keys()): + if k in self.criterion.weight_dict: + losses[k] *= self.criterion.weight_dict[k] + else: + # remove this loss if not specified in `weight_dict` + losses.pop(k) + + return losses + else: + mask_cls_results = outputs["pred_logits"] + mask_pred_results = outputs["pred_masks"] + # upsample masks + mask_pred_results = F.interpolate( + mask_pred_results, + size=(images.tensor.shape[-2], images.tensor.shape[-1]), + mode="bilinear", + align_corners=False, + ) + + processed_results = [] + for mask_cls_result, mask_pred_result, input_per_image, image_size in zip( + mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes + ): + height = image_size[0] + width = image_size[1] + mask_pred_result = sem_seg_postprocess( + mask_pred_result, image_size, height, width + ) + image = input_per_image["image"].to(self.device) + + r, regions = self.semantic_inference( + mask_cls_result, mask_pred_result, image, class_names + ) + + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = sem_seg_postprocess(r, image_size, height, width) + processed_results.append({"sem_seg": r}) + + # panoptic segmentation inference + if self.panoptic_on: + panoptic_r = self.panoptic_inference( + mask_cls_result, mask_pred_result + ) + processed_results[-1]["panoptic_seg"] = panoptic_r + + return processed_results + + + def semantic_inference(self, mask_cls, mask_pred, image, class_names): + mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1] + mask_pred = mask_pred.sigmoid() + + regions = None + if self.clip_ensemble: + clip_cls, regions, valid_flag = self.clip_adapter( + image, class_names, mask_pred, normalize=True + ) + if clip_cls is None: + clip_cls = torch.empty(0, mask_cls.shape[-1] + 1, device=self.device) + # softmax before index or after? + clip_cls = F.softmax(clip_cls[:, :-1], dim=-1) + if self.clip_ensemble_weight > 0: + map_back_clip_cls = mask_cls.new_ones(mask_cls.shape) + map_back_clip_cls[valid_flag] = clip_cls + mask_cls = torch.pow(mask_cls, 1 - self.clip_ensemble_weight) * \ + torch.pow(map_back_clip_cls, self.clip_ensemble_weight) + + + else: + # only clip model predictions are used + mask_cls = clip_cls + mask_pred = mask_pred[valid_flag] + semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred) + return semseg, regions + + def get_class_name_list(self, dataset_name): + class_names = [ + c.strip() for c in MetadataCatalog.get(dataset_name).stuff_classes + ] + return class_names + + +@META_ARCH_REGISTRY.register() +class OVSegDEMO(MaskFormer): + """ + Main class for zero shot mask classification semantic segmentation architectures. + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + sem_seg_head: nn.Module, + clip_adapter: nn.Module, + criterion: nn.Module, + num_queries: int, + panoptic_on: bool, + object_mask_threshold: float, + overlap_threshold: float, + metadata, + size_divisibility: int, + sem_seg_postprocess_before_inference: bool, + clip_ensemble: bool, + clip_ensemble_weight: float, + pixel_mean: Tuple[float], + pixel_std: Tuple[float], + ): + """ + Args: + backbone: a backbone module, must follow detectron2's backbone interface + sem_seg_head: a module that predicts semantic segmentation from backbone features + criterion: a module that defines the loss + clip_adapter: adapter for clip-based mask classification + num_queries: int, number of queries + panoptic_on: bool, whether to output panoptic segmentation prediction + object_mask_threshold: float, threshold to filter query based on classification score + for panoptic segmentation inference + overlap_threshold: overlap threshold used in general inference for panoptic segmentation + metadata: dataset meta, get `thing` and `stuff` category names for panoptic + segmentation inference + size_divisibility: Some backbones require the input height and width to be divisible by a + specific integer. We can use this to override such requirement. + sem_seg_postprocess_before_inference: whether to resize the prediction back + to original input size before semantic segmentation inference or after. + For high-resolution dataset like Mapillary, resizing predictions before + inference will cause OOM error. + pixel_mean, pixel_std: list or tuple with #channels element, representing + the per-channel mean and std to be used to normalize the input image + """ + super().__init__( + backbone=backbone, + sem_seg_head=sem_seg_head, + criterion=criterion, + num_queries=num_queries, + panoptic_on=panoptic_on, + object_mask_threshold=object_mask_threshold, + overlap_threshold=overlap_threshold, + metadata=metadata, + size_divisibility=size_divisibility, + sem_seg_postprocess_before_inference=sem_seg_postprocess_before_inference, + pixel_mean=pixel_mean, + pixel_std=pixel_std, + ) + self.clip_adapter: ClipAdapter = clip_adapter + + self.clip_ensemble: bool = clip_ensemble + self.clip_ensemble_weight: float = clip_ensemble_weight + + @classmethod + def from_config(cls, cfg): + init_kwargs = MaskFormer.from_config(cfg) + text_templates = build_text_prompt(cfg.MODEL.CLIP_ADAPTER) + + clip_adapter = MaskFormerClipAdapter( + cfg.MODEL.CLIP_ADAPTER.CLIP_MODEL_NAME, + text_templates, + mask_fill=cfg.MODEL.CLIP_ADAPTER.MASK_FILL, + mask_expand_ratio=cfg.MODEL.CLIP_ADAPTER.MASK_EXPAND_RATIO, + mask_thr=cfg.MODEL.CLIP_ADAPTER.MASK_THR, + mask_matting=cfg.MODEL.CLIP_ADAPTER.MASK_MATTING, + region_resized=cfg.MODEL.CLIP_ADAPTER.REGION_RESIZED, + mask_prompt_depth=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_DEPTH, + mask_prompt_fwd=cfg.MODEL.CLIP_ADAPTER.MASK_PROMPT_FWD, + ) + init_kwargs["clip_adapter"] = clip_adapter + init_kwargs["clip_ensemble"] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE + init_kwargs[ + "clip_ensemble_weight" + ] = cfg.MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT + + return init_kwargs + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + * "image": Tensor, image in (C, H, W) format. + * "instances": per-region ground truth + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model (may be different + from input resolution), used in inference. + Returns: + list[dict]: + each dict has the results for one image. The dict contains the following keys: + + * "sem_seg": + A Tensor that represents the + per-pixel segmentation prediced by the head. + The prediction has shape KxHxW that represents the logits of + each class for each pixel. + * "panoptic_seg": + A tuple that represent panoptic output + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. + segments_info (list[dict]): Describe each segment in `panoptic_seg`. + Each dict contains keys "id", "category_id", "isthing". + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.size_divisibility) + + features = self.backbone(images.tensor) + outputs = self.sem_seg_head(features) + class_names = batched_inputs[0]["class_names"] + if len(class_names) == 1: + # Because classification is performed in a 'contrastive' manner, adding others to represent other concepts + class_names.append('others') + text_features = self.clip_adapter.get_text_features(class_names) + outputs["pred_logits"] = self.clip_adapter.get_sim_logits( + text_features, self.clip_adapter.normalize_feature(outputs["pred_logits"]) + ) + mask_cls_results = outputs["pred_logits"] + mask_pred_results = outputs["pred_masks"] + # upsample masks + mask_pred_results = F.interpolate( + mask_pred_results, + size=(images.tensor.shape[-2], images.tensor.shape[-1]), + mode="bilinear", + align_corners=False, + ) + + processed_results = [] + for mask_cls_result, mask_pred_result, input_per_image, image_size in zip( + mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes + ): + height = image_size[0] + width = image_size[1] + mask_pred_result = sem_seg_postprocess( + mask_pred_result, image_size, height, width + ) + image = input_per_image["image"].to(self.device) + + r, regions = self.demo_inference(mask_cls_result, mask_pred_result, image, class_names) + + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + r = sem_seg_postprocess(r, image_size, height, width) + processed_results.append({"sem_seg": r}) + + return processed_results + + + + + def demo_inference(self, mask_cls, mask_pred, image, class_names): + mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1] + mask_pred = mask_pred.sigmoid() + + regions = None + if self.clip_ensemble: + clip_cls, regions, valid_flag = self.clip_adapter( + image, class_names, mask_pred, normalize=True + ) + if clip_cls is None: + clip_cls = torch.empty(0, mask_cls.shape[-1] + 1, device=self.device) + # softmax before index or after? + clip_cls = F.softmax(clip_cls[:, :-1], dim=-1) + if self.clip_ensemble_weight > 0: + map_back_clip_cls = mask_cls.new_ones(mask_cls.shape) + map_back_clip_cls[valid_flag] = clip_cls + mask_cls = torch.pow(mask_cls, 1 - self.clip_ensemble_weight) * \ + torch.pow(map_back_clip_cls, self.clip_ensemble_weight) + + else: + # only clip model predictions are used + mask_cls = clip_cls + mask_pred = mask_pred[valid_flag] + # bin_mask = mask_pred > self.clip_adapter.mask_thr + # select_cls = torch.zeros(sum(valid_flag), mask_cls.shape[-1], device=self.device) + # select_mask = torch.argmax(mask_cls, dim=0) + # if len(class_names) == 2 and class_names[-1] == 'others': + # select_mask = select_mask[:-1] + # for idx in select_mask: + # select_cls[idx] = mask_cls[idx] + # semseg = torch.einsum("qc,qhw->chw", select_cls, bin_mask.float()) + semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred) + return semseg, regions diff --git a/open_vocab_seg/test_time_augmentation.py b/open_vocab_seg/test_time_augmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..bb7a51f28419c59775013c74fdee49e5166bde51 --- /dev/null +++ b/open_vocab_seg/test_time_augmentation.py @@ -0,0 +1,217 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import copy +from itertools import count +import math +import numpy as np +import torch +from fvcore.transforms import HFlipTransform +from torch import nn +from torch.nn.parallel import DistributedDataParallel + +from detectron2.data.detection_utils import read_image +from detectron2.modeling import DatasetMapperTTA +from detectron2.modeling.postprocessing import sem_seg_postprocess +import logging +from detectron2.utils.logger import log_every_n, log_first_n + +__all__ = [ + "SemanticSegmentorWithTTA", +] + + +class SemanticSegmentorWithTTA(nn.Module): + """ + A SemanticSegmentor with test-time augmentation enabled. + Its :meth:`__call__` method has the same interface as :meth:`SemanticSegmentor.forward`. + """ + + def __init__(self, cfg, model, tta_mapper=None, batch_size=1): + """ + Args: + cfg (CfgNode): + model (SemanticSegmentor): a SemanticSegmentor to apply TTA on. + tta_mapper (callable): takes a dataset dict and returns a list of + augmented versions of the dataset dict. Defaults to + `DatasetMapperTTA(cfg)`. + batch_size (int): batch the augmented images into this batch size for inference. + """ + super().__init__() + if isinstance(model, DistributedDataParallel): + model = model.module + self.cfg = cfg.clone() + + self.model = model + + if tta_mapper is None: + tta_mapper = DatasetMapperTTA(cfg) + self.tta_mapper = tta_mapper + self.batch_size = batch_size + + def _inference_with_model(self, inputs): + if self.cfg.TEST.SLIDING_WINDOW: + log_first_n(logging.INFO, "Using sliding window to test") + + outputs = [] + + for input in inputs: + image_size = input["image"].shape[1:] # h,w + if self.cfg.TEST.SLIDING_TILE_SIZE > 0: + tile_size = ( + self.cfg.TEST.SLIDING_TILE_SIZE, + self.cfg.TEST.SLIDING_TILE_SIZE, + ) + else: + selected_mapping = {256: 224, 512: 256, 768: 512, 896: 512} + tile_size = min(image_size) + tile_size = selected_mapping[tile_size] + tile_size = (tile_size, tile_size) + extra_info = { + k: v + for k, v in input.items() + if k not in ["image", "height", "width"] + } + log_every_n( + logging.INFO, "split {} to {}".format(image_size, tile_size) + ) + overlap = self.cfg.TEST.SLIDING_OVERLAP + stride = math.ceil(tile_size[0] * (1 - overlap)) + tile_rows = int( + math.ceil((image_size[0] - tile_size[0]) / stride) + 1 + ) # strided convolution formula + tile_cols = int(math.ceil((image_size[1] - tile_size[1]) / stride) + 1) + full_probs = None + count_predictions = None + tile_counter = 0 + + for row in range(tile_rows): + for col in range(tile_cols): + x1 = int(col * stride) + y1 = int(row * stride) + x2 = min(x1 + tile_size[1], image_size[1]) + y2 = min(y1 + tile_size[0], image_size[0]) + x1 = max( + int(x2 - tile_size[1]), 0 + ) # for portrait images the x1 underflows sometimes + y1 = max( + int(y2 - tile_size[0]), 0 + ) # for very few rows y1 underflows + + img = input["image"][:, y1:y2, x1:x2] + padded_img = nn.functional.pad( + img, + ( + 0, + tile_size[1] - img.shape[-1], + 0, + tile_size[0] - img.shape[-2], + ), + ) + tile_counter += 1 + padded_input = {"image": padded_img} + padded_input.update(extra_info) + padded_prediction = self.model([padded_input])[0]["sem_seg"] + prediction = padded_prediction[ + :, 0 : img.shape[1], 0 : img.shape[2] + ] + if full_probs is None: + full_probs = prediction.new_zeros( + prediction.shape[0], image_size[0], image_size[1] + ) + if count_predictions is None: + count_predictions = prediction.new_zeros( + prediction.shape[0], image_size[0], image_size[1] + ) + count_predictions[:, y1:y2, x1:x2] += 1 + full_probs[ + :, y1:y2, x1:x2 + ] += prediction # accumulate the predictions also in the overlapping regions + + full_probs /= count_predictions + full_probs = sem_seg_postprocess( + full_probs, + image_size, + input.get("height", image_size[0]), + input.get("width", image_size[1]), + ) + outputs.append({"sem_seg": full_probs}) + + return outputs + else: + log_first_n(logging.INFO, "Using whole image to test") + return self.model(inputs) + + def _batch_inference(self, batched_inputs): + """ + Execute inference on a list of inputs, + using batch size = self.batch_size, instead of the length of the list. + Inputs & outputs have the same format as :meth:`SemanticSegmentor.forward` + """ + outputs = [] + inputs = [] + for idx, input in zip(count(), batched_inputs): + inputs.append(input) + if len(inputs) == self.batch_size or idx == len(batched_inputs) - 1: + with torch.no_grad(): + outputs.extend(self._inference_with_model(inputs)) + inputs = [] + return outputs + + def __call__(self, batched_inputs): + """ + Same input/output format as :meth:`SemanticSegmentor.forward` + """ + + def _maybe_read_image(dataset_dict): + ret = copy.copy(dataset_dict) + if "image" not in ret: + image = read_image(ret.pop("file_name"), self.model.input_format) + image = torch.from_numpy( + np.ascontiguousarray(image.transpose(2, 0, 1)) + ) # CHW + ret["image"] = image + if "height" not in ret and "width" not in ret: + ret["height"] = image.shape[1] + ret["width"] = image.shape[2] + return ret + + return [self._inference_one_image(_maybe_read_image(x)) for x in batched_inputs] + + def _inference_one_image(self, input): + """ + Args: + input (dict): one dataset dict with "image" field being a CHW tensor + Returns: + dict: one output dict + """ + augmented_inputs, tfms = self._get_augmented_inputs(input) + # 1: forward with all augmented images + outputs = self._batch_inference(augmented_inputs) + # Delete now useless variables to avoid being out of memory + del augmented_inputs + # 2: merge the results + # handle flip specially + # outputs = [output.detach() for output in outputs] + return self._merge_auged_output(outputs, tfms) + + def _merge_auged_output(self, outputs, tfms): + new_outputs = [] + for output, tfm in zip(outputs, tfms): + if any(isinstance(t, HFlipTransform) for t in tfm.transforms): + new_outputs.append(output["sem_seg"].flip(dims=[2])) + else: + new_outputs.append(output["sem_seg"]) + del outputs + # to avoid OOM with torch.stack + final_predictions = new_outputs[0] + for i in range(1, len(new_outputs)): + final_predictions += new_outputs[i] + final_predictions = final_predictions / len(new_outputs) + del new_outputs + return {"sem_seg": final_predictions} + + def _get_augmented_inputs(self, input): + augmented_inputs = self.tta_mapper(input) + tfms = [x.pop("transforms") for x in augmented_inputs] + return augmented_inputs, tfms diff --git a/open_vocab_seg/utils/__init__.py b/open_vocab_seg/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d7b02184067b3a370e2815d5dec39b9d1cdad42f --- /dev/null +++ b/open_vocab_seg/utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +from .events import setup_wandb, WandbWriter +from .predictor import VisualizationDemo, VisualizationDemoIndoor \ No newline at end of file diff --git a/open_vocab_seg/utils/events.py b/open_vocab_seg/utils/events.py new file mode 100644 index 0000000000000000000000000000000000000000..cbe82ce80a7110a1018167763ba3adc90f58faa0 --- /dev/null +++ b/open_vocab_seg/utils/events.py @@ -0,0 +1,121 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import os +import wandb +from detectron2.utils import comm +from detectron2.utils.events import EventWriter, get_event_storage + + +def setup_wandb(cfg, args): + if comm.is_main_process(): + init_args = { + k.lower(): v + for k, v in cfg.WANDB.items() + if isinstance(k, str) and k not in ["config", "name"] + } + # only include most related part to avoid too big table + # TODO: add configurable params to select which part of `cfg` should be saved in config + if "config_exclude_keys" in init_args: + init_args["config"] = cfg + init_args["config"]["cfg_file"] = args.config_file + else: + init_args["config"] = { + "model": cfg.MODEL, + "solver": cfg.SOLVER, + "cfg_file": args.config_file, + } + if ("name" not in init_args) or (init_args["name"] is None): + init_args["name"] = os.path.basename(args.config_file) + wandb.init(**init_args) + + +class BaseRule(object): + def __call__(self, target): + return target + + +class IsIn(BaseRule): + def __init__(self, keyword: str): + self.keyword = keyword + + def __call__(self, target): + return self.keyword in target + + +class Prefix(BaseRule): + def __init__(self, keyword: str): + self.keyword = keyword + + def __call__(self, target): + return "/".join([self.keyword, target]) + + +class WandbWriter(EventWriter): + """ + Write all scalars to a tensorboard file. + """ + + def __init__(self): + """ + Args: + log_dir (str): the directory to save the output events + kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)` + """ + self._last_write = -1 + self._group_rules = [ + (IsIn("/"), BaseRule()), + (IsIn("loss"), Prefix("train")), + ] + + def write(self): + + storage = get_event_storage() + + def _group_name(scalar_name): + for (rule, op) in self._group_rules: + if rule(scalar_name): + return op(scalar_name) + return scalar_name + + stats = { + _group_name(name): scalars[0] + for name, scalars in storage.latest().items() + if scalars[1] > self._last_write + } + if len(stats) > 0: + self._last_write = max([v[1] for k, v in storage.latest().items()]) + + # storage.put_{image,histogram} is only meant to be used by + # tensorboard writer. So we access its internal fields directly from here. + if len(storage._vis_data) >= 1: + stats["image"] = [ + wandb.Image(img, caption=img_name) + for img_name, img, step_num in storage._vis_data + ] + # Storage stores all image data and rely on this writer to clear them. + # As a result it assumes only one writer will use its image data. + # An alternative design is to let storage store limited recent + # data (e.g. only the most recent image) that all writers can access. + # In that case a writer may not see all image data if its period is long. + storage.clear_images() + + if len(storage._histograms) >= 1: + + def create_bar(tag, bucket_limits, bucket_counts, **kwargs): + data = [ + [label, val] for (label, val) in zip(bucket_limits, bucket_counts) + ] + table = wandb.Table(data=data, columns=["label", "value"]) + return wandb.plot.bar(table, "label", "value", title=tag) + + stats["hist"] = [create_bar(**params) for params in storage._histograms] + + storage.clear_histograms() + + if len(stats) == 0: + return + wandb.log(stats, step=storage.iter) + + def close(self): + wandb.finish() diff --git a/open_vocab_seg/utils/misc.py b/open_vocab_seg/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..a22d0a978c9cd89595c6e7c900885e1c148844b1 --- /dev/null +++ b/open_vocab_seg/utils/misc.py @@ -0,0 +1,126 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +""" +Misc functions, including distributed helpers. + +Mostly copy-paste from torchvision references. +""" +from typing import List, Optional + +import torch +import torch.distributed as dist +import torchvision +from torch import Tensor + + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + assert mask is not None + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + # TODO make this more general + if tensor_list[0].ndim == 3: + if torchvision._is_tracing(): + # nested_tensor_from_tensor_list() does not export well to ONNX + # call _onnx_nested_tensor_from_tensor_list() instead + return _onnx_nested_tensor_from_tensor_list(tensor_list) + + # TODO make it support different-sized images + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("not supported") + return NestedTensor(tensor, mask) + + +# _onnx_nested_tensor_from_tensor_list() is an implementation of +# nested_tensor_from_tensor_list() that is supported by ONNX tracing. +@torch.jit.unused +def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: + max_size = [] + for i in range(tensor_list[0].dim()): + max_size_i = torch.max( + torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) + ).to(torch.int64) + max_size.append(max_size_i) + max_size = tuple(max_size) + + # work around for + # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + # m[: img.shape[1], :img.shape[2]] = False + # which is not yet supported in onnx + padded_imgs = [] + padded_masks = [] + for img in tensor_list: + padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] + padded_img = torch.nn.functional.pad( + img, (0, padding[2], 0, padding[1], 0, padding[0]) + ) + padded_imgs.append(padded_img) + + m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) + padded_mask = torch.nn.functional.pad( + m, (0, padding[2], 0, padding[1]), "constant", 1 + ) + padded_masks.append(padded_mask.to(torch.bool)) + + tensor = torch.stack(padded_imgs) + mask = torch.stack(padded_masks) + + return NestedTensor(tensor, mask=mask) + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True + +def get_gt_binary_masks(gt_semseg): + mask_ids = torch.unique(gt_semseg) + gt_masks = [] + for id in mask_ids: + if id != 255: + gt_masks.append(gt_semseg == id) + gt_masks = torch.stack(gt_masks).float() + return gt_masks diff --git a/open_vocab_seg/utils/pcd_rendering.py b/open_vocab_seg/utils/pcd_rendering.py new file mode 100644 index 0000000000000000000000000000000000000000..74c9787d5c55834b417a25227a98b4fa0ea0993e --- /dev/null +++ b/open_vocab_seg/utils/pcd_rendering.py @@ -0,0 +1,114 @@ +import torch +import torch.nn as nn + +from pytorch3d.renderer import ( + PerspectiveCameras, + PointsRasterizationSettings, + PointsRasterizer, + AlphaCompositor, +) + + +def homogenize_pt(coord): + return torch.cat([coord, torch.ones_like(coord[..., :1])], dim=-1) + + +def unproject_pts_pt(intrinsics, coords, depth): + if coords.shape[-1] == 2: + coords = homogenize_pt(coords) + intrinsics = intrinsics.squeeze()[:3, :3] + coords = torch.inverse(intrinsics).mm(coords.T) * depth.reshape(1, -1) + return coords.T # [n, 3] + + +def get_coord_grids_pt(h, w, device, homogeneous=False): + """ + create pxiel coordinate grid + :param h: height + :param w: weight + :param device: device + :param homogeneous: if homogeneous coordinate + :return: coordinates [h, w, 2] + """ + y = torch.arange(0, h).to(device) + x = torch.arange(0, w).to(device) + grid_y, grid_x = torch.meshgrid(y, x) + if homogeneous: + return torch.stack([grid_x, grid_y, torch.ones_like(grid_x)], dim=-1) + return torch.stack([grid_x, grid_y], dim=-1) # [h, w, 2] + + +class PointsRenderer(nn.Module): + """ + A class for rendering a batch of points. The class should + be initialized with a rasterizer and compositor class which each have a forward + function. + """ + + def __init__(self, rasterizer, compositor) -> None: + super().__init__() + self.rasterizer = rasterizer + self.compositor = compositor + + def to(self, device): + self.rasterizer = self.rasterizer.to(device) + self.compositor = self.compositor.to(device) + return self + + def forward(self, point_clouds, **kwargs) -> torch.Tensor: + fragments = self.rasterizer(point_clouds, **kwargs) + + r = self.rasterizer.raster_settings.radius + + if type(r) == torch.Tensor: + if r.shape[-1] > 1: + idx = fragments.idx.clone() + idx[idx == -1] = 0 + r = r[:, idx.squeeze().long()] + r = r.permute(0, 3, 1, 2) + + dists2 = fragments.dists.permute(0, 3, 1, 2) + weights = 1 - dists2 / (r * r) + images = self.compositor( + fragments.idx.long().permute(0, 3, 1, 2), + weights, + point_clouds.features_packed().permute(1, 0), + **kwargs, + ) + + # permute so image comes at the end + images = images.permute(0, 2, 3, 1) + + return images + + +def create_pcd_renderer(h, w, intrinsics, R=None, T=None, radius=None, device="cuda"): + fx = intrinsics[0, 0] + fy = intrinsics[1, 1] + if R is None: + R = torch.eye(3)[None] # (1, 3, 3) + if T is None: + T = torch.zeros(1, 3) # (1, 3) + cameras = PerspectiveCameras(R=R, T=T, + device=device, + focal_length=((-fx, -fy),), + principal_point=(tuple(intrinsics[:2, -1]),), + image_size=((h, w),), + in_ndc=False, + ) + + if radius is None: + radius = 1.5 / min(h, w) * 2.0 + + raster_settings = PointsRasterizationSettings( + image_size=(h, w), + radius=radius, + points_per_pixel=8, + ) + + rasterizer = PointsRasterizer(cameras=cameras, raster_settings=raster_settings) + renderer = PointsRenderer( + rasterizer=rasterizer, + compositor=AlphaCompositor(background_color=(1, 1, 1)) + ) + return renderer diff --git a/open_vocab_seg/utils/post_process_utils.py b/open_vocab_seg/utils/post_process_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ed214319d90ceba0b47ef835072102b9ffec5179 --- /dev/null +++ b/open_vocab_seg/utils/post_process_utils.py @@ -0,0 +1,74 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import torch +from torch.nn import functional as F +import numpy as np + +try: + import pydensecrf.densecrf as dcrf + from pydensecrf.utils import ( + unary_from_softmax, + unary_from_labels, + create_pairwise_bilateral, + create_pairwise_gaussian, + ) +except: + dcrf = None + + +def dense_crf_post_process( + logits, + image, + n_labels=None, + max_iters=5, + pos_xy_std=(3, 3), + pos_w=3, + bi_xy_std=(80, 80), + bi_rgb_std=(13, 13, 13), + bi_w=10, +): + """ + logits : [C,H,W] + image : [3,H,W] + """ + if dcrf is None: + raise FileNotFoundError( + "pydensecrf is required to perform dense crf inference." + ) + if isinstance(logits, torch.Tensor): + logits = F.softmax(logits, dim=0).detach().cpu().numpy() + U = unary_from_softmax(logits) + n_labels = logits.shape[0] + elif logits.ndim == 3: + U = unary_from_softmax(logits) + n_labels = logits.shape[0] + else: + assert n_labels is not None + U = unary_from_labels(logits, n_labels, zero_unsure=False) + + d = dcrf.DenseCRF2D(image.shape[1], image.shape[0], n_labels) + + d.setUnaryEnergy(U) + + # This adds the color-independent term, features are the locations only. + d.addPairwiseGaussian( + sxy=pos_xy_std, + compat=pos_w, + kernel=dcrf.DIAG_KERNEL, + normalization=dcrf.NORMALIZE_SYMMETRIC, + ) + + # This adds the color-dependent term, i.e. features are (x,y,r,g,b). + d.addPairwiseBilateral( + sxy=bi_xy_std, + srgb=bi_rgb_std, + rgbim=image, + compat=bi_w, + kernel=dcrf.DIAG_KERNEL, + normalization=dcrf.NORMALIZE_SYMMETRIC, + ) + # Run five inference steps. + logits = d.inference(max_iters) + logits = np.asarray(logits).reshape((n_labels, image.shape[0], image.shape[1])) + return torch.from_numpy(logits) diff --git a/open_vocab_seg/utils/predictor.py b/open_vocab_seg/utils/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..59f5744d31f7422389c6994aa6fb01f71b298d21 --- /dev/null +++ b/open_vocab_seg/utils/predictor.py @@ -0,0 +1,793 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import numpy as np +import torch +import torchvision +import imageio +from tqdm import tqdm +import os +import cv2 + +from pytorch3d.structures import Pointclouds +from pytorch3d.renderer import look_at_view_transform + +from detectron2.data import MetadataCatalog +from detectron2.engine.defaults import DefaultPredictor +from detectron2.utils.visualizer import ColorMode, Visualizer +from detectron2.data.detection_utils import read_image +from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor +import matplotlib.pyplot as plt +import matplotlib as mpl +from .pcd_rendering import unproject_pts_pt, get_coord_grids_pt, create_pcd_renderer + + +class OVSegPredictor(DefaultPredictor): + def __init__(self, cfg): + super().__init__(cfg) + + def __call__(self, original_image, class_names): + """ + Args: + original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). + + Returns: + predictions (dict): + the output of the model for one image only. + See :doc:`/tutorials/models` for details about the format. + """ + with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 + # Apply pre-processing to image. + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + original_image = original_image[:, :, ::-1] + height, width = original_image.shape[:2] + image = self.aug.get_transform(original_image).apply_image(original_image) + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + + inputs = {"image": image, "height": height, "width": width, "class_names": class_names} + predictions = self.model([inputs])[0] + return predictions + +class OVSegVisualizer(Visualizer): + def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE, class_names=None): + super().__init__(img_rgb, metadata, scale, instance_mode) + self.class_names = class_names + + def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8): + """ + Draw semantic segmentation predictions/labels. + + Args: + sem_seg (Tensor or ndarray): the segmentation of shape (H, W). + Each value is the integer label of the pixel. + area_threshold (int): segments with less than `area_threshold` are not drawn. + alpha (float): the larger it is, the more opaque the segmentations are. + + Returns: + output (VisImage): image object with visualizations. + """ + if isinstance(sem_seg, torch.Tensor): + sem_seg = sem_seg.numpy() + labels, areas = np.unique(sem_seg, return_counts=True) + sorted_idxs = np.argsort(-areas).tolist() + labels = labels[sorted_idxs] + class_names = self.class_names if self.class_names is not None else self.metadata.stuff_classes + + for label in filter(lambda l: l < len(class_names), labels): + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[label]] + except (AttributeError, IndexError): + mask_color = None + mask_color = np.random.random((1, 3)).tolist()[0] + + binary_mask = (sem_seg == label).astype(np.uint8) + text = class_names[label] + self.draw_binary_mask( + binary_mask, + color=mask_color, + edge_color=(1.0, 1.0, 240.0 / 255), + text=text, + alpha=alpha, + area_threshold=area_threshold, + ) + return self.output + + def draw_sam_seg(self, masks, area_threshold=None, alpha=0.5): + """ + Draw semantic segmentation predictions/labels. + + Args: + sem_seg (Tensor or ndarray): the segmentation of shape (H, W). + Each value is the integer label of the pixel. + area_threshold (int): segments with less than `area_threshold` are not drawn. + alpha (float): the larger it is, the more opaque the segmentations are. + + Returns: + output (VisImage): image object with visualizations. + """ + plt.figure() + if len(masks) == 0: + return + sorted_anns = sorted(masks, key=(lambda x: x['area']), reverse=True) + img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 3)) + class_names = self.class_names if self.class_names is not None else self.metadata.stuff_classes + for ann in sorted_anns: + m = ann['segmentation'] + mask_color = np.random.random((1, 3)).tolist()[0] + + self.draw_binary_mask( + m, + color=mask_color, + edge_color=(1.0, 1.0, 240.0 / 255), + text=class_names[ann['class']], + alpha=alpha, + area_threshold=area_threshold, + ) + return self.output + + + +class VisualizationDemo(object): + def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): + """ + Args: + cfg (CfgNode): + instance_mode (ColorMode): + parallel (bool): whether to run the model in different processes from visualization. + Useful since the visualization logic can be slow. + """ + self.metadata = MetadataCatalog.get( + cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" + ) + + self.cpu_device = torch.device("cpu") + self.instance_mode = instance_mode + + self.parallel = parallel + if parallel: + raise NotImplementedError + else: + self.predictor = OVSegPredictor(cfg) + + def run_on_image(self, image, class_names): + """ + Args: + image (np.ndarray): an image of shape (H, W, C) (in BGR order). + This is the format used by OpenCV. + Returns: + predictions (dict): the output of the model. + vis_output (VisImage): the visualized image output. + """ + predictions = self.predictor(image, class_names) + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = image[:, :, ::-1] + visualizer = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + # if "sem_seg" in predictions: + # r = predictions["sem_seg"] + # blank_area = (r[0] == 0) + # pred_mask = r.argmax(dim=0).to('cpu') + # pred_mask[blank_area] = 255 + # pred_mask = np.array(pred_mask, dtype=np.int) + + # vis_output = visualizer.draw_sem_seg( + # pred_mask + # ) + # else: + # raise NotImplementedError + + if "sem_seg" in predictions: + r = predictions["sem_seg"] + pred_mask = r.argmax(dim=0).to('cpu') + pred_mask = np.array(pred_mask, dtype=int) + + vis_output = visualizer.draw_sem_seg( + pred_mask + ) + else: + raise NotImplementedError + + return predictions, vis_output + + def run_on_image_sam(self, path, class_names, depth_map_path, rage_matrices_path): + """ + Args: + path (str): the path of the image + Returns: + predictions (dict): the output of the model. + vis_output (VisImage): the visualized image output. + """ + image = read_image(path, format="BGR") + predictions = self.predictor(image, class_names) + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = image[:, :, ::-1] + visualizer_rgb = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_depth = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_rgb_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_depth_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + + sam_checkpoint = "sam_vit_h_4b8939.pth" + model_type = "vit_h" + device = "cuda" + sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) + sam.to(device=device) + + mask_generator_2 = SamAutomaticMaskGenerator( + model=sam, + points_per_side=64, + pred_iou_thresh=0.8, + stability_score_thresh=0.8, + crop_n_layers=0, + crop_n_points_downscale_factor=0, + min_mask_region_area=100, # Requires open-cv to run post-processing + ) + print('Using SAM to generate segments for the RGB image') + masks_rgb = mask_generator_2.generate(image) + masks_rgb = sorted(masks_rgb, key=(lambda x: x['area']), reverse=True) + + print('Using SAM to generate segments for the Depth map') + d, world_coord = self.project_2d_to_3d(depth_map_path, rage_matrices_path) + d = (d - np.min(d)) / (np.max(d) - np.min(d)) + image_depth = mpl.colormaps['plasma'](d)*255 + plt.figure() + plt.imshow(image_depth.astype(np.uint8)) + plt.axis('off') + plt.savefig('outputs/Depth_rendered.png', bbox_inches='tight', pad_inches=0.0) + masks_depth = mask_generator_2.generate(image_depth.astype(np.uint8)[:,:,:-1]) + masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=True) + + if "sem_seg" in predictions: + r = predictions["sem_seg"] + pred_mask = r.argmax(dim=0).to('cpu') + pred_mask = np.array(pred_mask, dtype=int) + + pred_mask_sam_rgb = pred_mask.copy() + for mask in masks_rgb: + cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True) + pred_mask_sam_rgb[mask['segmentation']] = cls_tmp[np.argmax(cls_num)] + mask['class'] = cls_tmp[np.argmax(cls_num)] + + vis_output_rgb = visualizer_rgb.draw_sem_seg( + pred_mask_sam_rgb + ) + # vis_output_rgb = visualizer_rgb.draw_sem_seg( + # pred_mask, alpha=1 + # ) + + pred_mask_sam_depth = pred_mask.copy() + for mask in masks_depth: + cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True) + pred_mask_sam_depth[mask['segmentation']] = cls_tmp[np.argmax(cls_num)] + mask['class'] = cls_tmp[np.argmax(cls_num)] + + vis_output_depth = visualizer_depth.draw_sem_seg( + pred_mask_sam_depth + ) + + vis_output_rgb_sam = visualizer_rgb_sam.draw_sam_seg(masks_rgb) + vis_output_depth_sam = visualizer_depth_sam.draw_sam_seg(masks_depth) + + else: + raise NotImplementedError + + return predictions, vis_output_rgb, vis_output_depth, vis_output_rgb_sam, vis_output_depth_sam + + def project_2d_to_3d(self, depth_map_path, rage_matrices_path): + + H = 800 + W = 1280 + IMAGE_SIZE = (H, W) + + def pixels_to_ndcs(xx, yy, size=IMAGE_SIZE): + s_y, s_x = size + s_x -= 1 # so 1 is being mapped into (n-1)th pixel + s_y -= 1 # so 1 is being mapped into (n-1)th pixel + x = (2 / s_x) * xx - 1 + y = (-2 / s_y) * yy + 1 + return x, y + + rage_matrices = np.load(rage_matrices_path) + + + # get the (ViewProj) matrix that transform points from the world coordinate to NDC + # (points in world coordinate) @ VP = (points in NDC) + VP = rage_matrices['VP'] + VP_inverse = rage_matrices['VP_inv'] # NDC to world coordinate + + # get the (Proj) matrix that transform points from the camera coordinate to NDC + # (points in camera coordinate) @ P = (points in NDC) + P = rage_matrices['P'] + P_inverse = rage_matrices['P_inv'] # NDC to camera coordinate + # print(VP, VP_inverse, P, P_inverse) + + d = np.load(depth_map_path) + d = d/6.0 - 4e-5 # convert to NDC coordinate + + px = np.arange(0, W) + py = np.arange(0, H) + px, py = np.meshgrid(px, py, sparse=False) + px = px.reshape(-1) + py = py.reshape(-1) + + ndcz = d[py, px] # get the depth in NDC + ndcx, ndcy = pixels_to_ndcs(px, py) + ndc_coord = np.stack([ndcx, ndcy, ndcz, np.ones_like(ndcz)], axis=1) + + camera_coord = ndc_coord @ P_inverse + camera_coord = camera_coord/camera_coord[:,-1:] + + world_coord = ndc_coord @ VP_inverse + world_coord = world_coord/world_coord[:,-1:] + + return d, world_coord + + def get_xyzrgb(self, rgb_path, depth_path, rage_matrices_path): + + H = 800 + W = 1280 + IMAGE_SIZE = (H, W) + + def pixels_to_ndcs(xx, yy, size=IMAGE_SIZE): + s_y, s_x = size + s_x -= 1 # so 1 is being mapped into (n-1)th pixel + s_y -= 1 # so 1 is being mapped into (n-1)th pixel + x = (2 / s_x) * xx - 1 + y = (-2 / s_y) * yy + 1 + return x, y + + rage_matrices = np.load(rage_matrices_path) + + + # get the (ViewProj) matrix that transform points from the world coordinate to NDC + # (points in world coordinate) @ VP = (points in NDC) + VP = rage_matrices['VP'] + VP_inverse = rage_matrices['VP_inv'] # NDC to world coordinate + + # get the (Proj) matrix that transform points from the camera coordinate to NDC + # (points in camera coordinate) @ P = (points in NDC) + P = rage_matrices['P'] + P_inverse = rage_matrices['P_inv'] # NDC to camera coordinate + # print(VP, VP_inverse, P, P_inverse) + + d = np.load(depth_path) + d = d/6.0 - 4e-5 # convert to NDC coordinate + + px = np.arange(0, W) + py = np.arange(0, H) + px, py = np.meshgrid(px, py, sparse=False) + px = px.reshape(-1) + py = py.reshape(-1) + + ndcz = d[py, px] # get the depth in NDC + ndcx, ndcy = pixels_to_ndcs(px, py) + ndc_coord = np.stack([ndcx, ndcy, ndcz, np.ones_like(ndcz)], axis=1) + + camera_coord = ndc_coord @ P_inverse + camera_coord = camera_coord/camera_coord[:,-1:] + + world_coord = ndc_coord @ VP_inverse + world_coord = world_coord/world_coord[:,-1:] + + rgb = read_image(rgb_path, format="BGR") + rgb = rgb[:, :, ::-1] + rgb = rgb[py, px, :] + + xyzrgb = np.concatenate((world_coord[:,:-1], rgb), axis=1) + + return xyzrgb + + def render_3d_video(self, xyzrgb_path, depth_path): + + device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') + + xyzrgb = np.load(xyzrgb_path) + depth = np.load(depth_path) + depth = torch.tensor(depth).to(device) + depth = 1 / depth + + H = 800 + W = 1280 + radius = 1.5 / min(H, W) * 2.0 + intrinsic = np.array([[max(H, W), 0, W // 2], + [0, max(H, W), H // 2], + [0, 0, 1]]) + + intrinsic = torch.from_numpy(intrinsic).float()[None].to(device) + coord = get_coord_grids_pt(H, W, device=device).float()[None] + pts = unproject_pts_pt(intrinsic, coord.reshape(-1, 2), depth) + pts[:, 0] = ((pts[:, 0] - pts[:, 0].min()) / (pts[:, 0].max() - pts[:, 0].min()) - 0.5) * 2 + pts[:, 1] = ((pts[:, 1] - pts[:, 1].min()) / (pts[:, 1].max() - pts[:, 1].min()) - 0.7) * 2 + pts[:, 2] = ((pts[:, 2] - pts[:, 2].min()) / (pts[:, 2].max() - pts[:, 2].min()) - 0.5) * 2 + + num_frames = 45 + degrees = np.linspace(120, 220, num_frames) + + total = ['rgb_3d_sam', 'depth_3d_sam', 'rgb_3d_sam_mask', 'depth_3d_sam_mask'] + frames_all = {} + + for j, name in enumerate(total): + img = torch.from_numpy(xyzrgb[name][:, 3:] / 255.).to(device).float() + pcd = Pointclouds(points=[pts], features=[img.squeeze().reshape(-1, 3)]) + frames = [] + for i in tqdm(range(num_frames)): + R, t = look_at_view_transform(3., -10, degrees[i]) + renderer = create_pcd_renderer(H, W, intrinsic.squeeze()[:3, :3], + R=R, T=t, + radius=radius, device=device) + result = renderer(pcd) + result = result.permute(0, 3, 1, 2) + frame = (255. * result.detach().cpu().squeeze().permute(1, 2, 0).numpy()).astype(np.uint8) + frames.append(frame) + + frames_all[name] = frames + + # video_out_file = '{}.gif'.format(name) + # imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25) + + video_out_file = '{}.mp4'.format(name) + imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25, quality=8) + + video_out_file = '{}.mp4'.format('RGB_3D_All') + imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['rgb_3d_sam_mask']+frames_all['rgb_3d_sam'], fps=25, quality=8) + + video_out_file = '{}.mp4'.format('Depth_3D_All') + imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['depth_3d_sam_mask']+frames_all['depth_3d_sam'], fps=25, quality=8) + +class VisualizationDemoIndoor(VisualizationDemo): + def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): + super().__init__(cfg, instance_mode, parallel) + + def build_pcd(self, depth_mask, coords, colors, masks, sem_map): + group_ids = np.full(masks[0]["segmentation"].shape, -1, dtype=int) + num_masks = len(masks) + group_counter = 0 + for i in reversed(range(num_masks)): + # print(masks[i]["predicted_iou"]) + group_ids[masks[i]["segmentation"]] = group_counter + group_counter += 1 + group_ids = np.unique(group_ids[depth_mask], return_inverse=True)[1] + return dict(coord=coords, color=colors, group=group_ids, sem_map=sem_map) + + + def run_on_pcd_ui(self, rgb_path, depth_path, class_names): + depth = depth_path + color = rgb_path + #semantic_map = join(rgb_path, scene_name, 'semantic_label', color_name[0:-4] + '.pth') + + depth_img = cv2.imread(depth, -1) # read 16bit grayscale image + depth_mask = (depth_img != 0) + color_image = cv2.imread(color) + color_image = cv2.resize(color_image, (640, 480)) + predictions = self.predictor(color_image, class_names) + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = color_image[:, :, ::-1] + visualizer_rgb = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_depth = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_rgb_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_depth_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + + sam_checkpoint = "sam_vit_h_4b8939.pth" + model_type = "vit_h" + device = "cuda" + sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) + sam.to(device=device) + + mask_generator_2 = SamAutomaticMaskGenerator( + model=sam, + points_per_side=64, + pred_iou_thresh=0.5, + stability_score_thresh=0.8, + crop_n_layers=0, + crop_n_points_downscale_factor=0, + min_mask_region_area=100, # Requires open-cv to run post-processing + ) + print('Using SAM to generate segments for the RGB image') + masks_rgb = mask_generator_2.generate(image) + masks_rgb = sorted(masks_rgb, key=(lambda x: x['area']), reverse=True) + + print('Using SAM to generate segments for the Depth map') + d = np.full(depth_img.shape, 0, dtype=float) + d[depth_mask] = (1 / (depth_img+1e-6))[depth_mask] + colored_depth = (d - np.min(d)) / (np.max(d) - np.min(d)) + colored_depth = mpl.colormaps['inferno'](colored_depth)*255 + plt.figure() + plt.imshow(colored_depth.astype(np.uint8)[:,:,:-1]) + plt.axis('off') + plt.savefig('outputs/Depth_rendered.png') + masks_depth = mask_generator_2.generate(colored_depth.astype(np.uint8)[:,:,:-1]) + masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=True) + + if "sem_seg" in predictions: + r = predictions["sem_seg"] + pred_mask = r.argmax(dim=0).to('cpu') + pred_mask = np.array(pred_mask, dtype=int) + + output2D = {} + pred_mask_sam_depth = np.full(pred_mask.shape, -1) + masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=False) + for mask in masks_depth: + to_paint = pred_mask_sam_depth == -1 + cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True) + #print(cls_tmp, cls_num) + pred_mask_sam_depth[mask['segmentation'] & to_paint] = cls_tmp[np.argmax(cls_num)] + #print(class_names[cls_tmp[np.argmax(cls_num)]]) + mask['class'] = cls_tmp[np.argmax(cls_num)] + + output2D['sem_seg_on_depth'] = visualizer_depth.draw_sem_seg( + pred_mask_sam_depth + ) + + pred_mask_sam_rgb = pred_mask.copy() + for mask in masks_rgb: + cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True) + #print(mask['segmentation'].sum(), cls_tmp, cls_num) + pred_mask_sam_rgb[mask['segmentation']] = cls_tmp[np.argmax(cls_num)] + mask['class'] = cls_tmp[np.argmax(cls_num)] + + output2D['sem_seg_on_rgb'] = visualizer_rgb.draw_sem_seg( + pred_mask_sam_rgb + ) + + output2D['sam_seg_on_rgb'] = visualizer_rgb_sam.draw_sam_seg(masks_rgb) + output2D['sam_seg_on_depth'] = visualizer_depth_sam.draw_sam_seg(masks_depth) + + else: + raise NotImplementedError + + color_image = np.reshape(color_image[depth_mask], [-1,3]) + #group_ids = group_ids[depth_mask] + + sem_map_color = pred_mask_sam_rgb[depth_mask] + sem_map_depth = pred_mask_sam_depth[depth_mask] + + colors = np.zeros_like(color_image) + colors[:,0] = color_image[:,2] + colors[:,1] = color_image[:,1] + colors[:,2] = color_image[:,0] + + depth_shift = 1000.0 + x,y = np.meshgrid(np.linspace(0,depth_img.shape[1]-1,depth_img.shape[1]), np.linspace(0,depth_img.shape[0]-1,depth_img.shape[0])) + uv_depth = np.zeros((depth_img.shape[0], depth_img.shape[1], 3)) + uv_depth[:,:,0] = x + uv_depth[:,:,1] = y + uv_depth[:,:,2] = depth_img/depth_shift + + output3D = {} + output3D['rgb_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + output3D['depth_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + output3D['rgb_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + output3D['depth_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + + return predictions, output2D, output3D + + def run_on_pcd(self, rgb_path, scene_name, color_name, class_names): + intrinsic_path = os.path.join(rgb_path, scene_name, 'intrinsics', 'intrinsic_depth.txt') + depth_intrinsic = np.loadtxt(intrinsic_path) + + pose = os.path.join(rgb_path, scene_name, 'pose', color_name[0:-4] + '.txt') + depth = os.path.join(rgb_path, scene_name, 'depth', color_name[0:-4] + '.png') + color = os.path.join(rgb_path, scene_name, 'color', color_name) + #semantic_map = join(rgb_path, scene_name, 'semantic_label', color_name[0:-4] + '.pth') + + depth_img = cv2.imread(depth, -1) # read 16bit grayscale image + depth_mask = (depth_img != 0) + color_image = cv2.imread(color) + color_image = cv2.resize(color_image, (640, 480)) + predictions = self.predictor(color_image, class_names) + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = color_image[:, :, ::-1] + visualizer_rgb = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_depth = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_rgb_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + visualizer_depth_sam = OVSegVisualizer(image, self.metadata, instance_mode=self.instance_mode, class_names=class_names) + + sam_checkpoint = "sam_vit_h_4b8939.pth" + model_type = "vit_h" + device = "cuda" + sam = sam_model_registry[model_type](checkpoint=sam_checkpoint) + sam.to(device=device) + + mask_generator_2 = SamAutomaticMaskGenerator( + model=sam, + points_per_side=64, + pred_iou_thresh=0.5, + stability_score_thresh=0.8, + crop_n_layers=0, + crop_n_points_downscale_factor=0, + min_mask_region_area=100, # Requires open-cv to run post-processing + ) + print('Using SAM to generate segments for the RGB image') + masks_rgb = mask_generator_2.generate(image) + masks_rgb = sorted(masks_rgb, key=(lambda x: x['area']), reverse=True) + + print('Using SAM to generate segments for the Depth map') + d = np.full(depth_img.shape, 0, dtype=float) + d[depth_mask] = (1 / (depth_img+1e-6))[depth_mask] + colored_depth = (d - np.min(d)) / (np.max(d) - np.min(d)) + colored_depth = mpl.colormaps['inferno'](colored_depth)*255 + plt.figure() + plt.imshow(colored_depth.astype(np.uint8)[:,:,:-1]) + plt.axis('off') + plt.savefig('outputs/Depth_rendered.png') + masks_depth = mask_generator_2.generate(colored_depth.astype(np.uint8)[:,:,:-1]) + masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=True) + + if "sem_seg" in predictions: + r = predictions["sem_seg"] + pred_mask = r.argmax(dim=0).to('cpu') + pred_mask = np.array(pred_mask, dtype=int) + + output2D = {} + pred_mask_sam_depth = np.full(pred_mask.shape, -1) + masks_depth = sorted(masks_depth, key=(lambda x: x['area']), reverse=False) + for mask in masks_depth: + to_paint = pred_mask_sam_depth == -1 + cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True) + #print(cls_tmp, cls_num) + pred_mask_sam_depth[mask['segmentation'] & to_paint] = cls_tmp[np.argmax(cls_num)] + #print(class_names[cls_tmp[np.argmax(cls_num)]]) + mask['class'] = cls_tmp[np.argmax(cls_num)] + + output2D['sem_seg_on_depth'] = visualizer_depth.draw_sem_seg( + pred_mask_sam_depth + ) + + pred_mask_sam_rgb = pred_mask.copy() + for mask in masks_rgb: + cls_tmp, cls_num = np.unique(pred_mask[mask['segmentation']], return_counts=True) + #print(mask['segmentation'].sum(), cls_tmp, cls_num) + pred_mask_sam_rgb[mask['segmentation']] = cls_tmp[np.argmax(cls_num)] + mask['class'] = cls_tmp[np.argmax(cls_num)] + + output2D['sem_seg_on_rgb'] = visualizer_rgb.draw_sem_seg( + pred_mask_sam_rgb + ) + + output2D['sam_seg_on_rgb'] = visualizer_rgb_sam.draw_sam_seg(masks_rgb) + output2D['sam_seg_on_depth'] = visualizer_depth_sam.draw_sam_seg(masks_depth) + + else: + raise NotImplementedError + + color_image = np.reshape(color_image[depth_mask], [-1,3]) + #group_ids = group_ids[depth_mask] + + sem_map_color = pred_mask_sam_rgb[depth_mask] + sem_map_depth = pred_mask_sam_depth[depth_mask] + + colors = np.zeros_like(color_image) + colors[:,0] = color_image[:,2] + colors[:,1] = color_image[:,1] + colors[:,2] = color_image[:,0] + + pose = np.loadtxt(pose) + + depth_shift = 1000.0 + x,y = np.meshgrid(np.linspace(0,depth_img.shape[1]-1,depth_img.shape[1]), np.linspace(0,depth_img.shape[0]-1,depth_img.shape[0])) + uv_depth = np.zeros((depth_img.shape[0], depth_img.shape[1], 3)) + uv_depth[:,:,0] = x + uv_depth[:,:,1] = y + uv_depth[:,:,2] = depth_img/depth_shift + + output3D = {} + output3D['rgb_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + output3D['depth_3d_sem'] = np.stack((uv_depth, output2D['sem_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + output3D['rgb_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_rgb'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + output3D['depth_3d_sam'] = np.stack((uv_depth, output2D['sam_seg_on_depth'].get_image()), axis=2).reshape((depth_img.shape[0], depth_img.shape[1], 6)) + + uv_depth = np.reshape(uv_depth, [-1,3]) + uv_depth = uv_depth[np.where(uv_depth[:,2]!=0),:].squeeze() + + intrinsic_inv = np.linalg.inv(depth_intrinsic) + fx = depth_intrinsic[0,0] + fy = depth_intrinsic[1,1] + cx = depth_intrinsic[0,2] + cy = depth_intrinsic[1,2] + bx = depth_intrinsic[0,3] + by = depth_intrinsic[1,3] + n = uv_depth.shape[0] + points = np.ones((n,4)) + X = (uv_depth[:,0]-cx)*uv_depth[:,2]/fx + bx + Y = (uv_depth[:,1]-cy)*uv_depth[:,2]/fy + by + points[:,0] = X + points[:,1] = Y + points[:,2] = uv_depth[:,2] + points_world = np.dot(points, np.transpose(pose)) + + output3D['pcd_color'] = self.build_pcd(depth_mask, coords=points_world[:,:3], colors=colors, masks=masks_rgb, sem_map=sem_map_color) + output3D['pcd_depth'] = self.build_pcd(depth_mask, coords=points_world[:,:3], colors=colors, masks=masks_depth, sem_map=sem_map_depth) + + return predictions, output2D, output3D + + + def merge_pcd(self, pcd_list, data_path, save_path, scene_path, voxel_size, th): + while len(pcd_list) != 1: + print(len(pcd_list), flush=True) + new_pcd_list = [] + for indice in pairwise_indices(len(pcd_list)): + # print(indice) + pcd_frame = cal_2_scenes(pcd_list, indice, voxel_size=voxel_size, voxelize=voxelize) + if pcd_frame is not None: + new_pcd_list.append(pcd_frame) + pcd_list = new_pcd_list + seg_dict = pcd_list[0] + seg_dict["group"] = num_to_natural(remove_small_group(seg_dict["group"], th)) + + data_dict = torch.load(scene_path) + scene_coord = torch.tensor(data_dict["coord"]).cuda().contiguous() + new_offset = torch.tensor(scene_coord.shape[0]).cuda() + gen_coord = torch.tensor(seg_dict["coord"]).cuda().contiguous().float() + offset = torch.tensor(gen_coord.shape[0]).cuda() + gen_group = seg_dict["group"] + gen_sem = seg_dict['sem_map'] + indices, dis = pointops.knn_query(1, gen_coord, offset, scene_coord, new_offset) + indices = indices.cpu().numpy() + sem_map = gen_sem[indices.reshape(-1)].astype(np.int16) + group = gen_group[indices.reshape(-1)].astype(np.int16) + mask_dis = dis.reshape(-1).cpu().numpy() > 0.6 + group[mask_dis] = -1 + sem_map[mask_dis] = -1 + group = group.astype(np.int16) + sem_map = sem_map.astype(np.int16) + torch.save((sem_map, num_to_natural(group)), os.path.join(save_path, scene_name + ".pth")) + + def render_3d_video(self, xyzrgb_path): + xyzrgb = np.load(xyzrgb_path) + device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device('cpu') + + depth = xyzrgb['rgb_3d_sam'][:, :, 2] + depth = torch.tensor(depth).to(device).float() + + num_frames = [60, 60, 60, 90] + + h = 480 + w = 640 + + intrinsic = np.array([[max(h, w), 0, w // 2], + [0, max(h, w), h // 2], + [0, 0, 1]]) + intrinsic = torch.from_numpy(intrinsic).float()[None].to(device) + + coord = get_coord_grids_pt(h, w, device=device).float()[None] + pts = unproject_pts_pt(intrinsic, coord.reshape(-1, 2), depth) + pts[:, 0] = ((pts[:, 0] - pts[:, 0].min()) / (pts[:, 0].max() - pts[:, 0].min()) - 0.5) * 2 + pts[:, 1] = ((pts[:, 1] - pts[:, 1].min()) / (pts[:, 1].max() - pts[:, 1].min()) - 0.5) * 2 + # pts[:, 1] = ((pts[:, 1] - pts[:, 1].min()) / (pts[:, 1].max() - pts[:, 1].min()) - 0.7) * 2 + pts[:, 2] = ((pts[:, 2] - pts[:, 2].min()) / (pts[:, 2].max() - pts[:, 2].min()) - 0.5) * 2 + + radius = 1.5 / min(h, w) * 2.0 + + + total = ['rgb_3d_sam', 'depth_3d_sam', 'rgb_3d_sam_mask', 'depth_3d_sam_mask'] + num_frames = 45 + degrees = np.linspace(120, 220, num_frames) + frames_all = {} + for j, name in enumerate(total): + img = torch.from_numpy(xyzrgb[name][:, :, 3:] / 255.).to(device).float() + pcd = Pointclouds(points=[pts], features=[img.squeeze().reshape(-1, 3)]) + time_steps = np.linspace(0, 1, num_frames) + frames = [] + for i, t_step in tqdm(enumerate(time_steps), total=len(time_steps)): + R, t = look_at_view_transform(3., -10, degrees[i]) + renderer = create_pcd_renderer(h, w, intrinsic.squeeze()[:3, :3], + R=R, T=t, + radius=radius, device=device) + + result = renderer(pcd) + result = result.permute(0, 3, 1, 2) + frame = (255. * result.detach().cpu().squeeze().permute(1, 2, 0).numpy()).astype(np.uint8) + frames.append(frame) + + frames_all[name] = frames + + # video_out_file = '{}.mp4'.format(name) + # imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25) + + video_out_file = '{}.mp4'.format(name) + imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25, quality=8) + + video_out_file = '{}.mp4'.format('RGB_3D_All') + imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['rgb_3d_sam_mask']+frames_all['rgb_3d_sam'], fps=25, quality=8) + + video_out_file = '{}.mp4'.format('Depth_3D_All') + imageio.mimwrite(os.path.join('outputs', video_out_file), frames_all['depth_3d_sam_mask']+frames_all['depth_3d_sam'], fps=25, quality=8) diff --git a/outputs/holder.py b/outputs/holder.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ovseg_swinbase_vitL14_ft_mpt.pth b/ovseg_swinbase_vitL14_ft_mpt.pth new file mode 100644 index 0000000000000000000000000000000000000000..0d2dcc4c4e721b187574f4c3829c58236713037a --- /dev/null +++ b/ovseg_swinbase_vitL14_ft_mpt.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd3731dde48d96654aba63e5a93753dc837d6889162a18ddf0877f5463d94c90 +size 2129343629 diff --git a/read_video.py b/read_video.py new file mode 100644 index 0000000000000000000000000000000000000000..1da1027914daa99d1da9308f0d967ac7e012b49e --- /dev/null +++ b/read_video.py @@ -0,0 +1,7 @@ +import cv2 + +Depth_Semantic_SAM_Mask_gif = cv2.VideoCapture('outputs/depth_3d_sam_mask.mp4') + +while(Depth_Semantic_SAM_Mask_gif .isOpened()): + ret, frame = Depth_Semantic_SAM_Mask_gif.read() + print(ret, frame.shape) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e139481f15c6045034fe33f6d0a9e1aa9a87ecaa --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +cython +scipy +shapely +timm +h5py +wandb +fire +opencv-python +pandas +imageio +fvcore +iopath +imageio[ffmpeg] +imageio[pyav] \ No newline at end of file diff --git a/sam_vit_h_4b8939.pth b/sam_vit_h_4b8939.pth new file mode 100644 index 0000000000000000000000000000000000000000..8523acce9ddab1cf7e355628a08b1aab8ce08a72 --- /dev/null +++ b/sam_vit_h_4b8939.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7bf3b02f3ebf1267aba913ff637d9a2d5c33d3173bb679e46d9f338c26f262e +size 2564550879 diff --git a/third_party/CLIP/.gitignore b/third_party/CLIP/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..fe3b563dec4e2d55b3824ef1bc7c31aed07848f0 --- /dev/null +++ b/third_party/CLIP/.gitignore @@ -0,0 +1,18 @@ +__pycache__/ +*.py[cod] +*$py.class +*.egg-info +.pytest_cache +.ipynb_checkpoints + +thumbs.db +.DS_Store +.idea +data/ +*.pkl +.theia +tmp +*/tmp +wandb/ +*/wadb +.history \ No newline at end of file diff --git a/third_party/CLIP/CLIP.png b/third_party/CLIP/CLIP.png new file mode 100644 index 0000000000000000000000000000000000000000..a1b5ec9171fd7a51e36e845a02304eb837142ba1 Binary files /dev/null and b/third_party/CLIP/CLIP.png differ diff --git a/third_party/CLIP/LICENSE b/third_party/CLIP/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..4e97f0b45803b7c04ae89548934af4f257a97501 --- /dev/null +++ b/third_party/CLIP/LICENSE @@ -0,0 +1,22 @@ +MIT License + +Copyright (c) 2021 OpenAI + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + diff --git a/third_party/CLIP/MANIFEST.in b/third_party/CLIP/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..effd8d995ff1842a48c69d2a0f7c8dce4423d7a2 --- /dev/null +++ b/third_party/CLIP/MANIFEST.in @@ -0,0 +1 @@ +include clip/bpe_simple_vocab_16e6.txt.gz diff --git a/third_party/CLIP/README.md b/third_party/CLIP/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5d2d20cd9e1cafcdf8bd8dfd83a0a9c47a884a39 --- /dev/null +++ b/third_party/CLIP/README.md @@ -0,0 +1,193 @@ +# CLIP + +[[Blog]](https://openai.com/blog/clip/) [[Paper]](https://arxiv.org/abs/2103.00020) [[Model Card]](model-card.md) [[Colab]](https://colab.research.google.com/github/openai/clip/blob/master/notebooks/Interacting_with_CLIP.ipynb) + +CLIP (Contrastive Language-Image Pre-Training) is a neural network trained on a variety of (image, text) pairs. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task, similarly to the zero-shot capabilities of GPT-2 and 3. We found CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision. + + + +## Approach + +![CLIP](CLIP.png) + + + +## Usage + +First, [install PyTorch 1.7.1](https://pytorch.org/get-started/locally/) and torchvision, as well as small additional dependencies, and then install this repo as a Python package. On a CUDA GPU machine, the following will do the trick: + +```bash +$ conda install --yes -c pytorch pytorch=1.7.1 torchvision cudatoolkit=11.0 +$ pip install ftfy regex tqdm +$ pip install git+https://github.com/openai/CLIP.git +``` + +Replace `cudatoolkit=11.0` above with the appropriate CUDA version on your machine or `cpuonly` when installing on a machine without a GPU. + +```python +import torch +import clip +from PIL import Image + +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = clip.load("ViT-B/32", device=device) + +image = preprocess(Image.open("CLIP.png")).unsqueeze(0).to(device) +text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) + +with torch.no_grad(): + image_features = model.encode_image(image) + text_features = model.encode_text(text) + + logits_per_image, logits_per_text = model(image, text) + probs = logits_per_image.softmax(dim=-1).cpu().numpy() + +print("Label probs:", probs) # prints: [[0.9927937 0.00421068 0.00299572]] +``` + + +## API + +The CLIP module `clip` provides the following methods: + +#### `clip.available_models()` + +Returns the names of the available CLIP models. + +#### `clip.load(name, device=..., jit=False)` + +Returns the model and the TorchVision transform needed by the model, specified by the model name returned by `clip.available_models()`. It will download the model as necessary. The `name` argument can also be a path to a local checkpoint. + +The device to run the model can be optionally specified, and the default is to use the first CUDA device if there is any, otherwise the CPU. When `jit` is `False`, a non-JIT version of the model will be loaded. + +#### `clip.tokenize(text: Union[str, List[str]], context_length=77)` + +Returns a LongTensor containing tokenized sequences of given text input(s). This can be used as the input to the model + +--- + +The model returned by `clip.load()` supports the following methods: + +#### `model.encode_image(image: Tensor)` + +Given a batch of images, returns the image features encoded by the vision portion of the CLIP model. + +#### `model.encode_text(text: Tensor)` + +Given a batch of text tokens, returns the text features encoded by the language portion of the CLIP model. + +#### `model(image: Tensor, text: Tensor)` + +Given a batch of images and a batch of text tokens, returns two Tensors, containing the logit scores corresponding to each image and text input. The values are cosine similarities between the corresponding image and text features, times 100. + + + +## More Examples + +### Zero-Shot Prediction + +The code below performs zero-shot prediction using CLIP, as shown in Appendix B in the paper. This example takes an image from the [CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html), and predicts the most likely labels among the 100 textual labels from the dataset. + +```python +import os +import clip +import torch +from torchvision.datasets import CIFAR100 + +# Load the model +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = clip.load('ViT-B/32', device) + +# Download the dataset +cifar100 = CIFAR100(root=os.path.expanduser("~/.cache"), download=True, train=False) + +# Prepare the inputs +image, class_id = cifar100[3637] +image_input = preprocess(image).unsqueeze(0).to(device) +text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in cifar100.classes]).to(device) + +# Calculate features +with torch.no_grad(): + image_features = model.encode_image(image_input) + text_features = model.encode_text(text_inputs) + +# Pick the top 5 most similar labels for the image +image_features /= image_features.norm(dim=-1, keepdim=True) +text_features /= text_features.norm(dim=-1, keepdim=True) +similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1) +values, indices = similarity[0].topk(5) + +# Print the result +print("\nTop predictions:\n") +for value, index in zip(values, indices): + print(f"{cifar100.classes[index]:>16s}: {100 * value.item():.2f}%") +``` + +The output will look like the following (the exact numbers may be slightly different depending on the compute device): + +``` +Top predictions: + + snake: 65.31% + turtle: 12.29% + sweet_pepper: 3.83% + lizard: 1.88% + crocodile: 1.75% +``` + +Note that this example uses the `encode_image()` and `encode_text()` methods that return the encoded features of given inputs. + + +### Linear-probe evaluation + +The example below uses [scikit-learn](https://scikit-learn.org/) to perform logistic regression on image features. + +```python +import os +import clip +import torch + +import numpy as np +from sklearn.linear_model import LogisticRegression +from torch.utils.data import DataLoader +from torchvision.datasets import CIFAR100 +from tqdm import tqdm + +# Load the model +device = "cuda" if torch.cuda.is_available() else "cpu" +model, preprocess = clip.load('ViT-B/32', device) + +# Load the dataset +root = os.path.expanduser("~/.cache") +train = CIFAR100(root, download=True, train=True, transform=preprocess) +test = CIFAR100(root, download=True, train=False, transform=preprocess) + + +def get_features(dataset): + all_features = [] + all_labels = [] + + with torch.no_grad(): + for images, labels in tqdm(DataLoader(dataset, batch_size=100)): + features = model.encode_image(images.to(device)) + + all_features.append(features) + all_labels.append(labels) + + return torch.cat(all_features).cpu().numpy(), torch.cat(all_labels).cpu().numpy() + +# Calculate the image features +train_features, train_labels = get_features(train) +test_features, test_labels = get_features(test) + +# Perform logistic regression +classifier = LogisticRegression(random_state=0, C=0.316, max_iter=1000, verbose=1) +classifier.fit(train_features, train_labels) + +# Evaluate using the logistic regression classifier +predictions = classifier.predict(test_features) +accuracy = np.mean((test_labels == predictions).astype(np.float)) * 100. +print(f"Accuracy = {accuracy:.3f}") +``` + +Note that the `C` value should be determined via a hyperparameter sweep using a validation split. diff --git a/third_party/CLIP/clip/__init__.py b/third_party/CLIP/clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dcc5619538c0f7c782508bdbd9587259d805e0d9 --- /dev/null +++ b/third_party/CLIP/clip/__init__.py @@ -0,0 +1 @@ +from .clip import * diff --git a/third_party/CLIP/clip/bpe_simple_vocab_16e6.txt.gz b/third_party/CLIP/clip/bpe_simple_vocab_16e6.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113 --- /dev/null +++ b/third_party/CLIP/clip/bpe_simple_vocab_16e6.txt.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a +size 1356917 diff --git a/third_party/CLIP/clip/clip.py b/third_party/CLIP/clip/clip.py new file mode 100644 index 0000000000000000000000000000000000000000..6d733edfac02d81ba3e402eb7e702764728bdaa2 --- /dev/null +++ b/third_party/CLIP/clip/clip.py @@ -0,0 +1,285 @@ +import hashlib +import os +import urllib +import warnings +from collections import OrderedDict +from typing import Union, List + +import torch +from PIL import Image +from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize +from tqdm import tqdm + +from .model import build_model +from .simple_tokenizer import SimpleTokenizer as _Tokenizer + +try: + from torchvision.transforms import InterpolationMode + + BICUBIC = InterpolationMode.BICUBIC +except ImportError: + BICUBIC = Image.BICUBIC + + +if torch.__version__.split(".") < ["1", "7", "1"]: + warnings.warn("PyTorch version 1.7.1 or higher is recommended") + + +__all__ = ["available_models", "load", "tokenize"] +_tokenizer = _Tokenizer() + +_MODELS = { + "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", + "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", + "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt", + "RN50x16": "https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt", + "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", + "ViT-B/16": "https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt", + "ViT-L/14": "https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt", + "ViT-L/14@336px": "https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt", +} + + +def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")): + os.makedirs(root, exist_ok=True) + filename = os.path.basename(url) + + expected_sha256 = url.split("/")[-2] + download_target = os.path.join(root, filename) + + if os.path.exists(download_target) and not os.path.isfile(download_target): + raise RuntimeError(f"{download_target} exists and is not a regular file") + + if os.path.isfile(download_target): + if ( + hashlib.sha256(open(download_target, "rb").read()).hexdigest() + == expected_sha256 + ): + return download_target + else: + warnings.warn( + f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file" + ) + + with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: + with tqdm( + total=int(source.info().get("Content-Length")), + ncols=80, + unit="iB", + unit_scale=True, + ) as loop: + while True: + buffer = source.read(8192) + if not buffer: + break + + output.write(buffer) + loop.update(len(buffer)) + + if ( + hashlib.sha256(open(download_target, "rb").read()).hexdigest() + != expected_sha256 + ): + raise RuntimeError( + f"Model has been downloaded but the SHA256 checksum does not not match" + ) + + return download_target + + +def _transform(n_px): + return Compose( + [ + Resize(n_px, interpolation=BICUBIC), + CenterCrop(n_px), + lambda image: image.convert("RGB"), + ToTensor(), + Normalize( + (0.48145466, 0.4578275, 0.40821073), + (0.26862954, 0.26130258, 0.27577711), + ), + ] + ) + + +def available_models() -> List[str]: + """Returns the names of available CLIP models""" + return list(_MODELS.keys()) + + +def load( + name: str, + mask_prompt_depth: int = 0, + device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", + jit=False, +): + """Load a CLIP model + + Parameters + ---------- + name : str + A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict + + device : Union[str, torch.device] + The device to put the loaded model + + jit : bool + Whether to load the optimized JIT model or more hackable non-JIT model (default). + + Returns + ------- + model : torch.nn.Module + The CLIP model + + preprocess : Callable[[PIL.Image], torch.Tensor] + A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input + """ + if name in _MODELS: + model_path = _download(_MODELS[name]) + elif os.path.isfile(name): + model_path = name + else: + raise RuntimeError( + f"Model {name} not found; available models = {available_models()}" + ) + + try: + # loading JIT archive + model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() + state_dict = None + except RuntimeError: + # loading saved state dict + if jit: + warnings.warn( + f"File {model_path} is not a JIT archive. Loading as a state dict instead" + ) + jit = False + state_dict = torch.load(model_path, map_location="cpu") + if 'state_dict' in state_dict: + new_state_dict = OrderedDict() + for k, v in state_dict['state_dict'].items(): + if k.startswith('module.'): + name = k[7:] # remove `module.` + new_state_dict[name] = v + state_dict = new_state_dict + + if not jit: + model = build_model(state_dict or model.state_dict(), mask_prompt_depth).to(device) + if str(device) == "cpu": + model.float() + return model, _transform(model.visual.input_resolution) + + # patch the device names + device_holder = torch.jit.trace( + lambda: torch.ones([]).to(torch.device(device)), example_inputs=[] + ) + device_node = [ + n + for n in device_holder.graph.findAllNodes("prim::Constant") + if "Device" in repr(n) + ][-1] + + def patch_device(module): + try: + graphs = [module.graph] if hasattr(module, "graph") else [] + except RuntimeError: + graphs = [] + + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("prim::Constant"): + if "value" in node.attributeNames() and str(node["value"]).startswith( + "cuda" + ): + node.copyAttributes(device_node) + + model.apply(patch_device) + patch_device(model.encode_image) + patch_device(model.encode_text) + + # patch dtype to float32 on CPU + if str(device) == "cpu": + float_holder = torch.jit.trace( + lambda: torch.ones([]).float(), example_inputs=[] + ) + float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] + float_node = float_input.node() + + def patch_float(module): + try: + graphs = [module.graph] if hasattr(module, "graph") else [] + except RuntimeError: + graphs = [] + + if hasattr(module, "forward1"): + graphs.append(module.forward1.graph) + + for graph in graphs: + for node in graph.findAllNodes("aten::to"): + inputs = list(node.inputs()) + for i in [ + 1, + 2, + ]: # dtype can be the second or third argument to aten::to() + if inputs[i].node()["value"] == 5: + inputs[i].node().copyAttributes(float_node) + + model.apply(patch_float) + patch_float(model.encode_image) + patch_float(model.encode_text) + + model.float() + + return model, _transform(model.input_resolution.item()) + + +def tokenize( + texts: Union[str, List[str]], + context_length: int = 77, + truncate: bool = False, + return_length: bool = False, +) -> torch.LongTensor: + """ + Returns the tokenized representation of given input string(s) + + Parameters + ---------- + texts : Union[str, List[str]] + An input string or a list of input strings to tokenize + + context_length : int + The context length to use; all CLIP models use 77 as the context length + + truncate: bool + Whether to truncate the text in case its encoding is longer than the context length + + Returns + ------- + A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length] + """ + if isinstance(texts, str): + texts = [texts] + + sot_token = _tokenizer.encoder["<|startoftext|>"] + eot_token = _tokenizer.encoder["<|endoftext|>"] + all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] + result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) + length = [] + for i, tokens in enumerate(all_tokens): + if len(tokens) > context_length: + if truncate: + tokens = tokens[:context_length] + tokens[-1] = eot_token + length.append(context_length) + else: + raise RuntimeError( + f"Input {texts[i]} is too long for context length {context_length}" + ) + else: + length.append(len(tokens)) + result[i, : len(tokens)] = torch.tensor(tokens) + if return_length: + return result, length + return result diff --git a/third_party/CLIP/clip/model.py b/third_party/CLIP/clip/model.py new file mode 100644 index 0000000000000000000000000000000000000000..8ea730a2cc8a992f9180428bd1fec7fc96aa89dd --- /dev/null +++ b/third_party/CLIP/clip/model.py @@ -0,0 +1,613 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from https://github.com/openai/CLIP/blob/main/clip/model.py + +from collections import OrderedDict +from typing import Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, inplanes, planes, stride=1): + super().__init__() + + # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 + self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + + self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + + self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() + + self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + + self.relu = nn.ReLU(inplace=True) + self.downsample = None + self.stride = stride + + if stride > 1 or inplanes != planes * Bottleneck.expansion: + # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 + self.downsample = nn.Sequential( + OrderedDict( + [ + ("-1", nn.AvgPool2d(stride)), + ( + "0", + nn.Conv2d( + inplanes, + planes * self.expansion, + 1, + stride=1, + bias=False, + ), + ), + ("1", nn.BatchNorm2d(planes * self.expansion)), + ] + ) + ) + + def forward(self, x: torch.Tensor): + identity = x + + out = self.relu(self.bn1(self.conv1(x))) + out = self.relu(self.bn2(self.conv2(out))) + out = self.avgpool(out) + out = self.bn3(self.conv3(out)) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + out = self.relu(out) + return out + + +class AttentionPool2d(nn.Module): + def __init__( + self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None + ): + super().__init__() + self.positional_embedding = nn.Parameter( + torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5 + ) + self.k_proj = nn.Linear(embed_dim, embed_dim) + self.q_proj = nn.Linear(embed_dim, embed_dim) + self.v_proj = nn.Linear(embed_dim, embed_dim) + self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) + self.num_heads = num_heads + self.grid_size = spacial_dim + + def forward(self, x, mask=None, return_cls=True): + b, c, gh, gw = x.shape + # remove irrelated feature + if mask is not None: + mask = F.interpolate(mask[:, None, ...], size=(gh, gw)).squeeze( + 1 + ) # [N,H,W] -> [N,grid,grid] + mask = (mask > 0.5).reshape(mask.shape[0], -1) + mask = torch.cat([mask, mask.new_ones(mask.shape[0], 1)], dim=1) + if x.size()[0] == 1: + x = x.expand(mask.shape[0], c, gh, gw) + + x = x.reshape(x.shape[0], c, gh * gw).permute(2, 0, 1) # NCHW -> (HW)NC + + x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC + positional_embedding = self.positional_embedding + if not (self.positional_embedding.shape[0] == x.shape[0]): + cls_pos = positional_embedding[0:1, :] + per_pos_embedding = ( + F.interpolate( + positional_embedding[1:, :] + .permute(1, 0) + .view(1, -1, self.grid_size, self.grid_size), + size=(gh, gw), + mode="bicubic", + ) + .reshape(-1, gh * gw) + .permute(1, 0) + ) + positional_embedding = torch.cat([cls_pos, per_pos_embedding]) + + x = x + positional_embedding[:, None, :].to(x.dtype) # (HW+1)NC + x, _ = F.multi_head_attention_forward( + query=x, + key=x, + value=x, + embed_dim_to_check=x.shape[-1], + num_heads=self.num_heads, + q_proj_weight=self.q_proj.weight, + k_proj_weight=self.k_proj.weight, + v_proj_weight=self.v_proj.weight, + in_proj_weight=None, + in_proj_bias=torch.cat( + [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias] + ), + bias_k=None, + bias_v=None, + add_zero_attn=False, + dropout_p=0, + out_proj_weight=self.c_proj.weight, + out_proj_bias=self.c_proj.bias, + use_separate_proj_weight=True, + training=self.training, + need_weights=False, + key_padding_mask=mask, + ) + + if return_cls: + return x[0] + else: + return x + + +class ModifiedResNet(nn.Module): + """ + A ResNet class that is similar to torchvision's but contains the following changes: + - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. + - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 + - The final pooling layer is a QKV attention instead of an average pool + """ + + def __init__(self, layers, output_dim, heads, input_resolution=224, width=64): + super().__init__() + self.output_dim = output_dim + self.input_resolution = input_resolution + + # the 3-layer stem + self.conv1 = nn.Conv2d( + 3, width // 2, kernel_size=3, stride=2, padding=1, bias=False + ) + self.bn1 = nn.BatchNorm2d(width // 2) + self.conv2 = nn.Conv2d( + width // 2, width // 2, kernel_size=3, padding=1, bias=False + ) + self.bn2 = nn.BatchNorm2d(width // 2) + self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) + self.bn3 = nn.BatchNorm2d(width) + self.avgpool = nn.AvgPool2d(2) + self.relu = nn.ReLU(inplace=True) + + # residual layers + self._inplanes = width # this is a *mutable* variable used during construction + self.layer1 = self._make_layer(width, layers[0]) + self.layer2 = self._make_layer(width * 2, layers[1], stride=2) + self.layer3 = self._make_layer(width * 4, layers[2], stride=2) + self.layer4 = self._make_layer(width * 8, layers[3], stride=2) + + embed_dim = width * 32 # the ResNet feature dimension + self.attnpool = AttentionPool2d( + input_resolution // 32, embed_dim, heads, output_dim + ) + + def _make_layer(self, planes, blocks, stride=1): + layers = [Bottleneck(self._inplanes, planes, stride)] + + self._inplanes = planes * Bottleneck.expansion + for _ in range(1, blocks): + layers.append(Bottleneck(self._inplanes, planes)) + + return nn.Sequential(*layers) + + def forward(self, x, mask: torch.Tensor = None, return_cls=True): + def stem(x): + for conv, bn in [ + (self.conv1, self.bn1), + (self.conv2, self.bn2), + (self.conv3, self.bn3), + ]: + x = self.relu(bn(conv(x))) + x = self.avgpool(x) + return x + + x = x.type(self.conv1.weight.dtype) + x = stem(x) # 1/4,1/4 + x = self.layer1(x) + x = self.layer2(x) # 1/8,1/8 + x = self.layer3(x) # 1/16,1/16 + x = self.layer4(x) # 1/32,1/32 + b, c, gh, gw = x.shape + x = self.attnpool(x, mask, return_cls) + if not return_cls: + return x[1:].permute(1, 0, 2).reshape(b, gh, gw, x.shape[-1]) # N,L,C + return x + + +class LayerNorm(nn.LayerNorm): + """Subclass torch's LayerNorm to handle fp16.""" + + def forward(self, x: torch.Tensor): + orig_type = x.dtype + ret = super().forward(x.type(torch.float32)) + return ret.type(orig_type) + + +class QuickGELU(nn.Module): + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict( + [ + ("c_fc", nn.Linear(d_model, d_model * 4)), + ("gelu", QuickGELU()), + ("c_proj", nn.Linear(d_model * 4, d_model)), + ] + ) + ) + self.ln_2 = LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor, **kwargs): + self.attn_mask = ( + self.attn_mask.to(dtype=x.dtype, device=x.device) + if self.attn_mask is not None + else None + ) + return self.attn( + x, x, x, need_weights=False, attn_mask=self.attn_mask, **kwargs + )[0] + + def forward(self, x: torch.Tensor, **kwargs): + x = x + self.attention(self.ln_1(x), **kwargs) + x = x + self.mlp(self.ln_2(x)) + return x + + +class Transformer(nn.Module): + def __init__( + self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None + ): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential( + *[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)] + ) + + def forward(self, x: torch.Tensor, **kwargs): + for block in self.resblocks: + x = block(x, **kwargs) + return x + + +class VisionTransformer(nn.Module): + def __init__( + self, + input_resolution: int, + patch_size: int, + mask_prompt_depth: int, + width: int, + layers: int, + heads: int, + output_dim: int, + ): + super().__init__() + self.input_resolution = input_resolution + self.output_dim = output_dim + self.conv1 = nn.Conv2d( + in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False, + ) + + scale = width ** -0.5 + self.class_embedding = nn.Parameter(scale * torch.randn(width)) + self.positional_embedding = nn.Parameter( + scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width) + ) + self.grid_size = input_resolution // patch_size + self.ln_pre = LayerNorm(width) + + self.transformer = Transformer(width, layers, heads) + + self.ln_post = LayerNorm(width) + self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) + + self.mask_pool = nn.AvgPool2d(patch_size, stride=patch_size) + self.mask_prompt_depth = mask_prompt_depth + self.mask_embedding = nn.Parameter(torch.zeros(self.mask_prompt_depth, self.grid_size * self.grid_size, width)) + + def forward(self, x: torch.Tensor, m: torch.Tensor = None): + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + if m is not None: + m = self.mask_pool(m.to(torch.float).squeeze()).reshape(m.shape[0], -1).unsqueeze(-1) + m = torch.ceil(m) + if self.mask_embedding.shape[1] == 1: + mask_embedding = self.mask_embedding.to(x.dtype).repeat(1, x.shape[1], 1) + else: + mask_embedding = self.mask_embedding.to(x.dtype) + x = x * m + mask_embedding[0].unsqueeze(0) * (1 - m) + + x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] + x = x + self.positional_embedding.to(x.dtype) + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + if m is not None: + for i, blk in enumerate(self.transformer.resblocks): + d = i + 1 + x = blk(x) + if d < self.mask_prompt_depth: + masked_x = x[1:, :, :] * m.permute(1, 0, 2) + \ + mask_embedding[d].unsqueeze(0).permute(1, 0, 2) * (1 - m.permute(1, 0, 2)) + x = torch.cat([x[:1, :, :], masked_x], dim=0) + else: + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.ln_post(x[:, 0, :]) + + if self.proj is not None: + x = x @ self.proj + + return x + + + +class CLIP(nn.Module): + def __init__( + self, + embed_dim: int, + # vision + image_resolution: int, + vision_layers: Union[Tuple[int, int, int, int], int], + vision_width: int, + vision_patch_size: int, + mask_prompt_depth: int, + # text + context_length: int, + vocab_size: int, + transformer_width: int, + transformer_heads: int, + transformer_layers: int, + ): + super().__init__() + + self.context_length = context_length + + if isinstance(vision_layers, (tuple, list)): + vision_heads = vision_width * 32 // 64 + self.visual = ModifiedResNet( + layers=vision_layers, + output_dim=embed_dim, + heads=vision_heads, + input_resolution=image_resolution, + width=vision_width, + ) + else: + vision_heads = vision_width // 64 + self.visual = VisionTransformer( + input_resolution=image_resolution, + patch_size=vision_patch_size, + mask_prompt_depth=mask_prompt_depth, + width=vision_width, + layers=vision_layers, + heads=vision_heads, + output_dim=embed_dim, + ) + + self.transformer = Transformer( + width=transformer_width, + layers=transformer_layers, + heads=transformer_heads, + attn_mask=self.build_attention_mask(), + ) + + self.vocab_size = vocab_size + self.token_embedding = nn.Embedding(vocab_size, transformer_width) + self.positional_embedding = nn.Parameter( + torch.empty(self.context_length, transformer_width) + ) + self.ln_final = LayerNorm(transformer_width) + + self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim)) + self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07)) + + self.initialize_parameters() + + def initialize_parameters(self): + nn.init.normal_(self.token_embedding.weight, std=0.02) + nn.init.normal_(self.positional_embedding, std=0.01) + + if isinstance(self.visual, ModifiedResNet): + if self.visual.attnpool is not None: + std = self.visual.attnpool.c_proj.in_features ** -0.5 + nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std) + nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std) + + for resnet_block in [ + self.visual.layer1, + self.visual.layer2, + self.visual.layer3, + self.visual.layer4, + ]: + for name, param in resnet_block.named_parameters(): + if name.endswith("bn3.weight"): + nn.init.zeros_(param) + + proj_std = (self.transformer.width ** -0.5) * ( + (2 * self.transformer.layers) ** -0.5 + ) + attn_std = self.transformer.width ** -0.5 + fc_std = (2 * self.transformer.width) ** -0.5 + for block in self.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + if self.text_projection is not None: + nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float("-inf")) + mask.triu_(1) # zero out the lower diagonal + return mask + + @property + def dtype(self): + return self.visual.conv1.weight.dtype + + def encode_image(self, image, **kwargs): + return self.visual(image.type(self.dtype), **kwargs) + + def encode_text(self, text): + x = self.token_embedding(text).type(self.dtype) # [batch_size, n_ctx, d_model] + + x = x + self.positional_embedding.type(self.dtype) + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x).type(self.dtype) + + # x.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection + + return x + + def forward(self, image, text): + image_features = self.encode_image(image) + text_features = self.encode_text(text) + + # normalized features + image_features = image_features / image_features.norm(dim=-1, keepdim=True) + text_features = text_features / text_features.norm(dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_image = logit_scale * image_features @ text_features.t() + logits_per_text = logit_scale * text_features @ image_features.t() + + # shape = [global_batch_size, global_batch_size] + return logits_per_image, logits_per_text + + +def convert_weights(model: nn.Module): + """Convert applicable model parameters to fp16""" + + def _convert_weights_to_fp16(l): + if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)): + l.weight.data = l.weight.data.half() + if l.bias is not None: + l.bias.data = l.bias.data.half() + + if isinstance(l, nn.MultiheadAttention): + for attr in [ + *[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], + "in_proj_bias", + "bias_k", + "bias_v", + ]: + tensor = getattr(l, attr) + if tensor is not None: + tensor.data = tensor.data.half() + + for name in ["text_projection", "proj"]: + if hasattr(l, name): + attr = getattr(l, name) + if attr is not None: + attr.data = attr.data.half() + + model.apply(_convert_weights_to_fp16) + + +def build_model(state_dict: dict, mask_prompt_depth: int = 0): + vit = "visual.proj" in state_dict + + if vit: + vision_width = state_dict["visual.conv1.weight"].shape[0] + vision_layers = len( + [ + k + for k in state_dict.keys() + if k.startswith("visual.") and k.endswith(".attn.in_proj_weight") + ] + ) + vision_patch_size = state_dict["visual.conv1.weight"].shape[-1] + grid_size = round( + (state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5 + ) + image_resolution = vision_patch_size * grid_size + else: + assert mask_prompt_depth == 0, 'ResNets do not support mask prompt tuning' + counts: list = [ + len( + set( + k.split(".")[2] + for k in state_dict + if k.startswith(f"visual.layer{b}") + ) + ) + for b in [1, 2, 3, 4] + ] + vision_layers = tuple(counts) + vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0] + output_width = round( + (state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5 + ) + vision_patch_size = None + assert ( + output_width ** 2 + 1 + == state_dict["visual.attnpool.positional_embedding"].shape[0] + ) + image_resolution = output_width * 32 + + embed_dim = state_dict["text_projection"].shape[1] + context_length = state_dict["positional_embedding"].shape[0] + vocab_size = state_dict["token_embedding.weight"].shape[0] + transformer_width = state_dict["ln_final.weight"].shape[0] + transformer_heads = transformer_width // 64 + transformer_layers = len( + set( + k.split(".")[2] + for k in state_dict + if k.startswith(f"transformer.resblocks") + ) + ) + + model = CLIP( + embed_dim, + image_resolution, + vision_layers, + vision_width, + vision_patch_size, + mask_prompt_depth, + context_length, + vocab_size, + transformer_width, + transformer_heads, + transformer_layers, + ) + + for key in ["input_resolution", "context_length", "vocab_size"]: + if key in state_dict: + del state_dict[key] + + convert_weights(model) + model.load_state_dict(state_dict, strict=False) + return model.eval() diff --git a/third_party/CLIP/clip/simple_tokenizer.py b/third_party/CLIP/clip/simple_tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..56d17512b06afb700e7834e4f3f6515c315ebb74 --- /dev/null +++ b/third_party/CLIP/clip/simple_tokenizer.py @@ -0,0 +1,150 @@ +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re + + +@lru_cache() +def default_bpe(): + return os.path.join( + os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz" + ) + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2 ** 8): + if b not in bs: + bs.append(b) + cs.append(2 ** 8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r"\s+", " ", text) + text = text.strip() + return text + + +class SimpleTokenizer(object): + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode("utf-8").split("\n") + merges = merges[1 : 49152 - 256 - 2 + 1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v + "" for v in vocab] + for merge in merges: + vocab.append("".join(merge)) + vocab.extend(["<|startoftext|>", "<|endoftext|>"]) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = { + "<|startoftext|>": "<|startoftext|>", + "<|endoftext|>": "<|endoftext|>", + } + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE, + ) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + "",) + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = "".join(self.byte_encoder[b] for b in token.encode("utf-8")) + bpe_tokens.extend( + self.encoder[bpe_token] for bpe_token in self.bpe(token).split(" ") + ) + return bpe_tokens + + def decode(self, tokens): + text = "".join([self.decoder[token] for token in tokens]) + text = ( + bytearray([self.byte_decoder[c] for c in text]) + .decode("utf-8", errors="replace") + .replace("", " ") + ) + return text diff --git a/third_party/CLIP/model-card.md b/third_party/CLIP/model-card.md new file mode 100644 index 0000000000000000000000000000000000000000..2d22e25bea89fdbccdaa2809fbeb83e0a7cfaa07 --- /dev/null +++ b/third_party/CLIP/model-card.md @@ -0,0 +1,120 @@ +# Model Card: CLIP + +Inspired by [Model Cards for Model Reporting (Mitchell et al.)](https://arxiv.org/abs/1810.03993) and [Lessons from Archives (Jo & Gebru)](https://arxiv.org/pdf/1912.10389.pdf), we’re providing some accompanying information about the multimodal model. + +## Model Details + +The CLIP model was developed by researchers at OpenAI to learn about what contributes to robustness in computer vision tasks. The model was also developed to test the ability of models to generalize to arbitrary image classification tasks in a zero-shot manner. It was not developed for general model deployment - to deploy models like CLIP, researchers will first need to carefully study their capabilities in relation to the specific context they’re being deployed within. + +### Model Date + +January 2021 + +### Model Type + +The base model uses a ResNet50 with several modifications as an image encoder and uses a masked self-attention Transformer as a text encoder. These encoders are trained to maximize the similarity of (image, text) pairs via a contrastive loss. There is also a variant of the model where the ResNet image encoder is replaced with a Vision Transformer. + +### Model Versions + +Initially, we’ve released one CLIP model based on the Vision Transformer architecture equivalent to ViT-B/32, along with the RN50 model, using the architecture equivalent to ResNet-50. + +As part of the staged release process, we have also released the RN101 model, as well as RN50x4, a RN50 scaled up 4x according to the [EfficientNet](https://arxiv.org/abs/1905.11946) scaling rule. In July 2021, we additionally released the RN50x16 and ViT-B/16 models. + +Please see the paper linked below for further details about their specification. + +### Documents + +- [Blog Post](https://openai.com/blog/clip/) +- [CLIP Paper](https://arxiv.org/abs/2103.00020) + + + +## Model Use + +### Intended Use + +The model is intended as a research output for research communities. We hope that this model will enable researchers to better understand and explore zero-shot, arbitrary image classification. We also hope it can be used for interdisciplinary studies of the potential impact of such models - the CLIP paper includes a discussion of potential downstream impacts to provide an example for this sort of analysis. + +#### Primary intended uses + +The primary intended users of these models are AI researchers. + +We primarily imagine the model will be used by researchers to better understand robustness, generalization, and other capabilities, biases, and constraints of computer vision models. + +### Out-of-Scope Use Cases + +**Any** deployed use case of the model - whether commercial or not - is currently out of scope. Non-deployed use cases such as image search in a constrained environment, are also not recommended unless there is thorough in-domain testing of the model with a specific, fixed class taxonomy. This is because our safety assessment demonstrated a high need for task specific testing especially given the variability of CLIP’s performance with different class taxonomies. This makes untested and unconstrained deployment of the model in any use case currently potentially harmful. + +Certain use cases which would fall under the domain of surveillance and facial recognition are always out-of-scope regardless of performance of the model. This is because the use of artificial intelligence for tasks such as these can be premature currently given the lack of testing norms and checks to ensure its fair use. + +Since the model has not been purposefully trained in or evaluated on any languages other than English, its use should be limited to English language use cases. + + + +## Data + +The model was trained on publicly available image-caption data. This was done through a combination of crawling a handful of websites and using commonly-used pre-existing image datasets such as [YFCC100M](http://projects.dfki.uni-kl.de/yfcc100m/). A large portion of the data comes from our crawling of the internet. This means that the data is more representative of people and societies most connected to the internet which tend to skew towards more developed nations, and younger, male users. + +### Data Mission Statement + +Our goal with building this dataset was to test out robustness and generalizability in computer vision tasks. As a result, the focus was on gathering large quantities of data from different publicly-available internet data sources. The data was gathered in a mostly non-interventionist manner. However, we only crawled websites that had policies against excessively violent and adult images and allowed us to filter out such content. We do not intend for this dataset to be used as the basis for any commercial or deployed model and will not be releasing the dataset. + + + +## Performance and Limitations + +### Performance + +We have evaluated the performance of CLIP on a wide range of benchmarks across a variety of computer vision datasets such as OCR to texture recognition to fine-grained classification. The paper describes model performance on the following datasets: + +- Food101 +- CIFAR10 +- CIFAR100 +- Birdsnap +- SUN397 +- Stanford Cars +- FGVC Aircraft +- VOC2007 +- DTD +- Oxford-IIIT Pet dataset +- Caltech101 +- Flowers102 +- MNIST +- SVHN +- IIIT5K +- Hateful Memes +- SST-2 +- UCF101 +- Kinetics700 +- Country211 +- CLEVR Counting +- KITTI Distance +- STL-10 +- RareAct +- Flickr30 +- MSCOCO +- ImageNet +- ImageNet-A +- ImageNet-R +- ImageNet Sketch +- ObjectNet (ImageNet Overlap) +- Youtube-BB +- ImageNet-Vid + +## Limitations + +CLIP and our analysis of it have a number of limitations. CLIP currently struggles with respect to certain tasks such as fine grained classification and counting objects. CLIP also poses issues with regards to fairness and bias which we discuss in the paper and briefly in the next section. Additionally, our approach to testing CLIP also has an important limitation- in many cases we have used linear probes to evaluate the performance of CLIP and there is evidence suggesting that linear probes can underestimate model performance. + +### Bias and Fairness + +We find that the performance of CLIP - and the specific biases it exhibits - can depend significantly on class design and the choices one makes for categories to include and exclude. We tested the risk of certain kinds of denigration with CLIP by classifying images of people from [Fairface](https://arxiv.org/abs/1908.04913) into crime-related and non-human animal categories. We found significant disparities with respect to race and gender. Additionally, we found that these disparities could shift based on how the classes were constructed. (Details captured in the Broader Impacts Section in the paper). + +We also tested the performance of CLIP on gender, race and age classification using the Fairface dataset (We default to using race categories as they are constructed in the Fairface dataset.) in order to assess quality of performance across different demographics. We found accuracy >96% across all races for gender classification with ‘Middle Eastern’ having the highest accuracy (98.4%) and ‘White’ having the lowest (96.5%). Additionally, CLIP averaged ~93% for racial classification and ~63% for age classification. Our use of evaluations to test for gender, race and age classification as well as denigration harms is simply to evaluate performance of the model across people and surface potential risks and not to demonstrate an endorsement/enthusiasm for such tasks. + + + +## Feedback + +### Where to send questions or comments about the model + +Please use [this Google Form](https://forms.gle/Uv7afRH5dvY34ZEs9) diff --git a/third_party/CLIP/requirements.txt b/third_party/CLIP/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b98c33f3a0e09ddf982606430472de3061c6e9f --- /dev/null +++ b/third_party/CLIP/requirements.txt @@ -0,0 +1,5 @@ +ftfy +regex +tqdm +torch +torchvision diff --git a/third_party/CLIP/setup.py b/third_party/CLIP/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..1026ae8a1c4d99f7107cd2eaffb0b391e87a121f --- /dev/null +++ b/third_party/CLIP/setup.py @@ -0,0 +1,21 @@ +import os + +import pkg_resources +from setuptools import setup, find_packages + +setup( + name="clip", + py_modules=["clip"], + version="1.0", + description="", + author="OpenAI", + packages=find_packages(exclude=["tests*"]), + install_requires=[ + str(r) + for r in pkg_resources.parse_requirements( + open(os.path.join(os.path.dirname(__file__), "requirements.txt")) + ) + ], + include_package_data=True, + extras_require={"dev": ["pytest"]}, +) diff --git a/third_party/CLIP/tests/test_consistency.py b/third_party/CLIP/tests/test_consistency.py new file mode 100644 index 0000000000000000000000000000000000000000..27d49eaae8721b7ad82d4949f2ab2606c8875d9f --- /dev/null +++ b/third_party/CLIP/tests/test_consistency.py @@ -0,0 +1,25 @@ +import numpy as np +import pytest +import torch +from PIL import Image + +import clip + + +@pytest.mark.parametrize("model_name", clip.available_models()) +def test_consistency(model_name): + device = "cpu" + jit_model, transform = clip.load(model_name, device=device, jit=True) + py_model, _ = clip.load(model_name, device=device, jit=False) + + image = transform(Image.open("CLIP.png")).unsqueeze(0).to(device) + text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device) + + with torch.no_grad(): + logits_per_image, _ = jit_model(image, text) + jit_probs = logits_per_image.softmax(dim=-1).cpu().numpy() + + logits_per_image, _ = py_model(image, text) + py_probs = logits_per_image.softmax(dim=-1).cpu().numpy() + + assert np.allclose(jit_probs, py_probs, atol=0.01, rtol=0.1) diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/tools/convert-pretrained-clip-model-to-d2.py b/tools/convert-pretrained-clip-model-to-d2.py new file mode 100644 index 0000000000000000000000000000000000000000..8e0fc1a37805727d625ba40cbbf07e9426e87ad7 --- /dev/null +++ b/tools/convert-pretrained-clip-model-to-d2.py @@ -0,0 +1,69 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import pickle as pkl +import sys + +import torch + +""" +Usage: + # download pretrained swin model: + wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth + # run the conversion + ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl + # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: +MODEL: + WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" +INPUT: + FORMAT: "RGB" +""" + + +def transform(path): + model = torch.load(path, map_location="cpu") + print(f"loading {path}......") + state_dict = model["model"] + state_dict = { + k.replace("visual_model.", ""): v + for k, v in state_dict.items() + if k.startswith("visual_model") + } + source_keys = [k for k in state_dict.keys() if "relative_coords" in k] + for k in source_keys: + state_dict[ + k.replace("relative_coords", "relative_position_index") + ] = state_dict[k] + del state_dict[k] + + source_keys = [k for k in state_dict.keys() if "atten_mask_matrix" in k] + for k in source_keys: + state_dict[k.replace("atten_mask_matrix", "attn_mask")] = state_dict[k] + del state_dict[k] + + source_keys = [k for k in state_dict.keys() if "rel_pos_embed_table" in k] + for k in source_keys: + state_dict[ + k.replace("rel_pos_embed_table", "relative_position_bias_table") + ] = state_dict[k] + del state_dict[k] + + source_keys = [k for k in state_dict.keys() if "channel_reduction" in k] + for k in source_keys: + state_dict[k.replace("channel_reduction", "reduction")] = state_dict[k] + del state_dict[k] + return { + k if k.startswith("backbone.") else "backbone." + k: v + for k, v in state_dict.items() + } + + +if __name__ == "__main__": + input = sys.argv[1] + res = { + "model": transform(input), + "__author__": "third_party", + "matching_heuristics": True, + } + with open(sys.argv[2], "wb") as f: + pkl.dump(res, f) diff --git a/tools/convert-pretrained-swin-model-to-d2.py b/tools/convert-pretrained-swin-model-to-d2.py new file mode 100644 index 0000000000000000000000000000000000000000..4cc9939c781a4d04dc6070a7fcac8d6c09afc8a1 --- /dev/null +++ b/tools/convert-pretrained-swin-model-to-d2.py @@ -0,0 +1,30 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import pickle as pkl +import sys + +import torch + +""" +Usage: + # download pretrained swin model: + wget https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth + # run the conversion + ./convert-pretrained-model-to-d2.py swin_tiny_patch4_window7_224.pth swin_tiny_patch4_window7_224.pkl + # Then, use swin_tiny_patch4_window7_224.pkl with the following changes in config: +MODEL: + WEIGHTS: "/path/to/swin_tiny_patch4_window7_224.pkl" +INPUT: + FORMAT: "RGB" +""" + +if __name__ == "__main__": + input = sys.argv[1] + + obj = torch.load(input, map_location="cpu")["model"] + + res = {"model": obj, "__author__": "third_party", "matching_heuristics": True} + + with open(sys.argv[2], "wb") as f: + pkl.dump(res, f) diff --git a/tools/convert-torchvision-to-d2.py b/tools/convert-torchvision-to-d2.py new file mode 100644 index 0000000000000000000000000000000000000000..60b9fb88693350c75f0b69350807503c87192724 --- /dev/null +++ b/tools/convert-torchvision-to-d2.py @@ -0,0 +1,54 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import pickle as pkl +import sys + +import torch + +""" +Usage: + # download one of the ResNet{18,34,50,101,152} models from torchvision: + wget https://download.pytorch.org/models/resnet50-19c8e357.pth -O r50.pth + # run the conversion + ./convert-torchvision-to-d2.py r50.pth r50.pkl + # Then, use r50.pkl with the following changes in config: +MODEL: + WEIGHTS: "/path/to/r50.pkl" + PIXEL_MEAN: [123.675, 116.280, 103.530] + PIXEL_STD: [58.395, 57.120, 57.375] + RESNETS: + DEPTH: 50 + STRIDE_IN_1X1: False +INPUT: + FORMAT: "RGB" + These models typically produce slightly worse results than the + pre-trained ResNets we use in official configs, which are the + original ResNet models released by MSRA. +""" + +if __name__ == "__main__": + input = sys.argv[1] + + obj = torch.load(input, map_location="cpu") + + newmodel = {} + for k in list(obj.keys()): + old_k = k + if "layer" not in k: + k = "stem." + k + for t in [1, 2, 3, 4]: + k = k.replace("layer{}".format(t), "res{}".format(t + 1)) + for t in [1, 2, 3]: + k = k.replace("bn{}".format(t), "conv{}.norm".format(t)) + k = k.replace("downsample.0", "shortcut") + k = k.replace("downsample.1", "shortcut.norm") + print(old_k, "->", k) + newmodel[k] = obj.pop(old_k).detach().numpy() + + res = {"model": newmodel, "__author__": "torchvision", "matching_heuristics": True} + + with open(sys.argv[2], "wb") as f: + pkl.dump(res, f) + if obj: + print("Unconverted keys:", obj.keys()) diff --git a/tools/ovseg_replace_clip.py b/tools/ovseg_replace_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..0781c5910d8bd7dec25aeb468514849dfe68e9e4 --- /dev/null +++ b/tools/ovseg_replace_clip.py @@ -0,0 +1,30 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import torch +from collections import OrderedDict + + +# PATH to new clip model +clip_ckpt = torch.load('xx/open_clip/src/logs/2022_xx/checkpoints/epoch_x.pt') + +new_model = OrderedDict() +state_dict = clip_ckpt['state_dict'] + +for k, v in state_dict.items(): + new_key = k.replace('module.','') + new_model[new_key] = v + +# PATH to trained ovseg model +ovseg_model = torch.load('xx/ovseg/output/model_final.pth', 'cpu') + +for k, v in new_model.items(): + new_k = 'clip_adapter.clip_model.' + k + if new_k in ovseg_model['model'].keys(): + ovseg_model['model'][new_k] = v + else: + print(f'{new_k} does not exist in ckpt') + +# ovseg_model['model']['clip_adapter.clip_model.visual.mask_embedding'] = new_model['visual.mask_embedding'] + +torch.save(ovseg_model, 'xx/ovseg/output/ovseg_ft_mpt.pth') diff --git a/tools/search_thr_ensemble_w.sh b/tools/search_thr_ensemble_w.sh new file mode 100644 index 0000000000000000000000000000000000000000..efdbd72dd1a6a9da96868688b0fd5530e956498a --- /dev/null +++ b/tools/search_thr_ensemble_w.sh @@ -0,0 +1,11 @@ +or MASK_THR in 0.35 0.4 0.45 +o + for ENSEMBLE_WEIGHT in 0.6 0.65 0.7 0.75 0.8 + do + python train_net.py --num-gpu 8 --eval-only --config-file configs/ovseg_swinB_vitL_bs32_120k.yaml \ + MODEL.WEIGHTS #PATH_of_ovseg_swinbase_vitL14_ft_mpt.pth DATASETS.TEST \(\"ade20k_sem_seg_val\"\) \ + MODEL.CLIP_ADAPTER.CLIP_ENSEMBLE_WEIGHT $ENSEMBLE_WEIGHT MODEL.CLIP_ADAPTER.MASK_THR $MASK_THR + done +one + + diff --git a/tools/util.py b/tools/util.py new file mode 100644 index 0000000000000000000000000000000000000000..e9d6caf994cf4bf79156e0d95ae6df7fd0142bf2 --- /dev/null +++ b/tools/util.py @@ -0,0 +1,296 @@ +import numpy as np +import torch +import os +import copy +from PIL import Image +import json +import imageio +# import clip + + +SCANNET_COLOR_MAP_20 = {-1: (0., 0., 0.), 0: (174., 199., 232.), 1: (152., 223., 138.), 2: (31., 119., 180.), 3: (255., 187., 120.), 4: (188., 189., 34.), 5: (140., 86., 75.), + 6: (255., 152., 150.), 7: (214., 39., 40.), 8: (197., 176., 213.), 9: (148., 103., 189.), 10: (196., 156., 148.), 11: (23., 190., 207.), 12: (247., 182., 210.), + 13: (219., 219., 141.), 14: (255., 127., 14.), 15: (158., 218., 229.), 16: (44., 160., 44.), 17: (112., 128., 144.), 18: (227., 119., 194.), 19: (82., 84., 163.)} + +class Voxelize(object): + def __init__(self, + voxel_size=0.05, + hash_type="fnv", + mode='train', + keys=("coord", "normal", "color", "label"), + return_discrete_coord=False, + return_min_coord=False): + self.voxel_size = voxel_size + self.hash = self.fnv_hash_vec if hash_type == "fnv" else self.ravel_hash_vec + assert mode in ["train", "test"] + self.mode = mode + self.keys = keys + self.return_discrete_coord = return_discrete_coord + self.return_min_coord = return_min_coord + + def __call__(self, data_dict): + assert "coord" in data_dict.keys() + discrete_coord = np.floor(data_dict["coord"] / np.array(self.voxel_size)).astype(np.int) + min_coord = discrete_coord.min(0) * np.array(self.voxel_size) + discrete_coord -= discrete_coord.min(0) + key = self.hash(discrete_coord) + idx_sort = np.argsort(key) + key_sort = key[idx_sort] + _, inverse, count = np.unique(key_sort, return_inverse=True, return_counts=True) + if self.mode == 'train': # train mode + # idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + np.random.randint(0, count.max(), count.size) % count + idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + idx_unique = idx_sort[idx_select] + if self.return_discrete_coord: + data_dict["discrete_coord"] = discrete_coord[idx_unique] + if self.return_min_coord: + data_dict["min_coord"] = min_coord.reshape([1, 3]) + for key in self.keys: + data_dict[key] = data_dict[key][idx_unique] + return data_dict + + elif self.mode == 'test': # test mode + data_part_list = [] + for i in range(count.max()): + idx_select = np.cumsum(np.insert(count, 0, 0)[0:-1]) + i % count + idx_part = idx_sort[idx_select] + data_part = dict(index=idx_part) + for key in data_dict.keys(): + if key in self.keys: + data_part[key] = data_dict[key][idx_part] + else: + data_part[key] = data_dict[key] + if self.return_discrete_coord: + data_part["discrete_coord"] = discrete_coord[idx_part] + if self.return_min_coord: + data_part["min_coord"] = min_coord.reshape([1, 3]) + data_part_list.append(data_part) + return data_part_list + else: + raise NotImplementedError + + @staticmethod + def ravel_hash_vec(arr): + """ + Ravel the coordinates after subtracting the min coordinates. + """ + assert arr.ndim == 2 + arr = arr.copy() + arr -= arr.min(0) + arr = arr.astype(np.uint64, copy=False) + arr_max = arr.max(0).astype(np.uint64) + 1 + + keys = np.zeros(arr.shape[0], dtype=np.uint64) + # Fortran style indexing + for j in range(arr.shape[1] - 1): + keys += arr[:, j] + keys *= arr_max[j + 1] + keys += arr[:, -1] + return keys + + @staticmethod + def fnv_hash_vec(arr): + """ + FNV64-1A + """ + assert arr.ndim == 2 + # Floor first for negative coordinates + arr = arr.copy() + arr = arr.astype(np.uint64, copy=False) + hashed_arr = np.uint64(14695981039346656037) * np.ones(arr.shape[0], dtype=np.uint64) + for j in range(arr.shape[1]): + hashed_arr *= np.uint64(1099511628211) + hashed_arr = np.bitwise_xor(hashed_arr, arr[:, j]) + return hashed_arr + + +def overlap_percentage(mask1, mask2): + intersection = np.logical_and(mask1, mask2) + area_intersection = np.sum(intersection) + + area_mask1 = np.sum(mask1) + area_mask2 = np.sum(mask2) + + smaller_area = min(area_mask1, area_mask2) + + return area_intersection / smaller_area + + +def remove_samll_masks(masks, ratio=0.8): + filtered_masks = [] + skip_masks = set() + + for i, mask1_dict in enumerate(masks): + if i in skip_masks: + continue + + should_keep = True + for j, mask2_dict in enumerate(masks): + if i == j or j in skip_masks: + continue + mask1 = mask1_dict["segmentation"] + mask2 = mask2_dict["segmentation"] + overlap = overlap_percentage(mask1, mask2) + if overlap > ratio: + if np.sum(mask1) < np.sum(mask2): + should_keep = False + break + else: + skip_masks.add(j) + + if should_keep: + filtered_masks.append(mask1) + + return filtered_masks + + +def to_numpy(x): + if isinstance(x, torch.Tensor): + x = x.clone().detach().cpu().numpy() + assert isinstance(x, np.ndarray) + return x + + +def save_point_cloud(coord, color=None, file_path="pc.ply", logger=None): + os.makedirs(os.path.dirname(file_path), exist_ok=True) + coord = to_numpy(coord) + if color is not None: + color = to_numpy(color) + pcd = o3d.geometry.PointCloud() + pcd.points = o3d.utility.Vector3dVector(coord) + pcd.colors = o3d.utility.Vector3dVector(np.ones_like(coord) if color is None else color) + o3d.io.write_point_cloud(file_path, pcd) + if logger is not None: + logger.info(f"Save Point Cloud to: {file_path}") + + +def remove_small_group(group_ids, th): + unique_elements, counts = np.unique(group_ids, return_counts=True) + result = group_ids.copy() + for i, count in enumerate(counts): + if count < th: + result[group_ids == unique_elements[i]] = -1 + + return result + + +def pairwise_indices(length): + return [[i, i + 1] if i + 1 < length else [i] for i in range(0, length, 2)] + + +def num_to_natural(group_ids): + ''' + Change the group number to natural number arrangement + ''' + if np.all(group_ids == -1): + return group_ids + array = copy.deepcopy(group_ids) + unique_values = np.unique(array[array != -1]) + mapping = np.full(np.max(unique_values) + 2, -1) + mapping[unique_values + 1] = np.arange(len(unique_values)) + array = mapping[array + 1] + return array + + +def get_matching_indices(source, pcd_tree, search_voxel_size, K=None): + match_inds = [] + for i, point in enumerate(source.points): + [_, idx, _] = pcd_tree.search_radius_vector_3d(point, search_voxel_size) + if K is not None: + idx = idx[:K] + for j in idx: + # match_inds[i, j] = 1 + match_inds.append((i, j)) + return match_inds + + +def visualize_3d(data_dict, text_feat_path, save_path): + text_feat = torch.load(text_feat_path) + group_logits = np.einsum('nc,mc->nm', data_dict["group_feat"], text_feat) + group_labels = np.argmax(group_logits, axis=-1) + labels = group_labels[data_dict["group"]] + labels[data_dict["group"] == -1] = -1 + visualize_pcd(data_dict["coord"], data_dict["color"], labels, save_path) + + +def visualize_pcd(coord, pcd_color, labels, save_path): + # alpha = 0.5 + label_color = np.array([SCANNET_COLOR_MAP_20[label] for label in labels]) + # overlay = (pcd_color * (1-alpha) + label_color * alpha).astype(np.uint8) / 255 + label_color = label_color / 255 + save_point_cloud(coord, label_color, save_path) + + +def visualize_2d(img_color, labels, img_size, save_path): + import matplotlib.pyplot as plt + # from skimage.segmentation import mark_boundaries + # from skimage.color import label2rgb + label_names = ["wall", "floor", "cabinet", "bed", "chair", + "sofa", "table", "door", "window", "bookshelf", + "picture", "counter", "desk", "curtain", "refridgerator", + "shower curtain", "toilet", "sink", "bathtub", "other"] + colors = np.array(list(SCANNET_COLOR_MAP_20.values()))[1:] + segmentation_color = np.zeros((img_size[0], img_size[1], 3)) + for i, color in enumerate(colors): + segmentation_color[labels == i] = color + alpha = 1 + overlay = (img_color * (1-alpha) + segmentation_color * alpha).astype(np.uint8) + fig, ax = plt.subplots() + ax.imshow(overlay) + patches = [plt.plot([], [], 's', color=np.array(color)/255, label=label)[0] for label, color in zip(label_names, colors)] + plt.legend(handles=patches, bbox_to_anchor=(0.5, -0.1), loc='upper center', ncol=4, fontsize='small') + plt.savefig(save_path, bbox_inches='tight') + plt.show() + + +def visualize_partition(coord, group_id, save_path): + group_id = group_id.reshape(-1) + num_groups = group_id.max() + 1 + group_colors = np.random.rand(num_groups, 3) + group_colors = np.vstack((group_colors, np.array([0,0,0]))) + color = group_colors[group_id] + save_point_cloud(coord, color, save_path) + + +def delete_invalid_group(group, group_feat): + indices = np.unique(group[group != -1]) + group = num_to_natural(group) + group_feat = group_feat[indices] + return group, group_feat + +def group_sem_voting(semantic_label, seg_result, instance_num=0): + if instance_num == 0: + instance_num = seg_result.max() + 1 + seg_labels = [] + sem_map = -1 * torch.ones_like(semantic_label) + for n in range(instance_num): + mask = (seg_result == n) + if mask.sum() == 0: + sem_map[mask] = -1 + seg_labels.append(-1) + continue + seg_label_n_cover, seg_label_n_nums = torch.unique(semantic_label[mask], return_counts=True) + seg_label_n = seg_label_n_cover[seg_label_n_nums.max(-1)[1]] + seg_labels.append(seg_label_n) + sem_map[mask] = seg_label_n + + return sem_map + +def two_image_to_gif(image_1, image_2, name): + num_begin = 30 + num_frames = 30 + num_end = 30 + frames = [] + for i in range(num_begin): + frames.append(image_1) + for i in range(num_frames): + image_tmp = image_1 + (image_2 - image_1) * (i / (num_frames - 1)) + frames.append(image_tmp.astype(np.uint8)) + for i in range(num_end): + frames.append(image_2) + + # video_out_file = '{}.gif'.format(name) + # imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25) + + video_out_file = '{}.mp4'.format(name) + imageio.mimwrite(os.path.join('outputs', video_out_file), frames, fps=25, quality=8) \ No newline at end of file diff --git a/tools/web_demo.py b/tools/web_demo.py new file mode 100644 index 0000000000000000000000000000000000000000..027b8ca4d656e3d94379c014ae505d2dc57c9225 --- /dev/null +++ b/tools/web_demo.py @@ -0,0 +1,76 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved + +import multiprocessing as mp + +import numpy as np +from PIL import Image + +from detectron2.config import get_cfg + +from detectron2.projects.deeplab import add_deeplab_config +from detectron2.data.detection_utils import read_image +from open_vocab_seg import add_ovseg_config +from open_vocab_seg.utils import VisualizationDemo + +import gradio as gr + +def setup_cfg(config_file): + # load config from file and command-line arguments + cfg = get_cfg() + add_deeplab_config(cfg) + add_ovseg_config(cfg) + cfg.merge_from_file(config_file) + cfg.freeze() + return cfg + + +def inference(class_names, input_img): + mp.set_start_method("spawn", force=True) + config_file = './configs/ovseg_swinB_vitL_demo.yaml' + cfg = setup_cfg(config_file) + + demo = VisualizationDemo(cfg) + + class_names = class_names.split(',') + img = read_image(input_img, format="BGR") + _, visualized_output = demo.run_on_image(img, class_names) + + return Image.fromarray(np.uint8(visualized_output.get_image())).convert('RGB') + +# demo = gr.Interface(fn=greet, inputs="text", outputs="text") +# demo.launch() + + +examples = [['Oculus, Ukulele', './resources/demo_samples/sample_03.jpeg'],] +output_labels = ['segmentation map'] + +title = 'OVSeg' + +description = """ +Gradio Demo for Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP \n +You may click on of the examples or upload your own image. \n +OVSeg could perform open vocabulary segmentation, you may input more classes (seperate by comma). +""" + +article = """ +

+ +Open-Vocabulary Semantic Segmentation with Mask-adapted CLIP + +| +Github Repo

+""" + +gr.Interface( + inference, + inputs=[ + gr.inputs.Textbox( + lines=1, placeholder=None, default='', label='class names'), + gr.inputs.Image(type='filepath') + ], + outputs=gr.outputs.Image(label='segmentation map'), + title=title, + description=description, + article=article, + examples=examples).launch(enable_queue=True) diff --git a/train_net.py b/train_net.py new file mode 100644 index 0000000000000000000000000000000000000000..8f544a17aa30b99ef64f783d5e55e6b786fe18c7 --- /dev/null +++ b/train_net.py @@ -0,0 +1,309 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copyright (c) Meta Platforms, Inc. All Rights Reserved +# Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/train_net.py + +""" +OVSeg Training Script. + +This script is a simplified version of the training script in detectron2/tools. +""" +import copy +import itertools +import logging +import os +from collections import OrderedDict +from typing import Any, Dict, List, Set + +import detectron2.utils.comm as comm +import torch +from detectron2.checkpoint import DetectionCheckpointer +from detectron2.config import get_cfg +from detectron2.data import MetadataCatalog +from detectron2.engine import ( + DefaultTrainer, + default_argument_parser, + default_setup, + launch, +) +from detectron2.evaluation import ( + DatasetEvaluator, + CityscapesSemSegEvaluator, + COCOEvaluator, + DatasetEvaluators, + verify_results, +) +from detectron2.projects.deeplab import add_deeplab_config, build_lr_scheduler +from detectron2.solver.build import maybe_add_gradient_clipping +from detectron2.utils.logger import setup_logger +from detectron2.utils.events import CommonMetricPrinter, JSONWriter + +# OVSeg +from open_vocab_seg import SemanticSegmentorWithTTA, add_ovseg_config +from open_vocab_seg.data import ( + MaskFormerSemanticDatasetMapper, +) + +from open_vocab_seg.data import ( + build_detection_test_loader, + build_detection_train_loader, +) +from open_vocab_seg.evaluation import ( + GeneralizedSemSegEvaluator, +) +from open_vocab_seg.utils.events import WandbWriter, setup_wandb +from open_vocab_seg.utils.post_process_utils import dense_crf_post_process + + +class Trainer(DefaultTrainer): + """ + Extension of the Trainer class adapted to DETR. + """ + + @classmethod + def build_evaluator(cls, cfg, dataset_name, output_folder=None): + """ + Create evaluator(s) for a given dataset. + This uses the special metadata "evaluator_type" associated with each + builtin dataset. For your own dataset, you can simply create an + evaluator manually in your script and do not have to worry about the + hacky if-else logic here. + """ + if output_folder is None: + output_folder = os.path.join(cfg.OUTPUT_DIR, "inference") + evaluator_list = [] + evaluator_type = MetadataCatalog.get(dataset_name).evaluator_type + if evaluator_type in ["sem_seg"]: + evaluator = GeneralizedSemSegEvaluator + evaluator_list.append( + evaluator( + dataset_name, + distributed=True, + output_dir=output_folder, + post_process_func=dense_crf_post_process + if cfg.TEST.DENSE_CRF + else None, + ) + ) + + if len(evaluator_list) == 0: + raise NotImplementedError( + "no Evaluator for the dataset {} with the type {}".format( + dataset_name, evaluator_type + ) + ) + elif len(evaluator_list) == 1: + return evaluator_list[0] + return DatasetEvaluators(evaluator_list) + + @classmethod + def build_train_loader(cls, cfg): + dataset = None + # Semantic segmentation dataset mapper + if cfg.INPUT.DATASET_MAPPER_NAME == "mask_former_semantic": + mapper = MaskFormerSemanticDatasetMapper(cfg, True) + else: + raise NotImplementedError + return build_detection_train_loader(cfg, mapper=mapper, dataset=dataset) + + @classmethod + def build_test_loader(cls, cfg, dataset_name): + """ + Returns: + iterable + It now calls :func:`detectron2.data.build_detection_test_loader`. + Overwrite it if you'd like a different data loader. + """ + return build_detection_test_loader(cfg, dataset_name, mapper=None) + + def build_writers(self): + """ + Build a list of writers to be used. By default it contains + writers that write metrics to the screen, + a json file, and a tensorboard event file respectively. + If you'd like a different list of writers, you can overwrite it in + your trainer. + + Returns: + list[EventWriter]: a list of :class:`EventWriter` objects. + + It is now implemented by: + :: + return [ + CommonMetricPrinter(self.max_iter), + JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), + TensorboardXWriter(self.cfg.OUTPUT_DIR), + ] + + """ + # Here the default print/log frequency of each writer is used. + return [ + # It may not always print what you want to see, since it prints "common" metrics only. + CommonMetricPrinter(self.max_iter), + JSONWriter(os.path.join(self.cfg.OUTPUT_DIR, "metrics.json")), + WandbWriter(), + ] + + @classmethod + def build_lr_scheduler(cls, cfg, optimizer): + """ + It now calls :func:`detectron2.solver.build_lr_scheduler`. + Overwrite it if you'd like a different scheduler. + """ + return build_lr_scheduler(cfg, optimizer) + + @classmethod + def build_optimizer(cls, cfg, model): + weight_decay_norm = cfg.SOLVER.WEIGHT_DECAY_NORM + weight_decay_embed = cfg.SOLVER.WEIGHT_DECAY_EMBED + + defaults = {} + defaults["lr"] = cfg.SOLVER.BASE_LR + defaults["weight_decay"] = cfg.SOLVER.WEIGHT_DECAY + + norm_module_types = ( + torch.nn.BatchNorm1d, + torch.nn.BatchNorm2d, + torch.nn.BatchNorm3d, + torch.nn.SyncBatchNorm, + # NaiveSyncBatchNorm inherits from BatchNorm2d + torch.nn.GroupNorm, + torch.nn.InstanceNorm1d, + torch.nn.InstanceNorm2d, + torch.nn.InstanceNorm3d, + torch.nn.LayerNorm, + torch.nn.LocalResponseNorm, + ) + + params: List[Dict[str, Any]] = [] + memo: Set[torch.nn.parameter.Parameter] = set() + for module_name, module in model.named_modules(): + for module_param_name, value in module.named_parameters(recurse=False): + if not value.requires_grad: + continue + # Avoid duplicating parameters + if value in memo: + continue + memo.add(value) + + hyperparams = copy.copy(defaults) + if "backbone" in module_name: + hyperparams["lr"] = ( + hyperparams["lr"] * cfg.SOLVER.BACKBONE_MULTIPLIER + ) + if ( + "relative_position_bias_table" in module_param_name + or "absolute_pos_embed" in module_param_name + ): + print(module_param_name) + hyperparams["weight_decay"] = 0.0 + if isinstance(module, norm_module_types): + hyperparams["weight_decay"] = weight_decay_norm + if isinstance(module, torch.nn.Embedding): + hyperparams["weight_decay"] = weight_decay_embed + params.append({"params": [value], **hyperparams}) + + def maybe_add_full_model_gradient_clipping(optim): + # detectron2 doesn't have full model gradient clipping now + clip_norm_val = cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE + enable = ( + cfg.SOLVER.CLIP_GRADIENTS.ENABLED + and cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model" + and clip_norm_val > 0.0 + ) + + class FullModelGradientClippingOptimizer(optim): + def step(self, closure=None): + all_params = itertools.chain( + *[x["params"] for x in self.param_groups] + ) + torch.nn.utils.clip_grad_norm_(all_params, clip_norm_val) + super().step(closure=closure) + + return FullModelGradientClippingOptimizer if enable else optim + + optimizer_type = cfg.SOLVER.OPTIMIZER + if optimizer_type == "SGD": + optimizer = maybe_add_full_model_gradient_clipping(torch.optim.SGD)( + params, cfg.SOLVER.BASE_LR, momentum=cfg.SOLVER.MOMENTUM + ) + elif optimizer_type == "ADAMW": + optimizer = maybe_add_full_model_gradient_clipping(torch.optim.AdamW)( + params, cfg.SOLVER.BASE_LR + ) + else: + raise NotImplementedError(f"no optimizer type {optimizer_type}") + if not cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE == "full_model": + optimizer = maybe_add_gradient_clipping(cfg, optimizer) + return optimizer + + @classmethod + def test_with_TTA(cls, cfg, model): + logger = logging.getLogger("detectron2.trainer") + # In the end of training, run an evaluation with TTA. + logger.info("Running inference with test-time augmentation ...") + model = SemanticSegmentorWithTTA(cfg, model) + evaluators = [ + cls.build_evaluator( + cfg, name, output_folder=os.path.join(cfg.OUTPUT_DIR, "inference_TTA") + ) + for name in cfg.DATASETS.TEST + ] + res = cls.test(cfg, model, evaluators) + res = OrderedDict({k + "_TTA": v for k, v in res.items()}) + return res + + +def setup(args): + """ + Create configs and perform basic setups. + """ + cfg = get_cfg() + # for poly lr schedule + add_deeplab_config(cfg) + add_ovseg_config(cfg) + cfg.merge_from_file(args.config_file) + cfg.merge_from_list(args.opts) + cfg.freeze() + default_setup(cfg, args) + # Setup logger for "ovseg" module + if not args.eval_only: + setup_wandb(cfg, args) + setup_logger( + output=cfg.OUTPUT_DIR, distributed_rank=comm.get_rank(), name="ovseg" + ) + return cfg + + +def main(args): + cfg = setup(args) + + if args.eval_only: + model = Trainer.build_model(cfg) + DetectionCheckpointer(model, save_dir=cfg.OUTPUT_DIR).resume_or_load( + cfg.MODEL.WEIGHTS, resume=args.resume + ) + + if cfg.TEST.AUG.ENABLED: + res = Trainer.test_with_TTA(cfg, model) + else: + res = Trainer.test(cfg, model) + if comm.is_main_process(): + verify_results(cfg, res) + return res + + trainer = Trainer(cfg) + trainer.resume_or_load(resume=args.resume) + return trainer.train() + + +if __name__ == "__main__": + args = default_argument_parser().parse_args() + print("Command Line Args:", args) + launch( + main, + args.num_gpus, + num_machines=args.num_machines, + machine_rank=args.machine_rank, + dist_url=args.dist_url, + args=(args,), + )