diff --git a/custom_midas_repo/LICENSE b/custom_midas_repo/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..277b5c11be103f028a8d10985139f1da10c2f08e --- /dev/null +++ b/custom_midas_repo/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2019 Intel ISL (Intel Intelligent Systems Lab) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/custom_midas_repo/README.md b/custom_midas_repo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9568ea71c755b6938ee5482ba9f09be722e75943 --- /dev/null +++ b/custom_midas_repo/README.md @@ -0,0 +1,259 @@ +## Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer + +This repository contains code to compute depth from a single image. It accompanies our [paper](https://arxiv.org/abs/1907.01341v3): + +>Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-shot Cross-dataset Transfer +René Ranftl, Katrin Lasinger, David Hafner, Konrad Schindler, Vladlen Koltun + + +and our [preprint](https://arxiv.org/abs/2103.13413): + +> Vision Transformers for Dense Prediction +> René Ranftl, Alexey Bochkovskiy, Vladlen Koltun + + +MiDaS was trained on up to 12 datasets (ReDWeb, DIML, Movies, MegaDepth, WSVD, TartanAir, HRWSI, ApolloScape, BlendedMVS, IRS, KITTI, NYU Depth V2) with +multi-objective optimization. +The original model that was trained on 5 datasets (`MIX 5` in the paper) can be found [here](https://github.com/isl-org/MiDaS/releases/tag/v2). +The figure below shows an overview of the different MiDaS models; the bubble size scales with number of parameters. + +![](figures/Improvement_vs_FPS.png) + +### Setup + +1) Pick one or more models and download the corresponding weights to the `weights` folder: + +MiDaS 3.1 +- For highest quality: [dpt_beit_large_512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) +- For moderately less quality, but better speed-performance trade-off: [dpt_swin2_large_384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt) +- For embedded devices: [dpt_swin2_tiny_256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt), [dpt_levit_224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt) +- For inference on Intel CPUs, OpenVINO may be used for the small legacy model: openvino_midas_v21_small [.xml](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.xml), [.bin](https://github.com/isl-org/MiDaS/releases/download/v3_1/openvino_midas_v21_small_256.bin) + +MiDaS 3.0: Legacy transformer models [dpt_large_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) and [dpt_hybrid_384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) + +MiDaS 2.1: Legacy convolutional models [midas_v21_384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) and [midas_v21_small_256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) + +1) Set up dependencies: + + ```shell + conda env create -f environment.yaml + conda activate midas-py310 + ``` + +#### optional + +For the Next-ViT model, execute + +```shell +git submodule add https://github.com/isl-org/Next-ViT midas/external/next_vit +``` + +For the OpenVINO model, install + +```shell +pip install openvino +``` + +### Usage + +1) Place one or more input images in the folder `input`. + +2) Run the model with + + ```shell + python run.py --model_type --input_path input --output_path output + ``` + where `````` is chosen from [dpt_beit_large_512](#model_type), [dpt_beit_large_384](#model_type), + [dpt_beit_base_384](#model_type), [dpt_swin2_large_384](#model_type), [dpt_swin2_base_384](#model_type), + [dpt_swin2_tiny_256](#model_type), [dpt_swin_large_384](#model_type), [dpt_next_vit_large_384](#model_type), + [dpt_levit_224](#model_type), [dpt_large_384](#model_type), [dpt_hybrid_384](#model_type), + [midas_v21_384](#model_type), [midas_v21_small_256](#model_type), [openvino_midas_v21_small_256](#model_type). + +3) The resulting depth maps are written to the `output` folder. + +#### optional + +1) By default, the inference resizes the height of input images to the size of a model to fit into the encoder. This + size is given by the numbers in the model names of the [accuracy table](#accuracy). Some models do not only support a single + inference height but a range of different heights. Feel free to explore different heights by appending the extra + command line argument `--height`. Unsupported height values will throw an error. Note that using this argument may + decrease the model accuracy. +2) By default, the inference keeps the aspect ratio of input images when feeding them into the encoder if this is + supported by a model (all models except for Swin, Swin2, LeViT). In order to resize to a square resolution, + disregarding the aspect ratio while preserving the height, use the command line argument `--square`. + +#### via Camera + + If you want the input images to be grabbed from the camera and shown in a window, leave the input and output paths + away and choose a model type as shown above: + + ```shell + python run.py --model_type --side + ``` + + The argument `--side` is optional and causes both the input RGB image and the output depth map to be shown + side-by-side for comparison. + +#### via Docker + +1) Make sure you have installed Docker and the + [NVIDIA Docker runtime](https://github.com/NVIDIA/nvidia-docker/wiki/Installation-\(Native-GPU-Support\)). + +2) Build the Docker image: + + ```shell + docker build -t midas . + ``` + +3) Run inference: + + ```shell + docker run --rm --gpus all -v $PWD/input:/opt/MiDaS/input -v $PWD/output:/opt/MiDaS/output -v $PWD/weights:/opt/MiDaS/weights midas + ``` + + This command passes through all of your NVIDIA GPUs to the container, mounts the + `input` and `output` directories and then runs the inference. + +#### via PyTorch Hub + +The pretrained model is also available on [PyTorch Hub](https://pytorch.org/hub/intelisl_midas_v2/) + +#### via TensorFlow or ONNX + +See [README](https://github.com/isl-org/MiDaS/tree/master/tf) in the `tf` subdirectory. + +Currently only supports MiDaS v2.1. + + +#### via Mobile (iOS / Android) + +See [README](https://github.com/isl-org/MiDaS/tree/master/mobile) in the `mobile` subdirectory. + +#### via ROS1 (Robot Operating System) + +See [README](https://github.com/isl-org/MiDaS/tree/master/ros) in the `ros` subdirectory. + +Currently only supports MiDaS v2.1. DPT-based models to be added. + + +### Accuracy + +We provide a **zero-shot error** $\epsilon_d$ which is evaluated for 6 different datasets +(see [paper](https://arxiv.org/abs/1907.01341v3)). **Lower error values are better**. +$\color{green}{\textsf{Overall model quality is represented by the improvement}}$ ([Imp.](#improvement)) with respect to +MiDaS 3.0 DPTL-384. The models are grouped by the height used for inference, whereas the square training resolution is given by +the numbers in the model names. The table also shows the **number of parameters** (in millions) and the +**frames per second** for inference at the training resolution (for GPU RTX 3090): + +| MiDaS Model | DIW
WHDR | Eth3d
AbsRel | Sintel
AbsRel | TUM
δ1 | KITTI
δ1 | NYUv2
δ1 | $\color{green}{\textsf{Imp.}}$
% | Par.
M | FPS
  | +|-----------------------------------------------------------------------------------------------------------------------|-------------------------:|-----------------------------:|------------------------------:|-------------------------:|-------------------------:|-------------------------:|-------------------------------------------------:|----------------------:|--------------------------:| +| **Inference height 512** | | | | | | | | | | +| [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1137 | 0.0659 | 0.2366 | **6.13** | 11.56* | **1.86*** | $\color{green}{\textsf{19}}$ | **345** | **5.7** | +| [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt)$\tiny{\square}$ | **0.1121** | **0.0614** | **0.2090** | 6.46 | **5.00*** | 1.90* | $\color{green}{\textsf{34}}$ | **345** | **5.7** | +| | | | | | | | | | | +| **Inference height 384** | | | | | | | | | | +| [v3.1 BEiTL-512](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt) | 0.1245 | 0.0681 | **0.2176** | **6.13** | 6.28* | **2.16*** | $\color{green}{\textsf{28}}$ | 345 | 12 | +| [v3.1 Swin2L-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt)$\tiny{\square}$ | 0.1106 | 0.0732 | 0.2442 | 8.87 | **5.84*** | 2.92* | $\color{green}{\textsf{22}}$ | 213 | 41 | +| [v3.1 Swin2B-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt)$\tiny{\square}$ | 0.1095 | 0.0790 | 0.2404 | 8.93 | 5.97* | 3.28* | $\color{green}{\textsf{22}}$ | 102 | 39 | +| [v3.1 SwinL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt)$\tiny{\square}$ | 0.1126 | 0.0853 | 0.2428 | 8.74 | 6.60* | 3.34* | $\color{green}{\textsf{17}}$ | 213 | 49 | +| [v3.1 BEiTL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt) | 0.1239 | **0.0667** | 0.2545 | 7.17 | 9.84* | 2.21* | $\color{green}{\textsf{17}}$ | 344 | 13 | +| [v3.1 Next-ViTL-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt) | **0.1031** | 0.0954 | 0.2295 | 9.21 | 6.89* | 3.47* | $\color{green}{\textsf{16}}$ | **72** | 30 | +| [v3.1 BEiTB-384](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt) | 0.1159 | 0.0967 | 0.2901 | 9.88 | 26.60* | 3.91* | $\color{green}{\textsf{-31}}$ | 112 | 31 | +| [v3.0 DPTL-384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt) | 0.1082 | 0.0888 | 0.2697 | 9.97 | 8.46 | 8.32 | $\color{green}{\textsf{0}}$ | 344 | **61** | +| [v3.0 DPTH-384](https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt) | 0.1106 | 0.0934 | 0.2741 | 10.89 | 11.56 | 8.69 | $\color{green}{\textsf{-10}}$ | 123 | 50 | +| [v2.1 Large384](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt) | 0.1295 | 0.1155 | 0.3285 | 12.51 | 16.08 | 8.71 | $\color{green}{\textsf{-32}}$ | 105 | 47 | +| | | | | | | | | | | +| **Inference height 256** | | | | | | | | | | +| [v3.1 Swin2T-256](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt)$\tiny{\square}$ | **0.1211** | **0.1106** | **0.2868** | **13.43** | **10.13*** | **5.55*** | $\color{green}{\textsf{-11}}$ | 42 | 64 | +| [v2.1 Small256](https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt) | 0.1344 | 0.1344 | 0.3370 | 14.53 | 29.27 | 13.43 | $\color{green}{\textsf{-76}}$ | **21** | **90** | +| | | | | | | | | | | +| **Inference height 224** | | | | | | | | | | +| [v3.1 LeViT224](https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt)$\tiny{\square}$ | **0.1314** | **0.1206** | **0.3148** | **18.21** | **15.27*** | **8.64*** | $\color{green}{\textsf{-40}}$ | **51** | **73** | + +* No zero-shot error, because models are also trained on KITTI and NYU Depth V2\ +$\square$ Validation performed at **square resolution**, either because the transformer encoder backbone of a model +does not support non-square resolutions (Swin, Swin2, LeViT) or for comparison with these models. All other +validations keep the aspect ratio. A difference in resolution limits the comparability of the zero-shot error and the +improvement, because these quantities are averages over the pixels of an image and do not take into account the +advantage of more details due to a higher resolution.\ +Best values per column and same validation height in bold + +#### Improvement + +The improvement in the above table is defined as the relative zero-shot error with respect to MiDaS v3.0 +DPTL-384 and averaging over the datasets. So, if $\epsilon_d$ is the zero-shot error for dataset $d$, then +the $\color{green}{\textsf{improvement}}$ is given by $100(1-(1/6)\sum_d\epsilon_d/\epsilon_{d,\rm{DPT_{L-384}}})$%. + +Note that the improvements of 10% for MiDaS v2.0 → v2.1 and 21% for MiDaS v2.1 → v3.0 are not visible from the +improvement column (Imp.) in the table but would require an evaluation with respect to MiDaS v2.1 Large384 +and v2.0 Large384 respectively instead of v3.0 DPTL-384. + +### Depth map comparison + +Zoom in for better visibility +![](figures/Comparison.png) + +### Speed on Camera Feed + +Test configuration +- Windows 10 +- 11th Gen Intel Core i7-1185G7 3.00GHz +- 16GB RAM +- Camera resolution 640x480 +- openvino_midas_v21_small_256 + +Speed: 22 FPS + +### Changelog + +* [Dec 2022] Released MiDaS v3.1: + - New models based on 5 different types of transformers ([BEiT](https://arxiv.org/pdf/2106.08254.pdf), [Swin2](https://arxiv.org/pdf/2111.09883.pdf), [Swin](https://arxiv.org/pdf/2103.14030.pdf), [Next-ViT](https://arxiv.org/pdf/2207.05501.pdf), [LeViT](https://arxiv.org/pdf/2104.01136.pdf)) + - Training datasets extended from 10 to 12, including also KITTI and NYU Depth V2 using [BTS](https://github.com/cleinc/bts) split + - Best model, BEiTLarge 512, with resolution 512x512, is on average about [28% more accurate](#Accuracy) than MiDaS v3.0 + - Integrated live depth estimation from camera feed +* [Sep 2021] Integrated to [Huggingface Spaces](https://huggingface.co/spaces) with [Gradio](https://github.com/gradio-app/gradio). See [Gradio Web Demo](https://huggingface.co/spaces/akhaliq/DPT-Large). +* [Apr 2021] Released MiDaS v3.0: + - New models based on [Dense Prediction Transformers](https://arxiv.org/abs/2103.13413) are on average [21% more accurate](#Accuracy) than MiDaS v2.1 + - Additional models can be found [here](https://github.com/isl-org/DPT) +* [Nov 2020] Released MiDaS v2.1: + - New model that was trained on 10 datasets and is on average about [10% more accurate](#Accuracy) than [MiDaS v2.0](https://github.com/isl-org/MiDaS/releases/tag/v2) + - New light-weight model that achieves [real-time performance](https://github.com/isl-org/MiDaS/tree/master/mobile) on mobile platforms. + - Sample applications for [iOS](https://github.com/isl-org/MiDaS/tree/master/mobile/ios) and [Android](https://github.com/isl-org/MiDaS/tree/master/mobile/android) + - [ROS package](https://github.com/isl-org/MiDaS/tree/master/ros) for easy deployment on robots +* [Jul 2020] Added TensorFlow and ONNX code. Added [online demo](http://35.202.76.57/). +* [Dec 2019] Released new version of MiDaS - the new model is significantly more accurate and robust +* [Jul 2019] Initial release of MiDaS ([Link](https://github.com/isl-org/MiDaS/releases/tag/v1)) + +### Citation + +Please cite our paper if you use this code or any of the models: +``` +@ARTICLE {Ranftl2022, + author = "Ren\'{e} Ranftl and Katrin Lasinger and David Hafner and Konrad Schindler and Vladlen Koltun", + title = "Towards Robust Monocular Depth Estimation: Mixing Datasets for Zero-Shot Cross-Dataset Transfer", + journal = "IEEE Transactions on Pattern Analysis and Machine Intelligence", + year = "2022", + volume = "44", + number = "3" +} +``` + +If you use a DPT-based model, please also cite: + +``` +@article{Ranftl2021, + author = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun}, + title = {Vision Transformers for Dense Prediction}, + journal = {ICCV}, + year = {2021}, +} +``` + +### Acknowledgements + +Our work builds on and uses code from [timm](https://github.com/rwightman/pytorch-image-models) and [Next-ViT](https://github.com/bytedance/Next-ViT). +We'd like to thank the authors for making these libraries available. + +### License + +MIT License diff --git a/custom_midas_repo/__init__.py b/custom_midas_repo/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/custom_midas_repo/hubconf.py b/custom_midas_repo/hubconf.py new file mode 100644 index 0000000000000000000000000000000000000000..fcfbeb59ecadeb0976bb6faac243ae485bc60280 --- /dev/null +++ b/custom_midas_repo/hubconf.py @@ -0,0 +1,435 @@ +dependencies = ["torch"] + +import torch + +from custom_midas_repo.midas.dpt_depth import DPTDepthModel +from custom_midas_repo.midas.midas_net import MidasNet +from custom_midas_repo.midas.midas_net_custom import MidasNet_small + +def DPT_BEiT_L_512(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_BEiT_L_512 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="beitl16_512", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_BEiT_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_BEiT_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="beitl16_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_BEiT_B_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_BEiT_B_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="beitb16_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_SwinV2_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_SwinV2_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swin2l24_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_SwinV2_B_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_SwinV2_B_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swin2b24_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_base_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_SwinV2_T_256(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_SwinV2_T_256 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swin2t16_256", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin2_tiny_256.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Swin_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_Swin_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="swinl12_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_swin_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Next_ViT_L_384(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_Next_ViT_L_384 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="next_vit_large_6m", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_next_vit_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_LeViT_224(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT_LeViT_224 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="levit_384", + non_negative=True, + head_features_1=64, + head_features_2=8, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_levit_224.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Large(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT-Large model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="vitl16_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_large_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def DPT_Hybrid(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS DPT-Hybrid model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = DPTDepthModel( + path=None, + backbone="vitb_rn50_384", + non_negative=True, + ) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v3/dpt_hybrid_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def MiDaS(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS v2.1 model for monocular depth estimation + pretrained (bool): load pretrained weights into model + """ + + model = MidasNet() + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_384.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + +def MiDaS_small(pretrained=True, **kwargs): + """ # This docstring shows up in hub.help() + MiDaS v2.1 small model for monocular depth estimation on resource-constrained devices + pretrained (bool): load pretrained weights into model + """ + + model = MidasNet_small(None, features=64, backbone="efficientnet_lite3", exportable=True, non_negative=True, blocks={'expand': True}) + + if pretrained: + checkpoint = ( + "https://github.com/isl-org/MiDaS/releases/download/v2_1/midas_v21_small_256.pt" + ) + state_dict = torch.hub.load_state_dict_from_url( + checkpoint, map_location=torch.device('cpu'), progress=True, check_hash=True + ) + model.load_state_dict(state_dict) + + return model + + +def transforms(): + import cv2 + from torchvision.transforms import Compose + from custom_midas_repo.midas.transforms import Resize, NormalizeImage, PrepareForNet + from custom_midas_repo.midas import transforms + + transforms.default_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 384, + 384, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="upper_bound", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.small_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 256, + 256, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="upper_bound", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.dpt_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 384, + 384, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.beit512_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 512, + 512, + resize_target=None, + keep_aspect_ratio=True, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.swin384_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 384, + 384, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.swin256_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 256, + 256, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + transforms.levit_transform = Compose( + [ + lambda img: {"image": img / 255.0}, + Resize( + 224, + 224, + resize_target=None, + keep_aspect_ratio=False, + ensure_multiple_of=32, + resize_method="minimal", + image_interpolation_method=cv2.INTER_CUBIC, + ), + NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + PrepareForNet(), + lambda sample: torch.from_numpy(sample["image"]).unsqueeze(0), + ] + ) + + return transforms diff --git a/custom_midas_repo/midas/__init__.py b/custom_midas_repo/midas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/custom_midas_repo/midas/backbones/__init__.py b/custom_midas_repo/midas/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/custom_midas_repo/midas/backbones/beit.py b/custom_midas_repo/midas/backbones/beit.py new file mode 100644 index 0000000000000000000000000000000000000000..3f15f657d21dd7e90cdb57e7dfb6bdcb7eb0921a --- /dev/null +++ b/custom_midas_repo/midas/backbones/beit.py @@ -0,0 +1,196 @@ +import custom_timm as timm +import torch +import types + +import numpy as np +import torch.nn.functional as F + +from .utils import forward_adapted_unflatten, make_backbone_default +from custom_timm.models.beit import gen_relative_position_index +from torch.utils.checkpoint import checkpoint +from typing import Optional + + +def forward_beit(pretrained, x): + return forward_adapted_unflatten(pretrained, x, "forward_features") + + +def patch_embed_forward(self, x): + """ + Modification of timm.models.layers.patch_embed.py: PatchEmbed.forward to support arbitrary window sizes. + """ + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + return x + + +def _get_rel_pos_bias(self, window_size): + """ + Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes. + """ + old_height = 2 * self.window_size[0] - 1 + old_width = 2 * self.window_size[1] - 1 + + new_height = 2 * window_size[0] - 1 + new_width = 2 * window_size[1] - 1 + + old_relative_position_bias_table = self.relative_position_bias_table + + old_num_relative_distance = self.num_relative_distance + new_num_relative_distance = new_height * new_width + 3 + + old_sub_table = old_relative_position_bias_table[:old_num_relative_distance - 3] + + old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2) + new_sub_table = F.interpolate(old_sub_table, size=(int(new_height), int(new_width)), mode="bilinear") + new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1) + + new_relative_position_bias_table = torch.cat( + [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3:]]) + + key = str(window_size[1]) + "," + str(window_size[0]) + if key not in self.relative_position_indices.keys(): + self.relative_position_indices[key] = gen_relative_position_index(window_size) + + relative_position_bias = new_relative_position_bias_table[ + self.relative_position_indices[key].view(-1)].view( + window_size[0] * window_size[1] + 1, + window_size[0] * window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + return relative_position_bias.unsqueeze(0) + + +def attention_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None): + """ + Modification of timm.models.beit.py: Attention.forward to support arbitrary window sizes. + """ + B, N, C = x.shape + + qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) if self.q_bias is not None else None + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + window_size = tuple(np.array(resolution) // 16) + attn = attn + self._get_rel_pos_bias(window_size) + if shared_rel_pos_bias is not None: + attn = attn + shared_rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +def block_forward(self, x, resolution, shared_rel_pos_bias: Optional[torch.Tensor] = None): + """ + Modification of timm.models.beit.py: Block.forward to support arbitrary window sizes. + """ + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), resolution, shared_rel_pos_bias=shared_rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), resolution, + shared_rel_pos_bias=shared_rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +def beit_forward_features(self, x): + """ + Modification of timm.models.beit.py: Beit.forward_features to support arbitrary window sizes. + """ + resolution = x.shape[2:] + + x = self.patch_embed(x) + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias) + else: + x = blk(x, resolution, shared_rel_pos_bias=rel_pos_bias) + x = self.norm(x) + return x + + +def _make_beit_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[0, 4, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + start_index_readout=1, +): + backbone = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index, + start_index_readout) + + backbone.model.patch_embed.forward = types.MethodType(patch_embed_forward, backbone.model.patch_embed) + backbone.model.forward_features = types.MethodType(beit_forward_features, backbone.model) + + for block in backbone.model.blocks: + attn = block.attn + attn._get_rel_pos_bias = types.MethodType(_get_rel_pos_bias, attn) + attn.forward = types.MethodType(attention_forward, attn) + attn.relative_position_indices = {} + + block.forward = types.MethodType(block_forward, block) + + return backbone + + +def _make_pretrained_beitl16_512(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("beit_large_patch16_512", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks is None else hooks + + features = [256, 512, 1024, 1024] + + return _make_beit_backbone( + model, + features=features, + size=[512, 512], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_beitl16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("beit_large_patch16_384", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks is None else hooks + return _make_beit_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_beitb16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("beit_base_patch16_384", pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks is None else hooks + return _make_beit_backbone( + model, + features=[96, 192, 384, 768], + hooks=hooks, + use_readout=use_readout, + ) diff --git a/custom_midas_repo/midas/backbones/levit.py b/custom_midas_repo/midas/backbones/levit.py new file mode 100644 index 0000000000000000000000000000000000000000..a5fb6934cf76c0680dd71cfe66f90374cfadf5a0 --- /dev/null +++ b/custom_midas_repo/midas/backbones/levit.py @@ -0,0 +1,106 @@ +import custom_timm as timm +import torch +import torch.nn as nn +import numpy as np + +from .utils import activations, get_activation, Transpose + + +def forward_levit(pretrained, x): + pretrained.model.forward_features(x) + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + + layer_1 = pretrained.act_postprocess1(layer_1) + layer_2 = pretrained.act_postprocess2(layer_2) + layer_3 = pretrained.act_postprocess3(layer_3) + + return layer_1, layer_2, layer_3 + + +def _make_levit_backbone( + model, + hooks=[3, 11, 21], + patch_grid=[14, 14] +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + + pretrained.activations = activations + + patch_grid_size = np.array(patch_grid, dtype=int) + + pretrained.act_postprocess1 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) + ) + pretrained.act_postprocess2 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 2).astype(int)).tolist())) + ) + pretrained.act_postprocess3 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((np.ceil(patch_grid_size / 4).astype(int)).tolist())) + ) + + return pretrained + + +class ConvTransposeNorm(nn.Sequential): + """ + Modification of + https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: ConvNorm + such that ConvTranspose2d is used instead of Conv2d. + """ + + def __init__( + self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1, + groups=1, bn_weight_init=1): + super().__init__() + self.add_module('c', + nn.ConvTranspose2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False)) + self.add_module('bn', nn.BatchNorm2d(out_chs)) + + nn.init.constant_(self.bn.weight, bn_weight_init) + + @torch.no_grad() + def fuse(self): + c, bn = self._modules.values() + w = bn.weight / (bn.running_var + bn.eps) ** 0.5 + w = c.weight * w[:, None, None, None] + b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5 + m = nn.ConvTranspose2d( + w.size(1), w.size(0), w.shape[2:], stride=self.c.stride, + padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups) + m.weight.data.copy_(w) + m.bias.data.copy_(b) + return m + + +def stem_b4_transpose(in_chs, out_chs, activation): + """ + Modification of + https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/levit.py: stem_b16 + such that ConvTranspose2d is used instead of Conv2d and stem is also reduced to the half. + """ + return nn.Sequential( + ConvTransposeNorm(in_chs, out_chs, 3, 2, 1), + activation(), + ConvTransposeNorm(out_chs, out_chs // 2, 3, 2, 1), + activation()) + + +def _make_pretrained_levit_384(pretrained, hooks=None): + model = timm.create_model("levit_384", pretrained=pretrained) + + hooks = [3, 11, 21] if hooks == None else hooks + return _make_levit_backbone( + model, + hooks=hooks + ) diff --git a/custom_midas_repo/midas/backbones/next_vit.py b/custom_midas_repo/midas/backbones/next_vit.py new file mode 100644 index 0000000000000000000000000000000000000000..a3ffae5930e1dd5288999f9a8dbee4f723a01ed5 --- /dev/null +++ b/custom_midas_repo/midas/backbones/next_vit.py @@ -0,0 +1,39 @@ +import custom_timm as timm + +import torch.nn as nn + +from pathlib import Path +from .utils import activations, forward_default, get_activation + +from ..external.next_vit.classification.nextvit import * + + +def forward_next_vit(pretrained, x): + return forward_default(pretrained, x, "forward") + + +def _make_next_vit_backbone( + model, + hooks=[2, 6, 36, 39], +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.features[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.features[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.features[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.features[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + return pretrained + + +def _make_pretrained_next_vit_large_6m(hooks=None): + model = timm.create_model("nextvit_large") + + hooks = [2, 6, 36, 39] if hooks == None else hooks + return _make_next_vit_backbone( + model, + hooks=hooks, + ) diff --git a/custom_midas_repo/midas/backbones/swin.py b/custom_midas_repo/midas/backbones/swin.py new file mode 100644 index 0000000000000000000000000000000000000000..d08ce41f7b8fe8900a1022e24cc5b47d32bf93e8 --- /dev/null +++ b/custom_midas_repo/midas/backbones/swin.py @@ -0,0 +1,13 @@ +import custom_timm as timm + +from .swin_common import _make_swin_backbone + + +def _make_pretrained_swinl12_384(pretrained, hooks=None): + model = timm.create_model("swin_large_patch4_window12_384", pretrained=pretrained) + + hooks = [1, 1, 17, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks + ) diff --git a/custom_midas_repo/midas/backbones/swin2.py b/custom_midas_repo/midas/backbones/swin2.py new file mode 100644 index 0000000000000000000000000000000000000000..2687b58732d86d7854b6edc435841e38a2e2958b --- /dev/null +++ b/custom_midas_repo/midas/backbones/swin2.py @@ -0,0 +1,34 @@ +import custom_timm as timm + +from .swin_common import _make_swin_backbone + + +def _make_pretrained_swin2l24_384(pretrained, hooks=None): + model = timm.create_model("swinv2_large_window12to24_192to384_22kft1k", pretrained=pretrained) + + hooks = [1, 1, 17, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks + ) + + +def _make_pretrained_swin2b24_384(pretrained, hooks=None): + model = timm.create_model("swinv2_base_window12to24_192to384_22kft1k", pretrained=pretrained) + + hooks = [1, 1, 17, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks + ) + + +def _make_pretrained_swin2t16_256(pretrained, hooks=None): + model = timm.create_model("swinv2_tiny_window16_256", pretrained=pretrained) + + hooks = [1, 1, 5, 1] if hooks == None else hooks + return _make_swin_backbone( + model, + hooks=hooks, + patch_grid=[64, 64] + ) diff --git a/custom_midas_repo/midas/backbones/swin_common.py b/custom_midas_repo/midas/backbones/swin_common.py new file mode 100644 index 0000000000000000000000000000000000000000..94d63d408f18511179d90b3ac6f697385d1e556d --- /dev/null +++ b/custom_midas_repo/midas/backbones/swin_common.py @@ -0,0 +1,52 @@ +import torch + +import torch.nn as nn +import numpy as np + +from .utils import activations, forward_default, get_activation, Transpose + + +def forward_swin(pretrained, x): + return forward_default(pretrained, x) + + +def _make_swin_backbone( + model, + hooks=[1, 1, 17, 1], + patch_grid=[96, 96] +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.layers[0].blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.layers[1].blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.layers[2].blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.layers[3].blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + if hasattr(model, "patch_grid"): + used_patch_grid = model.patch_grid + else: + used_patch_grid = patch_grid + + patch_grid_size = np.array(used_patch_grid, dtype=int) + + pretrained.act_postprocess1 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size(patch_grid_size.tolist())) + ) + pretrained.act_postprocess2 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((patch_grid_size // 2).tolist())) + ) + pretrained.act_postprocess3 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((patch_grid_size // 4).tolist())) + ) + pretrained.act_postprocess4 = nn.Sequential( + Transpose(1, 2), + nn.Unflatten(2, torch.Size((patch_grid_size // 8).tolist())) + ) + + return pretrained diff --git a/custom_midas_repo/midas/backbones/utils.py b/custom_midas_repo/midas/backbones/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0558899dddcfccec5f01a764d4f21738eb612149 --- /dev/null +++ b/custom_midas_repo/midas/backbones/utils.py @@ -0,0 +1,249 @@ +import torch + +import torch.nn as nn + + +class Slice(nn.Module): + def __init__(self, start_index=1): + super(Slice, self).__init__() + self.start_index = start_index + + def forward(self, x): + return x[:, self.start_index:] + + +class AddReadout(nn.Module): + def __init__(self, start_index=1): + super(AddReadout, self).__init__() + self.start_index = start_index + + def forward(self, x): + if self.start_index == 2: + readout = (x[:, 0] + x[:, 1]) / 2 + else: + readout = x[:, 0] + return x[:, self.start_index:] + readout.unsqueeze(1) + + +class ProjectReadout(nn.Module): + def __init__(self, in_features, start_index=1): + super(ProjectReadout, self).__init__() + self.start_index = start_index + + self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU()) + + def forward(self, x): + readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index:]) + features = torch.cat((x[:, self.start_index:], readout), -1) + + return self.project(features) + + +class Transpose(nn.Module): + def __init__(self, dim0, dim1): + super(Transpose, self).__init__() + self.dim0 = dim0 + self.dim1 = dim1 + + def forward(self, x): + x = x.transpose(self.dim0, self.dim1) + return x + + +activations = {} + + +def get_activation(name): + def hook(model, input, output): + activations[name] = output + + return hook + + +def forward_default(pretrained, x, function_name="forward_features"): + exec(f"pretrained.model.{function_name}(x)") + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + layer_4 = pretrained.activations["4"] + + if hasattr(pretrained, "act_postprocess1"): + layer_1 = pretrained.act_postprocess1(layer_1) + if hasattr(pretrained, "act_postprocess2"): + layer_2 = pretrained.act_postprocess2(layer_2) + if hasattr(pretrained, "act_postprocess3"): + layer_3 = pretrained.act_postprocess3(layer_3) + if hasattr(pretrained, "act_postprocess4"): + layer_4 = pretrained.act_postprocess4(layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def forward_adapted_unflatten(pretrained, x, function_name="forward_features"): + b, c, h, w = x.shape + + exec(f"glob = pretrained.model.{function_name}(x)") + + layer_1 = pretrained.activations["1"] + layer_2 = pretrained.activations["2"] + layer_3 = pretrained.activations["3"] + layer_4 = pretrained.activations["4"] + + layer_1 = pretrained.act_postprocess1[0:2](layer_1) + layer_2 = pretrained.act_postprocess2[0:2](layer_2) + layer_3 = pretrained.act_postprocess3[0:2](layer_3) + layer_4 = pretrained.act_postprocess4[0:2](layer_4) + + unflatten = nn.Sequential( + nn.Unflatten( + 2, + torch.Size( + [ + h // pretrained.model.patch_size[1], + w // pretrained.model.patch_size[0], + ] + ), + ) + ) + + if layer_1.ndim == 3: + layer_1 = unflatten(layer_1) + if layer_2.ndim == 3: + layer_2 = unflatten(layer_2) + if layer_3.ndim == 3: + layer_3 = unflatten(layer_3) + if layer_4.ndim == 3: + layer_4 = unflatten(layer_4) + + layer_1 = pretrained.act_postprocess1[3: len(pretrained.act_postprocess1)](layer_1) + layer_2 = pretrained.act_postprocess2[3: len(pretrained.act_postprocess2)](layer_2) + layer_3 = pretrained.act_postprocess3[3: len(pretrained.act_postprocess3)](layer_3) + layer_4 = pretrained.act_postprocess4[3: len(pretrained.act_postprocess4)](layer_4) + + return layer_1, layer_2, layer_3, layer_4 + + +def get_readout_oper(vit_features, features, use_readout, start_index=1): + if use_readout == "ignore": + readout_oper = [Slice(start_index)] * len(features) + elif use_readout == "add": + readout_oper = [AddReadout(start_index)] * len(features) + elif use_readout == "project": + readout_oper = [ + ProjectReadout(vit_features, start_index) for out_feat in features + ] + else: + assert ( + False + ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'" + + return readout_oper + + +def make_backbone_default( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + start_index_readout=1, +): + pretrained = nn.Module() + + pretrained.model = model + pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1")) + pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2")) + pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3")) + pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4")) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index_readout) + + # 32, 48, 136, 384 + pretrained.act_postprocess1 = nn.Sequential( + readout_oper[0], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[0], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[0], + out_channels=features[0], + kernel_size=4, + stride=4, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess2 = nn.Sequential( + readout_oper[1], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[1], + kernel_size=1, + stride=1, + padding=0, + ), + nn.ConvTranspose2d( + in_channels=features[1], + out_channels=features[1], + kernel_size=2, + stride=2, + padding=0, + bias=True, + dilation=1, + groups=1, + ), + ) + + pretrained.act_postprocess3 = nn.Sequential( + readout_oper[2], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[2], + kernel_size=1, + stride=1, + padding=0, + ), + ) + + pretrained.act_postprocess4 = nn.Sequential( + readout_oper[3], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[3], + kernel_size=1, + stride=1, + padding=0, + ), + nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ), + ) + + pretrained.model.start_index = start_index + pretrained.model.patch_size = [16, 16] + + return pretrained diff --git a/custom_midas_repo/midas/backbones/vit.py b/custom_midas_repo/midas/backbones/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..6be3e316208cf82b3f02efc6fde5ddd4791fecc6 --- /dev/null +++ b/custom_midas_repo/midas/backbones/vit.py @@ -0,0 +1,221 @@ +import torch +import torch.nn as nn +import custom_timm as timm +import types +import math +import torch.nn.functional as F + +from .utils import (activations, forward_adapted_unflatten, get_activation, get_readout_oper, + make_backbone_default, Transpose) + + +def forward_vit(pretrained, x): + return forward_adapted_unflatten(pretrained, x, "forward_flex") + + +def _resize_pos_embed(self, posemb, gs_h, gs_w): + posemb_tok, posemb_grid = ( + posemb[:, : self.start_index], + posemb[0, self.start_index:], + ) + + gs_old = int(math.sqrt(len(posemb_grid))) + + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear") + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1) + + posemb = torch.cat([posemb_tok, posemb_grid], dim=1) + + return posemb + + +def forward_flex(self, x): + b, c, h, w = x.shape + + pos_embed = self._resize_pos_embed( + self.pos_embed, h // self.patch_size[1], w // self.patch_size[0] + ) + + B = x.shape[0] + + if hasattr(self.patch_embed, "backbone"): + x = self.patch_embed.backbone(x) + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + + x = self.patch_embed.proj(x).flatten(2).transpose(1, 2) + + if getattr(self, "dist_token", None) is not None: + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + dist_token = self.dist_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, dist_token, x), dim=1) + else: + if self.no_embed_class: + x = x + pos_embed + cls_tokens = self.cls_token.expand( + B, -1, -1 + ) # stole cls_tokens impl from Phil Wang, thanks + x = torch.cat((cls_tokens, x), dim=1) + + if not self.no_embed_class: + x = x + pos_embed + x = self.pos_drop(x) + + for blk in self.blocks: + x = blk(x) + + x = self.norm(x) + + return x + + +def _make_vit_b16_backbone( + model, + features=[96, 192, 384, 768], + size=[384, 384], + hooks=[2, 5, 8, 11], + vit_features=768, + use_readout="ignore", + start_index=1, + start_index_readout=1, +): + pretrained = make_backbone_default(model, features, size, hooks, vit_features, use_readout, start_index, + start_index_readout) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained + + +def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("vit_large_patch16_384", pretrained=pretrained) + + hooks = [5, 11, 17, 23] if hooks == None else hooks + return _make_vit_b16_backbone( + model, + features=[256, 512, 1024, 1024], + hooks=hooks, + vit_features=1024, + use_readout=use_readout, + ) + + +def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None): + model = timm.create_model("vit_base_patch16_384", pretrained=pretrained) + + hooks = [2, 5, 8, 11] if hooks == None else hooks + return _make_vit_b16_backbone( + model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout + ) + + +def _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=[0, 1, 8, 11], + vit_features=768, + patch_size=[16, 16], + number_stages=2, + use_vit_only=False, + use_readout="ignore", + start_index=1, +): + pretrained = nn.Module() + + pretrained.model = model + + used_number_stages = 0 if use_vit_only else number_stages + for s in range(used_number_stages): + pretrained.model.patch_embed.backbone.stages[s].register_forward_hook( + get_activation(str(s + 1)) + ) + for s in range(used_number_stages, 4): + pretrained.model.blocks[hooks[s]].register_forward_hook(get_activation(str(s + 1))) + + pretrained.activations = activations + + readout_oper = get_readout_oper(vit_features, features, use_readout, start_index) + + for s in range(used_number_stages): + value = nn.Sequential(nn.Identity(), nn.Identity(), nn.Identity()) + exec(f"pretrained.act_postprocess{s + 1}=value") + for s in range(used_number_stages, 4): + if s < number_stages: + final_layer = nn.ConvTranspose2d( + in_channels=features[s], + out_channels=features[s], + kernel_size=4 // (2 ** s), + stride=4 // (2 ** s), + padding=0, + bias=True, + dilation=1, + groups=1, + ) + elif s > number_stages: + final_layer = nn.Conv2d( + in_channels=features[3], + out_channels=features[3], + kernel_size=3, + stride=2, + padding=1, + ) + else: + final_layer = None + + layers = [ + readout_oper[s], + Transpose(1, 2), + nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])), + nn.Conv2d( + in_channels=vit_features, + out_channels=features[s], + kernel_size=1, + stride=1, + padding=0, + ), + ] + if final_layer is not None: + layers.append(final_layer) + + value = nn.Sequential(*layers) + exec(f"pretrained.act_postprocess{s + 1}=value") + + pretrained.model.start_index = start_index + pretrained.model.patch_size = patch_size + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model) + + # We inject this function into the VisionTransformer instances so that + # we can use it with interpolated position embeddings without modifying the library source. + pretrained.model._resize_pos_embed = types.MethodType( + _resize_pos_embed, pretrained.model + ) + + return pretrained + + +def _make_pretrained_vitb_rn50_384( + pretrained, use_readout="ignore", hooks=None, use_vit_only=False +): + model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained) + + hooks = [0, 1, 8, 11] if hooks == None else hooks + return _make_vit_b_rn50_backbone( + model, + features=[256, 512, 768, 768], + size=[384, 384], + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) diff --git a/custom_midas_repo/midas/base_model.py b/custom_midas_repo/midas/base_model.py new file mode 100644 index 0000000000000000000000000000000000000000..5cf430239b47ec5ec07531263f26f5c24a2311cd --- /dev/null +++ b/custom_midas_repo/midas/base_model.py @@ -0,0 +1,16 @@ +import torch + + +class BaseModel(torch.nn.Module): + def load(self, path): + """Load model from file. + + Args: + path (str): file path + """ + parameters = torch.load(path, map_location=torch.device('cpu')) + + if "optimizer" in parameters: + parameters = parameters["model"] + + self.load_state_dict(parameters) diff --git a/custom_midas_repo/midas/blocks.py b/custom_midas_repo/midas/blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..6d87a00680bb6ed9a6d7c3043ea30a1e90361794 --- /dev/null +++ b/custom_midas_repo/midas/blocks.py @@ -0,0 +1,439 @@ +import torch +import torch.nn as nn + +from .backbones.beit import ( + _make_pretrained_beitl16_512, + _make_pretrained_beitl16_384, + _make_pretrained_beitb16_384, + forward_beit, +) +from .backbones.swin_common import ( + forward_swin, +) +from .backbones.swin2 import ( + _make_pretrained_swin2l24_384, + _make_pretrained_swin2b24_384, + _make_pretrained_swin2t16_256, +) +from .backbones.swin import ( + _make_pretrained_swinl12_384, +) +from .backbones.levit import ( + _make_pretrained_levit_384, + forward_levit, +) +from .backbones.vit import ( + _make_pretrained_vitb_rn50_384, + _make_pretrained_vitl16_384, + _make_pretrained_vitb16_384, + forward_vit, +) + +def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, + use_vit_only=False, use_readout="ignore", in_features=[96, 256, 512, 1024]): + if backbone == "beitl16_512": + pretrained = _make_pretrained_beitl16_512( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) # BEiT_512-L (backbone) + elif backbone == "beitl16_384": + pretrained = _make_pretrained_beitl16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) # BEiT_384-L (backbone) + elif backbone == "beitb16_384": + pretrained = _make_pretrained_beitb16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) # BEiT_384-B (backbone) + elif backbone == "swin2l24_384": + pretrained = _make_pretrained_swin2l24_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [192, 384, 768, 1536], features, groups=groups, expand=expand + ) # Swin2-L/12to24 (backbone) + elif backbone == "swin2b24_384": + pretrained = _make_pretrained_swin2b24_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [128, 256, 512, 1024], features, groups=groups, expand=expand + ) # Swin2-B/12to24 (backbone) + elif backbone == "swin2t16_256": + pretrained = _make_pretrained_swin2t16_256( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) # Swin2-T/16 (backbone) + elif backbone == "swinl12_384": + pretrained = _make_pretrained_swinl12_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [192, 384, 768, 1536], features, groups=groups, expand=expand + ) # Swin-L/12 (backbone) + elif backbone == "next_vit_large_6m": + from .backbones.next_vit import _make_pretrained_next_vit_large_6m + pretrained = _make_pretrained_next_vit_large_6m(hooks=hooks) + scratch = _make_scratch( + in_features, features, groups=groups, expand=expand + ) # Next-ViT-L on ImageNet-1K-6M (backbone) + elif backbone == "levit_384": + pretrained = _make_pretrained_levit_384( + use_pretrained, hooks=hooks + ) + scratch = _make_scratch( + [384, 512, 768], features, groups=groups, expand=expand + ) # LeViT 384 (backbone) + elif backbone == "vitl16_384": + pretrained = _make_pretrained_vitl16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [256, 512, 1024, 1024], features, groups=groups, expand=expand + ) # ViT-L/16 - 85.0% Top1 (backbone) + elif backbone == "vitb_rn50_384": + pretrained = _make_pretrained_vitb_rn50_384( + use_pretrained, + hooks=hooks, + use_vit_only=use_vit_only, + use_readout=use_readout, + ) + scratch = _make_scratch( + [256, 512, 768, 768], features, groups=groups, expand=expand + ) # ViT-H/16 - 85.0% Top1 (backbone) + elif backbone == "vitb16_384": + pretrained = _make_pretrained_vitb16_384( + use_pretrained, hooks=hooks, use_readout=use_readout + ) + scratch = _make_scratch( + [96, 192, 384, 768], features, groups=groups, expand=expand + ) # ViT-B/16 - 84.6% Top1 (backbone) + elif backbone == "resnext101_wsl": + pretrained = _make_pretrained_resnext101_wsl(use_pretrained) + scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand) # efficientnet_lite3 + elif backbone == "efficientnet_lite3": + pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable) + scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand) # efficientnet_lite3 + else: + print(f"Backbone '{backbone}' not implemented") + assert False + + return pretrained, scratch + + +def _make_scratch(in_shape, out_shape, groups=1, expand=False): + scratch = nn.Module() + + out_shape1 = out_shape + out_shape2 = out_shape + out_shape3 = out_shape + if len(in_shape) >= 4: + out_shape4 = out_shape + + if expand: + out_shape1 = out_shape + out_shape2 = out_shape*2 + out_shape3 = out_shape*4 + if len(in_shape) >= 4: + out_shape4 = out_shape*8 + + scratch.layer1_rn = nn.Conv2d( + in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer2_rn = nn.Conv2d( + in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + scratch.layer3_rn = nn.Conv2d( + in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + if len(in_shape) >= 4: + scratch.layer4_rn = nn.Conv2d( + in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups + ) + + return scratch + + +def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False): + efficientnet = torch.hub.load( + "rwightman/gen-efficientnet-pytorch", + "tf_efficientnet_lite3", + pretrained=use_pretrained, + exportable=exportable + ) + return _make_efficientnet_backbone(efficientnet) + + +def _make_efficientnet_backbone(effnet): + pretrained = nn.Module() + + pretrained.layer1 = nn.Sequential( + effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2] + ) + pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3]) + pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5]) + pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9]) + + return pretrained + + +def _make_resnet_backbone(resnet): + pretrained = nn.Module() + pretrained.layer1 = nn.Sequential( + resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1 + ) + + pretrained.layer2 = resnet.layer2 + pretrained.layer3 = resnet.layer3 + pretrained.layer4 = resnet.layer4 + + return pretrained + + +def _make_pretrained_resnext101_wsl(use_pretrained): + resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl") + return _make_resnet_backbone(resnet) + + + +class Interpolate(nn.Module): + """Interpolation module. + """ + + def __init__(self, scale_factor, mode, align_corners=False): + """Init. + + Args: + scale_factor (float): scaling + mode (str): interpolation mode + """ + super(Interpolate, self).__init__() + + self.interp = nn.functional.interpolate + self.scale_factor = scale_factor + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: interpolated data + """ + + x = self.interp( + x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners + ) + + return x + + +class ResidualConvUnit(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True + ) + + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + out = self.relu(x) + out = self.conv1(out) + out = self.relu(out) + out = self.conv2(out) + + return out + x + + +class FeatureFusionBlock(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock, self).__init__() + + self.resConfUnit1 = ResidualConvUnit(features) + self.resConfUnit2 = ResidualConvUnit(features) + + def forward(self, *xs): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + output += self.resConfUnit1(xs[1]) + + output = self.resConfUnit2(output) + + output = nn.functional.interpolate( + output, scale_factor=2, mode="bilinear", align_corners=True + ) + + return output + + + + +class ResidualConvUnit_custom(nn.Module): + """Residual convolution module. + """ + + def __init__(self, features, activation, bn): + """Init. + + Args: + features (int): number of features + """ + super().__init__() + + self.bn = bn + + self.groups=1 + + self.conv1 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + self.conv2 = nn.Conv2d( + features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups + ) + + if self.bn==True: + self.bn1 = nn.BatchNorm2d(features) + self.bn2 = nn.BatchNorm2d(features) + + self.activation = activation + + self.skip_add = nn.quantized.FloatFunctional() + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input + + Returns: + tensor: output + """ + + out = self.activation(x) + out = self.conv1(out) + if self.bn==True: + out = self.bn1(out) + + out = self.activation(out) + out = self.conv2(out) + if self.bn==True: + out = self.bn2(out) + + if self.groups > 1: + out = self.conv_merge(out) + + return self.skip_add.add(out, x) + + # return out + x + + +class FeatureFusionBlock_custom(nn.Module): + """Feature fusion block. + """ + + def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, size=None): + """Init. + + Args: + features (int): number of features + """ + super(FeatureFusionBlock_custom, self).__init__() + + self.deconv = deconv + self.align_corners = align_corners + + self.groups=1 + + self.expand = expand + out_features = features + if self.expand==True: + out_features = features//2 + + self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1) + + self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn) + self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn) + + self.skip_add = nn.quantized.FloatFunctional() + + self.size=size + + def forward(self, *xs, size=None): + """Forward pass. + + Returns: + tensor: output + """ + output = xs[0] + + if len(xs) == 2: + res = self.resConfUnit1(xs[1]) + output = self.skip_add.add(output, res) + # output += res + + output = self.resConfUnit2(output) + + if (size is None) and (self.size is None): + modifier = {"scale_factor": 2} + elif size is None: + modifier = {"size": self.size} + else: + modifier = {"size": size} + + output = nn.functional.interpolate( + output, **modifier, mode="bilinear", align_corners=self.align_corners + ) + + output = self.out_conv(output) + + return output + diff --git a/custom_midas_repo/midas/dpt_depth.py b/custom_midas_repo/midas/dpt_depth.py new file mode 100644 index 0000000000000000000000000000000000000000..993d296405dcb1d2bf58a9f0258cef4587b75b47 --- /dev/null +++ b/custom_midas_repo/midas/dpt_depth.py @@ -0,0 +1,166 @@ +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import ( + FeatureFusionBlock_custom, + Interpolate, + _make_encoder, + forward_beit, + forward_swin, + forward_levit, + forward_vit, +) +from .backbones.levit import stem_b4_transpose +from custom_timm.models.layers import get_act_layer + + +def _make_fusion_block(features, use_bn, size = None): + return FeatureFusionBlock_custom( + features, + nn.ReLU(False), + deconv=False, + bn=use_bn, + expand=False, + align_corners=True, + size=size, + ) + + +class DPT(BaseModel): + def __init__( + self, + head, + features=256, + backbone="vitb_rn50_384", + readout="project", + channels_last=False, + use_bn=False, + **kwargs + ): + + super(DPT, self).__init__() + + self.channels_last = channels_last + + # For the Swin, Swin 2, LeViT and Next-ViT Transformers, the hierarchical architectures prevent setting the + # hooks freely. Instead, the hooks have to be chosen according to the ranges specified in the comments. + hooks = { + "beitl16_512": [5, 11, 17, 23], + "beitl16_384": [5, 11, 17, 23], + "beitb16_384": [2, 5, 8, 11], + "swin2l24_384": [1, 1, 17, 1], # Allowed ranges: [0, 1], [0, 1], [ 0, 17], [ 0, 1] + "swin2b24_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1] + "swin2t16_256": [1, 1, 5, 1], # [0, 1], [0, 1], [ 0, 5], [ 0, 1] + "swinl12_384": [1, 1, 17, 1], # [0, 1], [0, 1], [ 0, 17], [ 0, 1] + "next_vit_large_6m": [2, 6, 36, 39], # [0, 2], [3, 6], [ 7, 36], [37, 39] + "levit_384": [3, 11, 21], # [0, 3], [6, 11], [14, 21] + "vitb_rn50_384": [0, 1, 8, 11], + "vitb16_384": [2, 5, 8, 11], + "vitl16_384": [5, 11, 17, 23], + }[backbone] + + if "next_vit" in backbone: + in_features = { + "next_vit_large_6m": [96, 256, 512, 1024], + }[backbone] + else: + in_features = None + + # Instantiate backbone and reassemble blocks + self.pretrained, self.scratch = _make_encoder( + backbone, + features, + False, # Set to true of you want to train from scratch, uses ImageNet weights + groups=1, + expand=False, + exportable=False, + hooks=hooks, + use_readout=readout, + in_features=in_features, + ) + + self.number_layers = len(hooks) if hooks is not None else 4 + size_refinenet3 = None + self.scratch.stem_transpose = None + + if "beit" in backbone: + self.forward_transformer = forward_beit + elif "swin" in backbone: + self.forward_transformer = forward_swin + elif "next_vit" in backbone: + from .backbones.next_vit import forward_next_vit + self.forward_transformer = forward_next_vit + elif "levit" in backbone: + self.forward_transformer = forward_levit + size_refinenet3 = 7 + self.scratch.stem_transpose = stem_b4_transpose(256, 128, get_act_layer("hard_swish")) + else: + self.forward_transformer = forward_vit + + self.scratch.refinenet1 = _make_fusion_block(features, use_bn) + self.scratch.refinenet2 = _make_fusion_block(features, use_bn) + self.scratch.refinenet3 = _make_fusion_block(features, use_bn, size_refinenet3) + if self.number_layers >= 4: + self.scratch.refinenet4 = _make_fusion_block(features, use_bn) + + self.scratch.output_conv = head + + + def forward(self, x): + if self.channels_last == True: + x.contiguous(memory_format=torch.channels_last) + + layers = self.forward_transformer(self.pretrained, x) + if self.number_layers == 3: + layer_1, layer_2, layer_3 = layers + else: + layer_1, layer_2, layer_3, layer_4 = layers + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + if self.number_layers >= 4: + layer_4_rn = self.scratch.layer4_rn(layer_4) + + if self.number_layers == 3: + path_3 = self.scratch.refinenet3(layer_3_rn, size=layer_2_rn.shape[2:]) + else: + path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:]) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:]) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:]) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + if self.scratch.stem_transpose is not None: + path_1 = self.scratch.stem_transpose(path_1) + + out = self.scratch.output_conv(path_1) + + return out + + +class DPTDepthModel(DPT): + def __init__(self, path=None, non_negative=True, **kwargs): + features = kwargs["features"] if "features" in kwargs else 256 + head_features_1 = kwargs["head_features_1"] if "head_features_1" in kwargs else features + head_features_2 = kwargs["head_features_2"] if "head_features_2" in kwargs else 32 + kwargs.pop("head_features_1", None) + kwargs.pop("head_features_2", None) + + head = nn.Sequential( + nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode="bilinear", align_corners=True), + nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + super().__init__(head, **kwargs) + + if path is not None: + self.load(path) + + def forward(self, x): + return super().forward(x).squeeze(dim=1) diff --git a/custom_midas_repo/midas/midas_net.py b/custom_midas_repo/midas/midas_net.py new file mode 100644 index 0000000000000000000000000000000000000000..8a954977800b0a0f48807e80fa63041910e33c1f --- /dev/null +++ b/custom_midas_repo/midas/midas_net.py @@ -0,0 +1,76 @@ +"""MidashNet: Network for monocular depth estimation trained by mixing several datasets. +This file contains code that is adapted from +https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py +""" +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import FeatureFusionBlock, Interpolate, _make_encoder + + +class MidasNet(BaseModel): + """Network for monocular depth estimation. + """ + + def __init__(self, path=None, features=256, non_negative=True): + """Init. + + Args: + path (str, optional): Path to saved model. Defaults to None. + features (int, optional): Number of features. Defaults to 256. + backbone (str, optional): Backbone network for encoder. Defaults to resnet50 + """ + print("Loading weights: ", path) + + super(MidasNet, self).__init__() + + use_pretrained = False if path is None else True + + self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained) + + self.scratch.refinenet4 = FeatureFusionBlock(features) + self.scratch.refinenet3 = FeatureFusionBlock(features) + self.scratch.refinenet2 = FeatureFusionBlock(features) + self.scratch.refinenet1 = FeatureFusionBlock(features) + + self.scratch.output_conv = nn.Sequential( + nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1), + Interpolate(scale_factor=2, mode="bilinear"), + nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1), + nn.ReLU(True), + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + ) + + if path: + self.load(path) + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input data (image) + + Returns: + tensor: depth + """ + + layer_1 = self.pretrained.layer1(x) + layer_2 = self.pretrained.layer2(layer_1) + layer_3 = self.pretrained.layer3(layer_2) + layer_4 = self.pretrained.layer4(layer_3) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return torch.squeeze(out, dim=1) diff --git a/custom_midas_repo/midas/midas_net_custom.py b/custom_midas_repo/midas/midas_net_custom.py new file mode 100644 index 0000000000000000000000000000000000000000..50e4acb5e53d5fabefe3dde16ab49c33c2b7797c --- /dev/null +++ b/custom_midas_repo/midas/midas_net_custom.py @@ -0,0 +1,128 @@ +"""MidashNet: Network for monocular depth estimation trained by mixing several datasets. +This file contains code that is adapted from +https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py +""" +import torch +import torch.nn as nn + +from .base_model import BaseModel +from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder + + +class MidasNet_small(BaseModel): + """Network for monocular depth estimation. + """ + + def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True, + blocks={'expand': True}): + """Init. + + Args: + path (str, optional): Path to saved model. Defaults to None. + features (int, optional): Number of features. Defaults to 256. + backbone (str, optional): Backbone network for encoder. Defaults to resnet50 + """ + print("Loading weights: ", path) + + super(MidasNet_small, self).__init__() + + use_pretrained = False if path else True + + self.channels_last = channels_last + self.blocks = blocks + self.backbone = backbone + + self.groups = 1 + + features1=features + features2=features + features3=features + features4=features + self.expand = False + if "expand" in self.blocks and self.blocks['expand'] == True: + self.expand = True + features1=features + features2=features*2 + features3=features*4 + features4=features*8 + + self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable) + + self.scratch.activation = nn.ReLU(False) + + self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners) + self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners) + + + self.scratch.output_conv = nn.Sequential( + nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups), + Interpolate(scale_factor=2, mode="bilinear"), + nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1), + self.scratch.activation, + nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0), + nn.ReLU(True) if non_negative else nn.Identity(), + nn.Identity(), + ) + + if path: + self.load(path) + + + def forward(self, x): + """Forward pass. + + Args: + x (tensor): input data (image) + + Returns: + tensor: depth + """ + if self.channels_last==True: + print("self.channels_last = ", self.channels_last) + x.contiguous(memory_format=torch.channels_last) + + + layer_1 = self.pretrained.layer1(x) + layer_2 = self.pretrained.layer2(layer_1) + layer_3 = self.pretrained.layer3(layer_2) + layer_4 = self.pretrained.layer4(layer_3) + + layer_1_rn = self.scratch.layer1_rn(layer_1) + layer_2_rn = self.scratch.layer2_rn(layer_2) + layer_3_rn = self.scratch.layer3_rn(layer_3) + layer_4_rn = self.scratch.layer4_rn(layer_4) + + + path_4 = self.scratch.refinenet4(layer_4_rn) + path_3 = self.scratch.refinenet3(path_4, layer_3_rn) + path_2 = self.scratch.refinenet2(path_3, layer_2_rn) + path_1 = self.scratch.refinenet1(path_2, layer_1_rn) + + out = self.scratch.output_conv(path_1) + + return torch.squeeze(out, dim=1) + + + +def fuse_model(m): + prev_previous_type = nn.Identity() + prev_previous_name = '' + previous_type = nn.Identity() + previous_name = '' + for name, module in m.named_modules(): + if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU: + # print("FUSED ", prev_previous_name, previous_name, name) + torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True) + elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d: + # print("FUSED ", prev_previous_name, previous_name) + torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True) + # elif previous_type == nn.Conv2d and type(module) == nn.ReLU: + # print("FUSED ", previous_name, name) + # torch.quantization.fuse_modules(m, [previous_name, name], inplace=True) + + prev_previous_type = previous_type + prev_previous_name = previous_name + previous_type = type(module) + previous_name = name \ No newline at end of file diff --git a/custom_midas_repo/midas/model_loader.py b/custom_midas_repo/midas/model_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..5119d8a200eaecd50ab0c8da8ea7e4141e47c634 --- /dev/null +++ b/custom_midas_repo/midas/model_loader.py @@ -0,0 +1,242 @@ +import cv2 +import torch + +from custom_midas_repo.midas.dpt_depth import DPTDepthModel +from custom_midas_repo.midas.midas_net import MidasNet +from custom_midas_repo.midas.midas_net_custom import MidasNet_small +from custom_midas_repo.midas.transforms import Resize, NormalizeImage, PrepareForNet + +from torchvision.transforms import Compose + +default_models = { + "dpt_beit_large_512": "weights/dpt_beit_large_512.pt", + "dpt_beit_large_384": "weights/dpt_beit_large_384.pt", + "dpt_beit_base_384": "weights/dpt_beit_base_384.pt", + "dpt_swin2_large_384": "weights/dpt_swin2_large_384.pt", + "dpt_swin2_base_384": "weights/dpt_swin2_base_384.pt", + "dpt_swin2_tiny_256": "weights/dpt_swin2_tiny_256.pt", + "dpt_swin_large_384": "weights/dpt_swin_large_384.pt", + "dpt_next_vit_large_384": "weights/dpt_next_vit_large_384.pt", + "dpt_levit_224": "weights/dpt_levit_224.pt", + "dpt_large_384": "weights/dpt_large_384.pt", + "dpt_hybrid_384": "weights/dpt_hybrid_384.pt", + "midas_v21_384": "weights/midas_v21_384.pt", + "midas_v21_small_256": "weights/midas_v21_small_256.pt", + "openvino_midas_v21_small_256": "weights/openvino_midas_v21_small_256.xml", +} + + +def load_model(device, model_path, model_type="dpt_large_384", optimize=True, height=None, square=False): + """Load the specified network. + + Args: + device (device): the torch device used + model_path (str): path to saved model + model_type (str): the type of the model to be loaded + optimize (bool): optimize the model to half-integer on CUDA? + height (int): inference encoder image height + square (bool): resize to a square resolution? + + Returns: + The loaded network, the transform which prepares images as input to the network and the dimensions of the + network input + """ + if "openvino" in model_type: + from openvino.runtime import Core + + keep_aspect_ratio = not square + + if model_type == "dpt_beit_large_512": + model = DPTDepthModel( + path=model_path, + backbone="beitl16_512", + non_negative=True, + ) + net_w, net_h = 512, 512 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_beit_large_384": + model = DPTDepthModel( + path=model_path, + backbone="beitl16_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_beit_base_384": + model = DPTDepthModel( + path=model_path, + backbone="beitb16_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin2_large_384": + model = DPTDepthModel( + path=model_path, + backbone="swin2l24_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin2_base_384": + model = DPTDepthModel( + path=model_path, + backbone="swin2b24_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin2_tiny_256": + model = DPTDepthModel( + path=model_path, + backbone="swin2t16_256", + non_negative=True, + ) + net_w, net_h = 256, 256 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_swin_large_384": + model = DPTDepthModel( + path=model_path, + backbone="swinl12_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_next_vit_large_384": + model = DPTDepthModel( + path=model_path, + backbone="next_vit_large_6m", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + # We change the notation from dpt_levit_224 (MiDaS notation) to levit_384 (timm notation) here, where the 224 refers + # to the resolution 224x224 used by LeViT and 384 is the first entry of the embed_dim, see _cfg and model_cfgs of + # https://github.com/rwightman/pytorch-image-models/blob/main/timm/models/levit.py + # (commit id: 927f031293a30afb940fff0bee34b85d9c059b0e) + elif model_type == "dpt_levit_224": + model = DPTDepthModel( + path=model_path, + backbone="levit_384", + non_negative=True, + head_features_1=64, + head_features_2=8, + ) + net_w, net_h = 224, 224 + keep_aspect_ratio = False + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_large_384": + model = DPTDepthModel( + path=model_path, + backbone="vitl16_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "dpt_hybrid_384": + model = DPTDepthModel( + path=model_path, + backbone="vitb_rn50_384", + non_negative=True, + ) + net_w, net_h = 384, 384 + resize_mode = "minimal" + normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + + elif model_type == "midas_v21_384": + model = MidasNet(model_path, non_negative=True) + net_w, net_h = 384, 384 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + elif model_type == "midas_v21_small_256": + model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True, + non_negative=True, blocks={'expand': True}) + net_w, net_h = 256, 256 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + elif model_type == "openvino_midas_v21_small_256": + ie = Core() + uncompiled_model = ie.read_model(model=model_path) + model = ie.compile_model(uncompiled_model, "CPU") + net_w, net_h = 256, 256 + resize_mode = "upper_bound" + normalization = NormalizeImage( + mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225] + ) + + else: + print(f"model_type '{model_type}' not implemented, use: --model_type large") + assert False + + if not "openvino" in model_type: + print("Model loaded, number of parameters = {:.0f}M".format(sum(p.numel() for p in model.parameters()) / 1e6)) + else: + print("Model loaded, optimized with OpenVINO") + + if "openvino" in model_type: + keep_aspect_ratio = False + + if height is not None: + net_w, net_h = height, height + + transform = Compose( + [ + Resize( + net_w, + net_h, + resize_target=None, + keep_aspect_ratio=keep_aspect_ratio, + ensure_multiple_of=32, + resize_method=resize_mode, + image_interpolation_method=cv2.INTER_CUBIC, + ), + normalization, + PrepareForNet(), + ] + ) + + if not "openvino" in model_type: + model.eval() + + if optimize and (device == torch.device("cuda")): + if not "openvino" in model_type: + model = model.to(memory_format=torch.channels_last) + model = model.half() + else: + print("Error: OpenVINO models are already optimized. No optimization to half-float possible.") + exit() + + if not "openvino" in model_type: + model.to(device) + + return model, transform, net_w, net_h diff --git a/custom_midas_repo/midas/transforms.py b/custom_midas_repo/midas/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..350cbc11662633ad7f8968eb10be2e7de6e384e9 --- /dev/null +++ b/custom_midas_repo/midas/transforms.py @@ -0,0 +1,234 @@ +import numpy as np +import cv2 +import math + + +def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA): + """Rezise the sample to ensure the given size. Keeps aspect ratio. + + Args: + sample (dict): sample + size (tuple): image size + + Returns: + tuple: new size + """ + shape = list(sample["disparity"].shape) + + if shape[0] >= size[0] and shape[1] >= size[1]: + return sample + + scale = [0, 0] + scale[0] = size[0] / shape[0] + scale[1] = size[1] / shape[1] + + scale = max(scale) + + shape[0] = math.ceil(scale * shape[0]) + shape[1] = math.ceil(scale * shape[1]) + + # resize + sample["image"] = cv2.resize( + sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method + ) + + sample["disparity"] = cv2.resize( + sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST + ) + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + tuple(shape[::-1]), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return tuple(shape) + + +class Resize(object): + """Resize sample to given size (width, height). + """ + + def __init__( + self, + width, + height, + resize_target=True, + keep_aspect_ratio=False, + ensure_multiple_of=1, + resize_method="lower_bound", + image_interpolation_method=cv2.INTER_AREA, + ): + """Init. + + Args: + width (int): desired output width + height (int): desired output height + resize_target (bool, optional): + True: Resize the full sample (image, mask, target). + False: Resize image only. + Defaults to True. + keep_aspect_ratio (bool, optional): + True: Keep the aspect ratio of the input sample. + Output sample might not have the given width and height, and + resize behaviour depends on the parameter 'resize_method'. + Defaults to False. + ensure_multiple_of (int, optional): + Output width and height is constrained to be multiple of this parameter. + Defaults to 1. + resize_method (str, optional): + "lower_bound": Output will be at least as large as the given size. + "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.) + "minimal": Scale as least as possible. (Output size might be smaller than given size.) + Defaults to "lower_bound". + """ + self.__width = width + self.__height = height + + self.__resize_target = resize_target + self.__keep_aspect_ratio = keep_aspect_ratio + self.__multiple_of = ensure_multiple_of + self.__resize_method = resize_method + self.__image_interpolation_method = image_interpolation_method + + def constrain_to_multiple_of(self, x, min_val=0, max_val=None): + y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if max_val is not None and y > max_val: + y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int) + + if y < min_val: + y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int) + + return y + + def get_size(self, width, height): + # determine new height and width + scale_height = self.__height / height + scale_width = self.__width / width + + if self.__keep_aspect_ratio: + if self.__resize_method == "lower_bound": + # scale such that output size is lower bound + if scale_width > scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "upper_bound": + # scale such that output size is upper bound + if scale_width < scale_height: + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + elif self.__resize_method == "minimal": + # scale as least as possbile + if abs(1 - scale_width) < abs(1 - scale_height): + # fit width + scale_height = scale_width + else: + # fit height + scale_width = scale_height + else: + raise ValueError( + f"resize_method {self.__resize_method} not implemented" + ) + + if self.__resize_method == "lower_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, min_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, min_val=self.__width + ) + elif self.__resize_method == "upper_bound": + new_height = self.constrain_to_multiple_of( + scale_height * height, max_val=self.__height + ) + new_width = self.constrain_to_multiple_of( + scale_width * width, max_val=self.__width + ) + elif self.__resize_method == "minimal": + new_height = self.constrain_to_multiple_of(scale_height * height) + new_width = self.constrain_to_multiple_of(scale_width * width) + else: + raise ValueError(f"resize_method {self.__resize_method} not implemented") + + return (new_width, new_height) + + def __call__(self, sample): + width, height = self.get_size( + sample["image"].shape[1], sample["image"].shape[0] + ) + + # resize sample + sample["image"] = cv2.resize( + sample["image"], + (width, height), + interpolation=self.__image_interpolation_method, + ) + + if self.__resize_target: + if "disparity" in sample: + sample["disparity"] = cv2.resize( + sample["disparity"], + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + + if "depth" in sample: + sample["depth"] = cv2.resize( + sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST + ) + + sample["mask"] = cv2.resize( + sample["mask"].astype(np.float32), + (width, height), + interpolation=cv2.INTER_NEAREST, + ) + sample["mask"] = sample["mask"].astype(bool) + + return sample + + +class NormalizeImage(object): + """Normlize image by given mean and std. + """ + + def __init__(self, mean, std): + self.__mean = mean + self.__std = std + + def __call__(self, sample): + sample["image"] = (sample["image"] - self.__mean) / self.__std + + return sample + + +class PrepareForNet(object): + """Prepare sample for usage as network input. + """ + + def __init__(self): + pass + + def __call__(self, sample): + image = np.transpose(sample["image"], (2, 0, 1)) + sample["image"] = np.ascontiguousarray(image).astype(np.float32) + + if "mask" in sample: + sample["mask"] = sample["mask"].astype(np.float32) + sample["mask"] = np.ascontiguousarray(sample["mask"]) + + if "disparity" in sample: + disparity = sample["disparity"].astype(np.float32) + sample["disparity"] = np.ascontiguousarray(disparity) + + if "depth" in sample: + depth = sample["depth"].astype(np.float32) + sample["depth"] = np.ascontiguousarray(depth) + + return sample diff --git a/custom_mmpkg/__init__.py b/custom_mmpkg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..33e7a7f594ef441479257c788e4c0d6e08657fc8 --- /dev/null +++ b/custom_mmpkg/__init__.py @@ -0,0 +1 @@ +#Dummy file ensuring this package will be recognized \ No newline at end of file diff --git a/custom_mmpkg/custom_mmcv/__init__.py b/custom_mmpkg/custom_mmcv/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..210a2989138380559f23045b568d0fbbeb918c03 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# flake8: noqa +from .arraymisc import * +from .fileio import * +from .image import * +from .utils import * +from .version import * +from .video import * +from .visualization import * + +# The following modules are not imported to this level, so mmcv may be used +# without PyTorch. +# - runner +# - parallel +# - op diff --git a/custom_mmpkg/custom_mmcv/arraymisc/__init__.py b/custom_mmpkg/custom_mmcv/arraymisc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4b4700d6139ae3d604ff6e542468cce4200c020c --- /dev/null +++ b/custom_mmpkg/custom_mmcv/arraymisc/__init__.py @@ -0,0 +1,4 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .quantization import dequantize, quantize + +__all__ = ['quantize', 'dequantize'] diff --git a/custom_mmpkg/custom_mmcv/arraymisc/quantization.py b/custom_mmpkg/custom_mmcv/arraymisc/quantization.py new file mode 100644 index 0000000000000000000000000000000000000000..8e47a3545780cf071a1ef8195efb0b7b662c8186 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/arraymisc/quantization.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + + +def quantize(arr, min_val, max_val, levels, dtype=np.int64): + """Quantize an array of (-inf, inf) to [0, levels-1]. + + Args: + arr (ndarray): Input array. + min_val (scalar): Minimum value to be clipped. + max_val (scalar): Maximum value to be clipped. + levels (int): Quantization levels. + dtype (np.type): The type of the quantized array. + + Returns: + tuple: Quantized array. + """ + if not (isinstance(levels, int) and levels > 1): + raise ValueError( + f'levels must be a positive integer, but got {levels}') + if min_val >= max_val: + raise ValueError( + f'min_val ({min_val}) must be smaller than max_val ({max_val})') + + arr = np.clip(arr, min_val, max_val) - min_val + quantized_arr = np.minimum( + np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1) + + return quantized_arr + + +def dequantize(arr, min_val, max_val, levels, dtype=np.float64): + """Dequantize an array. + + Args: + arr (ndarray): Input array. + min_val (scalar): Minimum value to be clipped. + max_val (scalar): Maximum value to be clipped. + levels (int): Quantization levels. + dtype (np.type): The type of the dequantized array. + + Returns: + tuple: Dequantized array. + """ + if not (isinstance(levels, int) and levels > 1): + raise ValueError( + f'levels must be a positive integer, but got {levels}') + if min_val >= max_val: + raise ValueError( + f'min_val ({min_val}) must be smaller than max_val ({max_val})') + + dequantized_arr = (arr + 0.5).astype(dtype) * (max_val - + min_val) / levels + min_val + + return dequantized_arr diff --git a/custom_mmpkg/custom_mmcv/cnn/__init__.py b/custom_mmpkg/custom_mmcv/cnn/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7246c897430f0cc7ce12719ad8608824fc734446 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/__init__.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .alexnet import AlexNet +# yapf: disable +from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS, + PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS, + ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule, + ConvTranspose2d, ConvTranspose3d, ConvWS2d, + DepthwiseSeparableConvModule, GeneralizedAttention, + HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d, + NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish, + build_activation_layer, build_conv_layer, + build_norm_layer, build_padding_layer, build_plugin_layer, + build_upsample_layer, conv_ws_2d, is_norm) +from .builder import MODELS, build_model_from_cfg +# yapf: enable +from .resnet import ResNet, make_res_layer +from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit, + NormalInit, PretrainedInit, TruncNormalInit, UniformInit, + XavierInit, bias_init_with_prob, caffe2_xavier_init, + constant_init, fuse_conv_bn, get_model_complexity_info, + initialize, kaiming_init, normal_init, trunc_normal_init, + uniform_init, xavier_init) +from .vgg import VGG, make_vgg_layer + +__all__ = [ + 'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer', + 'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init', + 'uniform_init', 'kaiming_init', 'caffe2_xavier_init', + 'bias_init_with_prob', 'ConvModule', 'build_activation_layer', + 'build_conv_layer', 'build_norm_layer', 'build_padding_layer', + 'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d', + 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish', + 'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', + 'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', + 'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d', + 'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d', + 'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', + 'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', + 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', + 'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg' +] diff --git a/custom_mmpkg/custom_mmcv/cnn/alexnet.py b/custom_mmpkg/custom_mmcv/cnn/alexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..89e36b8c7851f895d9ae7f07149f0e707456aab0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/alexnet.py @@ -0,0 +1,61 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.nn as nn + + +class AlexNet(nn.Module): + """AlexNet backbone. + + Args: + num_classes (int): number of classes for classification. + """ + + def __init__(self, num_classes=-1): + super(AlexNet, self).__init__() + self.num_classes = num_classes + self.features = nn.Sequential( + nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(64, 192, kernel_size=5, padding=2), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + nn.Conv2d(192, 384, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(384, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(256, 256, kernel_size=3, padding=1), + nn.ReLU(inplace=True), + nn.MaxPool2d(kernel_size=3, stride=2), + ) + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Dropout(), + nn.Linear(256 * 6 * 6, 4096), + nn.ReLU(inplace=True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(inplace=True), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + from ..runner import load_checkpoint + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + # use default initializer + pass + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + + x = self.features(x) + if self.num_classes > 0: + x = x.view(x.size(0), 256 * 6 * 6) + x = self.classifier(x) + + return x diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/__init__.py b/custom_mmpkg/custom_mmcv/cnn/bricks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0f33124ed23fc6f27119a37bcb5ab004d3572be0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/__init__.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .activation import build_activation_layer +from .context_block import ContextBlock +from .conv import build_conv_layer +from .conv2d_adaptive_padding import Conv2dAdaptivePadding +from .conv_module import ConvModule +from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d +from .depthwise_separable_conv_module import DepthwiseSeparableConvModule +from .drop import Dropout, DropPath +from .generalized_attention import GeneralizedAttention +from .hsigmoid import HSigmoid +from .hswish import HSwish +from .non_local import NonLocal1d, NonLocal2d, NonLocal3d +from .norm import build_norm_layer, is_norm +from .padding import build_padding_layer +from .plugin import build_plugin_layer +from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS, + PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS) +from .scale import Scale +from .swish import Swish +from .upsample import build_upsample_layer +from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d, + Linear, MaxPool2d, MaxPool3d) + +__all__ = [ + 'ConvModule', 'build_activation_layer', 'build_conv_layer', + 'build_norm_layer', 'build_padding_layer', 'build_upsample_layer', + 'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d', + 'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention', + 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS', + 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d', + 'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear', + 'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d', + 'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath' +] diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/activation.py b/custom_mmpkg/custom_mmcv/cnn/bricks/activation.py new file mode 100644 index 0000000000000000000000000000000000000000..0881d7201de63ea47c9e585eead35f5c12c1881f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/activation.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_mmpkg.custom_mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version +from .registry import ACTIVATION_LAYERS + +for module in [ + nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU, + nn.Sigmoid, nn.Tanh +]: + ACTIVATION_LAYERS.register_module(module=module) + + +@ACTIVATION_LAYERS.register_module(name='Clip') +@ACTIVATION_LAYERS.register_module() +class Clamp(nn.Module): + """Clamp activation layer. + + This activation function is to clamp the feature map value within + :math:`[min, max]`. More details can be found in ``torch.clamp()``. + + Args: + min (Number | optional): Lower-bound of the range to be clamped to. + Default to -1. + max (Number | optional): Upper-bound of the range to be clamped to. + Default to 1. + """ + + def __init__(self, min=-1., max=1.): + super(Clamp, self).__init__() + self.min = min + self.max = max + + def forward(self, x): + """Forward function. + + Args: + x (torch.Tensor): The input tensor. + + Returns: + torch.Tensor: Clamped tensor. + """ + return torch.clamp(x, min=self.min, max=self.max) + + +class GELU(nn.Module): + r"""Applies the Gaussian Error Linear Units function: + + .. math:: + \text{GELU}(x) = x * \Phi(x) + where :math:`\Phi(x)` is the Cumulative Distribution Function for + Gaussian Distribution. + + Shape: + - Input: :math:`(N, *)` where `*` means, any number of additional + dimensions + - Output: :math:`(N, *)`, same shape as the input + + .. image:: scripts/activation_images/GELU.png + + Examples:: + + >>> m = nn.GELU() + >>> input = torch.randn(2) + >>> output = m(input) + """ + + def forward(self, input): + return F.gelu(input) + + +if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.4')): + ACTIVATION_LAYERS.register_module(module=GELU) +else: + ACTIVATION_LAYERS.register_module(module=nn.GELU) + + +def build_activation_layer(cfg): + """Build activation layer. + + Args: + cfg (dict): The activation layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate an activation layer. + + Returns: + nn.Module: Created activation layer. + """ + return build_from_cfg(cfg, ACTIVATION_LAYERS) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/context_block.py b/custom_mmpkg/custom_mmcv/cnn/bricks/context_block.py new file mode 100644 index 0000000000000000000000000000000000000000..d60fdb904c749ce3b251510dff3cc63cea70d42e --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/context_block.py @@ -0,0 +1,125 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn + +from ..utils import constant_init, kaiming_init +from .registry import PLUGIN_LAYERS + + +def last_zero_init(m): + if isinstance(m, nn.Sequential): + constant_init(m[-1], val=0) + else: + constant_init(m, val=0) + + +@PLUGIN_LAYERS.register_module() +class ContextBlock(nn.Module): + """ContextBlock module in GCNet. + + See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond' + (https://arxiv.org/abs/1904.11492) for details. + + Args: + in_channels (int): Channels of the input feature map. + ratio (float): Ratio of channels of transform bottleneck + pooling_type (str): Pooling method for context modeling. + Options are 'att' and 'avg', stand for attention pooling and + average pooling respectively. Default: 'att'. + fusion_types (Sequence[str]): Fusion method for feature fusion, + Options are 'channels_add', 'channel_mul', stand for channelwise + addition and multiplication respectively. Default: ('channel_add',) + """ + + _abbr_ = 'context_block' + + def __init__(self, + in_channels, + ratio, + pooling_type='att', + fusion_types=('channel_add', )): + super(ContextBlock, self).__init__() + assert pooling_type in ['avg', 'att'] + assert isinstance(fusion_types, (list, tuple)) + valid_fusion_types = ['channel_add', 'channel_mul'] + assert all([f in valid_fusion_types for f in fusion_types]) + assert len(fusion_types) > 0, 'at least one fusion should be used' + self.in_channels = in_channels + self.ratio = ratio + self.planes = int(in_channels * ratio) + self.pooling_type = pooling_type + self.fusion_types = fusion_types + if pooling_type == 'att': + self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1) + self.softmax = nn.Softmax(dim=2) + else: + self.avg_pool = nn.AdaptiveAvgPool2d(1) + if 'channel_add' in fusion_types: + self.channel_add_conv = nn.Sequential( + nn.Conv2d(self.in_channels, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) + else: + self.channel_add_conv = None + if 'channel_mul' in fusion_types: + self.channel_mul_conv = nn.Sequential( + nn.Conv2d(self.in_channels, self.planes, kernel_size=1), + nn.LayerNorm([self.planes, 1, 1]), + nn.ReLU(inplace=True), # yapf: disable + nn.Conv2d(self.planes, self.in_channels, kernel_size=1)) + else: + self.channel_mul_conv = None + self.reset_parameters() + + def reset_parameters(self): + if self.pooling_type == 'att': + kaiming_init(self.conv_mask, mode='fan_in') + self.conv_mask.inited = True + + if self.channel_add_conv is not None: + last_zero_init(self.channel_add_conv) + if self.channel_mul_conv is not None: + last_zero_init(self.channel_mul_conv) + + def spatial_pool(self, x): + batch, channel, height, width = x.size() + if self.pooling_type == 'att': + input_x = x + # [N, C, H * W] + input_x = input_x.view(batch, channel, height * width) + # [N, 1, C, H * W] + input_x = input_x.unsqueeze(1) + # [N, 1, H, W] + context_mask = self.conv_mask(x) + # [N, 1, H * W] + context_mask = context_mask.view(batch, 1, height * width) + # [N, 1, H * W] + context_mask = self.softmax(context_mask) + # [N, 1, H * W, 1] + context_mask = context_mask.unsqueeze(-1) + # [N, 1, C, 1] + context = torch.matmul(input_x, context_mask) + # [N, C, 1, 1] + context = context.view(batch, channel, 1, 1) + else: + # [N, C, 1, 1] + context = self.avg_pool(x) + + return context + + def forward(self, x): + # [N, C, 1, 1] + context = self.spatial_pool(x) + + out = x + if self.channel_mul_conv is not None: + # [N, C, 1, 1] + channel_mul_term = torch.sigmoid(self.channel_mul_conv(context)) + out = out * channel_mul_term + if self.channel_add_conv is not None: + # [N, C, 1, 1] + channel_add_term = self.channel_add_conv(context) + out = out + channel_add_term + + return out diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/conv.py b/custom_mmpkg/custom_mmcv/cnn/bricks/conv.py new file mode 100644 index 0000000000000000000000000000000000000000..cf54491997a48ac3e7fadc4183ab7bf3e831024c --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/conv.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch import nn + +from .registry import CONV_LAYERS + +CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d) +CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d) +CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d) +CONV_LAYERS.register_module('Conv', module=nn.Conv2d) + + +def build_conv_layer(cfg, *args, **kwargs): + """Build convolution layer. + + Args: + cfg (None or dict): The conv layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate an conv layer. + args (argument list): Arguments passed to the `__init__` + method of the corresponding conv layer. + kwargs (keyword arguments): Keyword arguments passed to the `__init__` + method of the corresponding conv layer. + + Returns: + nn.Module: Created conv layer. + """ + if cfg is None: + cfg_ = dict(type='Conv2d') + else: + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in CONV_LAYERS: + raise KeyError(f'Unrecognized norm type {layer_type}') + else: + conv_layer = CONV_LAYERS.get(layer_type) + + layer = conv_layer(*args, **kwargs, **cfg_) + + return layer diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/conv2d_adaptive_padding.py b/custom_mmpkg/custom_mmcv/cnn/bricks/conv2d_adaptive_padding.py new file mode 100644 index 0000000000000000000000000000000000000000..b45e758ac6cf8dfb0382d072fe09125bc7e9b888 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/conv2d_adaptive_padding.py @@ -0,0 +1,62 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +from torch import nn +from torch.nn import functional as F + +from .registry import CONV_LAYERS + + +@CONV_LAYERS.register_module() +class Conv2dAdaptivePadding(nn.Conv2d): + """Implementation of 2D convolution in tensorflow with `padding` as "same", + which applies padding to input (if needed) so that input image gets fully + covered by filter and stride you specified. For stride 1, this will ensure + that output image size is same as input. For stride of 2, output dimensions + will be half, for example. + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super().__init__(in_channels, out_channels, kernel_size, stride, 0, + dilation, groups, bias) + + def forward(self, x): + img_h, img_w = x.size()[-2:] + kernel_h, kernel_w = self.weight.size()[-2:] + stride_h, stride_w = self.stride + output_h = math.ceil(img_h / stride_h) + output_w = math.ceil(img_w / stride_w) + pad_h = ( + max((output_h - 1) * self.stride[0] + + (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0)) + pad_w = ( + max((output_w - 1) * self.stride[1] + + (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0)) + if pad_h > 0 or pad_w > 0: + x = F.pad(x, [ + pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2 + ]) + return F.conv2d(x, self.weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/conv_module.py b/custom_mmpkg/custom_mmcv/cnn/bricks/conv_module.py new file mode 100644 index 0000000000000000000000000000000000000000..f9b82b6b35939be7031462d3febb6561e42854ea --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/conv_module.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch.nn as nn + +from custom_mmpkg.custom_mmcv.utils import _BatchNorm, _InstanceNorm +from ..utils import constant_init, kaiming_init +from .activation import build_activation_layer +from .conv import build_conv_layer +from .norm import build_norm_layer +from .padding import build_padding_layer +from .registry import PLUGIN_LAYERS + + +@PLUGIN_LAYERS.register_module() +class ConvModule(nn.Module): + """A conv block that bundles conv/norm/activation layers. + + This block simplifies the usage of convolution layers, which are commonly + used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). + It is based upon three build methods: `build_conv_layer()`, + `build_norm_layer()` and `build_activation_layer()`. + + Besides, we add some additional features in this module. + 1. Automatically set `bias` of the conv layer. + 2. Spectral norm is supported. + 3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only + supports zero and circular padding, and we add "reflect" padding mode. + + Args: + in_channels (int): Number of channels in the input feature map. + Same as that in ``nn._ConvNd``. + out_channels (int): Number of channels produced by the convolution. + Same as that in ``nn._ConvNd``. + kernel_size (int | tuple[int]): Size of the convolving kernel. + Same as that in ``nn._ConvNd``. + stride (int | tuple[int]): Stride of the convolution. + Same as that in ``nn._ConvNd``. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. Same as that in ``nn._ConvNd``. + dilation (int | tuple[int]): Spacing between kernel elements. + Same as that in ``nn._ConvNd``. + groups (int): Number of blocked connections from input channels to + output channels. Same as that in ``nn._ConvNd``. + bias (bool | str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise + False. Default: "auto". + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + inplace (bool): Whether to use inplace mode for activation. + Default: True. + with_spectral_norm (bool): Whether use spectral norm in conv module. + Default: False. + padding_mode (str): If the `padding_mode` has not been supported by + current `Conv2d` in PyTorch, we will use our own padding layer + instead. Currently, we support ['zeros', 'circular'] with official + implementation and ['reflect'] with our own implementation. + Default: 'zeros'. + order (tuple[str]): The order of conv/norm/activation layers. It is a + sequence of "conv", "norm" and "act". Common examples are + ("conv", "norm", "act") and ("act", "conv", "norm"). + Default: ('conv', 'norm', 'act'). + """ + + _abbr_ = 'conv_block' + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias='auto', + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + inplace=True, + with_spectral_norm=False, + padding_mode='zeros', + order=('conv', 'norm', 'act')): + super(ConvModule, self).__init__() + assert conv_cfg is None or isinstance(conv_cfg, dict) + assert norm_cfg is None or isinstance(norm_cfg, dict) + assert act_cfg is None or isinstance(act_cfg, dict) + official_padding_mode = ['zeros', 'circular'] + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.inplace = inplace + self.with_spectral_norm = with_spectral_norm + self.with_explicit_padding = padding_mode not in official_padding_mode + self.order = order + assert isinstance(self.order, tuple) and len(self.order) == 3 + assert set(order) == set(['conv', 'norm', 'act']) + + self.with_norm = norm_cfg is not None + self.with_activation = act_cfg is not None + # if the conv layer is before a norm layer, bias is unnecessary. + if bias == 'auto': + bias = not self.with_norm + self.with_bias = bias + + if self.with_explicit_padding: + pad_cfg = dict(type=padding_mode) + self.padding_layer = build_padding_layer(pad_cfg, padding) + + # reset padding to 0 for conv module + conv_padding = 0 if self.with_explicit_padding else padding + # build convolution layer + self.conv = build_conv_layer( + conv_cfg, + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=conv_padding, + dilation=dilation, + groups=groups, + bias=bias) + # export the attributes of self.conv to a higher level for convenience + self.in_channels = self.conv.in_channels + self.out_channels = self.conv.out_channels + self.kernel_size = self.conv.kernel_size + self.stride = self.conv.stride + self.padding = padding + self.dilation = self.conv.dilation + self.transposed = self.conv.transposed + self.output_padding = self.conv.output_padding + self.groups = self.conv.groups + + if self.with_spectral_norm: + self.conv = nn.utils.spectral_norm(self.conv) + + # build normalization layers + if self.with_norm: + # norm layer is after conv layer + if order.index('norm') > order.index('conv'): + norm_channels = out_channels + else: + norm_channels = in_channels + self.norm_name, norm = build_norm_layer(norm_cfg, norm_channels) + self.add_module(self.norm_name, norm) + if self.with_bias: + if isinstance(norm, (_BatchNorm, _InstanceNorm)): + warnings.warn( + 'Unnecessary conv bias before batch/instance norm') + else: + self.norm_name = None + + # build activation layer + if self.with_activation: + act_cfg_ = act_cfg.copy() + # nn.Tanh has no 'inplace' argument + if act_cfg_['type'] not in [ + 'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish' + ]: + act_cfg_.setdefault('inplace', inplace) + self.activate = build_activation_layer(act_cfg_) + + # Use msra init by default + self.init_weights() + + @property + def norm(self): + if self.norm_name: + return getattr(self, self.norm_name) + else: + return None + + def init_weights(self): + # 1. It is mainly for customized conv layers with their own + # initialization manners by calling their own ``init_weights()``, + # and we do not want ConvModule to override the initialization. + # 2. For customized conv layers without their own initialization + # manners (that is, they don't have their own ``init_weights()``) + # and PyTorch's conv layers, they will be initialized by + # this method with default ``kaiming_init``. + # Note: For PyTorch's conv layers, they will be overwritten by our + # initialization implementation using default ``kaiming_init``. + if not hasattr(self.conv, 'init_weights'): + if self.with_activation and self.act_cfg['type'] == 'LeakyReLU': + nonlinearity = 'leaky_relu' + a = self.act_cfg.get('negative_slope', 0.01) + else: + nonlinearity = 'relu' + a = 0 + kaiming_init(self.conv, a=a, nonlinearity=nonlinearity) + if self.with_norm: + constant_init(self.norm, 1, bias=0) + + def forward(self, x, activate=True, norm=True): + for layer in self.order: + if layer == 'conv': + if self.with_explicit_padding: + x = self.padding_layer(x) + x = self.conv(x) + elif layer == 'norm' and norm and self.with_norm: + x = self.norm(x) + elif layer == 'act' and activate and self.with_activation: + x = self.activate(x) + return x diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/conv_ws.py b/custom_mmpkg/custom_mmcv/cnn/bricks/conv_ws.py new file mode 100644 index 0000000000000000000000000000000000000000..a3941e27874993418b3b5708d5a7485f175ff9c8 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/conv_ws.py @@ -0,0 +1,148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .registry import CONV_LAYERS + + +def conv_ws_2d(input, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + eps=1e-5): + c_in = weight.size(0) + weight_flat = weight.view(c_in, -1) + mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1) + std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1) + weight = (weight - mean) / (std + eps) + return F.conv2d(input, weight, bias, stride, padding, dilation, groups) + + +@CONV_LAYERS.register_module('ConvWS') +class ConvWS2d(nn.Conv2d): + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + eps=1e-5): + super(ConvWS2d, self).__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.eps = eps + + def forward(self, x): + return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding, + self.dilation, self.groups, self.eps) + + +@CONV_LAYERS.register_module(name='ConvAWS') +class ConvAWS2d(nn.Conv2d): + """AWS (Adaptive Weight Standardization) + + This is a variant of Weight Standardization + (https://arxiv.org/pdf/1903.10520.pdf) + It is used in DetectoRS to avoid NaN + (https://arxiv.org/pdf/2006.02334.pdf) + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the conv kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If set True, adds a learnable bias to the + output. Default: True + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.register_buffer('weight_gamma', + torch.ones(self.out_channels, 1, 1, 1)) + self.register_buffer('weight_beta', + torch.zeros(self.out_channels, 1, 1, 1)) + + def _get_weight(self, weight): + weight_flat = weight.view(weight.size(0), -1) + mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) + std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) + weight = (weight - mean) / std + weight = self.weight_gamma * weight + self.weight_beta + return weight + + def forward(self, x): + weight = self._get_weight(self.weight) + return F.conv2d(x, weight, self.bias, self.stride, self.padding, + self.dilation, self.groups) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + """Override default load function. + + AWS overrides the function _load_from_state_dict to recover + weight_gamma and weight_beta if they are missing. If weight_gamma and + weight_beta are found in the checkpoint, this function will return + after super()._load_from_state_dict. Otherwise, it will compute the + mean and std of the pretrained weights and store them in weight_beta + and weight_gamma. + """ + + self.weight_gamma.data.fill_(-1) + local_missing_keys = [] + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, local_missing_keys, + unexpected_keys, error_msgs) + if self.weight_gamma.data.mean() > 0: + for k in local_missing_keys: + missing_keys.append(k) + return + weight = self.weight.data + weight_flat = weight.view(weight.size(0), -1) + mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1) + std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1) + self.weight_beta.data.copy_(mean) + self.weight_gamma.data.copy_(std) + missing_gamma_beta = [ + k for k in local_missing_keys + if k.endswith('weight_gamma') or k.endswith('weight_beta') + ] + for k in missing_gamma_beta: + local_missing_keys.remove(k) + for k in local_missing_keys: + missing_keys.append(k) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/depthwise_separable_conv_module.py b/custom_mmpkg/custom_mmcv/cnn/bricks/depthwise_separable_conv_module.py new file mode 100644 index 0000000000000000000000000000000000000000..722d5d8d71f75486e2db3008907c4eadfca41d63 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/depthwise_separable_conv_module.py @@ -0,0 +1,96 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .conv_module import ConvModule + + +class DepthwiseSeparableConvModule(nn.Module): + """Depthwise separable convolution module. + + See https://arxiv.org/pdf/1704.04861.pdf for details. + + This module can replace a ConvModule with the conv block replaced by two + conv block: depthwise conv block and pointwise conv block. The depthwise + conv block contains depthwise-conv/norm/activation layers. The pointwise + conv block contains pointwise-conv/norm/activation layers. It should be + noted that there will be norm/activation layer in the depthwise conv block + if `norm_cfg` and `act_cfg` are specified. + + Args: + in_channels (int): Number of channels in the input feature map. + Same as that in ``nn._ConvNd``. + out_channels (int): Number of channels produced by the convolution. + Same as that in ``nn._ConvNd``. + kernel_size (int | tuple[int]): Size of the convolving kernel. + Same as that in ``nn._ConvNd``. + stride (int | tuple[int]): Stride of the convolution. + Same as that in ``nn._ConvNd``. Default: 1. + padding (int | tuple[int]): Zero-padding added to both sides of + the input. Same as that in ``nn._ConvNd``. Default: 0. + dilation (int | tuple[int]): Spacing between kernel elements. + Same as that in ``nn._ConvNd``. Default: 1. + norm_cfg (dict): Default norm config for both depthwise ConvModule and + pointwise ConvModule. Default: None. + act_cfg (dict): Default activation config for both depthwise ConvModule + and pointwise ConvModule. Default: dict(type='ReLU'). + dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is + 'default', it will be the same as `norm_cfg`. Default: 'default'. + dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is + 'default', it will be the same as `act_cfg`. Default: 'default'. + pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is + 'default', it will be the same as `norm_cfg`. Default: 'default'. + pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is + 'default', it will be the same as `act_cfg`. Default: 'default'. + kwargs (optional): Other shared arguments for depthwise and pointwise + ConvModule. See ConvModule for ref. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + dw_norm_cfg='default', + dw_act_cfg='default', + pw_norm_cfg='default', + pw_act_cfg='default', + **kwargs): + super(DepthwiseSeparableConvModule, self).__init__() + assert 'groups' not in kwargs, 'groups should not be specified' + + # if norm/activation config of depthwise/pointwise ConvModule is not + # specified, use default config. + dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg + dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg + pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg + pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg + + # depthwise convolution + self.depthwise_conv = ConvModule( + in_channels, + in_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + norm_cfg=dw_norm_cfg, + act_cfg=dw_act_cfg, + **kwargs) + + self.pointwise_conv = ConvModule( + in_channels, + out_channels, + 1, + norm_cfg=pw_norm_cfg, + act_cfg=pw_act_cfg, + **kwargs) + + def forward(self, x): + x = self.depthwise_conv(x) + x = self.pointwise_conv(x) + return x diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/drop.py b/custom_mmpkg/custom_mmcv/cnn/bricks/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..90d192e3d3855d432bab5575406a09d5ff1aa94c --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/drop.py @@ -0,0 +1,65 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from custom_mmpkg.custom_mmcv import build_from_cfg +from .registry import DROPOUT_LAYERS + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + We follow the implementation + https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + # handle tensors with different dimensions, not just 4D tensors. + shape = (x.shape[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand( + shape, dtype=x.dtype, device=x.device) + output = x.div(keep_prob) * random_tensor.floor() + return output + + +@DROPOUT_LAYERS.register_module() +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + We follow the implementation + https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py # noqa: E501 + + Args: + drop_prob (float): Probability of the path to be zeroed. Default: 0.1 + """ + + def __init__(self, drop_prob=0.1): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +@DROPOUT_LAYERS.register_module() +class Dropout(nn.Dropout): + """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of + ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with + ``DropPath`` + + Args: + drop_prob (float): Probability of the elements to be + zeroed. Default: 0.5. + inplace (bool): Do the operation inplace or not. Default: False. + """ + + def __init__(self, drop_prob=0.5, inplace=False): + super().__init__(p=drop_prob, inplace=inplace) + + +def build_dropout(cfg, default_args=None): + """Builder for drop out layers.""" + return build_from_cfg(cfg, DROPOUT_LAYERS, default_args) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/generalized_attention.py b/custom_mmpkg/custom_mmcv/cnn/bricks/generalized_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..988d9adf2f289ef223bd1c680a5ae1d3387f0269 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/generalized_attention.py @@ -0,0 +1,412 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..utils import kaiming_init +from .registry import PLUGIN_LAYERS + + +@PLUGIN_LAYERS.register_module() +class GeneralizedAttention(nn.Module): + """GeneralizedAttention module. + + See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks' + (https://arxiv.org/abs/1711.07971) for details. + + Args: + in_channels (int): Channels of the input feature map. + spatial_range (int): The spatial range. -1 indicates no spatial range + constraint. Default: -1. + num_heads (int): The head number of empirical_attention module. + Default: 9. + position_embedding_dim (int): The position embedding dimension. + Default: -1. + position_magnitude (int): A multiplier acting on coord difference. + Default: 1. + kv_stride (int): The feature stride acting on key/value feature map. + Default: 2. + q_stride (int): The feature stride acting on query feature map. + Default: 1. + attention_type (str): A binary indicator string for indicating which + items in generalized empirical_attention module are used. + Default: '1111'. + + - '1000' indicates 'query and key content' (appr - appr) item, + - '0100' indicates 'query content and relative position' + (appr - position) item, + - '0010' indicates 'key content only' (bias - appr) item, + - '0001' indicates 'relative position only' (bias - position) item. + """ + + _abbr_ = 'gen_attention_block' + + def __init__(self, + in_channels, + spatial_range=-1, + num_heads=9, + position_embedding_dim=-1, + position_magnitude=1, + kv_stride=2, + q_stride=1, + attention_type='1111'): + + super(GeneralizedAttention, self).__init__() + + # hard range means local range for non-local operation + self.position_embedding_dim = ( + position_embedding_dim + if position_embedding_dim > 0 else in_channels) + + self.position_magnitude = position_magnitude + self.num_heads = num_heads + self.in_channels = in_channels + self.spatial_range = spatial_range + self.kv_stride = kv_stride + self.q_stride = q_stride + self.attention_type = [bool(int(_)) for _ in attention_type] + self.qk_embed_dim = in_channels // num_heads + out_c = self.qk_embed_dim * num_heads + + if self.attention_type[0] or self.attention_type[1]: + self.query_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_c, + kernel_size=1, + bias=False) + self.query_conv.kaiming_init = True + + if self.attention_type[0] or self.attention_type[2]: + self.key_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_c, + kernel_size=1, + bias=False) + self.key_conv.kaiming_init = True + + self.v_dim = in_channels // num_heads + self.value_conv = nn.Conv2d( + in_channels=in_channels, + out_channels=self.v_dim * num_heads, + kernel_size=1, + bias=False) + self.value_conv.kaiming_init = True + + if self.attention_type[1] or self.attention_type[3]: + self.appr_geom_fc_x = nn.Linear( + self.position_embedding_dim // 2, out_c, bias=False) + self.appr_geom_fc_x.kaiming_init = True + + self.appr_geom_fc_y = nn.Linear( + self.position_embedding_dim // 2, out_c, bias=False) + self.appr_geom_fc_y.kaiming_init = True + + if self.attention_type[2]: + stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) + appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv + self.appr_bias = nn.Parameter(appr_bias_value) + + if self.attention_type[3]: + stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2) + geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv + self.geom_bias = nn.Parameter(geom_bias_value) + + self.proj_conv = nn.Conv2d( + in_channels=self.v_dim * num_heads, + out_channels=in_channels, + kernel_size=1, + bias=True) + self.proj_conv.kaiming_init = True + self.gamma = nn.Parameter(torch.zeros(1)) + + if self.spatial_range >= 0: + # only works when non local is after 3*3 conv + if in_channels == 256: + max_len = 84 + elif in_channels == 512: + max_len = 42 + + max_len_kv = int((max_len - 1.0) / self.kv_stride + 1) + local_constraint_map = np.ones( + (max_len, max_len, max_len_kv, max_len_kv), dtype=np.int) + for iy in range(max_len): + for ix in range(max_len): + local_constraint_map[ + iy, ix, + max((iy - self.spatial_range) // + self.kv_stride, 0):min((iy + self.spatial_range + + 1) // self.kv_stride + + 1, max_len), + max((ix - self.spatial_range) // + self.kv_stride, 0):min((ix + self.spatial_range + + 1) // self.kv_stride + + 1, max_len)] = 0 + + self.local_constraint_map = nn.Parameter( + torch.from_numpy(local_constraint_map).byte(), + requires_grad=False) + + if self.q_stride > 1: + self.q_downsample = nn.AvgPool2d( + kernel_size=1, stride=self.q_stride) + else: + self.q_downsample = None + + if self.kv_stride > 1: + self.kv_downsample = nn.AvgPool2d( + kernel_size=1, stride=self.kv_stride) + else: + self.kv_downsample = None + + self.init_weights() + + def get_position_embedding(self, + h, + w, + h_kv, + w_kv, + q_stride, + kv_stride, + device, + dtype, + feat_dim, + wave_length=1000): + # the default type of Tensor is float32, leading to type mismatch + # in fp16 mode. Cast it to support fp16 mode. + h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype) + h_idxs = h_idxs.view((h, 1)) * q_stride + + w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype) + w_idxs = w_idxs.view((w, 1)) * q_stride + + h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to( + device=device, dtype=dtype) + h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride + + w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to( + device=device, dtype=dtype) + w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride + + # (h, h_kv, 1) + h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0) + h_diff *= self.position_magnitude + + # (w, w_kv, 1) + w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0) + w_diff *= self.position_magnitude + + feat_range = torch.arange(0, feat_dim / 4).to( + device=device, dtype=dtype) + + dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype) + dim_mat = dim_mat**((4. / feat_dim) * feat_range) + dim_mat = dim_mat.view((1, 1, -1)) + + embedding_x = torch.cat( + ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2) + + embedding_y = torch.cat( + ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2) + + return embedding_x, embedding_y + + def forward(self, x_input): + num_heads = self.num_heads + + # use empirical_attention + if self.q_downsample is not None: + x_q = self.q_downsample(x_input) + else: + x_q = x_input + n, _, h, w = x_q.shape + + if self.kv_downsample is not None: + x_kv = self.kv_downsample(x_input) + else: + x_kv = x_input + _, _, h_kv, w_kv = x_kv.shape + + if self.attention_type[0] or self.attention_type[1]: + proj_query = self.query_conv(x_q).view( + (n, num_heads, self.qk_embed_dim, h * w)) + proj_query = proj_query.permute(0, 1, 3, 2) + + if self.attention_type[0] or self.attention_type[2]: + proj_key = self.key_conv(x_kv).view( + (n, num_heads, self.qk_embed_dim, h_kv * w_kv)) + + if self.attention_type[1] or self.attention_type[3]: + position_embed_x, position_embed_y = self.get_position_embedding( + h, w, h_kv, w_kv, self.q_stride, self.kv_stride, + x_input.device, x_input.dtype, self.position_embedding_dim) + # (n, num_heads, w, w_kv, dim) + position_feat_x = self.appr_geom_fc_x(position_embed_x).\ + view(1, w, w_kv, num_heads, self.qk_embed_dim).\ + permute(0, 3, 1, 2, 4).\ + repeat(n, 1, 1, 1, 1) + + # (n, num_heads, h, h_kv, dim) + position_feat_y = self.appr_geom_fc_y(position_embed_y).\ + view(1, h, h_kv, num_heads, self.qk_embed_dim).\ + permute(0, 3, 1, 2, 4).\ + repeat(n, 1, 1, 1, 1) + + position_feat_x /= math.sqrt(2) + position_feat_y /= math.sqrt(2) + + # accelerate for saliency only + if (np.sum(self.attention_type) == 1) and self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim).\ + repeat(n, 1, 1, 1) + + energy = torch.matmul(appr_bias, proj_key).\ + view(n, num_heads, 1, h_kv * w_kv) + + h = 1 + w = 1 + else: + # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for + if not self.attention_type[0]: + energy = torch.zeros( + n, + num_heads, + h, + w, + h_kv, + w_kv, + dtype=x_input.dtype, + device=x_input.device) + + # attention_type[0]: appr - appr + # attention_type[1]: appr - position + # attention_type[2]: bias - appr + # attention_type[3]: bias - position + if self.attention_type[0] or self.attention_type[2]: + if self.attention_type[0] and self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim) + energy = torch.matmul(proj_query + appr_bias, proj_key).\ + view(n, num_heads, h, w, h_kv, w_kv) + + elif self.attention_type[0]: + energy = torch.matmul(proj_query, proj_key).\ + view(n, num_heads, h, w, h_kv, w_kv) + + elif self.attention_type[2]: + appr_bias = self.appr_bias.\ + view(1, num_heads, 1, self.qk_embed_dim).\ + repeat(n, 1, 1, 1) + + energy += torch.matmul(appr_bias, proj_key).\ + view(n, num_heads, 1, 1, h_kv, w_kv) + + if self.attention_type[1] or self.attention_type[3]: + if self.attention_type[1] and self.attention_type[3]: + geom_bias = self.geom_bias.\ + view(1, num_heads, 1, self.qk_embed_dim) + + proj_query_reshape = (proj_query + geom_bias).\ + view(n, num_heads, h, w, self.qk_embed_dim) + + energy_x = torch.matmul( + proj_query_reshape.permute(0, 1, 3, 2, 4), + position_feat_x.permute(0, 1, 2, 4, 3)) + energy_x = energy_x.\ + permute(0, 1, 3, 2, 4).unsqueeze(4) + + energy_y = torch.matmul( + proj_query_reshape, + position_feat_y.permute(0, 1, 2, 4, 3)) + energy_y = energy_y.unsqueeze(5) + + energy += energy_x + energy_y + + elif self.attention_type[1]: + proj_query_reshape = proj_query.\ + view(n, num_heads, h, w, self.qk_embed_dim) + proj_query_reshape = proj_query_reshape.\ + permute(0, 1, 3, 2, 4) + position_feat_x_reshape = position_feat_x.\ + permute(0, 1, 2, 4, 3) + position_feat_y_reshape = position_feat_y.\ + permute(0, 1, 2, 4, 3) + + energy_x = torch.matmul(proj_query_reshape, + position_feat_x_reshape) + energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4) + + energy_y = torch.matmul(proj_query_reshape, + position_feat_y_reshape) + energy_y = energy_y.unsqueeze(5) + + energy += energy_x + energy_y + + elif self.attention_type[3]: + geom_bias = self.geom_bias.\ + view(1, num_heads, self.qk_embed_dim, 1).\ + repeat(n, 1, 1, 1) + + position_feat_x_reshape = position_feat_x.\ + view(n, num_heads, w*w_kv, self.qk_embed_dim) + + position_feat_y_reshape = position_feat_y.\ + view(n, num_heads, h * h_kv, self.qk_embed_dim) + + energy_x = torch.matmul(position_feat_x_reshape, geom_bias) + energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv) + + energy_y = torch.matmul(position_feat_y_reshape, geom_bias) + energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1) + + energy += energy_x + energy_y + + energy = energy.view(n, num_heads, h * w, h_kv * w_kv) + + if self.spatial_range >= 0: + cur_local_constraint_map = \ + self.local_constraint_map[:h, :w, :h_kv, :w_kv].\ + contiguous().\ + view(1, 1, h*w, h_kv*w_kv) + + energy = energy.masked_fill_(cur_local_constraint_map, + float('-inf')) + + attention = F.softmax(energy, 3) + + proj_value = self.value_conv(x_kv) + proj_value_reshape = proj_value.\ + view((n, num_heads, self.v_dim, h_kv * w_kv)).\ + permute(0, 1, 3, 2) + + out = torch.matmul(attention, proj_value_reshape).\ + permute(0, 1, 3, 2).\ + contiguous().\ + view(n, self.v_dim * self.num_heads, h, w) + + out = self.proj_conv(out) + + # output is downsampled, upsample back to input size + if self.q_downsample is not None: + out = F.interpolate( + out, + size=x_input.shape[2:], + mode='bilinear', + align_corners=False) + + out = self.gamma * out + x_input + return out + + def init_weights(self): + for m in self.modules(): + if hasattr(m, 'kaiming_init') and m.kaiming_init: + kaiming_init( + m, + mode='fan_in', + nonlinearity='leaky_relu', + bias=0, + distribution='uniform', + a=1) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/hsigmoid.py b/custom_mmpkg/custom_mmcv/cnn/bricks/hsigmoid.py new file mode 100644 index 0000000000000000000000000000000000000000..30b1a3d6580cf0360710426fbea1f05acdf07b4b --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/hsigmoid.py @@ -0,0 +1,34 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .registry import ACTIVATION_LAYERS + + +@ACTIVATION_LAYERS.register_module() +class HSigmoid(nn.Module): + """Hard Sigmoid Module. Apply the hard sigmoid function: + Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value) + Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1) + + Args: + bias (float): Bias of the input feature map. Default: 1.0. + divisor (float): Divisor of the input feature map. Default: 2.0. + min_value (float): Lower bound value. Default: 0.0. + max_value (float): Upper bound value. Default: 1.0. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0): + super(HSigmoid, self).__init__() + self.bias = bias + self.divisor = divisor + assert self.divisor != 0 + self.min_value = min_value + self.max_value = max_value + + def forward(self, x): + x = (x + self.bias) / self.divisor + + return x.clamp_(self.min_value, self.max_value) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/hswish.py b/custom_mmpkg/custom_mmcv/cnn/bricks/hswish.py new file mode 100644 index 0000000000000000000000000000000000000000..7e0c090ff037c99ee6c5c84c4592e87beae02208 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/hswish.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .registry import ACTIVATION_LAYERS + + +@ACTIVATION_LAYERS.register_module() +class HSwish(nn.Module): + """Hard Swish Module. + + This module applies the hard swish function: + + .. math:: + Hswish(x) = x * ReLU6(x + 3) / 6 + + Args: + inplace (bool): can optionally do the operation in-place. + Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, inplace=False): + super(HSwish, self).__init__() + self.act = nn.ReLU6(inplace) + + def forward(self, x): + return x * self.act(x + 3) / 6 diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/non_local.py b/custom_mmpkg/custom_mmcv/cnn/bricks/non_local.py new file mode 100644 index 0000000000000000000000000000000000000000..92d00155ef275c1201ea66bba30470a1785cc5d7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/non_local.py @@ -0,0 +1,306 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta + +import torch +import torch.nn as nn + +from ..utils import constant_init, normal_init +from .conv_module import ConvModule +from .registry import PLUGIN_LAYERS + + +class _NonLocalNd(nn.Module, metaclass=ABCMeta): + """Basic Non-local module. + + This module is proposed in + "Non-local Neural Networks" + Paper reference: https://arxiv.org/abs/1711.07971 + Code reference: https://github.com/AlexHex7/Non-local_pytorch + + Args: + in_channels (int): Channels of the input feature map. + reduction (int): Channel reduction ratio. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`. + Default: True. + conv_cfg (None | dict): The config dict for convolution layers. + If not specified, it will use `nn.Conv2d` for convolution layers. + Default: None. + norm_cfg (None | dict): The config dict for normalization layers. + Default: None. (This parameter is only applicable to conv_out.) + mode (str): Options are `gaussian`, `concatenation`, + `embedded_gaussian` and `dot_product`. Default: embedded_gaussian. + """ + + def __init__(self, + in_channels, + reduction=2, + use_scale=True, + conv_cfg=None, + norm_cfg=None, + mode='embedded_gaussian', + **kwargs): + super(_NonLocalNd, self).__init__() + self.in_channels = in_channels + self.reduction = reduction + self.use_scale = use_scale + self.inter_channels = max(in_channels // reduction, 1) + self.mode = mode + + if mode not in [ + 'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation' + ]: + raise ValueError("Mode should be in 'gaussian', 'concatenation', " + f"'embedded_gaussian' or 'dot_product', but got " + f'{mode} instead.') + + # g, theta, phi are defaulted as `nn.ConvNd`. + # Here we use ConvModule for potential usage. + self.g = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + self.conv_out = ConvModule( + self.inter_channels, + self.in_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + if self.mode != 'gaussian': + self.theta = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + self.phi = ConvModule( + self.in_channels, + self.inter_channels, + kernel_size=1, + conv_cfg=conv_cfg, + act_cfg=None) + + if self.mode == 'concatenation': + self.concat_project = ConvModule( + self.inter_channels * 2, + 1, + kernel_size=1, + stride=1, + padding=0, + bias=False, + act_cfg=dict(type='ReLU')) + + self.init_weights(**kwargs) + + def init_weights(self, std=0.01, zeros_init=True): + if self.mode != 'gaussian': + for m in [self.g, self.theta, self.phi]: + normal_init(m.conv, std=std) + else: + normal_init(self.g.conv, std=std) + if zeros_init: + if self.conv_out.norm_cfg is None: + constant_init(self.conv_out.conv, 0) + else: + constant_init(self.conv_out.norm, 0) + else: + if self.conv_out.norm_cfg is None: + normal_init(self.conv_out.conv, std=std) + else: + normal_init(self.conv_out.norm, std=std) + + def gaussian(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def embedded_gaussian(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + if self.use_scale: + # theta_x.shape[-1] is `self.inter_channels` + pairwise_weight /= theta_x.shape[-1]**0.5 + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def dot_product(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + pairwise_weight /= pairwise_weight.shape[-1] + return pairwise_weight + + def concatenation(self, theta_x, phi_x): + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + h = theta_x.size(2) + w = phi_x.size(3) + theta_x = theta_x.repeat(1, 1, 1, w) + phi_x = phi_x.repeat(1, 1, h, 1) + + concat_feature = torch.cat([theta_x, phi_x], dim=1) + pairwise_weight = self.concat_project(concat_feature) + n, _, h, w = pairwise_weight.size() + pairwise_weight = pairwise_weight.view(n, h, w) + pairwise_weight /= pairwise_weight.shape[-1] + + return pairwise_weight + + def forward(self, x): + # Assume `reduction = 1`, then `inter_channels = C` + # or `inter_channels = C` when `mode="gaussian"` + + # NonLocal1d x: [N, C, H] + # NonLocal2d x: [N, C, H, W] + # NonLocal3d x: [N, C, T, H, W] + n = x.size(0) + + # NonLocal1d g_x: [N, H, C] + # NonLocal2d g_x: [N, HxW, C] + # NonLocal3d g_x: [N, TxHxW, C] + g_x = self.g(x).view(n, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H] + # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW] + # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW] + if self.mode == 'gaussian': + theta_x = x.view(n, self.in_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + if self.sub_sample: + phi_x = self.phi(x).view(n, self.in_channels, -1) + else: + phi_x = x.view(n, self.in_channels, -1) + elif self.mode == 'concatenation': + theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) + phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) + else: + theta_x = self.theta(x).view(n, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(n, self.inter_channels, -1) + + pairwise_func = getattr(self, self.mode) + # NonLocal1d pairwise_weight: [N, H, H] + # NonLocal2d pairwise_weight: [N, HxW, HxW] + # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW] + pairwise_weight = pairwise_func(theta_x, phi_x) + + # NonLocal1d y: [N, H, C] + # NonLocal2d y: [N, HxW, C] + # NonLocal3d y: [N, TxHxW, C] + y = torch.matmul(pairwise_weight, g_x) + # NonLocal1d y: [N, C, H] + # NonLocal2d y: [N, C, H, W] + # NonLocal3d y: [N, C, T, H, W] + y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, + *x.size()[2:]) + + output = x + self.conv_out(y) + + return output + + +class NonLocal1d(_NonLocalNd): + """1D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv1d'). + """ + + def __init__(self, + in_channels, + sub_sample=False, + conv_cfg=dict(type='Conv1d'), + **kwargs): + super(NonLocal1d, self).__init__( + in_channels, conv_cfg=conv_cfg, **kwargs) + + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool1d(kernel_size=2) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer + + +@PLUGIN_LAYERS.register_module() +class NonLocal2d(_NonLocalNd): + """2D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv2d'). + """ + + _abbr_ = 'nonlocal_block' + + def __init__(self, + in_channels, + sub_sample=False, + conv_cfg=dict(type='Conv2d'), + **kwargs): + super(NonLocal2d, self).__init__( + in_channels, conv_cfg=conv_cfg, **kwargs) + + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2)) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer + + +class NonLocal3d(_NonLocalNd): + """3D Non-local module. + + Args: + in_channels (int): Same as `NonLocalND`. + sub_sample (bool): Whether to apply max pooling after pairwise + function (Note that the `sub_sample` is applied on spatial only). + Default: False. + conv_cfg (None | dict): Same as `NonLocalND`. + Default: dict(type='Conv3d'). + """ + + def __init__(self, + in_channels, + sub_sample=False, + conv_cfg=dict(type='Conv3d'), + **kwargs): + super(NonLocal3d, self).__init__( + in_channels, conv_cfg=conv_cfg, **kwargs) + self.sub_sample = sub_sample + + if sub_sample: + max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2)) + self.g = nn.Sequential(self.g, max_pool_layer) + if self.mode != 'gaussian': + self.phi = nn.Sequential(self.phi, max_pool_layer) + else: + self.phi = max_pool_layer diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/norm.py b/custom_mmpkg/custom_mmcv/cnn/bricks/norm.py new file mode 100644 index 0000000000000000000000000000000000000000..da7a4d5d1ec957e885c48afb2dac772b6f792fd2 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/norm.py @@ -0,0 +1,144 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect + +import torch.nn as nn + +from custom_mmpkg.custom_mmcv.utils import is_tuple_of +from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm +from .registry import NORM_LAYERS + +NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d) +NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d) +NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d) +NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d) +NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm) +NORM_LAYERS.register_module('GN', module=nn.GroupNorm) +NORM_LAYERS.register_module('LN', module=nn.LayerNorm) +NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d) +NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d) +NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d) +NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d) + + +def infer_abbr(class_type): + """Infer abbreviation from the class name. + + When we build a norm layer with `build_norm_layer()`, we want to preserve + the norm type in variable names, e.g, self.bn1, self.gn. This method will + infer the abbreviation to map class types to abbreviations. + + Rule 1: If the class has the property "_abbr_", return the property. + Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or + InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and + "in" respectively. + Rule 3: If the class name contains "batch", "group", "layer" or "instance", + the abbreviation of this layer will be "bn", "gn", "ln" and "in" + respectively. + Rule 4: Otherwise, the abbreviation falls back to "norm". + + Args: + class_type (type): The norm layer type. + + Returns: + str: The inferred abbreviation. + """ + if not inspect.isclass(class_type): + raise TypeError( + f'class_type must be a type, but got {type(class_type)}') + if hasattr(class_type, '_abbr_'): + return class_type._abbr_ + if issubclass(class_type, _InstanceNorm): # IN is a subclass of BN + return 'in' + elif issubclass(class_type, _BatchNorm): + return 'bn' + elif issubclass(class_type, nn.GroupNorm): + return 'gn' + elif issubclass(class_type, nn.LayerNorm): + return 'ln' + else: + class_name = class_type.__name__.lower() + if 'batch' in class_name: + return 'bn' + elif 'group' in class_name: + return 'gn' + elif 'layer' in class_name: + return 'ln' + elif 'instance' in class_name: + return 'in' + else: + return 'norm_layer' + + +def build_norm_layer(cfg, num_features, postfix=''): + """Build normalization layer. + + Args: + cfg (dict): The norm layer config, which should contain: + + - type (str): Layer type. + - layer args: Args needed to instantiate a norm layer. + - requires_grad (bool, optional): Whether stop gradient updates. + num_features (int): Number of input channels. + postfix (int | str): The postfix to be appended into norm abbreviation + to create named layer. + + Returns: + (str, nn.Module): The first element is the layer name consisting of + abbreviation and postfix, e.g., bn1, gn. The second element is the + created norm layer. + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in NORM_LAYERS: + raise KeyError(f'Unrecognized norm type {layer_type}') + + norm_layer = NORM_LAYERS.get(layer_type) + abbr = infer_abbr(norm_layer) + + assert isinstance(postfix, (int, str)) + name = abbr + str(postfix) + + requires_grad = cfg_.pop('requires_grad', True) + cfg_.setdefault('eps', 1e-5) + if layer_type != 'GN': + layer = norm_layer(num_features, **cfg_) + if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'): + layer._specify_ddp_gpu_num(1) + else: + assert 'num_groups' in cfg_ + layer = norm_layer(num_channels=num_features, **cfg_) + + for param in layer.parameters(): + param.requires_grad = requires_grad + + return name, layer + + +def is_norm(layer, exclude=None): + """Check if a layer is a normalization layer. + + Args: + layer (nn.Module): The layer to be checked. + exclude (type | tuple[type]): Types to be excluded. + + Returns: + bool: Whether the layer is a norm layer. + """ + if exclude is not None: + if not isinstance(exclude, tuple): + exclude = (exclude, ) + if not is_tuple_of(exclude, type): + raise TypeError( + f'"exclude" must be either None or type or a tuple of types, ' + f'but got {type(exclude)}: {exclude}') + + if exclude and isinstance(layer, exclude): + return False + + all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm) + return isinstance(layer, all_norm_bases) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/padding.py b/custom_mmpkg/custom_mmcv/cnn/bricks/padding.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ac6b28a1789bd551c613a7d3e7b622433ac7ec --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/padding.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn + +from .registry import PADDING_LAYERS + +PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d) +PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d) +PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d) + + +def build_padding_layer(cfg, *args, **kwargs): + """Build padding layer. + + Args: + cfg (None or dict): The padding layer config, which should contain: + - type (str): Layer type. + - layer args: Args needed to instantiate a padding layer. + + Returns: + nn.Module: Created padding layer. + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + + cfg_ = cfg.copy() + padding_type = cfg_.pop('type') + if padding_type not in PADDING_LAYERS: + raise KeyError(f'Unrecognized padding type {padding_type}.') + else: + padding_layer = PADDING_LAYERS.get(padding_type) + + layer = padding_layer(*args, **kwargs, **cfg_) + + return layer diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/plugin.py b/custom_mmpkg/custom_mmcv/cnn/bricks/plugin.py new file mode 100644 index 0000000000000000000000000000000000000000..07c010d4053174dd41107aa654ea67e82b46a25c --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/plugin.py @@ -0,0 +1,88 @@ +import inspect +import platform + +from .registry import PLUGIN_LAYERS + +if platform.system() == 'Windows': + import regex as re +else: + import re + + +def infer_abbr(class_type): + """Infer abbreviation from the class name. + + This method will infer the abbreviation to map class types to + abbreviations. + + Rule 1: If the class has the property "abbr", return the property. + Rule 2: Otherwise, the abbreviation falls back to snake case of class + name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``. + + Args: + class_type (type): The norm layer type. + + Returns: + str: The inferred abbreviation. + """ + + def camel2snack(word): + """Convert camel case word into snack case. + + Modified from `inflection lib + `_. + + Example:: + + >>> camel2snack("FancyBlock") + 'fancy_block' + """ + + word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word) + word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word) + word = word.replace('-', '_') + return word.lower() + + if not inspect.isclass(class_type): + raise TypeError( + f'class_type must be a type, but got {type(class_type)}') + if hasattr(class_type, '_abbr_'): + return class_type._abbr_ + else: + return camel2snack(class_type.__name__) + + +def build_plugin_layer(cfg, postfix='', **kwargs): + """Build plugin layer. + + Args: + cfg (None or dict): cfg should contain: + type (str): identify plugin layer type. + layer args: args needed to instantiate a plugin layer. + postfix (int, str): appended into norm abbreviation to + create named layer. Default: ''. + + Returns: + tuple[str, nn.Module]: + name (str): abbreviation + postfix + layer (nn.Module): created plugin layer + """ + if not isinstance(cfg, dict): + raise TypeError('cfg must be a dict') + if 'type' not in cfg: + raise KeyError('the cfg dict must contain the key "type"') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in PLUGIN_LAYERS: + raise KeyError(f'Unrecognized plugin type {layer_type}') + + plugin_layer = PLUGIN_LAYERS.get(layer_type) + abbr = infer_abbr(plugin_layer) + + assert isinstance(postfix, (int, str)) + name = abbr + str(postfix) + + layer = plugin_layer(**kwargs, **cfg_) + + return name, layer diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/registry.py b/custom_mmpkg/custom_mmcv/cnn/bricks/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..496c18796f08a9de159b489fbef278ded22749d8 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/registry.py @@ -0,0 +1,16 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from custom_mmpkg.custom_mmcv.utils import Registry + +CONV_LAYERS = Registry('conv layer') +NORM_LAYERS = Registry('norm layer') +ACTIVATION_LAYERS = Registry('activation layer') +PADDING_LAYERS = Registry('padding layer') +UPSAMPLE_LAYERS = Registry('upsample layer') +PLUGIN_LAYERS = Registry('plugin layer') + +DROPOUT_LAYERS = Registry('drop out layers') +POSITIONAL_ENCODING = Registry('position encoding') +ATTENTION = Registry('attention') +FEEDFORWARD_NETWORK = Registry('feed-forward Network') +TRANSFORMER_LAYER = Registry('transformerLayer') +TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence') diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/scale.py b/custom_mmpkg/custom_mmcv/cnn/bricks/scale.py new file mode 100644 index 0000000000000000000000000000000000000000..c905fffcc8bf998d18d94f927591963c428025e2 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/scale.py @@ -0,0 +1,21 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + + +class Scale(nn.Module): + """A learnable scale parameter. + + This layer scales the input by a learnable factor. It multiplies a + learnable scale parameter of shape (1,) with input of any shape. + + Args: + scale (float): Initial value of scale factor. Default: 1.0 + """ + + def __init__(self, scale=1.0): + super(Scale, self).__init__() + self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) + + def forward(self, x): + return x * self.scale diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/swish.py b/custom_mmpkg/custom_mmcv/cnn/bricks/swish.py new file mode 100644 index 0000000000000000000000000000000000000000..e2ca8ed7b749413f011ae54aac0cab27e6f0b51f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/swish.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + +from .registry import ACTIVATION_LAYERS + + +@ACTIVATION_LAYERS.register_module() +class Swish(nn.Module): + """Swish Module. + + This module applies the swish function: + + .. math:: + Swish(x) = x * Sigmoid(x) + + Returns: + Tensor: The output tensor. + """ + + def __init__(self): + super(Swish, self).__init__() + + def forward(self, x): + return x * torch.sigmoid(x) diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/transformer.py b/custom_mmpkg/custom_mmcv/cnn/bricks/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..d4cd4655d30aef5cecb65522bc6b854fb60eca8d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/transformer.py @@ -0,0 +1,595 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings + +import torch +import torch.nn as nn + +from custom_mmpkg.custom_mmcv import ConfigDict, deprecated_api_warning +from custom_mmpkg.custom_mmcv.cnn import Linear, build_activation_layer, build_norm_layer +from custom_mmpkg.custom_mmcv.runner.base_module import BaseModule, ModuleList, Sequential +from custom_mmpkg.custom_mmcv.utils import build_from_cfg +from .drop import build_dropout +from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING, + TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE) + +# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file +try: + from custom_mmpkg.custom_mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention # noqa F401 + warnings.warn( + ImportWarning( + '``MultiScaleDeformableAttention`` has been moved to ' + '``mmcv.ops.multi_scale_deform_attn``, please change original path ' # noqa E501 + '``from custom_mmpkg.custom_mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` ' # noqa E501 + 'to ``from custom_mmpkg.custom_mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` ' # noqa E501 + )) + +except ImportError: + warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from ' + '``mmcv.ops.multi_scale_deform_attn``, ' + 'You should install ``mmcv-full`` if you need this module. ') + + +def build_positional_encoding(cfg, default_args=None): + """Builder for Position Encoding.""" + return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args) + + +def build_attention(cfg, default_args=None): + """Builder for attention.""" + return build_from_cfg(cfg, ATTENTION, default_args) + + +def build_feedforward_network(cfg, default_args=None): + """Builder for feed-forward network (FFN).""" + return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args) + + +def build_transformer_layer(cfg, default_args=None): + """Builder for transformer layer.""" + return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args) + + +def build_transformer_layer_sequence(cfg, default_args=None): + """Builder for transformer encoder and transformer decoder.""" + return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args) + + +@ATTENTION.register_module() +class MultiheadAttention(BaseModule): + """A wrapper for ``torch.nn.MultiheadAttention``. + + This module implements MultiheadAttention with identity connection, + and positional encoding is also passed as input. + + Args: + embed_dims (int): The embedding dimension. + num_heads (int): Parallel attention heads. + attn_drop (float): A Dropout layer on attn_output_weights. + Default: 0.0. + proj_drop (float): A Dropout layer after `nn.MultiheadAttention`. + Default: 0.0. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): When it is True, Key, Query and Value are shape of + (batch, n, embed_dim), otherwise (n, batch, embed_dim). + Default to False. + """ + + def __init__(self, + embed_dims, + num_heads, + attn_drop=0., + proj_drop=0., + dropout_layer=dict(type='Dropout', drop_prob=0.), + init_cfg=None, + batch_first=False, + **kwargs): + super(MultiheadAttention, self).__init__(init_cfg) + if 'dropout' in kwargs: + warnings.warn('The arguments `dropout` in MultiheadAttention ' + 'has been deprecated, now you can separately ' + 'set `attn_drop`(float), proj_drop(float), ' + 'and `dropout_layer`(dict) ') + attn_drop = kwargs['dropout'] + dropout_layer['drop_prob'] = kwargs.pop('dropout') + + self.embed_dims = embed_dims + self.num_heads = num_heads + self.batch_first = batch_first + + self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop, + **kwargs) + + self.proj_drop = nn.Dropout(proj_drop) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else nn.Identity() + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiheadAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_pos=None, + attn_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `MultiheadAttention`. + + **kwargs allow passing a more general data flow when combining + with other operations in `transformerlayer`. + + Args: + query (Tensor): The input query with shape [num_queries, bs, + embed_dims] if self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + If None, the ``query`` will be used. Defaults to None. + value (Tensor): The value tensor with same shape as `key`. + Same in `nn.MultiheadAttention.forward`. Defaults to None. + If None, the `key` will be used. + identity (Tensor): This tensor, with the same shape as x, + will be used for the identity link. + If None, `x` will be used. Defaults to None. + query_pos (Tensor): The positional encoding for query, with + the same shape as `x`. If not None, it will + be added to `x` before forward function. Defaults to None. + key_pos (Tensor): The positional encoding for `key`, with the + same shape as `key`. Defaults to None. If not None, it will + be added to `key` before forward function. If None, and + `query_pos` has the same shape as `key`, then `query_pos` + will be used for `key_pos`. Defaults to None. + attn_mask (Tensor): ByteTensor mask with shape [num_queries, + num_keys]. Same in `nn.MultiheadAttention.forward`. + Defaults to None. + key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys]. + Defaults to None. + + Returns: + Tensor: forwarded results with shape + [num_queries, bs, embed_dims] + if self.batch_first is False, else + [bs, num_queries embed_dims]. + """ + + if key is None: + key = query + if value is None: + value = key + if identity is None: + identity = query + if key_pos is None: + if query_pos is not None: + # use query_pos if key_pos is not available + if query_pos.shape == key.shape: + key_pos = query_pos + else: + warnings.warn(f'position encoding of key is' + f'missing in {self.__class__.__name__}.') + if query_pos is not None: + query = query + query_pos + if key_pos is not None: + key = key + key_pos + + # Because the dataflow('key', 'query', 'value') of + # ``torch.nn.MultiheadAttention`` is (num_query, batch, + # embed_dims), We should adjust the shape of dataflow from + # batch_first (batch, num_query, embed_dims) to num_query_first + # (num_query ,batch, embed_dims), and recover ``attn_output`` + # from num_query_first to batch_first. + if self.batch_first: + query = query.transpose(0, 1) + key = key.transpose(0, 1) + value = value.transpose(0, 1) + + out = self.attn( + query=query, + key=key, + value=value, + attn_mask=attn_mask, + key_padding_mask=key_padding_mask)[0] + + if self.batch_first: + out = out.transpose(0, 1) + + return identity + self.dropout_layer(self.proj_drop(out)) + + +@FEEDFORWARD_NETWORK.register_module() +class FFN(BaseModule): + """Implements feed-forward networks (FFNs) with identity connection. + + Args: + embed_dims (int): The feature dimension. Same as + `MultiheadAttention`. Defaults: 256. + feedforward_channels (int): The hidden dimension of FFNs. + Defaults: 1024. + num_fcs (int, optional): The number of fully-connected layers in + FFNs. Default: 2. + act_cfg (dict, optional): The activation config for FFNs. + Default: dict(type='ReLU') + ffn_drop (float, optional): Probability of an element to be + zeroed in FFN. Default 0.0. + add_identity (bool, optional): Whether to add the + identity connection. Default: `True`. + dropout_layer (obj:`ConfigDict`): The dropout_layer used + when adding the shortcut. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + @deprecated_api_warning( + { + 'dropout': 'ffn_drop', + 'add_residual': 'add_identity' + }, + cls_name='FFN') + def __init__(self, + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + act_cfg=dict(type='ReLU', inplace=True), + ffn_drop=0., + dropout_layer=None, + add_identity=True, + init_cfg=None, + **kwargs): + super(FFN, self).__init__(init_cfg) + assert num_fcs >= 2, 'num_fcs should be no less ' \ + f'than 2. got {num_fcs}.' + self.embed_dims = embed_dims + self.feedforward_channels = feedforward_channels + self.num_fcs = num_fcs + self.act_cfg = act_cfg + self.activate = build_activation_layer(act_cfg) + + layers = [] + in_channels = embed_dims + for _ in range(num_fcs - 1): + layers.append( + Sequential( + Linear(in_channels, feedforward_channels), self.activate, + nn.Dropout(ffn_drop))) + in_channels = feedforward_channels + layers.append(Linear(feedforward_channels, embed_dims)) + layers.append(nn.Dropout(ffn_drop)) + self.layers = Sequential(*layers) + self.dropout_layer = build_dropout( + dropout_layer) if dropout_layer else torch.nn.Identity() + self.add_identity = add_identity + + @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN') + def forward(self, x, identity=None): + """Forward function for `FFN`. + + The function would add x to the output tensor if residue is None. + """ + out = self.layers(x) + if not self.add_identity: + return self.dropout_layer(out) + if identity is None: + identity = x + return identity + self.dropout_layer(out) + + +@TRANSFORMER_LAYER.register_module() +class BaseTransformerLayer(BaseModule): + """Base `TransformerLayer` for vision transformer. + + It can be built from `mmcv.ConfigDict` and support more flexible + customization, for example, using any number of `FFN or LN ` and + use different kinds of `attention` by specifying a list of `ConfigDict` + named `attn_cfgs`. It is worth mentioning that it supports `prenorm` + when you specifying `norm` as the first element of `operation_order`. + More details about the `prenorm`: `On Layer Normalization in the + Transformer Architecture `_ . + + Args: + attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for `self_attention` or `cross_attention` modules, + The order of the configs in the list should be consistent with + corresponding attentions in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. Default: None. + ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )): + Configs for FFN, The order of the configs in the list should be + consistent with corresponding ffn in operation_order. + If it is a dict, all of the attention modules in operation_order + will be built with this config. + operation_order (tuple[str]): The execution order of operation + in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm'). + Support `prenorm` when you specifying first element as `norm`. + Default:None. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN'). + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + batch_first (bool): Key, Query and Value are shape + of (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + """ + + def __init__(self, + attn_cfgs=None, + ffn_cfgs=dict( + type='FFN', + embed_dims=256, + feedforward_channels=1024, + num_fcs=2, + ffn_drop=0., + act_cfg=dict(type='ReLU', inplace=True), + ), + operation_order=None, + norm_cfg=dict(type='LN'), + init_cfg=None, + batch_first=False, + **kwargs): + + deprecated_args = dict( + feedforward_channels='feedforward_channels', + ffn_dropout='ffn_drop', + ffn_num_fcs='num_fcs') + for ori_name, new_name in deprecated_args.items(): + if ori_name in kwargs: + warnings.warn( + f'The arguments `{ori_name}` in BaseTransformerLayer ' + f'has been deprecated, now you should set `{new_name}` ' + f'and other FFN related arguments ' + f'to a dict named `ffn_cfgs`. ') + ffn_cfgs[new_name] = kwargs[ori_name] + + super(BaseTransformerLayer, self).__init__(init_cfg) + + self.batch_first = batch_first + + assert set(operation_order) & set( + ['self_attn', 'norm', 'ffn', 'cross_attn']) == \ + set(operation_order), f'The operation_order of' \ + f' {self.__class__.__name__} should ' \ + f'contains all four operation type ' \ + f"{['self_attn', 'norm', 'ffn', 'cross_attn']}" + + num_attn = operation_order.count('self_attn') + operation_order.count( + 'cross_attn') + if isinstance(attn_cfgs, dict): + attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)] + else: + assert num_attn == len(attn_cfgs), f'The length ' \ + f'of attn_cfg {num_attn} is ' \ + f'not consistent with the number of attention' \ + f'in operation_order {operation_order}.' + + self.num_attn = num_attn + self.operation_order = operation_order + self.norm_cfg = norm_cfg + self.pre_norm = operation_order[0] == 'norm' + self.attentions = ModuleList() + + index = 0 + for operation_name in operation_order: + if operation_name in ['self_attn', 'cross_attn']: + if 'batch_first' in attn_cfgs[index]: + assert self.batch_first == attn_cfgs[index]['batch_first'] + else: + attn_cfgs[index]['batch_first'] = self.batch_first + attention = build_attention(attn_cfgs[index]) + # Some custom attentions used as `self_attn` + # or `cross_attn` can have different behavior. + attention.operation_name = operation_name + self.attentions.append(attention) + index += 1 + + self.embed_dims = self.attentions[0].embed_dims + + self.ffns = ModuleList() + num_ffns = operation_order.count('ffn') + if isinstance(ffn_cfgs, dict): + ffn_cfgs = ConfigDict(ffn_cfgs) + if isinstance(ffn_cfgs, dict): + ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)] + assert len(ffn_cfgs) == num_ffns + for ffn_index in range(num_ffns): + if 'embed_dims' not in ffn_cfgs[ffn_index]: + ffn_cfgs['embed_dims'] = self.embed_dims + else: + assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims + self.ffns.append( + build_feedforward_network(ffn_cfgs[ffn_index], + dict(type='FFN'))) + + self.norms = ModuleList() + num_norms = operation_order.count('norm') + for _ in range(num_norms): + self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1]) + + def forward(self, + query, + key=None, + value=None, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerDecoderLayer`. + + **kwargs contains some specific arguments of attentions. + + Args: + query (Tensor): The input query with shape + [num_queries, bs, embed_dims] if + self.batch_first is False, else + [bs, num_queries embed_dims]. + key (Tensor): The key tensor with shape [num_keys, bs, + embed_dims] if self.batch_first is False, else + [bs, num_keys, embed_dims] . + value (Tensor): The value tensor with same shape as `key`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor] | None): 2D Tensor used in + calculation of corresponding attention. The length of + it should equal to the number of `attention` in + `operation_order`. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in `self_attn` layer. + Defaults to None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: forwarded results with shape [num_queries, bs, embed_dims]. + """ + + norm_index = 0 + attn_index = 0 + ffn_index = 0 + identity = query + if attn_masks is None: + attn_masks = [None for _ in range(self.num_attn)] + elif isinstance(attn_masks, torch.Tensor): + attn_masks = [ + copy.deepcopy(attn_masks) for _ in range(self.num_attn) + ] + warnings.warn(f'Use same attn_mask in all attentions in ' + f'{self.__class__.__name__} ') + else: + assert len(attn_masks) == self.num_attn, f'The length of ' \ + f'attn_masks {len(attn_masks)} must be equal ' \ + f'to the number of attention in ' \ + f'operation_order {self.num_attn}' + + for layer in self.operation_order: + if layer == 'self_attn': + temp_key = temp_value = query + query = self.attentions[attn_index]( + query, + temp_key, + temp_value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=query_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=query_key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'norm': + query = self.norms[norm_index](query) + norm_index += 1 + + elif layer == 'cross_attn': + query = self.attentions[attn_index]( + query, + key, + value, + identity if self.pre_norm else None, + query_pos=query_pos, + key_pos=key_pos, + attn_mask=attn_masks[attn_index], + key_padding_mask=key_padding_mask, + **kwargs) + attn_index += 1 + identity = query + + elif layer == 'ffn': + query = self.ffns[ffn_index]( + query, identity if self.pre_norm else None) + ffn_index += 1 + + return query + + +@TRANSFORMER_LAYER_SEQUENCE.register_module() +class TransformerLayerSequence(BaseModule): + """Base class for TransformerEncoder and TransformerDecoder in vision + transformer. + + As base-class of Encoder and Decoder in vision transformer. + Support customization such as specifying different kind + of `transformer_layer` in `transformer_coder`. + + Args: + transformerlayer (list[obj:`mmcv.ConfigDict`] | + obj:`mmcv.ConfigDict`): Config of transformerlayer + in TransformerCoder. If it is obj:`mmcv.ConfigDict`, + it would be repeated `num_layer` times to a + list[`mmcv.ConfigDict`]. Default: None. + num_layers (int): The number of `TransformerLayer`. Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None): + super(TransformerLayerSequence, self).__init__(init_cfg) + if isinstance(transformerlayers, dict): + transformerlayers = [ + copy.deepcopy(transformerlayers) for _ in range(num_layers) + ] + else: + assert isinstance(transformerlayers, list) and \ + len(transformerlayers) == num_layers + self.num_layers = num_layers + self.layers = ModuleList() + for i in range(num_layers): + self.layers.append(build_transformer_layer(transformerlayers[i])) + self.embed_dims = self.layers[0].embed_dims + self.pre_norm = self.layers[0].pre_norm + + def forward(self, + query, + key, + value, + query_pos=None, + key_pos=None, + attn_masks=None, + query_key_padding_mask=None, + key_padding_mask=None, + **kwargs): + """Forward function for `TransformerCoder`. + + Args: + query (Tensor): Input query with shape + `(num_queries, bs, embed_dims)`. + key (Tensor): The key tensor with shape + `(num_keys, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_keys, bs, embed_dims)`. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. + Default: None. + attn_masks (List[Tensor], optional): Each element is 2D Tensor + which is used in calculation of corresponding attention in + operation_order. Default: None. + query_key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_queries]. Only used in self-attention + Default: None. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_keys]. Default: None. + + Returns: + Tensor: results with shape [num_queries, bs, embed_dims]. + """ + for layer in self.layers: + query = layer( + query, + key, + value, + query_pos=query_pos, + key_pos=key_pos, + attn_masks=attn_masks, + query_key_padding_mask=query_key_padding_mask, + key_padding_mask=key_padding_mask, + **kwargs) + return query diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/upsample.py b/custom_mmpkg/custom_mmcv/cnn/bricks/upsample.py new file mode 100644 index 0000000000000000000000000000000000000000..a1a353767d0ce8518f0d7289bed10dba0178ed12 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/upsample.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +import torch.nn.functional as F + +from ..utils import xavier_init +from .registry import UPSAMPLE_LAYERS + +UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample) +UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample) + + +@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle') +class PixelShufflePack(nn.Module): + """Pixel Shuffle upsample layer. + + This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to + achieve a simple upsampling with pixel shuffle. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + scale_factor (int): Upsample ratio. + upsample_kernel (int): Kernel size of the conv layer to expand the + channels. + """ + + def __init__(self, in_channels, out_channels, scale_factor, + upsample_kernel): + super(PixelShufflePack, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.scale_factor = scale_factor + self.upsample_kernel = upsample_kernel + self.upsample_conv = nn.Conv2d( + self.in_channels, + self.out_channels * scale_factor * scale_factor, + self.upsample_kernel, + padding=(self.upsample_kernel - 1) // 2) + self.init_weights() + + def init_weights(self): + xavier_init(self.upsample_conv, distribution='uniform') + + def forward(self, x): + x = self.upsample_conv(x) + x = F.pixel_shuffle(x, self.scale_factor) + return x + + +def build_upsample_layer(cfg, *args, **kwargs): + """Build upsample layer. + + Args: + cfg (dict): The upsample layer config, which should contain: + + - type (str): Layer type. + - scale_factor (int): Upsample ratio, which is not applicable to + deconv. + - layer args: Args needed to instantiate a upsample layer. + args (argument list): Arguments passed to the ``__init__`` + method of the corresponding conv layer. + kwargs (keyword arguments): Keyword arguments passed to the + ``__init__`` method of the corresponding conv layer. + + Returns: + nn.Module: Created upsample layer. + """ + if not isinstance(cfg, dict): + raise TypeError(f'cfg must be a dict, but got {type(cfg)}') + if 'type' not in cfg: + raise KeyError( + f'the cfg dict must contain the key "type", but got {cfg}') + cfg_ = cfg.copy() + + layer_type = cfg_.pop('type') + if layer_type not in UPSAMPLE_LAYERS: + raise KeyError(f'Unrecognized upsample type {layer_type}') + else: + upsample = UPSAMPLE_LAYERS.get(layer_type) + + if upsample is nn.Upsample: + cfg_['mode'] = layer_type + layer = upsample(*args, **kwargs, **cfg_) + return layer diff --git a/custom_mmpkg/custom_mmcv/cnn/bricks/wrappers.py b/custom_mmpkg/custom_mmcv/cnn/bricks/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..8aebf67bf52355a513f21756ee74fe510902d075 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/bricks/wrappers.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py # noqa: E501 + +Wrap some nn modules to support empty tensor input. Currently, these wrappers +are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask +heads are trained on only positive RoIs. +""" +import math + +import torch +import torch.nn as nn +from torch.nn.modules.utils import _pair, _triple + +from .registry import CONV_LAYERS, UPSAMPLE_LAYERS + +if torch.__version__ == 'parrots': + TORCH_VERSION = torch.__version__ +else: + # torch.__version__ could be 1.3.1+cu92, we only need the first two + # for comparison + TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2]) + + +def obsolete_torch_version(torch_version, version_threshold): + return torch_version == 'parrots' or torch_version <= version_threshold + + +class NewEmptyTensorOp(torch.autograd.Function): + + @staticmethod + def forward(ctx, x, new_shape): + ctx.shape = x.shape + return x.new_empty(new_shape) + + @staticmethod + def backward(ctx, grad): + shape = ctx.shape + return NewEmptyTensorOp.apply(grad, shape), None + + +@CONV_LAYERS.register_module('Conv', force=True) +class Conv2d(nn.Conv2d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size, + self.padding, self.stride, self.dilation): + o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@CONV_LAYERS.register_module('Conv3d', force=True) +class Conv3d(nn.Conv3d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size, + self.padding, self.stride, self.dilation): + o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1 + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@CONV_LAYERS.register_module() +@CONV_LAYERS.register_module('deconv') +@UPSAMPLE_LAYERS.register_module('deconv', force=True) +class ConvTranspose2d(nn.ConvTranspose2d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size, + self.padding, self.stride, + self.dilation, self.output_padding): + out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +@CONV_LAYERS.register_module() +@CONV_LAYERS.register_module('deconv3d') +@UPSAMPLE_LAYERS.register_module('deconv3d', force=True) +class ConvTranspose3d(nn.ConvTranspose3d): + + def forward(self, x): + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)): + out_shape = [x.shape[0], self.out_channels] + for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size, + self.padding, self.stride, + self.dilation, self.output_padding): + out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op) + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) + + +class MaxPool2d(nn.MaxPool2d): + + def forward(self, x): + # PyTorch 1.9 does not support empty tensor inference yet + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): + out_shape = list(x.shape[:2]) + for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size), + _pair(self.padding), _pair(self.stride), + _pair(self.dilation)): + o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 + o = math.ceil(o) if self.ceil_mode else math.floor(o) + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + return empty + + return super().forward(x) + + +class MaxPool3d(nn.MaxPool3d): + + def forward(self, x): + # PyTorch 1.9 does not support empty tensor inference yet + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)): + out_shape = list(x.shape[:2]) + for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size), + _triple(self.padding), + _triple(self.stride), + _triple(self.dilation)): + o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1 + o = math.ceil(o) if self.ceil_mode else math.floor(o) + out_shape.append(o) + empty = NewEmptyTensorOp.apply(x, out_shape) + return empty + + return super().forward(x) + + +class Linear(torch.nn.Linear): + + def forward(self, x): + # empty tensor forward of Linear layer is supported in Pytorch 1.6 + if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)): + out_shape = [x.shape[0], self.out_features] + empty = NewEmptyTensorOp.apply(x, out_shape) + if self.training: + # produce dummy gradient to avoid DDP warning. + dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0 + return empty + dummy + else: + return empty + + return super().forward(x) diff --git a/custom_mmpkg/custom_mmcv/cnn/builder.py b/custom_mmpkg/custom_mmcv/cnn/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..7567316c566bd3aca6d8f65a84b00e9e890948a7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/builder.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..runner import Sequential +from ..utils import Registry, build_from_cfg + + +def build_model_from_cfg(cfg, registry, default_args=None): + """Build a PyTorch model from config dict(s). Different from + ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built. + + Args: + cfg (dict, list[dict]): The config of modules, is is either a config + dict or a list of config dicts. If cfg is a list, a + the built modules will be wrapped with ``nn.Sequential``. + registry (:obj:`Registry`): A registry the module belongs to. + default_args (dict, optional): Default arguments to build the module. + Defaults to None. + + Returns: + nn.Module: A built nn module. + """ + if isinstance(cfg, list): + modules = [ + build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg + ] + return Sequential(*modules) + else: + return build_from_cfg(cfg, registry, default_args) + + +MODELS = Registry('model', build_func=build_model_from_cfg) diff --git a/custom_mmpkg/custom_mmcv/cnn/resnet.py b/custom_mmpkg/custom_mmcv/cnn/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..1cb3ac057ee2d52c46fc94685b5d4e698aad8d5f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/resnet.py @@ -0,0 +1,316 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.nn as nn +import torch.utils.checkpoint as cp + +from .utils import constant_init, kaiming_init + + +def conv3x3(in_planes, out_planes, stride=1, dilation=1): + """3x3 convolution with padding.""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False): + super(BasicBlock, self).__init__() + assert style in ['pytorch', 'caffe'] + self.conv1 = conv3x3(inplanes, planes, stride, dilation) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = conv3x3(planes, planes) + self.bn2 = nn.BatchNorm2d(planes) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + assert not with_cp + + def forward(self, x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False): + """Bottleneck block. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if + it is "caffe", the stride-two layer is the first 1x1 conv layer. + """ + super(Bottleneck, self).__init__() + assert style in ['pytorch', 'caffe'] + if style == 'pytorch': + conv1_stride = 1 + conv2_stride = stride + else: + conv1_stride = stride + conv2_stride = 1 + self.conv1 = nn.Conv2d( + inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False) + self.conv2 = nn.Conv2d( + planes, + planes, + kernel_size=3, + stride=conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.bn1 = nn.BatchNorm2d(planes) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d( + planes, planes * self.expansion, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * self.expansion) + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + def forward(self, x): + + def _inner_forward(x): + residual = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + residual = self.downsample(x) + + out += residual + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +def make_res_layer(block, + inplanes, + planes, + blocks, + stride=1, + dilation=1, + style='pytorch', + with_cp=False): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [] + layers.append( + block( + inplanes, + planes, + stride, + dilation, + downsample, + style=style, + with_cp=with_cp)) + inplanes = planes * block.expansion + for _ in range(1, blocks): + layers.append( + block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp)) + + return nn.Sequential(*layers) + + +class ResNet(nn.Module): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + num_stages (int): Resnet stages, normally 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style='pytorch', + frozen_stages=-1, + bn_eval=True, + bn_frozen=False, + with_cp=False): + super(ResNet, self).__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + assert num_stages >= 1 and num_stages <= 4 + block, stage_blocks = self.arch_settings[depth] + stage_blocks = stage_blocks[:num_stages] + assert len(strides) == len(dilations) == num_stages + assert max(out_indices) < num_stages + + self.out_indices = out_indices + self.style = style + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + self.with_cp = with_cp + + self.inplanes = 64 + self.conv1 = nn.Conv2d( + 3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + self.res_layers = [] + for i, num_blocks in enumerate(stage_blocks): + stride = strides[i] + dilation = dilations[i] + planes = 64 * 2**i + res_layer = make_res_layer( + block, + self.inplanes, + planes, + num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + with_cp=with_cp) + self.inplanes = planes * block.expansion + layer_name = f'layer{i + 1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self.feat_dim = block.expansion * 64 * 2**(len(stage_blocks) - 1) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + from ..runner import load_checkpoint + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super(ResNet, self).train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + if mode and self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for param in self.bn1.parameters(): + param.requires_grad = False + self.bn1.eval() + self.bn1.weight.requires_grad = False + self.bn1.bias.requires_grad = False + for i in range(1, self.frozen_stages + 1): + mod = getattr(self, f'layer{i}') + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/custom_mmpkg/custom_mmcv/cnn/utils/__init__.py b/custom_mmpkg/custom_mmcv/cnn/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a263e31c1e3977712827ca229bbc04910b4e928e --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/utils/__init__.py @@ -0,0 +1,19 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .flops_counter import get_model_complexity_info +from .fuse_conv_bn import fuse_conv_bn +from .sync_bn import revert_sync_batchnorm +from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit, + KaimingInit, NormalInit, PretrainedInit, + TruncNormalInit, UniformInit, XavierInit, + bias_init_with_prob, caffe2_xavier_init, + constant_init, initialize, kaiming_init, normal_init, + trunc_normal_init, uniform_init, xavier_init) + +__all__ = [ + 'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init', + 'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init', + 'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize', + 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', + 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', + 'Caffe2XavierInit', 'revert_sync_batchnorm' +] diff --git a/custom_mmpkg/custom_mmcv/cnn/utils/flops_counter.py b/custom_mmpkg/custom_mmcv/cnn/utils/flops_counter.py new file mode 100644 index 0000000000000000000000000000000000000000..a445d7a0ef90b371c74476c2b50b7b66eabc6d80 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/utils/flops_counter.py @@ -0,0 +1,599 @@ +# Modified from flops-counter.pytorch by Vladislav Sovrasov +# original repo: https://github.com/sovrasov/flops-counter.pytorch + +# MIT License + +# Copyright (c) 2018 Vladislav Sovrasov + +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: + +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +import sys +from functools import partial + +import numpy as np +import torch +import torch.nn as nn + +import custom_mmpkg.custom_mmcv as mmcv + + +def get_model_complexity_info(model, + input_shape, + print_per_layer_stat=True, + as_strings=True, + input_constructor=None, + flush=False, + ost=sys.stdout): + """Get complexity information of a model. + + This method can calculate FLOPs and parameter counts of a model with + corresponding input shape. It can also print complexity information for + each layer in a model. + + Supported layers are listed as below: + - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``. + - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``, ``nn.LeakyReLU``, + ``nn.ReLU6``. + - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``, + ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``, + ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``, + ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``, + ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``. + - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``, + ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``, + ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``. + - Linear: ``nn.Linear``. + - Deconvolution: ``nn.ConvTranspose2d``. + - Upsample: ``nn.Upsample``. + + Args: + model (nn.Module): The model for complexity calculation. + input_shape (tuple): Input shape used for calculation. + print_per_layer_stat (bool): Whether to print complexity information + for each layer in a model. Default: True. + as_strings (bool): Output FLOPs and params counts in a string form. + Default: True. + input_constructor (None | callable): If specified, it takes a callable + method that generates input. otherwise, it will generate a random + tensor with input shape to calculate FLOPs. Default: None. + flush (bool): same as that in :func:`print`. Default: False. + ost (stream): same as ``file`` param in :func:`print`. + Default: sys.stdout. + + Returns: + tuple[float | str]: If ``as_strings`` is set to True, it will return + FLOPs and parameter counts in a string format. otherwise, it will + return those in a float number format. + """ + assert type(input_shape) is tuple + assert len(input_shape) >= 1 + assert isinstance(model, nn.Module) + flops_model = add_flops_counting_methods(model) + flops_model.eval() + flops_model.start_flops_count() + if input_constructor: + input = input_constructor(input_shape) + _ = flops_model(**input) + else: + try: + batch = torch.ones(()).new_empty( + (1, *input_shape), + dtype=next(flops_model.parameters()).dtype, + device=next(flops_model.parameters()).device) + except StopIteration: + # Avoid StopIteration for models which have no parameters, + # like `nn.Relu()`, `nn.AvgPool2d`, etc. + batch = torch.ones(()).new_empty((1, *input_shape)) + + _ = flops_model(batch) + + flops_count, params_count = flops_model.compute_average_flops_cost() + if print_per_layer_stat: + print_model_with_flops( + flops_model, flops_count, params_count, ost=ost, flush=flush) + flops_model.stop_flops_count() + + if as_strings: + return flops_to_string(flops_count), params_to_string(params_count) + + return flops_count, params_count + + +def flops_to_string(flops, units='GFLOPs', precision=2): + """Convert FLOPs number into a string. + + Note that Here we take a multiply-add counts as one FLOP. + + Args: + flops (float): FLOPs number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'GFLOPs', + 'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically + choose the most suitable unit for FLOPs. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted FLOPs number with units. + + Examples: + >>> flops_to_string(1e9) + '1.0 GFLOPs' + >>> flops_to_string(2e5, 'MFLOPs') + '0.2 MFLOPs' + >>> flops_to_string(3e-9, None) + '3e-09 FLOPs' + """ + if units is None: + if flops // 10**9 > 0: + return str(round(flops / 10.**9, precision)) + ' GFLOPs' + elif flops // 10**6 > 0: + return str(round(flops / 10.**6, precision)) + ' MFLOPs' + elif flops // 10**3 > 0: + return str(round(flops / 10.**3, precision)) + ' KFLOPs' + else: + return str(flops) + ' FLOPs' + else: + if units == 'GFLOPs': + return str(round(flops / 10.**9, precision)) + ' ' + units + elif units == 'MFLOPs': + return str(round(flops / 10.**6, precision)) + ' ' + units + elif units == 'KFLOPs': + return str(round(flops / 10.**3, precision)) + ' ' + units + else: + return str(flops) + ' FLOPs' + + +def params_to_string(num_params, units=None, precision=2): + """Convert parameter number into a string. + + Args: + num_params (float): Parameter number to be converted. + units (str | None): Converted FLOPs units. Options are None, 'M', + 'K' and ''. If set to None, it will automatically choose the most + suitable unit for Parameter number. Default: None. + precision (int): Digit number after the decimal point. Default: 2. + + Returns: + str: The converted parameter number with units. + + Examples: + >>> params_to_string(1e9) + '1000.0 M' + >>> params_to_string(2e5) + '200.0 k' + >>> params_to_string(3e-9) + '3e-09' + """ + if units is None: + if num_params // 10**6 > 0: + return str(round(num_params / 10**6, precision)) + ' M' + elif num_params // 10**3: + return str(round(num_params / 10**3, precision)) + ' k' + else: + return str(num_params) + else: + if units == 'M': + return str(round(num_params / 10.**6, precision)) + ' ' + units + elif units == 'K': + return str(round(num_params / 10.**3, precision)) + ' ' + units + else: + return str(num_params) + + +def print_model_with_flops(model, + total_flops, + total_params, + units='GFLOPs', + precision=3, + ost=sys.stdout, + flush=False): + """Print a model with FLOPs for each layer. + + Args: + model (nn.Module): The model to be printed. + total_flops (float): Total FLOPs of the model. + total_params (float): Total parameter counts of the model. + units (str | None): Converted FLOPs units. Default: 'GFLOPs'. + precision (int): Digit number after the decimal point. Default: 3. + ost (stream): same as `file` param in :func:`print`. + Default: sys.stdout. + flush (bool): same as that in :func:`print`. Default: False. + + Example: + >>> class ExampleModel(nn.Module): + + >>> def __init__(self): + >>> super().__init__() + >>> self.conv1 = nn.Conv2d(3, 8, 3) + >>> self.conv2 = nn.Conv2d(8, 256, 3) + >>> self.conv3 = nn.Conv2d(256, 8, 3) + >>> self.avg_pool = nn.AdaptiveAvgPool2d((1, 1)) + >>> self.flatten = nn.Flatten() + >>> self.fc = nn.Linear(8, 1) + + >>> def forward(self, x): + >>> x = self.conv1(x) + >>> x = self.conv2(x) + >>> x = self.conv3(x) + >>> x = self.avg_pool(x) + >>> x = self.flatten(x) + >>> x = self.fc(x) + >>> return x + + >>> model = ExampleModel() + >>> x = (3, 16, 16) + to print the complexity information state for each layer, you can use + >>> get_model_complexity_info(model, x) + or directly use + >>> print_model_with_flops(model, 4579784.0, 37361) + ExampleModel( + 0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs, + (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1)) # noqa: E501 + (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1)) + (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1)) + (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1)) + (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, ) + (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True) + ) + """ + + def accumulate_params(self): + if is_supported_instance(self): + return self.__params__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_params() + return sum + + def accumulate_flops(self): + if is_supported_instance(self): + return self.__flops__ / model.__batch_counter__ + else: + sum = 0 + for m in self.children(): + sum += m.accumulate_flops() + return sum + + def flops_repr(self): + accumulated_num_params = self.accumulate_params() + accumulated_flops_cost = self.accumulate_flops() + return ', '.join([ + params_to_string( + accumulated_num_params, units='M', precision=precision), + '{:.3%} Params'.format(accumulated_num_params / total_params), + flops_to_string( + accumulated_flops_cost, units=units, precision=precision), + '{:.3%} FLOPs'.format(accumulated_flops_cost / total_flops), + self.original_extra_repr() + ]) + + def add_extra_repr(m): + m.accumulate_flops = accumulate_flops.__get__(m) + m.accumulate_params = accumulate_params.__get__(m) + flops_extra_repr = flops_repr.__get__(m) + if m.extra_repr != flops_extra_repr: + m.original_extra_repr = m.extra_repr + m.extra_repr = flops_extra_repr + assert m.extra_repr != m.original_extra_repr + + def del_extra_repr(m): + if hasattr(m, 'original_extra_repr'): + m.extra_repr = m.original_extra_repr + del m.original_extra_repr + if hasattr(m, 'accumulate_flops'): + del m.accumulate_flops + + model.apply(add_extra_repr) + print(model, file=ost, flush=flush) + model.apply(del_extra_repr) + + +def get_model_parameters_number(model): + """Calculate parameter number of a model. + + Args: + model (nn.module): The model for parameter number calculation. + + Returns: + float: Parameter number of the model. + """ + num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) + return num_params + + +def add_flops_counting_methods(net_main_module): + # adding additional methods to the existing module object, + # this is done this way so that each function has access to self object + net_main_module.start_flops_count = start_flops_count.__get__( + net_main_module) + net_main_module.stop_flops_count = stop_flops_count.__get__( + net_main_module) + net_main_module.reset_flops_count = reset_flops_count.__get__( + net_main_module) + net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__( # noqa: E501 + net_main_module) + + net_main_module.reset_flops_count() + + return net_main_module + + +def compute_average_flops_cost(self): + """Compute average FLOPs cost. + + A method to compute average FLOPs cost, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + + Returns: + float: Current mean flops consumption per image. + """ + batches_count = self.__batch_counter__ + flops_sum = 0 + for module in self.modules(): + if is_supported_instance(module): + flops_sum += module.__flops__ + params_sum = get_model_parameters_number(self) + return flops_sum / batches_count, params_sum + + +def start_flops_count(self): + """Activate the computation of mean flops consumption per image. + + A method to activate the computation of mean flops consumption per image. + which will be available after ``add_flops_counting_methods()`` is called on + a desired net object. It should be called before running the network. + """ + add_batch_counter_hook_function(self) + + def add_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + return + + else: + handle = module.register_forward_hook( + get_modules_mapping()[type(module)]) + + module.__flops_handle__ = handle + + self.apply(partial(add_flops_counter_hook_function)) + + +def stop_flops_count(self): + """Stop computing the mean flops consumption per image. + + A method to stop computing the mean flops consumption per image, which will + be available after ``add_flops_counting_methods()`` is called on a desired + net object. It can be called to pause the computation whenever. + """ + remove_batch_counter_hook_function(self) + self.apply(remove_flops_counter_hook_function) + + +def reset_flops_count(self): + """Reset statistics computed so far. + + A method to Reset computed statistics, which will be available after + `add_flops_counting_methods()` is called on a desired net object. + """ + add_batch_counter_variables_or_reset(self) + self.apply(add_flops_counter_variable_or_reset) + + +# ---- Internal functions +def empty_flops_counter_hook(module, input, output): + module.__flops__ += 0 + + +def upsample_flops_counter_hook(module, input, output): + output_size = output[0] + batch_size = output_size.shape[0] + output_elements_count = batch_size + for val in output_size.shape[1:]: + output_elements_count *= val + module.__flops__ += int(output_elements_count) + + +def relu_flops_counter_hook(module, input, output): + active_elements_count = output.numel() + module.__flops__ += int(active_elements_count) + + +def linear_flops_counter_hook(module, input, output): + input = input[0] + output_last_dim = output.shape[ + -1] # pytorch checks dimensions, so here we don't care much + module.__flops__ += int(np.prod(input.shape) * output_last_dim) + + +def pool_flops_counter_hook(module, input, output): + input = input[0] + module.__flops__ += int(np.prod(input.shape)) + + +def norm_flops_counter_hook(module, input, output): + input = input[0] + + batch_flops = np.prod(input.shape) + if (getattr(module, 'affine', False) + or getattr(module, 'elementwise_affine', False)): + batch_flops *= 2 + module.__flops__ += int(batch_flops) + + +def deconv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + input_height, input_width = input.shape[2:] + + kernel_height, kernel_width = conv_module.kernel_size + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = ( + kernel_height * kernel_width * in_channels * filters_per_channel) + + active_elements_count = batch_size * input_height * input_width + overall_conv_flops = conv_per_position_flops * active_elements_count + bias_flops = 0 + if conv_module.bias is not None: + output_height, output_width = output.shape[2:] + bias_flops = out_channels * batch_size * output_height * output_height + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def conv_flops_counter_hook(conv_module, input, output): + # Can have multiple inputs, getting the first one + input = input[0] + + batch_size = input.shape[0] + output_dims = list(output.shape[2:]) + + kernel_dims = list(conv_module.kernel_size) + in_channels = conv_module.in_channels + out_channels = conv_module.out_channels + groups = conv_module.groups + + filters_per_channel = out_channels // groups + conv_per_position_flops = int( + np.prod(kernel_dims)) * in_channels * filters_per_channel + + active_elements_count = batch_size * int(np.prod(output_dims)) + + overall_conv_flops = conv_per_position_flops * active_elements_count + + bias_flops = 0 + + if conv_module.bias is not None: + + bias_flops = out_channels * active_elements_count + + overall_flops = overall_conv_flops + bias_flops + + conv_module.__flops__ += int(overall_flops) + + +def batch_counter_hook(module, input, output): + batch_size = 1 + if len(input) > 0: + # Can have multiple inputs, getting the first one + input = input[0] + batch_size = len(input) + else: + pass + print('Warning! No positional inputs found for a module, ' + 'assuming batch size is 1.') + module.__batch_counter__ += batch_size + + +def add_batch_counter_variables_or_reset(module): + + module.__batch_counter__ = 0 + + +def add_batch_counter_hook_function(module): + if hasattr(module, '__batch_counter_handle__'): + return + + handle = module.register_forward_hook(batch_counter_hook) + module.__batch_counter_handle__ = handle + + +def remove_batch_counter_hook_function(module): + if hasattr(module, '__batch_counter_handle__'): + module.__batch_counter_handle__.remove() + del module.__batch_counter_handle__ + + +def add_flops_counter_variable_or_reset(module): + if is_supported_instance(module): + if hasattr(module, '__flops__') or hasattr(module, '__params__'): + print('Warning: variables __flops__ or __params__ are already ' + 'defined for the module' + type(module).__name__ + + ' ptflops can affect your code!') + module.__flops__ = 0 + module.__params__ = get_model_parameters_number(module) + + +def is_supported_instance(module): + if type(module) in get_modules_mapping(): + return True + return False + + +def remove_flops_counter_hook_function(module): + if is_supported_instance(module): + if hasattr(module, '__flops_handle__'): + module.__flops_handle__.remove() + del module.__flops_handle__ + + +def get_modules_mapping(): + return { + # convolutions + nn.Conv1d: conv_flops_counter_hook, + nn.Conv2d: conv_flops_counter_hook, + mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook, + nn.Conv3d: conv_flops_counter_hook, + mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook, + # activations + nn.ReLU: relu_flops_counter_hook, + nn.PReLU: relu_flops_counter_hook, + nn.ELU: relu_flops_counter_hook, + nn.LeakyReLU: relu_flops_counter_hook, + nn.ReLU6: relu_flops_counter_hook, + # poolings + nn.MaxPool1d: pool_flops_counter_hook, + nn.AvgPool1d: pool_flops_counter_hook, + nn.AvgPool2d: pool_flops_counter_hook, + nn.MaxPool2d: pool_flops_counter_hook, + mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook, + nn.MaxPool3d: pool_flops_counter_hook, + mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook, + nn.AvgPool3d: pool_flops_counter_hook, + nn.AdaptiveMaxPool1d: pool_flops_counter_hook, + nn.AdaptiveAvgPool1d: pool_flops_counter_hook, + nn.AdaptiveMaxPool2d: pool_flops_counter_hook, + nn.AdaptiveAvgPool2d: pool_flops_counter_hook, + nn.AdaptiveMaxPool3d: pool_flops_counter_hook, + nn.AdaptiveAvgPool3d: pool_flops_counter_hook, + # normalizations + nn.BatchNorm1d: norm_flops_counter_hook, + nn.BatchNorm2d: norm_flops_counter_hook, + nn.BatchNorm3d: norm_flops_counter_hook, + nn.GroupNorm: norm_flops_counter_hook, + nn.InstanceNorm1d: norm_flops_counter_hook, + nn.InstanceNorm2d: norm_flops_counter_hook, + nn.InstanceNorm3d: norm_flops_counter_hook, + nn.LayerNorm: norm_flops_counter_hook, + # FC + nn.Linear: linear_flops_counter_hook, + mmcv.cnn.bricks.Linear: linear_flops_counter_hook, + # Upscale + nn.Upsample: upsample_flops_counter_hook, + # Deconvolution + nn.ConvTranspose2d: deconv_flops_counter_hook, + mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook, + } diff --git a/custom_mmpkg/custom_mmcv/cnn/utils/fuse_conv_bn.py b/custom_mmpkg/custom_mmcv/cnn/utils/fuse_conv_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..cb7076f80bf37f7931185bf0293ffcc1ce19c8ef --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/utils/fuse_conv_bn.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn + + +def _fuse_conv_bn(conv, bn): + """Fuse conv and bn into one module. + + Args: + conv (nn.Module): Conv to be fused. + bn (nn.Module): BN to be fused. + + Returns: + nn.Module: Fused module. + """ + conv_w = conv.weight + conv_b = conv.bias if conv.bias is not None else torch.zeros_like( + bn.running_mean) + + factor = bn.weight / torch.sqrt(bn.running_var + bn.eps) + conv.weight = nn.Parameter(conv_w * + factor.reshape([conv.out_channels, 1, 1, 1])) + conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias) + return conv + + +def fuse_conv_bn(module): + """Recursively fuse conv and bn in a module. + + During inference, the functionary of batch norm layers is turned off + but only the mean and var alone channels are used, which exposes the + chance to fuse it with the preceding conv layers to save computations and + simplify network structures. + + Args: + module (nn.Module): Module to be fused. + + Returns: + nn.Module: Fused module. + """ + last_conv = None + last_conv_name = None + + for name, child in module.named_children(): + if isinstance(child, + (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)): + if last_conv is None: # only fuse BN that is after Conv + continue + fused_conv = _fuse_conv_bn(last_conv, child) + module._modules[last_conv_name] = fused_conv + # To reduce changes, set BN as Identity instead of deleting it. + module._modules[name] = nn.Identity() + last_conv = None + elif isinstance(child, nn.Conv2d): + last_conv = child + last_conv_name = name + else: + fuse_conv_bn(child) + return module diff --git a/custom_mmpkg/custom_mmcv/cnn/utils/sync_bn.py b/custom_mmpkg/custom_mmcv/cnn/utils/sync_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..f75291daab5cfbf367621cef62b0067aed9fbd0d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/utils/sync_bn.py @@ -0,0 +1,59 @@ +import torch + +import custom_mmpkg.custom_mmcv as mmcv + + +class _BatchNormXd(torch.nn.modules.batchnorm._BatchNorm): + """A general BatchNorm layer without input dimension check. + + Reproduced from @kapily's work: + (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547) + The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc + is `_check_input_dim` that is designed for tensor sanity checks. + The check has been bypassed in this class for the convenience of converting + SyncBatchNorm. + """ + + def _check_input_dim(self, input): + return + + +def revert_sync_batchnorm(module): + """Helper function to convert all `SyncBatchNorm` (SyncBN) and + `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to + `BatchNormXd` layers. + + Adapted from @kapily's work: + (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547) + + Args: + module (nn.Module): The module containing `SyncBatchNorm` layers. + + Returns: + module_output: The converted module with `BatchNormXd` layers. + """ + module_output = module + module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm] + if hasattr(mmcv, 'ops'): + module_checklist.append(mmcv.ops.SyncBatchNorm) + if isinstance(module, tuple(module_checklist)): + module_output = _BatchNormXd(module.num_features, module.eps, + module.momentum, module.affine, + module.track_running_stats) + if module.affine: + # no_grad() may not be needed here but + # just to be consistent with `convert_sync_batchnorm()` + with torch.no_grad(): + module_output.weight = module.weight + module_output.bias = module.bias + module_output.running_mean = module.running_mean + module_output.running_var = module.running_var + module_output.num_batches_tracked = module.num_batches_tracked + module_output.training = module.training + # qconfig exists in quantized models + if hasattr(module, 'qconfig'): + module_output.qconfig = module.qconfig + for name, child in module.named_children(): + module_output.add_module(name, revert_sync_batchnorm(child)) + del module + return module_output diff --git a/custom_mmpkg/custom_mmcv/cnn/utils/weight_init.py b/custom_mmpkg/custom_mmcv/cnn/utils/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..7a5bb1755d2269829c113b98026aa0310a3d70cb --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/utils/weight_init.py @@ -0,0 +1,684 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import math +import warnings + +import numpy as np +import torch +import torch.nn as nn +from torch import Tensor + +from custom_mmpkg.custom_mmcv.utils import Registry, build_from_cfg, get_logger, print_log + +INITIALIZERS = Registry('initializer') + + +def update_init_info(module, init_info): + """Update the `_params_init_info` in the module if the value of parameters + are changed. + + Args: + module (obj:`nn.Module`): The module of PyTorch with a user-defined + attribute `_params_init_info` which records the initialization + information. + init_info (str): The string that describes the initialization. + """ + assert hasattr( + module, + '_params_init_info'), f'Can not find `_params_init_info` in {module}' + for name, param in module.named_parameters(): + + assert param in module._params_init_info, ( + f'Find a new :obj:`Parameter` ' + f'named `{name}` during executing the ' + f'`init_weights` of ' + f'`{module.__class__.__name__}`. ' + f'Please do not add or ' + f'replace parameters during executing ' + f'the `init_weights`. ') + + # The parameter has been changed during executing the + # `init_weights` of module + mean_value = param.data.mean() + if module._params_init_info[param]['tmp_mean_value'] != mean_value: + module._params_init_info[param]['init_info'] = init_info + module._params_init_info[param]['tmp_mean_value'] = mean_value + + +def constant_init(module, val, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.constant_(module.weight, val) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def xavier_init(module, gain=1, bias=0, distribution='normal'): + assert distribution in ['uniform', 'normal'] + if hasattr(module, 'weight') and module.weight is not None: + if distribution == 'uniform': + nn.init.xavier_uniform_(module.weight, gain=gain) + else: + nn.init.xavier_normal_(module.weight, gain=gain) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def normal_init(module, mean=0, std=1, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.normal_(module.weight, mean, std) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def trunc_normal_init(module: nn.Module, + mean: float = 0, + std: float = 1, + a: float = -2, + b: float = 2, + bias: float = 0) -> None: + if hasattr(module, 'weight') and module.weight is not None: + trunc_normal_(module.weight, mean, std, a, b) # type: ignore + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) # type: ignore + + +def uniform_init(module, a=0, b=1, bias=0): + if hasattr(module, 'weight') and module.weight is not None: + nn.init.uniform_(module.weight, a, b) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def kaiming_init(module, + a=0, + mode='fan_out', + nonlinearity='relu', + bias=0, + distribution='normal'): + assert distribution in ['uniform', 'normal'] + if hasattr(module, 'weight') and module.weight is not None: + if distribution == 'uniform': + nn.init.kaiming_uniform_( + module.weight, a=a, mode=mode, nonlinearity=nonlinearity) + else: + nn.init.kaiming_normal_( + module.weight, a=a, mode=mode, nonlinearity=nonlinearity) + if hasattr(module, 'bias') and module.bias is not None: + nn.init.constant_(module.bias, bias) + + +def caffe2_xavier_init(module, bias=0): + # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch + # Acknowledgment to FAIR's internal code + kaiming_init( + module, + a=1, + mode='fan_in', + nonlinearity='leaky_relu', + bias=bias, + distribution='uniform') + + +def bias_init_with_prob(prior_prob): + """initialize conv/fc bias value according to a given probability value.""" + bias_init = float(-np.log((1 - prior_prob) / prior_prob)) + return bias_init + + +def _get_bases_name(m): + return [b.__name__ for b in m.__class__.__bases__] + + +class BaseInit(object): + + def __init__(self, *, bias=0, bias_prob=None, layer=None): + self.wholemodule = False + if not isinstance(bias, (int, float)): + raise TypeError(f'bias must be a number, but got a {type(bias)}') + + if bias_prob is not None: + if not isinstance(bias_prob, float): + raise TypeError(f'bias_prob type must be float, \ + but got {type(bias_prob)}') + + if layer is not None: + if not isinstance(layer, (str, list)): + raise TypeError(f'layer must be a str or a list of str, \ + but got a {type(layer)}') + else: + layer = [] + + if bias_prob is not None: + self.bias = bias_init_with_prob(bias_prob) + else: + self.bias = bias + self.layer = [layer] if isinstance(layer, str) else layer + + def _get_init_info(self): + info = f'{self.__class__.__name__}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Constant') +class ConstantInit(BaseInit): + """Initialize module parameters with constant values. + + Args: + val (int | float): the value to fill the weights in the module with + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, val, **kwargs): + super().__init__(**kwargs) + self.val = val + + def __call__(self, module): + + def init(m): + if self.wholemodule: + constant_init(m, self.val, self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + constant_init(m, self.val, self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Xavier') +class XavierInit(BaseInit): + r"""Initialize module parameters with values according to the method + described in `Understanding the difficulty of training deep feedforward + neural networks - Glorot, X. & Bengio, Y. (2010). + `_ + + Args: + gain (int | float): an optional scaling factor. Defaults to 1. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + distribution (str): distribution either be ``'normal'`` + or ``'uniform'``. Defaults to ``'normal'``. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, gain=1, distribution='normal', **kwargs): + super().__init__(**kwargs) + self.gain = gain + self.distribution = distribution + + def __call__(self, module): + + def init(m): + if self.wholemodule: + xavier_init(m, self.gain, self.bias, self.distribution) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + xavier_init(m, self.gain, self.bias, self.distribution) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: gain={self.gain}, ' \ + f'distribution={self.distribution}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Normal') +class NormalInit(BaseInit): + r"""Initialize module parameters with the values drawn from the normal + distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`. + + Args: + mean (int | float):the mean of the normal distribution. Defaults to 0. + std (int | float): the standard deviation of the normal distribution. + Defaults to 1. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + + """ + + def __init__(self, mean=0, std=1, **kwargs): + super().__init__(**kwargs) + self.mean = mean + self.std = std + + def __call__(self, module): + + def init(m): + if self.wholemodule: + normal_init(m, self.mean, self.std, self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + normal_init(m, self.mean, self.std, self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: mean={self.mean},' \ + f' std={self.std}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='TruncNormal') +class TruncNormalInit(BaseInit): + r"""Initialize module parameters with the values drawn from the normal + distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values + outside :math:`[a, b]`. + + Args: + mean (float): the mean of the normal distribution. Defaults to 0. + std (float): the standard deviation of the normal distribution. + Defaults to 1. + a (float): The minimum cutoff value. + b ( float): The maximum cutoff value. + bias (float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + + """ + + def __init__(self, + mean: float = 0, + std: float = 1, + a: float = -2, + b: float = 2, + **kwargs) -> None: + super().__init__(**kwargs) + self.mean = mean + self.std = std + self.a = a + self.b = b + + def __call__(self, module: nn.Module) -> None: + + def init(m): + if self.wholemodule: + trunc_normal_init(m, self.mean, self.std, self.a, self.b, + self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + trunc_normal_init(m, self.mean, self.std, self.a, self.b, + self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \ + f' mean={self.mean}, std={self.std}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Uniform') +class UniformInit(BaseInit): + r"""Initialize module parameters with values drawn from the uniform + distribution :math:`\mathcal{U}(a, b)`. + + Args: + a (int | float): the lower bound of the uniform distribution. + Defaults to 0. + b (int | float): the upper bound of the uniform distribution. + Defaults to 1. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, a=0, b=1, **kwargs): + super().__init__(**kwargs) + self.a = a + self.b = b + + def __call__(self, module): + + def init(m): + if self.wholemodule: + uniform_init(m, self.a, self.b, self.bias) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + uniform_init(m, self.a, self.b, self.bias) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: a={self.a},' \ + f' b={self.b}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Kaiming') +class KaimingInit(BaseInit): + r"""Initialize module parameters with the values according to the method + described in `Delving deep into rectifiers: Surpassing human-level + performance on ImageNet classification - He, K. et al. (2015). + `_ + + Args: + a (int | float): the negative slope of the rectifier used after this + layer (only used with ``'leaky_relu'``). Defaults to 0. + mode (str): either ``'fan_in'`` or ``'fan_out'``. Choosing + ``'fan_in'`` preserves the magnitude of the variance of the weights + in the forward pass. Choosing ``'fan_out'`` preserves the + magnitudes in the backwards pass. Defaults to ``'fan_out'``. + nonlinearity (str): the non-linear function (`nn.functional` name), + recommended to use only with ``'relu'`` or ``'leaky_relu'`` . + Defaults to 'relu'. + bias (int | float): the value to fill the bias. Defaults to 0. + bias_prob (float, optional): the probability for bias initialization. + Defaults to None. + distribution (str): distribution either be ``'normal'`` or + ``'uniform'``. Defaults to ``'normal'``. + layer (str | list[str], optional): the layer will be initialized. + Defaults to None. + """ + + def __init__(self, + a=0, + mode='fan_out', + nonlinearity='relu', + distribution='normal', + **kwargs): + super().__init__(**kwargs) + self.a = a + self.mode = mode + self.nonlinearity = nonlinearity + self.distribution = distribution + + def __call__(self, module): + + def init(m): + if self.wholemodule: + kaiming_init(m, self.a, self.mode, self.nonlinearity, + self.bias, self.distribution) + else: + layername = m.__class__.__name__ + basesname = _get_bases_name(m) + if len(set(self.layer) & set([layername] + basesname)): + kaiming_init(m, self.a, self.mode, self.nonlinearity, + self.bias, self.distribution) + + module.apply(init) + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \ + f'nonlinearity={self.nonlinearity}, ' \ + f'distribution ={self.distribution}, bias={self.bias}' + return info + + +@INITIALIZERS.register_module(name='Caffe2Xavier') +class Caffe2XavierInit(KaimingInit): + # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch + # Acknowledgment to FAIR's internal code + def __init__(self, **kwargs): + super().__init__( + a=1, + mode='fan_in', + nonlinearity='leaky_relu', + distribution='uniform', + **kwargs) + + def __call__(self, module): + super().__call__(module) + + +@INITIALIZERS.register_module(name='Pretrained') +class PretrainedInit(object): + """Initialize module by loading a pretrained model. + + Args: + checkpoint (str): the checkpoint file of the pretrained model should + be load. + prefix (str, optional): the prefix of a sub-module in the pretrained + model. it is for loading a part of the pretrained model to + initialize. For example, if we would like to only load the + backbone of a detector model, we can set ``prefix='backbone.'``. + Defaults to None. + map_location (str): map tensors into proper locations. + """ + + def __init__(self, checkpoint, prefix=None, map_location=None): + self.checkpoint = checkpoint + self.prefix = prefix + self.map_location = map_location + + def __call__(self, module): + from custom_mmpkg.custom_mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint, + load_state_dict) + logger = get_logger('mmcv') + if self.prefix is None: + print_log(f'load model from: {self.checkpoint}', logger=logger) + load_checkpoint( + module, + self.checkpoint, + map_location=self.map_location, + strict=False, + logger=logger) + else: + print_log( + f'load {self.prefix} in model from: {self.checkpoint}', + logger=logger) + state_dict = _load_checkpoint_with_prefix( + self.prefix, self.checkpoint, map_location=self.map_location) + load_state_dict(module, state_dict, strict=False, logger=logger) + + if hasattr(module, '_params_init_info'): + update_init_info(module, init_info=self._get_init_info()) + + def _get_init_info(self): + info = f'{self.__class__.__name__}: load from {self.checkpoint}' + return info + + +def _initialize(module, cfg, wholemodule=False): + func = build_from_cfg(cfg, INITIALIZERS) + # wholemodule flag is for override mode, there is no layer key in override + # and initializer will give init values for the whole module with the name + # in override. + func.wholemodule = wholemodule + func(module) + + +def _initialize_override(module, override, cfg): + if not isinstance(override, (dict, list)): + raise TypeError(f'override must be a dict or a list of dict, \ + but got {type(override)}') + + override = [override] if isinstance(override, dict) else override + + for override_ in override: + + cp_override = copy.deepcopy(override_) + name = cp_override.pop('name', None) + if name is None: + raise ValueError('`override` must contain the key "name",' + f'but got {cp_override}') + # if override only has name key, it means use args in init_cfg + if not cp_override: + cp_override.update(cfg) + # if override has name key and other args except type key, it will + # raise error + elif 'type' not in cp_override.keys(): + raise ValueError( + f'`override` need "type" key, but got {cp_override}') + + if hasattr(module, name): + _initialize(getattr(module, name), cp_override, wholemodule=True) + else: + raise RuntimeError(f'module did not have attribute {name}, ' + f'but init_cfg is {cp_override}.') + + +def initialize(module, init_cfg): + """Initialize a module. + + Args: + module (``torch.nn.Module``): the module will be initialized. + init_cfg (dict | list[dict]): initialization configuration dict to + define initializer. OpenMMLab has implemented 6 initializers + including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``, + ``Kaiming``, and ``Pretrained``. + Example: + >>> module = nn.Linear(2, 3, bias=True) + >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2) + >>> initialize(module, init_cfg) + + >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2)) + >>> # define key ``'layer'`` for initializing layer with different + >>> # configuration + >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1), + dict(type='Constant', layer='Linear', val=2)] + >>> initialize(module, init_cfg) + + >>> # define key``'override'`` to initialize some specific part in + >>> # module + >>> class FooNet(nn.Module): + >>> def __init__(self): + >>> super().__init__() + >>> self.feat = nn.Conv2d(3, 16, 3) + >>> self.reg = nn.Conv2d(16, 10, 3) + >>> self.cls = nn.Conv2d(16, 5, 3) + >>> model = FooNet() + >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d', + >>> override=dict(type='Constant', name='reg', val=3, bias=4)) + >>> initialize(model, init_cfg) + + >>> model = ResNet(depth=50) + >>> # Initialize weights with the pretrained model. + >>> init_cfg = dict(type='Pretrained', + checkpoint='torchvision://resnet50') + >>> initialize(model, init_cfg) + + >>> # Initialize weights of a sub-module with the specific part of + >>> # a pretrained model by using "prefix". + >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\ + >>> 'retinanet_r50_fpn_1x_coco/'\ + >>> 'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth' + >>> init_cfg = dict(type='Pretrained', + checkpoint=url, prefix='backbone.') + """ + if not isinstance(init_cfg, (dict, list)): + raise TypeError(f'init_cfg must be a dict or a list of dict, \ + but got {type(init_cfg)}') + + if isinstance(init_cfg, dict): + init_cfg = [init_cfg] + + for cfg in init_cfg: + # should deeply copy the original config because cfg may be used by + # other modules, e.g., one init_cfg shared by multiple bottleneck + # blocks, the expected cfg will be changed after pop and will change + # the initialization behavior of other modules + cp_cfg = copy.deepcopy(cfg) + override = cp_cfg.pop('override', None) + _initialize(module, cp_cfg) + + if override is not None: + cp_cfg.pop('layer', None) + _initialize_override(module, override, cp_cfg) + else: + # All attributes in module have same initialization. + pass + + +def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float, + b: float) -> Tensor: + # Method based on + # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + # Modified from + # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' + 'The distribution of values may be incorrect.', + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + lower = norm_cdf((a - mean) / std) + upper = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [lower, upper], then translate + # to [2lower-1, 2upper-1]. + tensor.uniform_(2 * lower - 1, 2 * upper - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor: Tensor, + mean: float = 0., + std: float = 1., + a: float = -2., + b: float = 2.) -> Tensor: + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + Modified from + https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py + + Args: + tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`. + mean (float): the mean of the normal distribution. + std (float): the standard deviation of the normal distribution. + a (float): the minimum cutoff value. + b (float): the maximum cutoff value. + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) diff --git a/custom_mmpkg/custom_mmcv/cnn/vgg.py b/custom_mmpkg/custom_mmcv/cnn/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..8778b649561a45a9652b1a15a26c2d171e58f3e1 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/cnn/vgg.py @@ -0,0 +1,175 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.nn as nn + +from .utils import constant_init, kaiming_init, normal_init + + +def conv3x3(in_planes, out_planes, dilation=1): + """3x3 convolution with padding.""" + return nn.Conv2d( + in_planes, + out_planes, + kernel_size=3, + padding=dilation, + dilation=dilation) + + +def make_vgg_layer(inplanes, + planes, + num_blocks, + dilation=1, + with_bn=False, + ceil_mode=False): + layers = [] + for _ in range(num_blocks): + layers.append(conv3x3(inplanes, planes, dilation)) + if with_bn: + layers.append(nn.BatchNorm2d(planes)) + layers.append(nn.ReLU(inplace=True)) + inplanes = planes + layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode)) + + return layers + + +class VGG(nn.Module): + """VGG backbone. + + Args: + depth (int): Depth of vgg, from {11, 13, 16, 19}. + with_bn (bool): Use BatchNorm or not. + num_classes (int): number of classes for classification. + num_stages (int): VGG stages, normally 5. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze + running stats (mean and var). + bn_frozen (bool): Whether to freeze weight and bias of BN layers. + """ + + arch_settings = { + 11: (1, 1, 2, 2, 2), + 13: (2, 2, 2, 2, 2), + 16: (2, 2, 3, 3, 3), + 19: (2, 2, 4, 4, 4) + } + + def __init__(self, + depth, + with_bn=False, + num_classes=-1, + num_stages=5, + dilations=(1, 1, 1, 1, 1), + out_indices=(0, 1, 2, 3, 4), + frozen_stages=-1, + bn_eval=True, + bn_frozen=False, + ceil_mode=False, + with_last_pool=True): + super(VGG, self).__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for vgg') + assert num_stages >= 1 and num_stages <= 5 + stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + assert len(dilations) == num_stages + assert max(out_indices) <= num_stages + + self.num_classes = num_classes + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.bn_eval = bn_eval + self.bn_frozen = bn_frozen + + self.inplanes = 3 + start_idx = 0 + vgg_layers = [] + self.range_sub_modules = [] + for i, num_blocks in enumerate(self.stage_blocks): + num_modules = num_blocks * (2 + with_bn) + 1 + end_idx = start_idx + num_modules + dilation = dilations[i] + planes = 64 * 2**i if i < 4 else 512 + vgg_layer = make_vgg_layer( + self.inplanes, + planes, + num_blocks, + dilation=dilation, + with_bn=with_bn, + ceil_mode=ceil_mode) + vgg_layers.extend(vgg_layer) + self.inplanes = planes + self.range_sub_modules.append([start_idx, end_idx]) + start_idx = end_idx + if not with_last_pool: + vgg_layers.pop(-1) + self.range_sub_modules[-1][1] -= 1 + self.module_name = 'features' + self.add_module(self.module_name, nn.Sequential(*vgg_layers)) + + if self.num_classes > 0: + self.classifier = nn.Sequential( + nn.Linear(512 * 7 * 7, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, 4096), + nn.ReLU(True), + nn.Dropout(), + nn.Linear(4096, num_classes), + ) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + from ..runner import load_checkpoint + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + elif isinstance(m, nn.Linear): + normal_init(m, std=0.01) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + outs = [] + vgg_layers = getattr(self, self.module_name) + for i in range(len(self.stage_blocks)): + for j in range(*self.range_sub_modules[i]): + vgg_layer = vgg_layers[j] + x = vgg_layer(x) + if i in self.out_indices: + outs.append(x) + if self.num_classes > 0: + x = x.view(x.size(0), -1) + x = self.classifier(x) + outs.append(x) + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def train(self, mode=True): + super(VGG, self).train(mode) + if self.bn_eval: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.bn_frozen: + for params in m.parameters(): + params.requires_grad = False + vgg_layers = getattr(self, self.module_name) + if mode and self.frozen_stages >= 0: + for i in range(self.frozen_stages): + for j in range(*self.range_sub_modules[i]): + mod = vgg_layers[j] + mod.eval() + for param in mod.parameters(): + param.requires_grad = False diff --git a/custom_mmpkg/custom_mmcv/engine/__init__.py b/custom_mmpkg/custom_mmcv/engine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3193b7f664e19ce2458d81c836597fa22e4bb082 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/engine/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test, + single_gpu_test) + +__all__ = [ + 'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test', + 'single_gpu_test' +] diff --git a/custom_mmpkg/custom_mmcv/engine/test.py b/custom_mmpkg/custom_mmcv/engine/test.py new file mode 100644 index 0000000000000000000000000000000000000000..ac64007f1784b8999b969b9fe4baca393c44d257 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/engine/test.py @@ -0,0 +1,202 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import pickle +import shutil +import tempfile +import time + +import torch +import torch.distributed as dist + +import custom_mmpkg.custom_mmcv as mmcv +from custom_mmpkg.custom_mmcv.runner import get_dist_info + + +def single_gpu_test(model, data_loader): + """Test model with a single gpu. + + This method tests model with a single gpu and displays test progress bar. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + for data in data_loader: + with torch.no_grad(): + result = model(return_loss=False, **data) + results.extend(result) + + # Assume result has the same length of batch_size + # refer to https://github.com/open-mmlab/mmcv/issues/985 + batch_size = len(result) + for _ in range(batch_size): + prog_bar.update() + return results + + +def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting + ``gpu_collect=True``, it encodes results to gpu tensors and use gpu + communication for results collection. On cpu mode it saves the results on + different gpus to ``tmpdir`` and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (nn.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + + Returns: + list: The prediction results. + """ + model.eval() + results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + time.sleep(2) # This line can prevent deadlock problem in some cases. + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, **data) + results.extend(result) + + if rank == 0: + batch_size = len(result) + batch_size_all = batch_size * world_size + if batch_size_all + prog_bar.completed > len(dataset): + batch_size_all = len(dataset) - prog_bar.completed + for _ in range(batch_size_all): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + """Collect results under cpu mode. + + On cpu mode, this function will save the results on different gpus to + ``tmpdir`` and collect them by the rank 0 worker. + + Args: + result_part (list): Result list containing result parts + to be collected. + size (int): Size of the results, commonly equal to length of + the results. + tmpdir (str | None): temporal directory for collected results to + store. If set to None, it will create a random temporal directory + for it. + + Returns: + list: The collected results. + """ + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + mmcv.mkdir_or_exist('.dist_test') + tmpdir = tempfile.mkdtemp(dir='.dist_test') + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl')) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, f'part_{i}.pkl') + part_result = mmcv.load(part_file) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_result: + part_list.append(part_result) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + """Collect results under gpu mode. + + On gpu mode, this function will encode results to gpu tensors and use gpu + communication for results collection. + + Args: + result_part (list): Result list containing result parts + to be collected. + size (int): Size of the results, commonly equal to length of + the results. + + Returns: + list: The collected results. + """ + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes()) + # When data is severely insufficient, an empty part_result + # on a certain gpu could makes the overall outputs empty. + if part_result: + part_list.append(part_result) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results diff --git a/custom_mmpkg/custom_mmcv/fileio/__init__.py b/custom_mmpkg/custom_mmcv/fileio/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2051b85f7e59bff7bdbaa131849ce8cd31f059a4 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .file_client import BaseStorageBackend, FileClient +from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler +from .io import dump, load, register_handler +from .parse import dict_from_file, list_from_file + +__all__ = [ + 'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler', + 'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler', + 'list_from_file', 'dict_from_file' +] diff --git a/custom_mmpkg/custom_mmcv/fileio/file_client.py b/custom_mmpkg/custom_mmcv/fileio/file_client.py new file mode 100644 index 0000000000000000000000000000000000000000..c060e6e88cce26d13b297d7aeca83e7b2be119bc --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/file_client.py @@ -0,0 +1,1148 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect +import os +import os.path as osp +import re +import tempfile +import warnings +from abc import ABCMeta, abstractmethod +from contextlib import contextmanager +from pathlib import Path +from typing import Iterable, Iterator, Optional, Tuple, Union +from urllib.request import urlopen + +import custom_mmpkg.custom_mmcv as mmcv +from custom_mmpkg.custom_mmcv.utils.misc import has_method +from custom_mmpkg.custom_mmcv.utils.path import is_filepath + + +class BaseStorageBackend(metaclass=ABCMeta): + """Abstract class of storage backends. + + All backends need to implement two apis: ``get()`` and ``get_text()``. + ``get()`` reads the file as a byte stream and ``get_text()`` reads the file + as texts. + """ + + # a flag to indicate whether the backend can create a symlink for a file + _allow_symlink = False + + @property + def name(self): + return self.__class__.__name__ + + @property + def allow_symlink(self): + return self._allow_symlink + + @abstractmethod + def get(self, filepath): + pass + + @abstractmethod + def get_text(self, filepath): + pass + + +class CephBackend(BaseStorageBackend): + """Ceph storage backend (for internal use). + + Args: + path_mapping (dict|None): path mapping dict from local path to Petrel + path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath`` + will be replaced by ``dst``. Default: None. + + .. warning:: + :class:`mmcv.fileio.file_client.CephBackend` will be deprecated, + please use :class:`mmcv.fileio.file_client.PetrelBackend` instead. + """ + + def __init__(self, path_mapping=None): + try: + import ceph + except ImportError: + raise ImportError('Please install ceph to enable CephBackend.') + + warnings.warn( + 'CephBackend will be deprecated, please use PetrelBackend instead') + self._client = ceph.S3Client() + assert isinstance(path_mapping, dict) or path_mapping is None + self.path_mapping = path_mapping + + def get(self, filepath): + filepath = str(filepath) + if self.path_mapping is not None: + for k, v in self.path_mapping.items(): + filepath = filepath.replace(k, v) + value = self._client.Get(filepath) + value_buf = memoryview(value) + return value_buf + + def get_text(self, filepath, encoding=None): + raise NotImplementedError + + +class PetrelBackend(BaseStorageBackend): + """Petrel storage backend (for internal use). + + PetrelBackend supports reading and writing data to multiple clusters. + If the file path contains the cluster name, PetrelBackend will read data + from specified cluster or write data to it. Otherwise, PetrelBackend will + access the default cluster. + + Args: + path_mapping (dict, optional): Path mapping dict from local path to + Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in + ``filepath`` will be replaced by ``dst``. Default: None. + enable_mc (bool, optional): Whether to enable memcached support. + Default: True. + + Examples: + >>> filepath1 = 's3://path/of/file' + >>> filepath2 = 'cluster-name:s3://path/of/file' + >>> client = PetrelBackend() + >>> client.get(filepath1) # get data from default cluster + >>> client.get(filepath2) # get data from 'cluster-name' cluster + """ + + def __init__(self, + path_mapping: Optional[dict] = None, + enable_mc: bool = True): + try: + from petrel_client import client + except ImportError: + raise ImportError('Please install petrel_client to enable ' + 'PetrelBackend.') + + self._client = client.Client(enable_mc=enable_mc) + assert isinstance(path_mapping, dict) or path_mapping is None + self.path_mapping = path_mapping + + def _map_path(self, filepath: Union[str, Path]) -> str: + """Map ``filepath`` to a string path whose prefix will be replaced by + :attr:`self.path_mapping`. + + Args: + filepath (str): Path to be mapped. + """ + filepath = str(filepath) + if self.path_mapping is not None: + for k, v in self.path_mapping.items(): + filepath = filepath.replace(k, v) + return filepath + + def _format_path(self, filepath: str) -> str: + """Convert a ``filepath`` to standard format of petrel oss. + + If the ``filepath`` is concatenated by ``os.path.join``, in a Windows + environment, the ``filepath`` will be the format of + 's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the + above ``filepath`` will be converted to 's3://bucket_name/image.jpg'. + + Args: + filepath (str): Path to be formatted. + """ + return re.sub(r'\\+', '/', filepath) + + def get(self, filepath: Union[str, Path]) -> memoryview: + """Read data from a given ``filepath`` with 'rb' mode. + + Args: + filepath (str or Path): Path to read data. + + Returns: + memoryview: A memory view of expected bytes object to avoid + copying. The memoryview object can be converted to bytes by + ``value_buf.tobytes()``. + """ + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + value = self._client.Get(filepath) + value_buf = memoryview(value) + return value_buf + + def get_text(self, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> str: + """Read data from a given ``filepath`` with 'r' mode. + + Args: + filepath (str or Path): Path to read data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + + Returns: + str: Expected text reading from ``filepath``. + """ + return str(self.get(filepath), encoding=encoding) + + def put(self, obj: bytes, filepath: Union[str, Path]) -> None: + """Save data to a given ``filepath``. + + Args: + obj (bytes): Data to be saved. + filepath (str or Path): Path to write data. + """ + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + self._client.put(filepath, obj) + + def put_text(self, + obj: str, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> None: + """Save data to a given ``filepath``. + + Args: + obj (str): Data to be written. + filepath (str or Path): Path to write data. + encoding (str): The encoding format used to encode the ``obj``. + Default: 'utf-8'. + """ + self.put(bytes(obj, encoding=encoding), filepath) + + def remove(self, filepath: Union[str, Path]) -> None: + """Remove a file. + + Args: + filepath (str or Path): Path to be removed. + """ + if not has_method(self._client, 'delete'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `delete` method, please use a higher version or dev' + ' branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + self._client.delete(filepath) + + def exists(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path exists. + + Args: + filepath (str or Path): Path to be checked whether exists. + + Returns: + bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. + """ + if not (has_method(self._client, 'contains') + and has_method(self._client, 'isdir')): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `contains` and `isdir` methods, please use a higher' + 'version or dev branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + return self._client.contains(filepath) or self._client.isdir(filepath) + + def isdir(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a directory. + + Args: + filepath (str or Path): Path to be checked whether it is a + directory. + + Returns: + bool: Return ``True`` if ``filepath`` points to a directory, + ``False`` otherwise. + """ + if not has_method(self._client, 'isdir'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `isdir` method, please use a higher version or dev' + ' branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + return self._client.isdir(filepath) + + def isfile(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a file. + + Args: + filepath (str or Path): Path to be checked whether it is a file. + + Returns: + bool: Return ``True`` if ``filepath`` points to a file, ``False`` + otherwise. + """ + if not has_method(self._client, 'contains'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `contains` method, please use a higher version or ' + 'dev branch instead.')) + + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + return self._client.contains(filepath) + + def join_path(self, filepath: Union[str, Path], + *filepaths: Union[str, Path]) -> str: + """Concatenate all file paths. + + Args: + filepath (str or Path): Path to be concatenated. + + Returns: + str: The result after concatenation. + """ + filepath = self._format_path(self._map_path(filepath)) + if filepath.endswith('/'): + filepath = filepath[:-1] + formatted_paths = [filepath] + for path in filepaths: + formatted_paths.append(self._format_path(self._map_path(path))) + return '/'.join(formatted_paths) + + @contextmanager + def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: + """Download a file from ``filepath`` and return a temporary path. + + ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It + can be called with ``with`` statement, and when exists from the + ``with`` statement, the temporary path will be released. + + Args: + filepath (str | Path): Download a file from ``filepath``. + + Examples: + >>> client = PetrelBackend() + >>> # After existing from the ``with`` clause, + >>> # the path will be removed + >>> with client.get_local_path('s3://path/of/your/file') as path: + ... # do something here + + Yields: + Iterable[str]: Only yield one temporary path. + """ + filepath = self._map_path(filepath) + filepath = self._format_path(filepath) + assert self.isfile(filepath) + try: + f = tempfile.NamedTemporaryFile(delete=False) + f.write(self.get(filepath)) + f.close() + yield f.name + finally: + os.remove(f.name) + + def list_dir_or_file(self, + dir_path: Union[str, Path], + list_dir: bool = True, + list_file: bool = True, + suffix: Optional[Union[str, Tuple[str]]] = None, + recursive: bool = False) -> Iterator[str]: + """Scan a directory to find the interested directories or files in + arbitrary order. + + Note: + Petrel has no concept of directories but it simulates the directory + hierarchy in the filesystem through public prefixes. In addition, + if the returned path ends with '/', it means the path is a public + prefix which is a logical directory. + + Note: + :meth:`list_dir_or_file` returns the path relative to ``dir_path``. + In addition, the returned path of directory will not contains the + suffix '/' which is consistent with other backends. + + Args: + dir_path (str | Path): Path of the directory. + list_dir (bool): List the directories. Default: True. + list_file (bool): List the path of files. Default: True. + suffix (str or tuple[str], optional): File suffix + that we are interested in. Default: None. + recursive (bool): If set to True, recursively scan the + directory. Default: False. + + Yields: + Iterable[str]: A relative path to ``dir_path``. + """ + if not has_method(self._client, 'list'): + raise NotImplementedError( + ('Current version of Petrel Python SDK has not supported ' + 'the `list` method, please use a higher version or dev' + ' branch instead.')) + + dir_path = self._map_path(dir_path) + dir_path = self._format_path(dir_path) + if list_dir and suffix is not None: + raise TypeError( + '`list_dir` should be False when `suffix` is not None') + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('`suffix` must be a string or tuple of strings') + + # Petrel's simulated directory hierarchy assumes that directory paths + # should end with `/` + if not dir_path.endswith('/'): + dir_path += '/' + + root = dir_path + + def _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive): + for path in self._client.list(dir_path): + # the `self.isdir` is not used here to determine whether path + # is a directory, because `self.isdir` relies on + # `self._client.list` + if path.endswith('/'): # a directory path + next_dir_path = self.join_path(dir_path, path) + if list_dir: + # get the relative path and exclude the last + # character '/' + rel_dir = next_dir_path[len(root):-1] + yield rel_dir + if recursive: + yield from _list_dir_or_file(next_dir_path, list_dir, + list_file, suffix, + recursive) + else: # a file path + absolute_path = self.join_path(dir_path, path) + rel_path = absolute_path[len(root):] + if (suffix is None + or rel_path.endswith(suffix)) and list_file: + yield rel_path + + return _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive) + + +class MemcachedBackend(BaseStorageBackend): + """Memcached storage backend. + + Attributes: + server_list_cfg (str): Config file for memcached server list. + client_cfg (str): Config file for memcached client. + sys_path (str | None): Additional path to be appended to `sys.path`. + Default: None. + """ + + def __init__(self, server_list_cfg, client_cfg, sys_path=None): + if sys_path is not None: + import sys + sys.path.append(sys_path) + try: + import mc + except ImportError: + raise ImportError( + 'Please install memcached to enable MemcachedBackend.') + + self.server_list_cfg = server_list_cfg + self.client_cfg = client_cfg + self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg, + self.client_cfg) + # mc.pyvector servers as a point which points to a memory cache + self._mc_buffer = mc.pyvector() + + def get(self, filepath): + filepath = str(filepath) + import mc + self._client.Get(filepath, self._mc_buffer) + value_buf = mc.ConvertBuffer(self._mc_buffer) + return value_buf + + def get_text(self, filepath, encoding=None): + raise NotImplementedError + + +class LmdbBackend(BaseStorageBackend): + """Lmdb storage backend. + + Args: + db_path (str): Lmdb database path. + readonly (bool, optional): Lmdb environment parameter. If True, + disallow any write operations. Default: True. + lock (bool, optional): Lmdb environment parameter. If False, when + concurrent access occurs, do not lock the database. Default: False. + readahead (bool, optional): Lmdb environment parameter. If False, + disable the OS filesystem readahead mechanism, which may improve + random read performance when a database is larger than RAM. + Default: False. + + Attributes: + db_path (str): Lmdb database path. + """ + + def __init__(self, + db_path, + readonly=True, + lock=False, + readahead=False, + **kwargs): + try: + import lmdb + except ImportError: + raise ImportError('Please install lmdb to enable LmdbBackend.') + + self.db_path = str(db_path) + self._client = lmdb.open( + self.db_path, + readonly=readonly, + lock=lock, + readahead=readahead, + **kwargs) + + def get(self, filepath): + """Get values according to the filepath. + + Args: + filepath (str | obj:`Path`): Here, filepath is the lmdb key. + """ + filepath = str(filepath) + with self._client.begin(write=False) as txn: + value_buf = txn.get(filepath.encode('ascii')) + return value_buf + + def get_text(self, filepath, encoding=None): + raise NotImplementedError + + +class HardDiskBackend(BaseStorageBackend): + """Raw hard disks storage backend.""" + + _allow_symlink = True + + def get(self, filepath: Union[str, Path]) -> bytes: + """Read data from a given ``filepath`` with 'rb' mode. + + Args: + filepath (str or Path): Path to read data. + + Returns: + bytes: Expected bytes object. + """ + with open(filepath, 'rb') as f: + value_buf = f.read() + return value_buf + + def get_text(self, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> str: + """Read data from a given ``filepath`` with 'r' mode. + + Args: + filepath (str or Path): Path to read data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + + Returns: + str: Expected text reading from ``filepath``. + """ + with open(filepath, 'r', encoding=encoding) as f: + value_buf = f.read() + return value_buf + + def put(self, obj: bytes, filepath: Union[str, Path]) -> None: + """Write data to a given ``filepath`` with 'wb' mode. + + Note: + ``put`` will create a directory if the directory of ``filepath`` + does not exist. + + Args: + obj (bytes): Data to be written. + filepath (str or Path): Path to write data. + """ + mmcv.mkdir_or_exist(osp.dirname(filepath)) + with open(filepath, 'wb') as f: + f.write(obj) + + def put_text(self, + obj: str, + filepath: Union[str, Path], + encoding: str = 'utf-8') -> None: + """Write data to a given ``filepath`` with 'w' mode. + + Note: + ``put_text`` will create a directory if the directory of + ``filepath`` does not exist. + + Args: + obj (str): Data to be written. + filepath (str or Path): Path to write data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + """ + mmcv.mkdir_or_exist(osp.dirname(filepath)) + with open(filepath, 'w', encoding=encoding) as f: + f.write(obj) + + def remove(self, filepath: Union[str, Path]) -> None: + """Remove a file. + + Args: + filepath (str or Path): Path to be removed. + """ + os.remove(filepath) + + def exists(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path exists. + + Args: + filepath (str or Path): Path to be checked whether exists. + + Returns: + bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. + """ + return osp.exists(filepath) + + def isdir(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a directory. + + Args: + filepath (str or Path): Path to be checked whether it is a + directory. + + Returns: + bool: Return ``True`` if ``filepath`` points to a directory, + ``False`` otherwise. + """ + return osp.isdir(filepath) + + def isfile(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a file. + + Args: + filepath (str or Path): Path to be checked whether it is a file. + + Returns: + bool: Return ``True`` if ``filepath`` points to a file, ``False`` + otherwise. + """ + return osp.isfile(filepath) + + def join_path(self, filepath: Union[str, Path], + *filepaths: Union[str, Path]) -> str: + """Concatenate all file paths. + + Join one or more filepath components intelligently. The return value + is the concatenation of filepath and any members of *filepaths. + + Args: + filepath (str or Path): Path to be concatenated. + + Returns: + str: The result of concatenation. + """ + return osp.join(filepath, *filepaths) + + @contextmanager + def get_local_path( + self, filepath: Union[str, Path]) -> Iterable[Union[str, Path]]: + """Only for unified API and do nothing.""" + yield filepath + + def list_dir_or_file(self, + dir_path: Union[str, Path], + list_dir: bool = True, + list_file: bool = True, + suffix: Optional[Union[str, Tuple[str]]] = None, + recursive: bool = False) -> Iterator[str]: + """Scan a directory to find the interested directories or files in + arbitrary order. + + Note: + :meth:`list_dir_or_file` returns the path relative to ``dir_path``. + + Args: + dir_path (str | Path): Path of the directory. + list_dir (bool): List the directories. Default: True. + list_file (bool): List the path of files. Default: True. + suffix (str or tuple[str], optional): File suffix + that we are interested in. Default: None. + recursive (bool): If set to True, recursively scan the + directory. Default: False. + + Yields: + Iterable[str]: A relative path to ``dir_path``. + """ + if list_dir and suffix is not None: + raise TypeError('`suffix` should be None when `list_dir` is True') + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('`suffix` must be a string or tuple of strings') + + root = dir_path + + def _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + rel_path = osp.relpath(entry.path, root) + if (suffix is None + or rel_path.endswith(suffix)) and list_file: + yield rel_path + elif osp.isdir(entry.path): + if list_dir: + rel_dir = osp.relpath(entry.path, root) + yield rel_dir + if recursive: + yield from _list_dir_or_file(entry.path, list_dir, + list_file, suffix, + recursive) + + return _list_dir_or_file(dir_path, list_dir, list_file, suffix, + recursive) + + +class HTTPBackend(BaseStorageBackend): + """HTTP and HTTPS storage bachend.""" + + def get(self, filepath): + value_buf = urlopen(filepath).read() + return value_buf + + def get_text(self, filepath, encoding='utf-8'): + value_buf = urlopen(filepath).read() + return value_buf.decode(encoding) + + @contextmanager + def get_local_path(self, filepath: str) -> Iterable[str]: + """Download a file from ``filepath``. + + ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It + can be called with ``with`` statement, and when exists from the + ``with`` statement, the temporary path will be released. + + Args: + filepath (str): Download a file from ``filepath``. + + Examples: + >>> client = HTTPBackend() + >>> # After existing from the ``with`` clause, + >>> # the path will be removed + >>> with client.get_local_path('http://path/of/your/file') as path: + ... # do something here + """ + try: + f = tempfile.NamedTemporaryFile(delete=False) + f.write(self.get(filepath)) + f.close() + yield f.name + finally: + os.remove(f.name) + + +class FileClient: + """A general file client to access files in different backends. + + The client loads a file or text in a specified backend from its path + and returns it as a binary or text file. There are two ways to choose a + backend, the name of backend and the prefix of path. Although both of them + can be used to choose a storage backend, ``backend`` has a higher priority + that is if they are all set, the storage backend will be chosen by the + backend argument. If they are all `None`, the disk backend will be chosen. + Note that It can also register other backend accessor with a given name, + prefixes, and backend class. In addition, We use the singleton pattern to + avoid repeated object creation. If the arguments are the same, the same + object will be returned. + + Args: + backend (str, optional): The storage backend type. Options are "disk", + "ceph", "memcached", "lmdb", "http" and "petrel". Default: None. + prefix (str, optional): The prefix of the registered storage backend. + Options are "s3", "http", "https". Default: None. + + Examples: + >>> # only set backend + >>> file_client = FileClient(backend='petrel') + >>> # only set prefix + >>> file_client = FileClient(prefix='s3') + >>> # set both backend and prefix but use backend to choose client + >>> file_client = FileClient(backend='petrel', prefix='s3') + >>> # if the arguments are the same, the same object is returned + >>> file_client1 = FileClient(backend='petrel') + >>> file_client1 is file_client + True + + Attributes: + client (:obj:`BaseStorageBackend`): The backend object. + """ + + _backends = { + 'disk': HardDiskBackend, + 'ceph': CephBackend, + 'memcached': MemcachedBackend, + 'lmdb': LmdbBackend, + 'petrel': PetrelBackend, + 'http': HTTPBackend, + } + # This collection is used to record the overridden backends, and when a + # backend appears in the collection, the singleton pattern is disabled for + # that backend, because if the singleton pattern is used, then the object + # returned will be the backend before overwriting + _overridden_backends = set() + _prefix_to_backends = { + 's3': PetrelBackend, + 'http': HTTPBackend, + 'https': HTTPBackend, + } + _overridden_prefixes = set() + + _instances = {} + + def __new__(cls, backend=None, prefix=None, **kwargs): + if backend is None and prefix is None: + backend = 'disk' + if backend is not None and backend not in cls._backends: + raise ValueError( + f'Backend {backend} is not supported. Currently supported ones' + f' are {list(cls._backends.keys())}') + if prefix is not None and prefix not in cls._prefix_to_backends: + raise ValueError( + f'prefix {prefix} is not supported. Currently supported ones ' + f'are {list(cls._prefix_to_backends.keys())}') + + # concatenate the arguments to a unique key for determining whether + # objects with the same arguments were created + arg_key = f'{backend}:{prefix}' + for key, value in kwargs.items(): + arg_key += f':{key}:{value}' + + # if a backend was overridden, it will create a new object + if (arg_key in cls._instances + and backend not in cls._overridden_backends + and prefix not in cls._overridden_prefixes): + _instance = cls._instances[arg_key] + else: + # create a new object and put it to _instance + _instance = super().__new__(cls) + if backend is not None: + _instance.client = cls._backends[backend](**kwargs) + else: + _instance.client = cls._prefix_to_backends[prefix](**kwargs) + + cls._instances[arg_key] = _instance + + return _instance + + @property + def name(self): + return self.client.name + + @property + def allow_symlink(self): + return self.client.allow_symlink + + @staticmethod + def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]: + """Parse the prefix of a uri. + + Args: + uri (str | Path): Uri to be parsed that contains the file prefix. + + Examples: + >>> FileClient.parse_uri_prefix('s3://path/of/your/file') + 's3' + + Returns: + str | None: Return the prefix of uri if the uri contains '://' + else ``None``. + """ + assert is_filepath(uri) + uri = str(uri) + if '://' not in uri: + return None + else: + prefix, _ = uri.split('://') + # In the case of PetrelBackend, the prefix may contains the cluster + # name like clusterName:s3 + if ':' in prefix: + _, prefix = prefix.split(':') + return prefix + + @classmethod + def infer_client(cls, + file_client_args: Optional[dict] = None, + uri: Optional[Union[str, Path]] = None) -> 'FileClient': + """Infer a suitable file client based on the URI and arguments. + + Args: + file_client_args (dict, optional): Arguments to instantiate a + FileClient. Default: None. + uri (str | Path, optional): Uri to be parsed that contains the file + prefix. Default: None. + + Examples: + >>> uri = 's3://path/of/your/file' + >>> file_client = FileClient.infer_client(uri=uri) + >>> file_client_args = {'backend': 'petrel'} + >>> file_client = FileClient.infer_client(file_client_args) + + Returns: + FileClient: Instantiated FileClient object. + """ + assert file_client_args is not None or uri is not None + if file_client_args is None: + file_prefix = cls.parse_uri_prefix(uri) # type: ignore + return cls(prefix=file_prefix) + else: + return cls(**file_client_args) + + @classmethod + def _register_backend(cls, name, backend, force=False, prefixes=None): + if not isinstance(name, str): + raise TypeError('the backend name should be a string, ' + f'but got {type(name)}') + if not inspect.isclass(backend): + raise TypeError( + f'backend should be a class but got {type(backend)}') + if not issubclass(backend, BaseStorageBackend): + raise TypeError( + f'backend {backend} is not a subclass of BaseStorageBackend') + if not force and name in cls._backends: + raise KeyError( + f'{name} is already registered as a storage backend, ' + 'add "force=True" if you want to override it') + + if name in cls._backends and force: + cls._overridden_backends.add(name) + cls._backends[name] = backend + + if prefixes is not None: + if isinstance(prefixes, str): + prefixes = [prefixes] + else: + assert isinstance(prefixes, (list, tuple)) + for prefix in prefixes: + if prefix not in cls._prefix_to_backends: + cls._prefix_to_backends[prefix] = backend + elif (prefix in cls._prefix_to_backends) and force: + cls._overridden_prefixes.add(prefix) + cls._prefix_to_backends[prefix] = backend + else: + raise KeyError( + f'{prefix} is already registered as a storage backend,' + ' add "force=True" if you want to override it') + + @classmethod + def register_backend(cls, name, backend=None, force=False, prefixes=None): + """Register a backend to FileClient. + + This method can be used as a normal class method or a decorator. + + .. code-block:: python + + class NewBackend(BaseStorageBackend): + + def get(self, filepath): + return filepath + + def get_text(self, filepath): + return filepath + + FileClient.register_backend('new', NewBackend) + + or + + .. code-block:: python + + @FileClient.register_backend('new') + class NewBackend(BaseStorageBackend): + + def get(self, filepath): + return filepath + + def get_text(self, filepath): + return filepath + + Args: + name (str): The name of the registered backend. + backend (class, optional): The backend class to be registered, + which must be a subclass of :class:`BaseStorageBackend`. + When this method is used as a decorator, backend is None. + Defaults to None. + force (bool, optional): Whether to override the backend if the name + has already been registered. Defaults to False. + prefixes (str or list[str] or tuple[str], optional): The prefixes + of the registered storage backend. Default: None. + `New in version 1.3.15.` + """ + if backend is not None: + cls._register_backend( + name, backend, force=force, prefixes=prefixes) + return + + def _register(backend_cls): + cls._register_backend( + name, backend_cls, force=force, prefixes=prefixes) + return backend_cls + + return _register + + def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]: + """Read data from a given ``filepath`` with 'rb' mode. + + Note: + There are two types of return values for ``get``, one is ``bytes`` + and the other is ``memoryview``. The advantage of using memoryview + is that you can avoid copying, and if you want to convert it to + ``bytes``, you can use ``.tobytes()``. + + Args: + filepath (str or Path): Path to read data. + + Returns: + bytes | memoryview: Expected bytes object or a memory view of the + bytes object. + """ + return self.client.get(filepath) + + def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str: + """Read data from a given ``filepath`` with 'r' mode. + + Args: + filepath (str or Path): Path to read data. + encoding (str): The encoding format used to open the ``filepath``. + Default: 'utf-8'. + + Returns: + str: Expected text reading from ``filepath``. + """ + return self.client.get_text(filepath, encoding) + + def put(self, obj: bytes, filepath: Union[str, Path]) -> None: + """Write data to a given ``filepath`` with 'wb' mode. + + Note: + ``put`` should create a directory if the directory of ``filepath`` + does not exist. + + Args: + obj (bytes): Data to be written. + filepath (str or Path): Path to write data. + """ + self.client.put(obj, filepath) + + def put_text(self, obj: str, filepath: Union[str, Path]) -> None: + """Write data to a given ``filepath`` with 'w' mode. + + Note: + ``put_text`` should create a directory if the directory of + ``filepath`` does not exist. + + Args: + obj (str): Data to be written. + filepath (str or Path): Path to write data. + encoding (str, optional): The encoding format used to open the + `filepath`. Default: 'utf-8'. + """ + self.client.put_text(obj, filepath) + + def remove(self, filepath: Union[str, Path]) -> None: + """Remove a file. + + Args: + filepath (str, Path): Path to be removed. + """ + self.client.remove(filepath) + + def exists(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path exists. + + Args: + filepath (str or Path): Path to be checked whether exists. + + Returns: + bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise. + """ + return self.client.exists(filepath) + + def isdir(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a directory. + + Args: + filepath (str or Path): Path to be checked whether it is a + directory. + + Returns: + bool: Return ``True`` if ``filepath`` points to a directory, + ``False`` otherwise. + """ + return self.client.isdir(filepath) + + def isfile(self, filepath: Union[str, Path]) -> bool: + """Check whether a file path is a file. + + Args: + filepath (str or Path): Path to be checked whether it is a file. + + Returns: + bool: Return ``True`` if ``filepath`` points to a file, ``False`` + otherwise. + """ + return self.client.isfile(filepath) + + def join_path(self, filepath: Union[str, Path], + *filepaths: Union[str, Path]) -> str: + """Concatenate all file paths. + + Join one or more filepath components intelligently. The return value + is the concatenation of filepath and any members of *filepaths. + + Args: + filepath (str or Path): Path to be concatenated. + + Returns: + str: The result of concatenation. + """ + return self.client.join_path(filepath, *filepaths) + + @contextmanager + def get_local_path(self, filepath: Union[str, Path]) -> Iterable[str]: + """Download data from ``filepath`` and write the data to local path. + + ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It + can be called with ``with`` statement, and when exists from the + ``with`` statement, the temporary path will be released. + + Note: + If the ``filepath`` is a local path, just return itself. + + .. warning:: + ``get_local_path`` is an experimental interface that may change in + the future. + + Args: + filepath (str or Path): Path to be read data. + + Examples: + >>> file_client = FileClient(prefix='s3') + >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path: + ... # do something here + + Yields: + Iterable[str]: Only yield one path. + """ + with self.client.get_local_path(str(filepath)) as local_path: + yield local_path + + def list_dir_or_file(self, + dir_path: Union[str, Path], + list_dir: bool = True, + list_file: bool = True, + suffix: Optional[Union[str, Tuple[str]]] = None, + recursive: bool = False) -> Iterator[str]: + """Scan a directory to find the interested directories or files in + arbitrary order. + + Note: + :meth:`list_dir_or_file` returns the path relative to ``dir_path``. + + Args: + dir_path (str | Path): Path of the directory. + list_dir (bool): List the directories. Default: True. + list_file (bool): List the path of files. Default: True. + suffix (str or tuple[str], optional): File suffix + that we are interested in. Default: None. + recursive (bool): If set to True, recursively scan the + directory. Default: False. + + Yields: + Iterable[str]: A relative path to ``dir_path``. + """ + yield from self.client.list_dir_or_file(dir_path, list_dir, list_file, + suffix, recursive) diff --git a/custom_mmpkg/custom_mmcv/fileio/handlers/__init__.py b/custom_mmpkg/custom_mmcv/fileio/handlers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa24d91972837b8756b225f4879bac20436eb72a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/handlers/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import BaseFileHandler +from .json_handler import JsonHandler +from .pickle_handler import PickleHandler +from .yaml_handler import YamlHandler + +__all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler'] diff --git a/custom_mmpkg/custom_mmcv/fileio/handlers/base.py b/custom_mmpkg/custom_mmcv/fileio/handlers/base.py new file mode 100644 index 0000000000000000000000000000000000000000..288878bc57282fbb2f12b32290152ca8e9d3cab0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/handlers/base.py @@ -0,0 +1,30 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import ABCMeta, abstractmethod + + +class BaseFileHandler(metaclass=ABCMeta): + # `str_like` is a flag to indicate whether the type of file object is + # str-like object or bytes-like object. Pickle only processes bytes-like + # objects but json only processes str-like object. If it is str-like + # object, `StringIO` will be used to process the buffer. + str_like = True + + @abstractmethod + def load_from_fileobj(self, file, **kwargs): + pass + + @abstractmethod + def dump_to_fileobj(self, obj, file, **kwargs): + pass + + @abstractmethod + def dump_to_str(self, obj, **kwargs): + pass + + def load_from_path(self, filepath, mode='r', **kwargs): + with open(filepath, mode) as f: + return self.load_from_fileobj(f, **kwargs) + + def dump_to_path(self, obj, filepath, mode='w', **kwargs): + with open(filepath, mode) as f: + self.dump_to_fileobj(obj, f, **kwargs) diff --git a/custom_mmpkg/custom_mmcv/fileio/handlers/json_handler.py b/custom_mmpkg/custom_mmcv/fileio/handlers/json_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..18d4f15f74139d20adff18b20be5529c592a66b6 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/handlers/json_handler.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json + +import numpy as np + +from .base import BaseFileHandler + + +def set_default(obj): + """Set default json values for non-serializable values. + + It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list. + It also converts ``np.generic`` (including ``np.int32``, ``np.float32``, + etc.) into plain numbers of plain python built-in types. + """ + if isinstance(obj, (set, range)): + return list(obj) + elif isinstance(obj, np.ndarray): + return obj.tolist() + elif isinstance(obj, np.generic): + return obj.item() + raise TypeError(f'{type(obj)} is unsupported for json dump') + + +class JsonHandler(BaseFileHandler): + + def load_from_fileobj(self, file): + return json.load(file) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault('default', set_default) + json.dump(obj, file, **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault('default', set_default) + return json.dumps(obj, **kwargs) diff --git a/custom_mmpkg/custom_mmcv/fileio/handlers/pickle_handler.py b/custom_mmpkg/custom_mmcv/fileio/handlers/pickle_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..b37c79bed4ef9fd8913715e62dbe3fc5cafdc3aa --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/handlers/pickle_handler.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import pickle + +from .base import BaseFileHandler + + +class PickleHandler(BaseFileHandler): + + str_like = False + + def load_from_fileobj(self, file, **kwargs): + return pickle.load(file, **kwargs) + + def load_from_path(self, filepath, **kwargs): + return super(PickleHandler, self).load_from_path( + filepath, mode='rb', **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault('protocol', 2) + return pickle.dumps(obj, **kwargs) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault('protocol', 2) + pickle.dump(obj, file, **kwargs) + + def dump_to_path(self, obj, filepath, **kwargs): + super(PickleHandler, self).dump_to_path( + obj, filepath, mode='wb', **kwargs) diff --git a/custom_mmpkg/custom_mmcv/fileio/handlers/yaml_handler.py b/custom_mmpkg/custom_mmcv/fileio/handlers/yaml_handler.py new file mode 100644 index 0000000000000000000000000000000000000000..c5aa2eea1e8c76f8baf753d1c8c959dee665e543 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/handlers/yaml_handler.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import yaml + +try: + from yaml import CLoader as Loader, CDumper as Dumper +except ImportError: + from yaml import Loader, Dumper + +from .base import BaseFileHandler # isort:skip + + +class YamlHandler(BaseFileHandler): + + def load_from_fileobj(self, file, **kwargs): + kwargs.setdefault('Loader', Loader) + return yaml.load(file, **kwargs) + + def dump_to_fileobj(self, obj, file, **kwargs): + kwargs.setdefault('Dumper', Dumper) + yaml.dump(obj, file, **kwargs) + + def dump_to_str(self, obj, **kwargs): + kwargs.setdefault('Dumper', Dumper) + return yaml.dump(obj, **kwargs) diff --git a/custom_mmpkg/custom_mmcv/fileio/io.py b/custom_mmpkg/custom_mmcv/fileio/io.py new file mode 100644 index 0000000000000000000000000000000000000000..aaefde58aa3ea5b58f86249ce7e1c40c186eb8dd --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/io.py @@ -0,0 +1,151 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from io import BytesIO, StringIO +from pathlib import Path + +from ..utils import is_list_of, is_str +from .file_client import FileClient +from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler + +file_handlers = { + 'json': JsonHandler(), + 'yaml': YamlHandler(), + 'yml': YamlHandler(), + 'pickle': PickleHandler(), + 'pkl': PickleHandler() +} + + +def load(file, file_format=None, file_client_args=None, **kwargs): + """Load data from json/yaml/pickle files. + + This method provides a unified api for loading data from serialized files. + + Note: + In v1.3.16 and later, ``load`` supports loading data from serialized + files those can be storaged in different backends. + + Args: + file (str or :obj:`Path` or file-like object): Filename or a file-like + object. + file_format (str, optional): If not specified, the file format will be + inferred from the file extension, otherwise use the specified one. + Currently supported formats include "json", "yaml/yml" and + "pickle/pkl". + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> load('/path/of/your/file') # file is storaged in disk + >>> load('https://path/of/your/file') # file is storaged in Internet + >>> load('s3://path/of/your/file') # file is storaged in petrel + + Returns: + The content from the file. + """ + if isinstance(file, Path): + file = str(file) + if file_format is None and is_str(file): + file_format = file.split('.')[-1] + if file_format not in file_handlers: + raise TypeError(f'Unsupported format: {file_format}') + + handler = file_handlers[file_format] + if is_str(file): + file_client = FileClient.infer_client(file_client_args, file) + if handler.str_like: + with StringIO(file_client.get_text(file)) as f: + obj = handler.load_from_fileobj(f, **kwargs) + else: + with BytesIO(file_client.get(file)) as f: + obj = handler.load_from_fileobj(f, **kwargs) + elif hasattr(file, 'read'): + obj = handler.load_from_fileobj(file, **kwargs) + else: + raise TypeError('"file" must be a filepath str or a file-object') + return obj + + +def dump(obj, file=None, file_format=None, file_client_args=None, **kwargs): + """Dump data to json/yaml/pickle strings or files. + + This method provides a unified api for dumping data as strings or to files, + and also supports custom arguments for each file format. + + Note: + In v1.3.16 and later, ``dump`` supports dumping data as strings or to + files which is saved to different backends. + + Args: + obj (any): The python object to be dumped. + file (str or :obj:`Path` or file-like object, optional): If not + specified, then the object is dumped to a str, otherwise to a file + specified by the filename or file-like object. + file_format (str, optional): Same as :func:`load`. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> dump('hello world', '/path/of/your/file') # disk + >>> dump('hello world', 's3://path/of/your/file') # ceph or petrel + + Returns: + bool: True for success, False otherwise. + """ + if isinstance(file, Path): + file = str(file) + if file_format is None: + if is_str(file): + file_format = file.split('.')[-1] + elif file is None: + raise ValueError( + 'file_format must be specified since file is None') + if file_format not in file_handlers: + raise TypeError(f'Unsupported format: {file_format}') + + handler = file_handlers[file_format] + if file is None: + return handler.dump_to_str(obj, **kwargs) + elif is_str(file): + file_client = FileClient.infer_client(file_client_args, file) + if handler.str_like: + with StringIO() as f: + handler.dump_to_fileobj(obj, f, **kwargs) + file_client.put_text(f.getvalue(), file) + else: + with BytesIO() as f: + handler.dump_to_fileobj(obj, f, **kwargs) + file_client.put(f.getvalue(), file) + elif hasattr(file, 'write'): + handler.dump_to_fileobj(obj, file, **kwargs) + else: + raise TypeError('"file" must be a filename str or a file-object') + + +def _register_handler(handler, file_formats): + """Register a handler for some file extensions. + + Args: + handler (:obj:`BaseFileHandler`): Handler to be registered. + file_formats (str or list[str]): File formats to be handled by this + handler. + """ + if not isinstance(handler, BaseFileHandler): + raise TypeError( + f'handler must be a child of BaseFileHandler, not {type(handler)}') + if isinstance(file_formats, str): + file_formats = [file_formats] + if not is_list_of(file_formats, str): + raise TypeError('file_formats must be a str or a list of str') + for ext in file_formats: + file_handlers[ext] = handler + + +def register_handler(file_formats, **kwargs): + + def wrap(cls): + _register_handler(cls(**kwargs), file_formats) + return cls + + return wrap diff --git a/custom_mmpkg/custom_mmcv/fileio/parse.py b/custom_mmpkg/custom_mmcv/fileio/parse.py new file mode 100644 index 0000000000000000000000000000000000000000..f60f0d611b8d75692221d0edd7dc993b0a6445c9 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/fileio/parse.py @@ -0,0 +1,97 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from io import StringIO + +from .file_client import FileClient + + +def list_from_file(filename, + prefix='', + offset=0, + max_num=0, + encoding='utf-8', + file_client_args=None): + """Load a text file and parse the content as a list of strings. + + Note: + In v1.3.16 and later, ``list_from_file`` supports loading a text file + which can be storaged in different backends and parsing the content as + a list for strings. + + Args: + filename (str): Filename. + prefix (str): The prefix to be inserted to the beginning of each item. + offset (int): The offset of lines. + max_num (int): The maximum number of lines to be read, + zeros and negatives mean no limitation. + encoding (str): Encoding used to open the file. Default utf-8. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> list_from_file('/path/of/your/file') # disk + ['hello', 'world'] + >>> list_from_file('s3://path/of/your/file') # ceph or petrel + ['hello', 'world'] + + Returns: + list[str]: A list of strings. + """ + cnt = 0 + item_list = [] + file_client = FileClient.infer_client(file_client_args, filename) + with StringIO(file_client.get_text(filename, encoding)) as f: + for _ in range(offset): + f.readline() + for line in f: + if 0 < max_num <= cnt: + break + item_list.append(prefix + line.rstrip('\n\r')) + cnt += 1 + return item_list + + +def dict_from_file(filename, + key_type=str, + encoding='utf-8', + file_client_args=None): + """Load a text file and parse the content as a dict. + + Each line of the text file will be two or more columns split by + whitespaces or tabs. The first column will be parsed as dict keys, and + the following columns will be parsed as dict values. + + Note: + In v1.3.16 and later, ``dict_from_file`` supports loading a text file + which can be storaged in different backends and parsing the content as + a dict. + + Args: + filename(str): Filename. + key_type(type): Type of the dict keys. str is user by default and + type conversion will be performed if specified. + encoding (str): Encoding used to open the file. Default utf-8. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + + Examples: + >>> dict_from_file('/path/of/your/file') # disk + {'key1': 'value1', 'key2': 'value2'} + >>> dict_from_file('s3://path/of/your/file') # ceph or petrel + {'key1': 'value1', 'key2': 'value2'} + + Returns: + dict: The parsed contents. + """ + mapping = {} + file_client = FileClient.infer_client(file_client_args, filename) + with StringIO(file_client.get_text(filename, encoding)) as f: + for line in f: + items = line.rstrip('\n').split() + assert len(items) >= 2 + key = key_type(items[0]) + val = items[1:] if len(items) > 2 else items[1] + mapping[key] = val + return mapping diff --git a/custom_mmpkg/custom_mmcv/image/__init__.py b/custom_mmpkg/custom_mmcv/image/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d0051d609d3de4e7562e3fe638335c66617c4d91 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/image/__init__.py @@ -0,0 +1,28 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr, + gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert, + rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb) +from .geometric import (cutout, imcrop, imflip, imflip_, impad, + impad_to_multiple, imrescale, imresize, imresize_like, + imresize_to_multiple, imrotate, imshear, imtranslate, + rescale_size) +from .io import imfrombytes, imread, imwrite, supported_backends, use_backend +from .misc import tensor2imgs +from .photometric import (adjust_brightness, adjust_color, adjust_contrast, + adjust_lighting, adjust_sharpness, auto_contrast, + clahe, imdenormalize, imequalize, iminvert, + imnormalize, imnormalize_, lut_transform, posterize, + solarize) + +__all__ = [ + 'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb', + 'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale', + 'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size', + 'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate', + 'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend', + 'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize', + 'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr', + 'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize', + 'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe', + 'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting' +] diff --git a/custom_mmpkg/custom_mmcv/image/colorspace.py b/custom_mmpkg/custom_mmcv/image/colorspace.py new file mode 100644 index 0000000000000000000000000000000000000000..814533952fdfda23d67cb6a3073692d8c1156add --- /dev/null +++ b/custom_mmpkg/custom_mmcv/image/colorspace.py @@ -0,0 +1,306 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + + +def imconvert(img, src, dst): + """Convert an image from the src colorspace to dst colorspace. + + Args: + img (ndarray): The input image. + src (str): The source colorspace, e.g., 'rgb', 'hsv'. + dst (str): The destination colorspace, e.g., 'rgb', 'hsv'. + + Returns: + ndarray: The converted image. + """ + code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') + out_img = cv2.cvtColor(img, code) + return out_img + + +def bgr2gray(img, keepdim=False): + """Convert a BGR image to grayscale image. + + Args: + img (ndarray): The input image. + keepdim (bool): If False (by default), then return the grayscale image + with 2 dims, otherwise 3 dims. + + Returns: + ndarray: The converted grayscale image. + """ + out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + if keepdim: + out_img = out_img[..., None] + return out_img + + +def rgb2gray(img, keepdim=False): + """Convert a RGB image to grayscale image. + + Args: + img (ndarray): The input image. + keepdim (bool): If False (by default), then return the grayscale image + with 2 dims, otherwise 3 dims. + + Returns: + ndarray: The converted grayscale image. + """ + out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY) + if keepdim: + out_img = out_img[..., None] + return out_img + + +def gray2bgr(img): + """Convert a grayscale image to BGR image. + + Args: + img (ndarray): The input image. + + Returns: + ndarray: The converted BGR image. + """ + img = img[..., None] if img.ndim == 2 else img + out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + return out_img + + +def gray2rgb(img): + """Convert a grayscale image to RGB image. + + Args: + img (ndarray): The input image. + + Returns: + ndarray: The converted RGB image. + """ + img = img[..., None] if img.ndim == 2 else img + out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) + return out_img + + +def _convert_input_type_range(img): + """Convert the type and range of the input image. + + It converts the input image to np.float32 type and range of [0, 1]. + It is mainly used for pre-processing the input image in colorspace + conversion functions such as rgb2ycbcr and ycbcr2rgb. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + (ndarray): The converted image with type of np.float32 and range of + [0, 1]. + """ + img_type = img.dtype + img = img.astype(np.float32) + if img_type == np.float32: + pass + elif img_type == np.uint8: + img /= 255. + else: + raise TypeError('The img type should be np.float32 or np.uint8, ' + f'but got {img_type}') + return img + + +def _convert_output_type_range(img, dst_type): + """Convert the type and range of the image according to dst_type. + + It converts the image to desired type and range. If `dst_type` is np.uint8, + images will be converted to np.uint8 type with range [0, 255]. If + `dst_type` is np.float32, it converts the image to np.float32 type with + range [0, 1]. + It is mainly used for post-processing images in colorspace conversion + functions such as rgb2ycbcr and ycbcr2rgb. + + Args: + img (ndarray): The image to be converted with np.float32 type and + range [0, 255]. + dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it + converts the image to np.uint8 type with range [0, 255]. If + dst_type is np.float32, it converts the image to np.float32 type + with range [0, 1]. + + Returns: + (ndarray): The converted image with desired type and range. + """ + if dst_type not in (np.uint8, np.float32): + raise TypeError('The dst_type should be np.float32 or np.uint8, ' + f'but got {dst_type}') + if dst_type == np.uint8: + img = img.round() + else: + img /= 255. + return img.astype(dst_type) + + +def rgb2ycbcr(img, y_only=False): + """Convert a RGB image to YCbCr image. + + This function produces the same results as Matlab's `rgb2ycbcr` function. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + y_only (bool): Whether to only return Y channel. Default: False. + + Returns: + ndarray: The converted YCbCr image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) + if y_only: + out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0 + else: + out_img = np.matmul( + img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786], + [24.966, 112.0, -18.214]]) + [16, 128, 128] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def bgr2ycbcr(img, y_only=False): + """Convert a BGR image to YCbCr image. + + The bgr version of rgb2ycbcr. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + y_only (bool): Whether to only return Y channel. Default: False. + + Returns: + ndarray: The converted YCbCr image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) + if y_only: + out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0 + else: + out_img = np.matmul( + img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786], + [65.481, -37.797, 112.0]]) + [16, 128, 128] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def ycbcr2rgb(img): + """Convert a YCbCr image to RGB image. + + This function produces the same results as Matlab's ycbcr2rgb function. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + ndarray: The converted RGB image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) * 255 + out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], + [0, -0.00153632, 0.00791071], + [0.00625893, -0.00318811, 0]]) * 255.0 + [ + -222.921, 135.576, -276.836 + ] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def ycbcr2bgr(img): + """Convert a YCbCr image to BGR image. + + The bgr version of ycbcr2rgb. + It implements the ITU-R BT.601 conversion for standard-definition + television. See more details in + https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion. + + It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`. + In OpenCV, it implements a JPEG conversion. See more details in + https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion. + + Args: + img (ndarray): The input image. It accepts: + 1. np.uint8 type with range [0, 255]; + 2. np.float32 type with range [0, 1]. + + Returns: + ndarray: The converted BGR image. The output image has the same type + and range as input image. + """ + img_type = img.dtype + img = _convert_input_type_range(img) * 255 + out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621], + [0.00791071, -0.00153632, 0], + [0, -0.00318811, 0.00625893]]) * 255.0 + [ + -276.836, 135.576, -222.921 + ] + out_img = _convert_output_type_range(out_img, img_type) + return out_img + + +def convert_color_factory(src, dst): + + code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}') + + def convert_color(img): + out_img = cv2.cvtColor(img, code) + return out_img + + convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()} + image. + + Args: + img (ndarray or str): The input image. + + Returns: + ndarray: The converted {dst.upper()} image. + """ + + return convert_color + + +bgr2rgb = convert_color_factory('bgr', 'rgb') + +rgb2bgr = convert_color_factory('rgb', 'bgr') + +bgr2hsv = convert_color_factory('bgr', 'hsv') + +hsv2bgr = convert_color_factory('hsv', 'bgr') + +bgr2hls = convert_color_factory('bgr', 'hls') + +hls2bgr = convert_color_factory('hls', 'bgr') diff --git a/custom_mmpkg/custom_mmcv/image/geometric.py b/custom_mmpkg/custom_mmcv/image/geometric.py new file mode 100644 index 0000000000000000000000000000000000000000..cf97c201cb4e43796c911919d03fb26a07ed817d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/image/geometric.py @@ -0,0 +1,728 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numbers + +import cv2 +import numpy as np + +from ..utils import to_2tuple +from .io import imread_backend + +try: + from PIL import Image +except ImportError: + Image = None + + +def _scale_size(size, scale): + """Rescale a size by a ratio. + + Args: + size (tuple[int]): (w, h). + scale (float | tuple(float)): Scaling factor. + + Returns: + tuple[int]: scaled size. + """ + if isinstance(scale, (float, int)): + scale = (scale, scale) + w, h = size + return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5) + + +cv2_interp_codes = { + 'nearest': cv2.INTER_NEAREST, + 'bilinear': cv2.INTER_LINEAR, + 'bicubic': cv2.INTER_CUBIC, + 'area': cv2.INTER_AREA, + 'lanczos': cv2.INTER_LANCZOS4 +} + +if Image is not None: + pillow_interp_codes = { + 'nearest': Image.NEAREST, + 'bilinear': Image.BILINEAR, + 'bicubic': Image.BICUBIC, + 'box': Image.BOX, + 'lanczos': Image.LANCZOS, + 'hamming': Image.HAMMING + } + + +def imresize(img, + size, + return_scale=False, + interpolation='bilinear', + out=None, + backend=None): + """Resize image to a given size. + + Args: + img (ndarray): The input image. + size (tuple[int]): Target size (w, h). + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. + out (ndarray): The output destination. + backend (str | None): The image resize backend type. Options are `cv2`, + `pillow`, `None`. If backend is None, the global imread_backend + specified by ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = img.shape[:2] + if backend is None: + backend = imread_backend + if backend not in ['cv2', 'pillow']: + raise ValueError(f'backend: {backend} is not supported for resize.' + f"Supported backends are 'cv2', 'pillow'") + + if backend == 'pillow': + assert img.dtype == np.uint8, 'Pillow backend only support uint8 type' + pil_image = Image.fromarray(img) + pil_image = pil_image.resize(size, pillow_interp_codes[interpolation]) + resized_img = np.array(pil_image) + else: + resized_img = cv2.resize( + img, size, dst=out, interpolation=cv2_interp_codes[interpolation]) + if not return_scale: + return resized_img + else: + w_scale = size[0] / w + h_scale = size[1] / h + return resized_img, w_scale, h_scale + + +def imresize_to_multiple(img, + divisor, + size=None, + scale_factor=None, + keep_ratio=False, + return_scale=False, + interpolation='bilinear', + out=None, + backend=None): + """Resize image according to a given size or scale factor and then rounds + up the the resized or rescaled image size to the nearest value that can be + divided by the divisor. + + Args: + img (ndarray): The input image. + divisor (int | tuple): Resized image size will be a multiple of + divisor. If divisor is a tuple, divisor should be + (w_divisor, h_divisor). + size (None | int | tuple[int]): Target size (w, h). Default: None. + scale_factor (None | float | tuple[float]): Multiplier for spatial + size. Should match input size if it is a tuple and the 2D style is + (w_scale_factor, h_scale_factor). Default: None. + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. Default: False. + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Interpolation method, accepted values are + "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2' + backend, "nearest", "bilinear" for 'pillow' backend. + out (ndarray): The output destination. + backend (str | None): The image resize backend type. Options are `cv2`, + `pillow`, `None`. If backend is None, the global imread_backend + specified by ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = img.shape[:2] + if size is not None and scale_factor is not None: + raise ValueError('only one of size or scale_factor should be defined') + elif size is None and scale_factor is None: + raise ValueError('one of size or scale_factor should be defined') + elif size is not None: + size = to_2tuple(size) + if keep_ratio: + size = rescale_size((w, h), size, return_scale=False) + else: + size = _scale_size((w, h), scale_factor) + + divisor = to_2tuple(divisor) + size = tuple([int(np.ceil(s / d)) * d for s, d in zip(size, divisor)]) + resized_img, w_scale, h_scale = imresize( + img, + size, + return_scale=True, + interpolation=interpolation, + out=out, + backend=backend) + if return_scale: + return resized_img, w_scale, h_scale + else: + return resized_img + + +def imresize_like(img, + dst_img, + return_scale=False, + interpolation='bilinear', + backend=None): + """Resize image to the same size of a given image. + + Args: + img (ndarray): The input image. + dst_img (ndarray): The target image. + return_scale (bool): Whether to return `w_scale` and `h_scale`. + interpolation (str): Same as :func:`resize`. + backend (str | None): Same as :func:`resize`. + + Returns: + tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or + `resized_img`. + """ + h, w = dst_img.shape[:2] + return imresize(img, (w, h), return_scale, interpolation, backend=backend) + + +def rescale_size(old_size, scale, return_scale=False): + """Calculate the new size to be rescaled to. + + Args: + old_size (tuple[int]): The old size (w, h) of image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by this + factor, else if it is a tuple of 2 integers, then the image will + be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image size. + + Returns: + tuple[int]: The new rescaled image size. + """ + w, h = old_size + if isinstance(scale, (float, int)): + if scale <= 0: + raise ValueError(f'Invalid scale {scale}, must be positive.') + scale_factor = scale + elif isinstance(scale, tuple): + max_long_edge = max(scale) + max_short_edge = min(scale) + scale_factor = min(max_long_edge / max(h, w), + max_short_edge / min(h, w)) + else: + raise TypeError( + f'Scale must be a number or tuple of int, but got {type(scale)}') + + new_size = _scale_size((w, h), scale_factor) + + if return_scale: + return new_size, scale_factor + else: + return new_size + + +def imrescale(img, + scale, + return_scale=False, + interpolation='bilinear', + backend=None): + """Resize image while keeping the aspect ratio. + + Args: + img (ndarray): The input image. + scale (float | tuple[int]): The scaling factor or maximum size. + If it is a float number, then the image will be rescaled by this + factor, else if it is a tuple of 2 integers, then the image will + be rescaled as large as possible within the scale. + return_scale (bool): Whether to return the scaling factor besides the + rescaled image. + interpolation (str): Same as :func:`resize`. + backend (str | None): Same as :func:`resize`. + + Returns: + ndarray: The rescaled image. + """ + h, w = img.shape[:2] + new_size, scale_factor = rescale_size((w, h), scale, return_scale=True) + rescaled_img = imresize( + img, new_size, interpolation=interpolation, backend=backend) + if return_scale: + return rescaled_img, scale_factor + else: + return rescaled_img + + +def imflip(img, direction='horizontal'): + """Flip an image horizontally or vertically. + + Args: + img (ndarray): Image to be flipped. + direction (str): The flip direction, either "horizontal" or + "vertical" or "diagonal". + + Returns: + ndarray: The flipped image. + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + if direction == 'horizontal': + return np.flip(img, axis=1) + elif direction == 'vertical': + return np.flip(img, axis=0) + else: + return np.flip(img, axis=(0, 1)) + + +def imflip_(img, direction='horizontal'): + """Inplace flip an image horizontally or vertically. + + Args: + img (ndarray): Image to be flipped. + direction (str): The flip direction, either "horizontal" or + "vertical" or "diagonal". + + Returns: + ndarray: The flipped image (inplace). + """ + assert direction in ['horizontal', 'vertical', 'diagonal'] + if direction == 'horizontal': + return cv2.flip(img, 1, img) + elif direction == 'vertical': + return cv2.flip(img, 0, img) + else: + return cv2.flip(img, -1, img) + + +def imrotate(img, + angle, + center=None, + scale=1.0, + border_value=0, + interpolation='bilinear', + auto_bound=False): + """Rotate an image. + + Args: + img (ndarray): Image to be rotated. + angle (float): Rotation angle in degrees, positive values mean + clockwise rotation. + center (tuple[float], optional): Center point (w, h) of the rotation in + the source image. If not specified, the center of the image will be + used. + scale (float): Isotropic scale factor. + border_value (int): Border value. + interpolation (str): Same as :func:`resize`. + auto_bound (bool): Whether to adjust the image size to cover the whole + rotated image. + + Returns: + ndarray: The rotated image. + """ + if center is not None and auto_bound: + raise ValueError('`auto_bound` conflicts with `center`') + h, w = img.shape[:2] + if center is None: + center = ((w - 1) * 0.5, (h - 1) * 0.5) + assert isinstance(center, tuple) + + matrix = cv2.getRotationMatrix2D(center, -angle, scale) + if auto_bound: + cos = np.abs(matrix[0, 0]) + sin = np.abs(matrix[0, 1]) + new_w = h * sin + w * cos + new_h = h * cos + w * sin + matrix[0, 2] += (new_w - w) * 0.5 + matrix[1, 2] += (new_h - h) * 0.5 + w = int(np.round(new_w)) + h = int(np.round(new_h)) + rotated = cv2.warpAffine( + img, + matrix, (w, h), + flags=cv2_interp_codes[interpolation], + borderValue=border_value) + return rotated + + +def bbox_clip(bboxes, img_shape): + """Clip bboxes to fit the image shape. + + Args: + bboxes (ndarray): Shape (..., 4*k) + img_shape (tuple[int]): (height, width) of the image. + + Returns: + ndarray: Clipped bboxes. + """ + assert bboxes.shape[-1] % 4 == 0 + cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype) + cmin[0::2] = img_shape[1] - 1 + cmin[1::2] = img_shape[0] - 1 + clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0) + return clipped_bboxes + + +def bbox_scaling(bboxes, scale, clip_shape=None): + """Scaling bboxes w.r.t the box center. + + Args: + bboxes (ndarray): Shape(..., 4). + scale (float): Scaling factor. + clip_shape (tuple[int], optional): If specified, bboxes that exceed the + boundary will be clipped according to the given shape (h, w). + + Returns: + ndarray: Scaled bboxes. + """ + if float(scale) == 1.0: + scaled_bboxes = bboxes.copy() + else: + w = bboxes[..., 2] - bboxes[..., 0] + 1 + h = bboxes[..., 3] - bboxes[..., 1] + 1 + dw = (w * (scale - 1)) * 0.5 + dh = (h * (scale - 1)) * 0.5 + scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1) + if clip_shape is not None: + return bbox_clip(scaled_bboxes, clip_shape) + else: + return scaled_bboxes + + +def imcrop(img, bboxes, scale=1.0, pad_fill=None): + """Crop image patches. + + 3 steps: scale the bboxes -> clip bboxes -> crop and pad. + + Args: + img (ndarray): Image to be cropped. + bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes. + scale (float, optional): Scale ratio of bboxes, the default value + 1.0 means no padding. + pad_fill (Number | list[Number]): Value to be filled for padding. + Default: None, which means no padding. + + Returns: + list[ndarray] | ndarray: The cropped image patches. + """ + chn = 1 if img.ndim == 2 else img.shape[2] + if pad_fill is not None: + if isinstance(pad_fill, (int, float)): + pad_fill = [pad_fill for _ in range(chn)] + assert len(pad_fill) == chn + + _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes + scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32) + clipped_bbox = bbox_clip(scaled_bboxes, img.shape) + + patches = [] + for i in range(clipped_bbox.shape[0]): + x1, y1, x2, y2 = tuple(clipped_bbox[i, :]) + if pad_fill is None: + patch = img[y1:y2 + 1, x1:x2 + 1, ...] + else: + _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :]) + if chn == 1: + patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1) + else: + patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn) + patch = np.array( + pad_fill, dtype=img.dtype) * np.ones( + patch_shape, dtype=img.dtype) + x_start = 0 if _x1 >= 0 else -_x1 + y_start = 0 if _y1 >= 0 else -_y1 + w = x2 - x1 + 1 + h = y2 - y1 + 1 + patch[y_start:y_start + h, x_start:x_start + w, + ...] = img[y1:y1 + h, x1:x1 + w, ...] + patches.append(patch) + + if bboxes.ndim == 1: + return patches[0] + else: + return patches + + +def impad(img, + *, + shape=None, + padding=None, + pad_val=0, + padding_mode='constant'): + """Pad the given image to a certain shape or pad on all sides with + specified padding mode and padding value. + + Args: + img (ndarray): Image to be padded. + shape (tuple[int]): Expected padding shape (h, w). Default: None. + padding (int or tuple[int]): Padding on each border. If a single int is + provided this is used to pad all borders. If tuple of length 2 is + provided this is the padding on left/right and top/bottom + respectively. If a tuple of length 4 is provided this is the + padding for the left, top, right and bottom borders respectively. + Default: None. Note that `shape` and `padding` can not be both + set. + pad_val (Number | Sequence[Number]): Values to be filled in padding + areas when padding_mode is 'constant'. Default: 0. + padding_mode (str): Type of padding. Should be: constant, edge, + reflect or symmetric. Default: constant. + + - constant: pads with a constant value, this value is specified + with pad_val. + - edge: pads with the last value at the edge of the image. + - reflect: pads with reflection of image without repeating the + last value on the edge. For example, padding [1, 2, 3, 4] + with 2 elements on both sides in reflect mode will result + in [3, 2, 1, 2, 3, 4, 3, 2]. + - symmetric: pads with reflection of image repeating the last + value on the edge. For example, padding [1, 2, 3, 4] with + 2 elements on both sides in symmetric mode will result in + [2, 1, 1, 2, 3, 4, 4, 3] + + Returns: + ndarray: The padded image. + """ + + assert (shape is not None) ^ (padding is not None) + if shape is not None: + padding = (0, 0, shape[1] - img.shape[1], shape[0] - img.shape[0]) + + # check pad_val + if isinstance(pad_val, tuple): + assert len(pad_val) == img.shape[-1] + elif not isinstance(pad_val, numbers.Number): + raise TypeError('pad_val must be a int or a tuple. ' + f'But received {type(pad_val)}') + + # check padding + if isinstance(padding, tuple) and len(padding) in [2, 4]: + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + elif isinstance(padding, numbers.Number): + padding = (padding, padding, padding, padding) + else: + raise ValueError('Padding must be a int or a 2, or 4 element tuple.' + f'But received {padding}') + + # check padding mode + assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'] + + border_type = { + 'constant': cv2.BORDER_CONSTANT, + 'edge': cv2.BORDER_REPLICATE, + 'reflect': cv2.BORDER_REFLECT_101, + 'symmetric': cv2.BORDER_REFLECT + } + img = cv2.copyMakeBorder( + img, + padding[1], + padding[3], + padding[0], + padding[2], + border_type[padding_mode], + value=pad_val) + + return img + + +def impad_to_multiple(img, divisor, pad_val=0): + """Pad an image to ensure each edge to be multiple to some number. + + Args: + img (ndarray): Image to be padded. + divisor (int): Padded image edges will be multiple to divisor. + pad_val (Number | Sequence[Number]): Same as :func:`impad`. + + Returns: + ndarray: The padded image. + """ + pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor + pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor + return impad(img, shape=(pad_h, pad_w), pad_val=pad_val) + + +def cutout(img, shape, pad_val=0): + """Randomly cut out a rectangle from the original img. + + Args: + img (ndarray): Image to be cutout. + shape (int | tuple[int]): Expected cutout shape (h, w). If given as a + int, the value will be used for both h and w. + pad_val (int | float | tuple[int | float]): Values to be filled in the + cut area. Defaults to 0. + + Returns: + ndarray: The cutout image. + """ + + channels = 1 if img.ndim == 2 else img.shape[2] + if isinstance(shape, int): + cut_h, cut_w = shape, shape + else: + assert isinstance(shape, tuple) and len(shape) == 2, \ + f'shape must be a int or a tuple with length 2, but got type ' \ + f'{type(shape)} instead.' + cut_h, cut_w = shape + if isinstance(pad_val, (int, float)): + pad_val = tuple([pad_val] * channels) + elif isinstance(pad_val, tuple): + assert len(pad_val) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(pad_val), channels) + else: + raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`') + + img_h, img_w = img.shape[:2] + y0 = np.random.uniform(img_h) + x0 = np.random.uniform(img_w) + + y1 = int(max(0, y0 - cut_h / 2.)) + x1 = int(max(0, x0 - cut_w / 2.)) + y2 = min(img_h, y1 + cut_h) + x2 = min(img_w, x1 + cut_w) + + if img.ndim == 2: + patch_shape = (y2 - y1, x2 - x1) + else: + patch_shape = (y2 - y1, x2 - x1, channels) + + img_cutout = img.copy() + patch = np.array( + pad_val, dtype=img.dtype) * np.ones( + patch_shape, dtype=img.dtype) + img_cutout[y1:y2, x1:x2, ...] = patch + + return img_cutout + + +def _get_shear_matrix(magnitude, direction='horizontal'): + """Generate the shear matrix for transformation. + + Args: + magnitude (int | float): The magnitude used for shear. + direction (str): The flip direction, either "horizontal" + or "vertical". + + Returns: + ndarray: The shear matrix with dtype float32. + """ + if direction == 'horizontal': + shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]]) + elif direction == 'vertical': + shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]]) + return shear_matrix + + +def imshear(img, + magnitude, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Shear an image. + + Args: + img (ndarray): Image to be sheared with format (h, w) + or (h, w, c). + magnitude (int | float): The magnitude used for shear. + direction (str): The flip direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. + interpolation (str): Same as :func:`resize`. + + Returns: + ndarray: The sheared image. + """ + assert direction in ['horizontal', + 'vertical'], f'Invalid direction: {direction}' + height, width = img.shape[:2] + if img.ndim == 2: + channels = 1 + elif img.ndim == 3: + channels = img.shape[-1] + if isinstance(border_value, int): + border_value = tuple([border_value] * channels) + elif isinstance(border_value, tuple): + assert len(border_value) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(border_value), channels) + else: + raise ValueError( + f'Invalid type {type(border_value)} for `border_value`') + shear_matrix = _get_shear_matrix(magnitude, direction) + sheared = cv2.warpAffine( + img, + shear_matrix, + (width, height), + # Note case when the number elements in `border_value` + # greater than 3 (e.g. shearing masks whose channels large + # than 3) will raise TypeError in `cv2.warpAffine`. + # Here simply slice the first 3 values in `border_value`. + borderValue=border_value[:3], + flags=cv2_interp_codes[interpolation]) + return sheared + + +def _get_translate_matrix(offset, direction='horizontal'): + """Generate the translate matrix. + + Args: + offset (int | float): The offset used for translate. + direction (str): The translate direction, either + "horizontal" or "vertical". + + Returns: + ndarray: The translate matrix with dtype float32. + """ + if direction == 'horizontal': + translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]]) + elif direction == 'vertical': + translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]]) + return translate_matrix + + +def imtranslate(img, + offset, + direction='horizontal', + border_value=0, + interpolation='bilinear'): + """Translate an image. + + Args: + img (ndarray): Image to be translated with format + (h, w) or (h, w, c). + offset (int | float): The offset used for translate. + direction (str): The translate direction, either "horizontal" + or "vertical". + border_value (int | tuple[int]): Value used in case of a + constant border. + interpolation (str): Same as :func:`resize`. + + Returns: + ndarray: The translated image. + """ + assert direction in ['horizontal', + 'vertical'], f'Invalid direction: {direction}' + height, width = img.shape[:2] + if img.ndim == 2: + channels = 1 + elif img.ndim == 3: + channels = img.shape[-1] + if isinstance(border_value, int): + border_value = tuple([border_value] * channels) + elif isinstance(border_value, tuple): + assert len(border_value) == channels, \ + 'Expected the num of elements in tuple equals the channels' \ + 'of input image. Found {} vs {}'.format( + len(border_value), channels) + else: + raise ValueError( + f'Invalid type {type(border_value)} for `border_value`.') + translate_matrix = _get_translate_matrix(offset, direction) + translated = cv2.warpAffine( + img, + translate_matrix, + (width, height), + # Note case when the number elements in `border_value` + # greater than 3 (e.g. translating masks whose channels + # large than 3) will raise TypeError in `cv2.warpAffine`. + # Here simply slice the first 3 values in `border_value`. + borderValue=border_value[:3], + flags=cv2_interp_codes[interpolation]) + return translated diff --git a/custom_mmpkg/custom_mmcv/image/io.py b/custom_mmpkg/custom_mmcv/image/io.py new file mode 100644 index 0000000000000000000000000000000000000000..8fe4400ddc5751cd01a554131b33eca3154e4ca7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/image/io.py @@ -0,0 +1,258 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import io +import os.path as osp +from pathlib import Path + +import cv2 +import numpy as np +from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION, + IMREAD_UNCHANGED) + +from custom_mmpkg.custom_mmcv.utils import check_file_exist, is_str, mkdir_or_exist + +try: + from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG +except ImportError: + TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None + +try: + from PIL import Image, ImageOps +except ImportError: + Image = None + +try: + import tifffile +except ImportError: + tifffile = None + +jpeg = None +supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile'] + +imread_flags = { + 'color': IMREAD_COLOR, + 'grayscale': IMREAD_GRAYSCALE, + 'unchanged': IMREAD_UNCHANGED, + 'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR, + 'grayscale_ignore_orientation': + IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE +} + +imread_backend = 'cv2' + + +def use_backend(backend): + """Select a backend for image decoding. + + Args: + backend (str): The image decoding backend type. Options are `cv2`, + `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG) + and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg` + file format. + """ + assert backend in supported_backends + global imread_backend + imread_backend = backend + if imread_backend == 'turbojpeg': + if TurboJPEG is None: + raise ImportError('`PyTurboJPEG` is not installed') + global jpeg + if jpeg is None: + jpeg = TurboJPEG() + elif imread_backend == 'pillow': + if Image is None: + raise ImportError('`Pillow` is not installed') + elif imread_backend == 'tifffile': + if tifffile is None: + raise ImportError('`tifffile` is not installed') + + +def _jpegflag(flag='color', channel_order='bgr'): + channel_order = channel_order.lower() + if channel_order not in ['rgb', 'bgr']: + raise ValueError('channel order must be either "rgb" or "bgr"') + + if flag == 'color': + if channel_order == 'bgr': + return TJPF_BGR + elif channel_order == 'rgb': + return TJCS_RGB + elif flag == 'grayscale': + return TJPF_GRAY + else: + raise ValueError('flag must be "color" or "grayscale"') + + +def _pillow2array(img, flag='color', channel_order='bgr'): + """Convert a pillow image to numpy array. + + Args: + img (:obj:`PIL.Image.Image`): The image loaded using PIL + flag (str): Flags specifying the color type of a loaded image, + candidates are 'color', 'grayscale' and 'unchanged'. + Default to 'color'. + channel_order (str): The channel order of the output image array, + candidates are 'bgr' and 'rgb'. Default to 'bgr'. + + Returns: + np.ndarray: The converted numpy array + """ + channel_order = channel_order.lower() + if channel_order not in ['rgb', 'bgr']: + raise ValueError('channel order must be either "rgb" or "bgr"') + + if flag == 'unchanged': + array = np.array(img) + if array.ndim >= 3 and array.shape[2] >= 3: # color image + array[:, :, :3] = array[:, :, (2, 1, 0)] # RGB to BGR + else: + # Handle exif orientation tag + if flag in ['color', 'grayscale']: + img = ImageOps.exif_transpose(img) + # If the image mode is not 'RGB', convert it to 'RGB' first. + if img.mode != 'RGB': + if img.mode != 'LA': + # Most formats except 'LA' can be directly converted to RGB + img = img.convert('RGB') + else: + # When the mode is 'LA', the default conversion will fill in + # the canvas with black, which sometimes shadows black objects + # in the foreground. + # + # Therefore, a random color (124, 117, 104) is used for canvas + img_rgba = img.convert('RGBA') + img = Image.new('RGB', img_rgba.size, (124, 117, 104)) + img.paste(img_rgba, mask=img_rgba.split()[3]) # 3 is alpha + if flag in ['color', 'color_ignore_orientation']: + array = np.array(img) + if channel_order != 'rgb': + array = array[:, :, ::-1] # RGB to BGR + elif flag in ['grayscale', 'grayscale_ignore_orientation']: + img = img.convert('L') + array = np.array(img) + else: + raise ValueError( + 'flag must be "color", "grayscale", "unchanged", ' + f'"color_ignore_orientation" or "grayscale_ignore_orientation"' + f' but got {flag}') + return array + + +def imread(img_or_path, flag='color', channel_order='bgr', backend=None): + """Read an image. + + Args: + img_or_path (ndarray or str or Path): Either a numpy array or str or + pathlib.Path. If it is a numpy array (loaded image), then + it will be returned as is. + flag (str): Flags specifying the color type of a loaded image, + candidates are `color`, `grayscale`, `unchanged`, + `color_ignore_orientation` and `grayscale_ignore_orientation`. + By default, `cv2` and `pillow` backend would rotate the image + according to its EXIF info unless called with `unchanged` or + `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend + always ignore image's EXIF info regardless of the flag. + The `turbojpeg` backend only supports `color` and `grayscale`. + channel_order (str): Order of channel, candidates are `bgr` and `rgb`. + backend (str | None): The image decoding backend type. Options are + `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. + If backend is None, the global imread_backend specified by + ``mmcv.use_backend()`` will be used. Default: None. + + Returns: + ndarray: Loaded image array. + """ + + if backend is None: + backend = imread_backend + if backend not in supported_backends: + raise ValueError(f'backend: {backend} is not supported. Supported ' + "backends are 'cv2', 'turbojpeg', 'pillow'") + if isinstance(img_or_path, Path): + img_or_path = str(img_or_path) + + if isinstance(img_or_path, np.ndarray): + return img_or_path + elif is_str(img_or_path): + check_file_exist(img_or_path, + f'img file does not exist: {img_or_path}') + if backend == 'turbojpeg': + with open(img_or_path, 'rb') as in_file: + img = jpeg.decode(in_file.read(), + _jpegflag(flag, channel_order)) + if img.shape[-1] == 1: + img = img[:, :, 0] + return img + elif backend == 'pillow': + img = Image.open(img_or_path) + img = _pillow2array(img, flag, channel_order) + return img + elif backend == 'tifffile': + img = tifffile.imread(img_or_path) + return img + else: + flag = imread_flags[flag] if is_str(flag) else flag + img = cv2.imread(img_or_path, flag) + if flag == IMREAD_COLOR and channel_order == 'rgb': + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) + return img + else: + raise TypeError('"img" must be a numpy array or a str or ' + 'a pathlib.Path object') + + +def imfrombytes(content, flag='color', channel_order='bgr', backend=None): + """Read an image from bytes. + + Args: + content (bytes): Image bytes got from files or other streams. + flag (str): Same as :func:`imread`. + backend (str | None): The image decoding backend type. Options are + `cv2`, `pillow`, `turbojpeg`, `None`. If backend is None, the + global imread_backend specified by ``mmcv.use_backend()`` will be + used. Default: None. + + Returns: + ndarray: Loaded image array. + """ + + if backend is None: + backend = imread_backend + if backend not in supported_backends: + raise ValueError(f'backend: {backend} is not supported. Supported ' + "backends are 'cv2', 'turbojpeg', 'pillow'") + if backend == 'turbojpeg': + img = jpeg.decode(content, _jpegflag(flag, channel_order)) + if img.shape[-1] == 1: + img = img[:, :, 0] + return img + elif backend == 'pillow': + buff = io.BytesIO(content) + img = Image.open(buff) + img = _pillow2array(img, flag, channel_order) + return img + else: + img_np = np.frombuffer(content, np.uint8) + flag = imread_flags[flag] if is_str(flag) else flag + img = cv2.imdecode(img_np, flag) + if flag == IMREAD_COLOR and channel_order == 'rgb': + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) + return img + + +def imwrite(img, file_path, params=None, auto_mkdir=True): + """Write image to file. + + Args: + img (ndarray): Image array to be written. + file_path (str): Image file path. + params (None or list): Same as opencv :func:`imwrite` interface. + auto_mkdir (bool): If the parent folder of `file_path` does not exist, + whether to create it automatically. + + Returns: + bool: Successful or not. + """ + if auto_mkdir: + dir_name = osp.abspath(osp.dirname(file_path)) + mkdir_or_exist(dir_name) + return cv2.imwrite(file_path, img, params) diff --git a/custom_mmpkg/custom_mmcv/image/misc.py b/custom_mmpkg/custom_mmcv/image/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..f8a1aae4510cdef05b9f61a664818c06760cea77 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/image/misc.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np + +import custom_mmpkg.custom_mmcv as mmcv + +try: + import torch +except ImportError: + torch = None + + +def tensor2imgs(tensor, mean=(0, 0, 0), std=(1, 1, 1), to_rgb=True): + """Convert tensor to 3-channel images. + + Args: + tensor (torch.Tensor): Tensor that contains multiple images, shape ( + N, C, H, W). + mean (tuple[float], optional): Mean of images. Defaults to (0, 0, 0). + std (tuple[float], optional): Standard deviation of images. + Defaults to (1, 1, 1). + to_rgb (bool, optional): Whether the tensor was converted to RGB + format in the first place. If so, convert it back to BGR. + Defaults to True. + + Returns: + list[np.ndarray]: A list that contains multiple images. + """ + + if torch is None: + raise RuntimeError('pytorch is not installed') + assert torch.is_tensor(tensor) and tensor.ndim == 4 + assert len(mean) == 3 + assert len(std) == 3 + + num_imgs = tensor.size(0) + mean = np.array(mean, dtype=np.float32) + std = np.array(std, dtype=np.float32) + imgs = [] + for img_id in range(num_imgs): + img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0) + img = mmcv.imdenormalize( + img, mean, std, to_bgr=to_rgb).astype(np.uint8) + imgs.append(np.ascontiguousarray(img)) + return imgs diff --git a/custom_mmpkg/custom_mmcv/image/photometric.py b/custom_mmpkg/custom_mmcv/image/photometric.py new file mode 100644 index 0000000000000000000000000000000000000000..5085d012019c0cbf56f66f421a378278c1a058ae --- /dev/null +++ b/custom_mmpkg/custom_mmcv/image/photometric.py @@ -0,0 +1,428 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + +from ..utils import is_tuple_of +from .colorspace import bgr2gray, gray2bgr + + +def imnormalize(img, mean, std, to_rgb=True): + """Normalize an image with mean and std. + + Args: + img (ndarray): Image to be normalized. + mean (ndarray): The mean to be used for normalize. + std (ndarray): The std to be used for normalize. + to_rgb (bool): Whether to convert to rgb. + + Returns: + ndarray: The normalized image. + """ + img = img.copy().astype(np.float32) + return imnormalize_(img, mean, std, to_rgb) + + +def imnormalize_(img, mean, std, to_rgb=True): + """Inplace normalize an image with mean and std. + + Args: + img (ndarray): Image to be normalized. + mean (ndarray): The mean to be used for normalize. + std (ndarray): The std to be used for normalize. + to_rgb (bool): Whether to convert to rgb. + + Returns: + ndarray: The normalized image. + """ + # cv2 inplace normalization does not accept uint8 + assert img.dtype != np.uint8 + mean = np.float64(mean.reshape(1, -1)) + stdinv = 1 / np.float64(std.reshape(1, -1)) + if to_rgb: + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace + cv2.subtract(img, mean, img) # inplace + cv2.multiply(img, stdinv, img) # inplace + return img + + +def imdenormalize(img, mean, std, to_bgr=True): + assert img.dtype != np.uint8 + mean = mean.reshape(1, -1).astype(np.float64) + std = std.reshape(1, -1).astype(np.float64) + img = cv2.multiply(img, std) # make a copy + cv2.add(img, mean, img) # inplace + if to_bgr: + cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img) # inplace + return img + + +def iminvert(img): + """Invert (negate) an image. + + Args: + img (ndarray): Image to be inverted. + + Returns: + ndarray: The inverted image. + """ + return np.full_like(img, 255) - img + + +def solarize(img, thr=128): + """Solarize an image (invert all pixel values above a threshold) + + Args: + img (ndarray): Image to be solarized. + thr (int): Threshold for solarizing (0 - 255). + + Returns: + ndarray: The solarized image. + """ + img = np.where(img < thr, img, 255 - img) + return img + + +def posterize(img, bits): + """Posterize an image (reduce the number of bits for each color channel) + + Args: + img (ndarray): Image to be posterized. + bits (int): Number of bits (1 to 8) to use for posterizing. + + Returns: + ndarray: The posterized image. + """ + shift = 8 - bits + img = np.left_shift(np.right_shift(img, shift), shift) + return img + + +def adjust_color(img, alpha=1, beta=None, gamma=0): + r"""It blends the source image and its gray image: + + .. math:: + output = img * alpha + gray\_img * beta + gamma + + Args: + img (ndarray): The input source image. + alpha (int | float): Weight for the source image. Default 1. + beta (int | float): Weight for the converted gray image. + If None, it's assigned the value (1 - `alpha`). + gamma (int | float): Scalar added to each sum. + Same as :func:`cv2.addWeighted`. Default 0. + + Returns: + ndarray: Colored image which has the same size and dtype as input. + """ + gray_img = bgr2gray(img) + gray_img = np.tile(gray_img[..., None], [1, 1, 3]) + if beta is None: + beta = 1 - alpha + colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma) + if not colored_img.dtype == np.uint8: + # Note when the dtype of `img` is not the default `np.uint8` + # (e.g. np.float32), the value in `colored_img` got from cv2 + # is not guaranteed to be in range [0, 255], so here clip + # is needed. + colored_img = np.clip(colored_img, 0, 255) + return colored_img + + +def imequalize(img): + """Equalize the image histogram. + + This function applies a non-linear mapping to the input image, + in order to create a uniform distribution of grayscale values + in the output image. + + Args: + img (ndarray): Image to be equalized. + + Returns: + ndarray: The equalized image. + """ + + def _scale_channel(im, c): + """Scale the data in the corresponding channel.""" + im = im[:, :, c] + # Compute the histogram of the image channel. + histo = np.histogram(im, 256, (0, 255))[0] + # For computing the step, filter out the nonzeros. + nonzero_histo = histo[histo > 0] + step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255 + if not step: + lut = np.array(range(256)) + else: + # Compute the cumulative sum, shifted by step // 2 + # and then normalized by step. + lut = (np.cumsum(histo) + (step // 2)) // step + # Shift lut, prepending with 0. + lut = np.concatenate([[0], lut[:-1]], 0) + # handle potential integer overflow + lut[lut > 255] = 255 + # If step is zero, return the original image. + # Otherwise, index from lut. + return np.where(np.equal(step, 0), im, lut[im]) + + # Scales each channel independently and then stacks + # the result. + s1 = _scale_channel(img, 0) + s2 = _scale_channel(img, 1) + s3 = _scale_channel(img, 2) + equalized_img = np.stack([s1, s2, s3], axis=-1) + return equalized_img.astype(img.dtype) + + +def adjust_brightness(img, factor=1.): + """Adjust image brightness. + + This function controls the brightness of an image. An + enhancement factor of 0.0 gives a black image. + A factor of 1.0 gives the original image. This function + blends the source image and the degenerated black image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be brightened. + factor (float): A value controls the enhancement. + Factor 1.0 returns the original image, lower + factors mean less color (brightness, contrast, + etc), and higher values more. Default 1. + + Returns: + ndarray: The brightened image. + """ + degenerated = np.zeros_like(img) + # Note manually convert the dtype to np.float32, to + # achieve as close results as PIL.ImageEnhance.Brightness. + # Set beta=1-factor, and gamma=0 + brightened_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + brightened_img = np.clip(brightened_img, 0, 255) + return brightened_img.astype(img.dtype) + + +def adjust_contrast(img, factor=1.): + """Adjust image contrast. + + This function controls the contrast of an image. An + enhancement factor of 0.0 gives a solid grey + image. A factor of 1.0 gives the original image. It + blends the source image and the degenerated mean image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be contrasted. BGR order. + factor (float): Same as :func:`mmcv.adjust_brightness`. + + Returns: + ndarray: The contrasted image. + """ + gray_img = bgr2gray(img) + hist = np.histogram(gray_img, 256, (0, 255))[0] + mean = round(np.sum(gray_img) / np.sum(hist)) + degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype) + degenerated = gray2bgr(degenerated) + contrasted_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + contrasted_img = np.clip(contrasted_img, 0, 255) + return contrasted_img.astype(img.dtype) + + +def auto_contrast(img, cutoff=0): + """Auto adjust image contrast. + + This function maximize (normalize) image contrast by first removing cutoff + percent of the lightest and darkest pixels from the histogram and remapping + the image so that the darkest pixel becomes black (0), and the lightest + becomes white (255). + + Args: + img (ndarray): Image to be contrasted. BGR order. + cutoff (int | float | tuple): The cutoff percent of the lightest and + darkest pixels to be removed. If given as tuple, it shall be + (low, high). Otherwise, the single value will be used for both. + Defaults to 0. + + Returns: + ndarray: The contrasted image. + """ + + def _auto_contrast_channel(im, c, cutoff): + im = im[:, :, c] + # Compute the histogram of the image channel. + histo = np.histogram(im, 256, (0, 255))[0] + # Remove cut-off percent pixels from histo + histo_sum = np.cumsum(histo) + cut_low = histo_sum[-1] * cutoff[0] // 100 + cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100 + histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low + histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0) + + # Compute mapping + low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1] + # If all the values have been cut off, return the origin img + if low >= high: + return im + scale = 255.0 / (high - low) + offset = -low * scale + lut = np.array(range(256)) + lut = lut * scale + offset + lut = np.clip(lut, 0, 255) + return lut[im] + + if isinstance(cutoff, (int, float)): + cutoff = (cutoff, cutoff) + else: + assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \ + f'float or tuple, but got {type(cutoff)} instead.' + # Auto adjusts contrast for each channel independently and then stacks + # the result. + s1 = _auto_contrast_channel(img, 0, cutoff) + s2 = _auto_contrast_channel(img, 1, cutoff) + s3 = _auto_contrast_channel(img, 2, cutoff) + contrasted_img = np.stack([s1, s2, s3], axis=-1) + return contrasted_img.astype(img.dtype) + + +def adjust_sharpness(img, factor=1., kernel=None): + """Adjust image sharpness. + + This function controls the sharpness of an image. An + enhancement factor of 0.0 gives a blurred image. A + factor of 1.0 gives the original image. And a factor + of 2.0 gives a sharpened image. It blends the source + image and the degenerated mean image: + + .. math:: + output = img * factor + degenerated * (1 - factor) + + Args: + img (ndarray): Image to be sharpened. BGR order. + factor (float): Same as :func:`mmcv.adjust_brightness`. + kernel (np.ndarray, optional): Filter kernel to be applied on the img + to obtain the degenerated img. Defaults to None. + + Note: + No value sanity check is enforced on the kernel set by users. So with + an inappropriate kernel, the ``adjust_sharpness`` may fail to perform + the function its name indicates but end up performing whatever + transform determined by the kernel. + + Returns: + ndarray: The sharpened image. + """ + + if kernel is None: + # adopted from PIL.ImageFilter.SMOOTH + kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13 + assert isinstance(kernel, np.ndarray), \ + f'kernel must be of type np.ndarray, but got {type(kernel)} instead.' + assert kernel.ndim == 2, \ + f'kernel must have a dimension of 2, but got {kernel.ndim} instead.' + + degenerated = cv2.filter2D(img, -1, kernel) + sharpened_img = cv2.addWeighted( + img.astype(np.float32), factor, degenerated.astype(np.float32), + 1 - factor, 0) + sharpened_img = np.clip(sharpened_img, 0, 255) + return sharpened_img.astype(img.dtype) + + +def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True): + """AlexNet-style PCA jitter. + + This data augmentation is proposed in `ImageNet Classification with Deep + Convolutional Neural Networks + `_. + + Args: + img (ndarray): Image to be adjusted lighting. BGR order. + eigval (ndarray): the eigenvalue of the convariance matrix of pixel + values, respectively. + eigvec (ndarray): the eigenvector of the convariance matrix of pixel + values, respectively. + alphastd (float): The standard deviation for distribution of alpha. + Defaults to 0.1 + to_rgb (bool): Whether to convert img to rgb. + + Returns: + ndarray: The adjusted image. + """ + assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \ + f'eigval and eigvec should both be of type np.ndarray, got ' \ + f'{type(eigval)} and {type(eigvec)} instead.' + + assert eigval.ndim == 1 and eigvec.ndim == 2 + assert eigvec.shape == (3, eigval.shape[0]) + n_eigval = eigval.shape[0] + assert isinstance(alphastd, float), 'alphastd should be of type float, ' \ + f'got {type(alphastd)} instead.' + + img = img.copy().astype(np.float32) + if to_rgb: + cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img) # inplace + + alpha = np.random.normal(0, alphastd, n_eigval) + alter = eigvec \ + * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \ + * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval)) + alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape) + img_adjusted = img + alter + return img_adjusted + + +def lut_transform(img, lut_table): + """Transform array by look-up table. + + The function lut_transform fills the output array with values from the + look-up table. Indices of the entries are taken from the input array. + + Args: + img (ndarray): Image to be transformed. + lut_table (ndarray): look-up table of 256 elements; in case of + multi-channel input array, the table should either have a single + channel (in this case the same table is used for all channels) or + the same number of channels as in the input array. + + Returns: + ndarray: The transformed image. + """ + assert isinstance(img, np.ndarray) + assert 0 <= np.min(img) and np.max(img) <= 255 + assert isinstance(lut_table, np.ndarray) + assert lut_table.shape == (256, ) + + return cv2.LUT(np.array(img, dtype=np.uint8), lut_table) + + +def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)): + """Use CLAHE method to process the image. + + See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. + Graphics Gems, 1994:474-485.` for more information. + + Args: + img (ndarray): Image to be processed. + clip_limit (float): Threshold for contrast limiting. Default: 40.0. + tile_grid_size (tuple[int]): Size of grid for histogram equalization. + Input image will be divided into equally sized rectangular tiles. + It defines the number of tiles in row and column. Default: (8, 8). + + Returns: + ndarray: The processed image. + """ + assert isinstance(img, np.ndarray) + assert img.ndim == 2 + assert isinstance(clip_limit, (float, int)) + assert is_tuple_of(tile_grid_size, int) + assert len(tile_grid_size) == 2 + + clahe = cv2.createCLAHE(clip_limit, tile_grid_size) + return clahe.apply(np.array(img, dtype=np.uint8)) diff --git a/custom_mmpkg/custom_mmcv/model_zoo/deprecated.json b/custom_mmpkg/custom_mmcv/model_zoo/deprecated.json new file mode 100644 index 0000000000000000000000000000000000000000..25cf6f28caecc22a77e3136fefa6b8dfc0e6cb5b --- /dev/null +++ b/custom_mmpkg/custom_mmcv/model_zoo/deprecated.json @@ -0,0 +1,6 @@ +{ + "resnet50_caffe": "detectron/resnet50_caffe", + "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr", + "resnet101_caffe": "detectron/resnet101_caffe", + "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr" +} diff --git a/custom_mmpkg/custom_mmcv/model_zoo/mmcls.json b/custom_mmpkg/custom_mmcv/model_zoo/mmcls.json new file mode 100644 index 0000000000000000000000000000000000000000..bdb311d9fe6d9f317290feedc9e37236c6cf6e8f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/model_zoo/mmcls.json @@ -0,0 +1,31 @@ +{ + "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth", + "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth", + "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth", + "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth", + "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth", + "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth", + "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth", + "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth", + "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_batch256_imagenet_20200708-34ab8f90.pth", + "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_batch256_imagenet_20200708-32ffb4f7.pth", + "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_batch256_imagenet_20200708-cfb998bf.pth", + "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_batch256_imagenet_20200708-753f3608.pth", + "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_batch256_imagenet_20200708-ec25b1f9.pth", + "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_batch256_imagenet_20200708-1ad0ce94.pth", + "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_batch256_imagenet_20200708-9cb302ef.pth", + "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_batch256_imagenet_20200708-e79cb6a2.pth", + "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth", + "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth", + "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth", + "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth", + "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth", + "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth", + "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth", + "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth", + "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth", + "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth", + "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth", + "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth", + "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth" +} diff --git a/custom_mmpkg/custom_mmcv/model_zoo/open_mmlab.json b/custom_mmpkg/custom_mmcv/model_zoo/open_mmlab.json new file mode 100644 index 0000000000000000000000000000000000000000..8311db4feef92faa0841c697d75efbee8430c3a0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/model_zoo/open_mmlab.json @@ -0,0 +1,50 @@ +{ + "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth", + "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth", + "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth", + "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth", + "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth", + "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth", + "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth", + "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth", + "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth", + "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth", + "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth", + "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth", + "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth", + "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth", + "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth", + "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth", + "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth", + "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth", + "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth", + "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth", + "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth", + "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth", + "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth", + "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth", + "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth", + "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth", + "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth", + "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth", + "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth", + "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth", + "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth", + "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth", + "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth", + "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth", + "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth", + "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth", + "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth", + "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth", + "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth", + "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth", + "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth", + "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth", + "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth", + "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth", + "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth", + "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth", + "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth", + "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth" +} diff --git a/custom_mmpkg/custom_mmcv/ops/__init__.py b/custom_mmpkg/custom_mmcv/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..999e090a458ee148ceca0649f1e3806a40e909bd --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/__init__.py @@ -0,0 +1,81 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .assign_score_withk import assign_score_withk +from .ball_query import ball_query +from .bbox import bbox_overlaps +from .border_align import BorderAlign, border_align +from .box_iou_rotated import box_iou_rotated +from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive +from .cc_attention import CrissCrossAttention +from .contour_expand import contour_expand +from .corner_pool import CornerPool +from .correlation import Correlation +from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d +from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack, + ModulatedDeformRoIPoolPack, deform_roi_pool) +from .deprecated_wrappers import Conv2d_deprecated as Conv2d +from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d +from .deprecated_wrappers import Linear_deprecated as Linear +from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d +from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss, + sigmoid_focal_loss, softmax_focal_loss) +from .furthest_point_sample import (furthest_point_sample, + furthest_point_sample_with_dist) +from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu +from .gather_points import gather_points +from .group_points import GroupAll, QueryAndGroup, grouping_operation +from .info import (get_compiler_version, get_compiling_cuda_version, + get_onnxruntime_op_path) +from .iou3d import boxes_iou_bev, nms_bev, nms_normal_bev +from .knn import knn +from .masked_conv import MaskedConv2d, masked_conv2d +from .modulated_deform_conv import (ModulatedDeformConv2d, + ModulatedDeformConv2dPack, + modulated_deform_conv2d) +from .multi_scale_deform_attn import MultiScaleDeformableAttention +from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms +from .pixel_group import pixel_group +from .point_sample import (SimpleRoIAlign, point_sample, + rel_roi_point_to_rel_img_point) +from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu, + points_in_boxes_part) +from .points_sampler import PointsSampler +from .psa_mask import PSAMask +from .roi_align import RoIAlign, roi_align +from .roi_align_rotated import RoIAlignRotated, roi_align_rotated +from .roi_pool import RoIPool, roi_pool +from .roiaware_pool3d import RoIAwarePool3d +from .roipoint_pool3d import RoIPointPool3d +from .saconv import SAConv2d +from .scatter_points import DynamicScatter, dynamic_scatter +from .sync_bn import SyncBatchNorm +from .three_interpolate import three_interpolate +from .three_nn import three_nn +from .tin_shift import TINShift, tin_shift +from .upfirdn2d import upfirdn2d +from .voxelize import Voxelization, voxelization + +__all__ = [ + 'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe', + 'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack', + 'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack', + 'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss', + 'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss', + 'get_compiler_version', 'get_compiling_cuda_version', + 'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d', + 'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack', + 'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match', + 'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d', + 'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask', + 'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign', + 'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk', + 'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query', + 'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu', + 'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup', + 'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn', + 'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign', + 'border_align', 'gather_points', 'furthest_point_sample', + 'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation', + 'boxes_iou_bev', 'nms_bev', 'nms_normal_bev', 'Voxelization', + 'voxelization', 'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', + 'points_in_boxes_part', 'points_in_boxes_cpu', 'points_in_boxes_all' +] diff --git a/custom_mmpkg/custom_mmcv/ops/assign_score_withk.py b/custom_mmpkg/custom_mmcv/ops/assign_score_withk.py new file mode 100644 index 0000000000000000000000000000000000000000..4906adaa2cffd1b46912fbe7d4f87ef2f9fa0012 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/assign_score_withk.py @@ -0,0 +1,123 @@ +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward']) + + +class AssignScoreWithK(Function): + r"""Perform weighted sum to generate output features according to scores. + Modified from `PAConv `_. + + This is a memory-efficient CUDA implementation of assign_scores operation, + which first transform all point features with weight bank, then assemble + neighbor features with ``knn_idx`` and perform weighted sum of ``scores``. + + See the `paper `_ appendix Sec. D for + more detailed descriptions. + + Note: + This implementation assumes using ``neighbor`` kernel input, which is + (point_features - center_features, point_features). + See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/ + pointnet2/paconv.py#L128 for more details. + """ + + @staticmethod + def forward(ctx, + scores, + point_features, + center_features, + knn_idx, + aggregate='sum'): + """ + Args: + scores (torch.Tensor): (B, npoint, K, M), predicted scores to + aggregate weight matrices in the weight bank. + ``npoint`` is the number of sampled centers. + ``K`` is the number of queried neighbors. + ``M`` is the number of weight matrices in the weight bank. + point_features (torch.Tensor): (B, N, M, out_dim) + Pre-computed point features to be aggregated. + center_features (torch.Tensor): (B, N, M, out_dim) + Pre-computed center features to be aggregated. + knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN. + We assume the first idx in each row is the idx of the center. + aggregate (str, optional): Aggregation method. + Can be 'sum', 'avg' or 'max'. Defaults: 'sum'. + + Returns: + torch.Tensor: (B, out_dim, npoint, K), the aggregated features. + """ + agg = {'sum': 0, 'avg': 1, 'max': 2} + + B, N, M, out_dim = point_features.size() + _, npoint, K, _ = scores.size() + + output = point_features.new_zeros((B, out_dim, npoint, K)) + ext_module.assign_score_withk_forward( + point_features.contiguous(), + center_features.contiguous(), + scores.contiguous(), + knn_idx.contiguous(), + output, + B=B, + N0=N, + N1=npoint, + M=M, + K=K, + O=out_dim, + aggregate=agg[aggregate]) + + ctx.save_for_backward(output, point_features, center_features, scores, + knn_idx) + ctx.agg = agg[aggregate] + + return output + + @staticmethod + def backward(ctx, grad_out): + """ + Args: + grad_out (torch.Tensor): (B, out_dim, npoint, K) + + Returns: + grad_scores (torch.Tensor): (B, npoint, K, M) + grad_point_features (torch.Tensor): (B, N, M, out_dim) + grad_center_features (torch.Tensor): (B, N, M, out_dim) + """ + _, point_features, center_features, scores, knn_idx = ctx.saved_tensors + + agg = ctx.agg + + B, N, M, out_dim = point_features.size() + _, npoint, K, _ = scores.size() + + grad_point_features = point_features.new_zeros(point_features.shape) + grad_center_features = center_features.new_zeros(center_features.shape) + grad_scores = scores.new_zeros(scores.shape) + + ext_module.assign_score_withk_backward( + grad_out.contiguous(), + point_features.contiguous(), + center_features.contiguous(), + scores.contiguous(), + knn_idx.contiguous(), + grad_point_features, + grad_center_features, + grad_scores, + B=B, + N0=N, + N1=npoint, + M=M, + K=K, + O=out_dim, + aggregate=agg) + + return grad_scores, grad_point_features, \ + grad_center_features, None, None + + +assign_score_withk = AssignScoreWithK.apply diff --git a/custom_mmpkg/custom_mmcv/ops/ball_query.py b/custom_mmpkg/custom_mmcv/ops/ball_query.py new file mode 100644 index 0000000000000000000000000000000000000000..d0466847c6e5c1239e359a0397568413ebc1504a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/ball_query.py @@ -0,0 +1,55 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['ball_query_forward']) + + +class BallQuery(Function): + """Find nearby points in spherical space.""" + + @staticmethod + def forward(ctx, min_radius: float, max_radius: float, sample_num: int, + xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor: + """ + Args: + min_radius (float): minimum radius of the balls. + max_radius (float): maximum radius of the balls. + sample_num (int): maximum number of features in the balls. + xyz (Tensor): (B, N, 3) xyz coordinates of the features. + center_xyz (Tensor): (B, npoint, 3) centers of the ball query. + + Returns: + Tensor: (B, npoint, nsample) tensor with the indices of + the features that form the query balls. + """ + assert center_xyz.is_contiguous() + assert xyz.is_contiguous() + assert min_radius < max_radius + + B, N, _ = xyz.size() + npoint = center_xyz.size(1) + idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int) + + ext_module.ball_query_forward( + center_xyz, + xyz, + idx, + b=B, + n=N, + m=npoint, + min_radius=min_radius, + max_radius=max_radius, + nsample=sample_num) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(idx) + return idx + + @staticmethod + def backward(ctx, a=None): + return None, None, None, None + + +ball_query = BallQuery.apply diff --git a/custom_mmpkg/custom_mmcv/ops/bbox.py b/custom_mmpkg/custom_mmcv/ops/bbox.py new file mode 100644 index 0000000000000000000000000000000000000000..0c4d58b6c91f652933974f519acd3403a833e906 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/bbox.py @@ -0,0 +1,72 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps']) + + +def bbox_overlaps(bboxes1, bboxes2, mode='iou', aligned=False, offset=0): + """Calculate overlap between two set of bboxes. + + If ``aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + Args: + bboxes1 (Tensor): shape (m, 4) in format or empty. + bboxes2 (Tensor): shape (n, 4) in format or empty. + If aligned is ``True``, then m and n must be equal. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + + Returns: + ious(Tensor): shape (m, n) if aligned == False else shape (m, 1) + + Example: + >>> bboxes1 = torch.FloatTensor([ + >>> [0, 0, 10, 10], + >>> [10, 10, 20, 20], + >>> [32, 32, 38, 42], + >>> ]) + >>> bboxes2 = torch.FloatTensor([ + >>> [0, 0, 10, 20], + >>> [0, 10, 10, 19], + >>> [10, 10, 20, 20], + >>> ]) + >>> bbox_overlaps(bboxes1, bboxes2) + tensor([[0.5000, 0.0000, 0.0000], + [0.0000, 0.0000, 1.0000], + [0.0000, 0.0000, 0.0000]]) + + Example: + >>> empty = torch.FloatTensor([]) + >>> nonempty = torch.FloatTensor([ + >>> [0, 0, 10, 9], + >>> ]) + >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1) + >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0) + >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0) + """ + + mode_dict = {'iou': 0, 'iof': 1} + assert mode in mode_dict.keys() + mode_flag = mode_dict[mode] + # Either the boxes are empty or the length of boxes' last dimension is 4 + assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0) + assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0) + assert offset == 1 or offset == 0 + + rows = bboxes1.size(0) + cols = bboxes2.size(0) + if aligned: + assert rows == cols + + if rows * cols == 0: + return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols) + + if aligned: + ious = bboxes1.new_zeros(rows) + else: + ious = bboxes1.new_zeros((rows, cols)) + ext_module.bbox_overlaps( + bboxes1, bboxes2, ious, mode=mode_flag, aligned=aligned, offset=offset) + return ious diff --git a/custom_mmpkg/custom_mmcv/ops/border_align.py b/custom_mmpkg/custom_mmcv/ops/border_align.py new file mode 100644 index 0000000000000000000000000000000000000000..ff305be328e9b0a15e1bbb5e6b41beb940f55c81 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/border_align.py @@ -0,0 +1,109 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# modified from +# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['border_align_forward', 'border_align_backward']) + + +class BorderAlignFunction(Function): + + @staticmethod + def symbolic(g, input, boxes, pool_size): + return g.op( + 'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size) + + @staticmethod + def forward(ctx, input, boxes, pool_size): + ctx.pool_size = pool_size + ctx.input_shape = input.size() + + assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]' + assert boxes.size(2) == 4, \ + 'the last dimension of boxes must be (x1, y1, x2, y2)' + assert input.size(1) % 4 == 0, \ + 'the channel for input feature must be divisible by factor 4' + + # [B, C//4, H*W, 4] + output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4) + output = input.new_zeros(output_shape) + # `argmax_idx` only used for backward + argmax_idx = input.new_zeros(output_shape).to(torch.int) + + ext_module.border_align_forward( + input, boxes, output, argmax_idx, pool_size=ctx.pool_size) + + ctx.save_for_backward(boxes, argmax_idx) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + boxes, argmax_idx = ctx.saved_tensors + grad_input = grad_output.new_zeros(ctx.input_shape) + # complex head architecture may cause grad_output uncontiguous + grad_output = grad_output.contiguous() + ext_module.border_align_backward( + grad_output, + boxes, + argmax_idx, + grad_input, + pool_size=ctx.pool_size) + return grad_input, None, None + + +border_align = BorderAlignFunction.apply + + +class BorderAlign(nn.Module): + r"""Border align pooling layer. + + Applies border_align over the input feature based on predicted bboxes. + The details were described in the paper + `BorderDet: Border Feature for Dense Object Detection + `_. + + For each border line (e.g. top, left, bottom or right) of each box, + border_align does the following: + 1. uniformly samples `pool_size`+1 positions on this line, involving \ + the start and end points. + 2. the corresponding features on these points are computed by \ + bilinear interpolation. + 3. max pooling over all the `pool_size`+1 positions are used for \ + computing pooled feature. + + Args: + pool_size (int): number of positions sampled over the boxes' borders + (e.g. top, bottom, left, right). + + """ + + def __init__(self, pool_size): + super(BorderAlign, self).__init__() + self.pool_size = pool_size + + def forward(self, input, boxes): + """ + Args: + input: Features with shape [N,4C,H,W]. Channels ranged in [0,C), + [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom, + right features respectively. + boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2). + + Returns: + Tensor: Pooled features with shape [N,C,H*W,4]. The order is + (top,left,bottom,right) for the last dimension. + """ + return border_align(input, boxes, self.pool_size) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(pool_size={self.pool_size})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/box_iou_rotated.py b/custom_mmpkg/custom_mmcv/ops/box_iou_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..2d78015e9c2a9e7a52859b4e18f84a9aa63481a0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/box_iou_rotated.py @@ -0,0 +1,45 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated']) + + +def box_iou_rotated(bboxes1, bboxes2, mode='iou', aligned=False): + """Return intersection-over-union (Jaccard index) of boxes. + + Both sets of boxes are expected to be in + (x_center, y_center, width, height, angle) format. + + If ``aligned`` is ``False``, then calculate the ious between each bbox + of bboxes1 and bboxes2, otherwise the ious between each aligned pair of + bboxes1 and bboxes2. + + Arguments: + boxes1 (Tensor): rotated bboxes 1. \ + It has shape (N, 5), indicating (x, y, w, h, theta) for each row. + Note that theta is in radian. + boxes2 (Tensor): rotated bboxes 2. \ + It has shape (M, 5), indicating (x, y, w, h, theta) for each row. + Note that theta is in radian. + mode (str): "iou" (intersection over union) or iof (intersection over + foreground). + + Returns: + ious(Tensor): shape (N, M) if aligned == False else shape (N,) + """ + assert mode in ['iou', 'iof'] + mode_dict = {'iou': 0, 'iof': 1} + mode_flag = mode_dict[mode] + rows = bboxes1.size(0) + cols = bboxes2.size(0) + if aligned: + ious = bboxes1.new_zeros(rows) + else: + ious = bboxes1.new_zeros((rows * cols)) + bboxes1 = bboxes1.contiguous() + bboxes2 = bboxes2.contiguous() + ext_module.box_iou_rotated( + bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned) + if not aligned: + ious = ious.view(rows, cols) + return ious diff --git a/custom_mmpkg/custom_mmcv/ops/carafe.py b/custom_mmpkg/custom_mmcv/ops/carafe.py new file mode 100644 index 0000000000000000000000000000000000000000..5154cb3abfccfbbe0a1b2daa67018dbf80aaf6d2 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/carafe.py @@ -0,0 +1,287 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd import Function +from torch.nn.modules.module import Module + +from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward', + 'carafe_backward' +]) + + +class CARAFENaiveFunction(Function): + + @staticmethod + def symbolic(g, features, masks, kernel_size, group_size, scale_factor): + return g.op( + 'mmcv::MMCVCARAFENaive', + features, + masks, + kernel_size_i=kernel_size, + group_size_i=group_size, + scale_factor_f=scale_factor) + + @staticmethod + def forward(ctx, features, masks, kernel_size, group_size, scale_factor): + assert scale_factor >= 1 + assert masks.size(1) == kernel_size * kernel_size * group_size + assert masks.size(-1) == features.size(-1) * scale_factor + assert masks.size(-2) == features.size(-2) * scale_factor + assert features.size(1) % group_size == 0 + assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 + ctx.kernel_size = kernel_size + ctx.group_size = group_size + ctx.scale_factor = scale_factor + ctx.feature_size = features.size() + ctx.mask_size = masks.size() + + n, c, h, w = features.size() + output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) + ext_module.carafe_naive_forward( + features, + masks, + output, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + if features.requires_grad or masks.requires_grad: + ctx.save_for_backward(features, masks) + return output + + @staticmethod + def backward(ctx, grad_output): + assert grad_output.is_cuda + + features, masks = ctx.saved_tensors + kernel_size = ctx.kernel_size + group_size = ctx.group_size + scale_factor = ctx.scale_factor + + grad_input = torch.zeros_like(features) + grad_masks = torch.zeros_like(masks) + ext_module.carafe_naive_backward( + grad_output.contiguous(), + features, + masks, + grad_input, + grad_masks, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + return grad_input, grad_masks, None, None, None + + +carafe_naive = CARAFENaiveFunction.apply + + +class CARAFENaive(Module): + + def __init__(self, kernel_size, group_size, scale_factor): + super(CARAFENaive, self).__init__() + + assert isinstance(kernel_size, int) and isinstance( + group_size, int) and isinstance(scale_factor, int) + self.kernel_size = kernel_size + self.group_size = group_size + self.scale_factor = scale_factor + + def forward(self, features, masks): + return carafe_naive(features, masks, self.kernel_size, self.group_size, + self.scale_factor) + + +class CARAFEFunction(Function): + + @staticmethod + def symbolic(g, features, masks, kernel_size, group_size, scale_factor): + return g.op( + 'mmcv::MMCVCARAFE', + features, + masks, + kernel_size_i=kernel_size, + group_size_i=group_size, + scale_factor_f=scale_factor) + + @staticmethod + def forward(ctx, features, masks, kernel_size, group_size, scale_factor): + assert scale_factor >= 1 + assert masks.size(1) == kernel_size * kernel_size * group_size + assert masks.size(-1) == features.size(-1) * scale_factor + assert masks.size(-2) == features.size(-2) * scale_factor + assert features.size(1) % group_size == 0 + assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1 + ctx.kernel_size = kernel_size + ctx.group_size = group_size + ctx.scale_factor = scale_factor + ctx.feature_size = features.size() + ctx.mask_size = masks.size() + + n, c, h, w = features.size() + output = features.new_zeros((n, c, h * scale_factor, w * scale_factor)) + routput = features.new_zeros(output.size(), requires_grad=False) + rfeatures = features.new_zeros(features.size(), requires_grad=False) + rmasks = masks.new_zeros(masks.size(), requires_grad=False) + ext_module.carafe_forward( + features, + masks, + rfeatures, + routput, + rmasks, + output, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + + if features.requires_grad or masks.requires_grad: + ctx.save_for_backward(features, masks, rfeatures) + return output + + @staticmethod + def backward(ctx, grad_output): + assert grad_output.is_cuda + + features, masks, rfeatures = ctx.saved_tensors + kernel_size = ctx.kernel_size + group_size = ctx.group_size + scale_factor = ctx.scale_factor + + rgrad_output = torch.zeros_like(grad_output, requires_grad=False) + rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False) + rgrad_input = torch.zeros_like(features, requires_grad=False) + rgrad_masks = torch.zeros_like(masks, requires_grad=False) + grad_input = torch.zeros_like(features, requires_grad=False) + grad_masks = torch.zeros_like(masks, requires_grad=False) + ext_module.carafe_backward( + grad_output.contiguous(), + rfeatures, + masks, + rgrad_output, + rgrad_input_hs, + rgrad_input, + rgrad_masks, + grad_input, + grad_masks, + kernel_size=kernel_size, + group_size=group_size, + scale_factor=scale_factor) + return grad_input, grad_masks, None, None, None + + +carafe = CARAFEFunction.apply + + +class CARAFE(Module): + """ CARAFE: Content-Aware ReAssembly of FEatures + + Please refer to https://arxiv.org/abs/1905.02188 for more details. + + Args: + kernel_size (int): reassemble kernel size + group_size (int): reassemble group size + scale_factor (int): upsample ratio + + Returns: + upsampled feature map + """ + + def __init__(self, kernel_size, group_size, scale_factor): + super(CARAFE, self).__init__() + + assert isinstance(kernel_size, int) and isinstance( + group_size, int) and isinstance(scale_factor, int) + self.kernel_size = kernel_size + self.group_size = group_size + self.scale_factor = scale_factor + + def forward(self, features, masks): + return carafe(features, masks, self.kernel_size, self.group_size, + self.scale_factor) + + +@UPSAMPLE_LAYERS.register_module(name='carafe') +class CARAFEPack(nn.Module): + """A unified package of CARAFE upsampler that contains: 1) channel + compressor 2) content encoder 3) CARAFE op. + + Official implementation of ICCV 2019 paper + CARAFE: Content-Aware ReAssembly of FEatures + Please refer to https://arxiv.org/abs/1905.02188 for more details. + + Args: + channels (int): input feature channels + scale_factor (int): upsample ratio + up_kernel (int): kernel size of CARAFE op + up_group (int): group size of CARAFE op + encoder_kernel (int): kernel size of content encoder + encoder_dilation (int): dilation of content encoder + compressed_channels (int): output channels of channels compressor + + Returns: + upsampled feature map + """ + + def __init__(self, + channels, + scale_factor, + up_kernel=5, + up_group=1, + encoder_kernel=3, + encoder_dilation=1, + compressed_channels=64): + super(CARAFEPack, self).__init__() + self.channels = channels + self.scale_factor = scale_factor + self.up_kernel = up_kernel + self.up_group = up_group + self.encoder_kernel = encoder_kernel + self.encoder_dilation = encoder_dilation + self.compressed_channels = compressed_channels + self.channel_compressor = nn.Conv2d(channels, self.compressed_channels, + 1) + self.content_encoder = nn.Conv2d( + self.compressed_channels, + self.up_kernel * self.up_kernel * self.up_group * + self.scale_factor * self.scale_factor, + self.encoder_kernel, + padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2), + dilation=self.encoder_dilation, + groups=1) + self.init_weights() + + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution='uniform') + normal_init(self.content_encoder, std=0.001) + + def kernel_normalizer(self, mask): + mask = F.pixel_shuffle(mask, self.scale_factor) + n, mask_c, h, w = mask.size() + # use float division explicitly, + # to void inconsistency while exporting to onnx + mask_channel = int(mask_c / float(self.up_kernel**2)) + mask = mask.view(n, mask_channel, -1, h, w) + + mask = F.softmax(mask, dim=2, dtype=mask.dtype) + mask = mask.view(n, mask_c, h, w).contiguous() + + return mask + + def feature_reassemble(self, x, mask): + x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor) + return x + + def forward(self, x): + compressed_x = self.channel_compressor(x) + mask = self.content_encoder(compressed_x) + mask = self.kernel_normalizer(mask) + + x = self.feature_reassemble(x, mask) + return x diff --git a/custom_mmpkg/custom_mmcv/ops/cc_attention.py b/custom_mmpkg/custom_mmcv/ops/cc_attention.py new file mode 100644 index 0000000000000000000000000000000000000000..d6868974cae6e5a7b9a6841845f9fca909a27155 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/cc_attention.py @@ -0,0 +1,83 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_mmpkg.custom_mmcv.cnn import PLUGIN_LAYERS, Scale + + +def NEG_INF_DIAG(n, device): + """Returns a diagonal matrix of size [n, n]. + + The diagonal are all "-inf". This is for avoiding calculating the + overlapped element in the Criss-Cross twice. + """ + return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0) + + +@PLUGIN_LAYERS.register_module() +class CrissCrossAttention(nn.Module): + """Criss-Cross Attention Module. + + .. note:: + Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch + to a pure PyTorch and equivalent implementation. For more + details, please refer to https://github.com/open-mmlab/mmcv/pull/1201. + + Speed comparison for one forward pass + + - Input size: [2,512,97,97] + - Device: 1 NVIDIA GeForce RTX 2080 Ti + + +-----------------------+---------------+------------+---------------+ + | |PyTorch version|CUDA version|Relative speed | + +=======================+===============+============+===============+ + |with torch.no_grad() |0.00554402 s |0.0299619 s |5.4x | + +-----------------------+---------------+------------+---------------+ + |no with torch.no_grad()|0.00562803 s |0.0301349 s |5.4x | + +-----------------------+---------------+------------+---------------+ + + Args: + in_channels (int): Channels of the input feature map. + """ + + def __init__(self, in_channels): + super().__init__() + self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1) + self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1) + self.value_conv = nn.Conv2d(in_channels, in_channels, 1) + self.gamma = Scale(0.) + self.in_channels = in_channels + + def forward(self, x): + """forward function of Criss-Cross Attention. + + Args: + x (Tensor): Input feature. \ + shape (batch_size, in_channels, height, width) + Returns: + Tensor: Output of the layer, with shape of \ + (batch_size, in_channels, height, width) + """ + B, C, H, W = x.size() + query = self.query_conv(x) + key = self.key_conv(x) + value = self.value_conv(x) + energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG( + H, query.device) + energy_H = energy_H.transpose(1, 2) + energy_W = torch.einsum('bchw,bchj->bhwj', query, key) + attn = F.softmax( + torch.cat([energy_H, energy_W], dim=-1), dim=-1) # [B,H,W,(H+W)] + out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H]) + out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:]) + + out = self.gamma(out) + x + out = out.contiguous() + + return out + + def __repr__(self): + s = self.__class__.__name__ + s += f'(in_channels={self.in_channels})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/contour_expand.py b/custom_mmpkg/custom_mmcv/ops/contour_expand.py new file mode 100644 index 0000000000000000000000000000000000000000..ea1111e1768b5f27e118bf7dbc0d9c70a7afd6d7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/contour_expand.py @@ -0,0 +1,49 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['contour_expand']) + + +def contour_expand(kernel_mask, internal_kernel_label, min_kernel_area, + kernel_num): + """Expand kernel contours so that foreground pixels are assigned into + instances. + + Arguments: + kernel_mask (np.array or Tensor): The instance kernel mask with + size hxw. + internal_kernel_label (np.array or Tensor): The instance internal + kernel label with size hxw. + min_kernel_area (int): The minimum kernel area. + kernel_num (int): The instance kernel number. + + Returns: + label (list): The instance index map with size hxw. + """ + assert isinstance(kernel_mask, (torch.Tensor, np.ndarray)) + assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray)) + assert isinstance(min_kernel_area, int) + assert isinstance(kernel_num, int) + + if isinstance(kernel_mask, np.ndarray): + kernel_mask = torch.from_numpy(kernel_mask) + if isinstance(internal_kernel_label, np.ndarray): + internal_kernel_label = torch.from_numpy(internal_kernel_label) + + if torch.__version__ == 'parrots': + if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0: + label = [] + else: + label = ext_module.contour_expand( + kernel_mask, + internal_kernel_label, + min_kernel_area=min_kernel_area, + kernel_num=kernel_num) + label = label.tolist() + else: + label = ext_module.contour_expand(kernel_mask, internal_kernel_label, + min_kernel_area, kernel_num) + return label diff --git a/custom_mmpkg/custom_mmcv/ops/corner_pool.py b/custom_mmpkg/custom_mmcv/ops/corner_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..a33d798b43d405e4c86bee4cd6389be21ca9c637 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/corner_pool.py @@ -0,0 +1,161 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'top_pool_forward', 'top_pool_backward', 'bottom_pool_forward', + 'bottom_pool_backward', 'left_pool_forward', 'left_pool_backward', + 'right_pool_forward', 'right_pool_backward' +]) + +_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3} + + +class TopPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.top_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.top_pool_backward(input, grad_output) + return output + + +class BottomPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.bottom_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.bottom_pool_backward(input, grad_output) + return output + + +class LeftPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.left_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.left_pool_backward(input, grad_output) + return output + + +class RightPoolFunction(Function): + + @staticmethod + def symbolic(g, input): + output = g.op( + 'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right'])) + return output + + @staticmethod + def forward(ctx, input): + output = ext_module.right_pool_forward(input) + ctx.save_for_backward(input) + return output + + @staticmethod + def backward(ctx, grad_output): + input, = ctx.saved_tensors + output = ext_module.right_pool_backward(input, grad_output) + return output + + +class CornerPool(nn.Module): + """Corner Pooling. + + Corner Pooling is a new type of pooling layer that helps a + convolutional network better localize corners of bounding boxes. + + Please refer to https://arxiv.org/abs/1808.01244 for more details. + Code is modified from https://github.com/princeton-vl/CornerNet-Lite. + + Args: + mode(str): Pooling orientation for the pooling layer + + - 'bottom': Bottom Pooling + - 'left': Left Pooling + - 'right': Right Pooling + - 'top': Top Pooling + + Returns: + Feature map after pooling. + """ + + pool_functions = { + 'bottom': BottomPoolFunction, + 'left': LeftPoolFunction, + 'right': RightPoolFunction, + 'top': TopPoolFunction, + } + + cummax_dim_flip = { + 'bottom': (2, False), + 'left': (3, True), + 'right': (3, False), + 'top': (2, True), + } + + def __init__(self, mode): + super(CornerPool, self).__init__() + assert mode in self.pool_functions + self.mode = mode + self.corner_pool = self.pool_functions[mode] + + def forward(self, x): + if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0': + if torch.onnx.is_in_onnx_export(): + assert torch.__version__ >= '1.7.0', \ + 'When `cummax` serves as an intermediate component whose '\ + 'outputs is used as inputs for another modules, it\'s '\ + 'expected that pytorch version must be >= 1.7.0, '\ + 'otherwise Error appears like: `RuntimeError: tuple '\ + 'appears in op that does not forward tuples, unsupported '\ + 'kind: prim::PythonOp`.' + + dim, flip = self.cummax_dim_flip[self.mode] + if flip: + x = x.flip(dim) + pool_tensor, _ = torch.cummax(x, dim=dim) + if flip: + pool_tensor = pool_tensor.flip(dim) + return pool_tensor + else: + return self.corner_pool.apply(x) diff --git a/custom_mmpkg/custom_mmcv/ops/correlation.py b/custom_mmpkg/custom_mmcv/ops/correlation.py new file mode 100644 index 0000000000000000000000000000000000000000..3d0b79c301b29915dfaf4d2b1846c59be73127d3 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/correlation.py @@ -0,0 +1,196 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['correlation_forward', 'correlation_backward']) + + +class CorrelationFunction(Function): + + @staticmethod + def forward(ctx, + input1, + input2, + kernel_size=1, + max_displacement=1, + stride=1, + padding=1, + dilation=1, + dilation_patch=1): + + ctx.save_for_backward(input1, input2) + + kH, kW = ctx.kernel_size = _pair(kernel_size) + patch_size = max_displacement * 2 + 1 + ctx.patch_size = patch_size + dH, dW = ctx.stride = _pair(stride) + padH, padW = ctx.padding = _pair(padding) + dilationH, dilationW = ctx.dilation = _pair(dilation) + dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair( + dilation_patch) + + output_size = CorrelationFunction._output_size(ctx, input1) + + output = input1.new_zeros(output_size) + + ext_module.correlation_forward( + input1, + input2, + output, + kH=kH, + kW=kW, + patchH=patch_size, + patchW=patch_size, + padH=padH, + padW=padW, + dilationH=dilationH, + dilationW=dilationW, + dilation_patchH=dilation_patchH, + dilation_patchW=dilation_patchW, + dH=dH, + dW=dW) + + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input1, input2 = ctx.saved_tensors + + kH, kW = ctx.kernel_size + patch_size = ctx.patch_size + padH, padW = ctx.padding + dilationH, dilationW = ctx.dilation + dilation_patchH, dilation_patchW = ctx.dilation_patch + dH, dW = ctx.stride + grad_input1 = torch.zeros_like(input1) + grad_input2 = torch.zeros_like(input2) + + ext_module.correlation_backward( + grad_output, + input1, + input2, + grad_input1, + grad_input2, + kH=kH, + kW=kW, + patchH=patch_size, + patchW=patch_size, + padH=padH, + padW=padW, + dilationH=dilationH, + dilationW=dilationW, + dilation_patchH=dilation_patchH, + dilation_patchW=dilation_patchW, + dH=dH, + dW=dW) + return grad_input1, grad_input2, None, None, None, None, None, None + + @staticmethod + def _output_size(ctx, input1): + iH, iW = input1.size(2), input1.size(3) + batch_size = input1.size(0) + kH, kW = ctx.kernel_size + patch_size = ctx.patch_size + dH, dW = ctx.stride + padH, padW = ctx.padding + dilationH, dilationW = ctx.dilation + dilatedKH = (kH - 1) * dilationH + 1 + dilatedKW = (kW - 1) * dilationW + 1 + + oH = int((iH + 2 * padH - dilatedKH) / dH + 1) + oW = int((iW + 2 * padW - dilatedKW) / dW + 1) + + output_size = (batch_size, patch_size, patch_size, oH, oW) + return output_size + + +class Correlation(nn.Module): + r"""Correlation operator + + This correlation operator works for optical flow correlation computation. + + There are two batched tensors with shape :math:`(N, C, H, W)`, + and the correlation output's shape is :math:`(N, max\_displacement \times + 2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})` + + where + + .. math:: + H_{out} = \left\lfloor\frac{H_{in} + 2 \times padding - + dilation \times (kernel\_size - 1) - 1} + {stride} + 1\right\rfloor + + .. math:: + W_{out} = \left\lfloor\frac{W_{in} + 2 \times padding - dilation + \times (kernel\_size - 1) - 1} + {stride} + 1\right\rfloor + + the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding + window convolution between input1 and shifted input2, + + .. math:: + Corr(N_i, dx, dy) = + \sum_{c=0}^{C-1} + input1(N_i, c) \star + \mathcal{S}(input2(N_i, c), dy, dx) + + where :math:`\star` is the valid 2d sliding window convolution operator, + and :math:`\mathcal{S}` means shifting the input features (auto-complete + zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in + [-max\_displacement \times dilation\_patch, max\_displacement \times + dilation\_patch]`. + + Args: + kernel_size (int): The size of sliding window i.e. local neighborhood + representing the center points and involved in correlation + computation. Defaults to 1. + max_displacement (int): The radius for computing correlation volume, + but the actual working space can be dilated by dilation_patch. + Defaults to 1. + stride (int): The stride of the sliding blocks in the input spatial + dimensions. Defaults to 1. + padding (int): Zero padding added to all four sides of the input1. + Defaults to 0. + dilation (int): The spacing of local neighborhood that will involved + in correlation. Defaults to 1. + dilation_patch (int): The spacing between position need to compute + correlation. Defaults to 1. + """ + + def __init__(self, + kernel_size: int = 1, + max_displacement: int = 1, + stride: int = 1, + padding: int = 0, + dilation: int = 1, + dilation_patch: int = 1) -> None: + super().__init__() + self.kernel_size = kernel_size + self.max_displacement = max_displacement + self.stride = stride + self.padding = padding + self.dilation = dilation + self.dilation_patch = dilation_patch + + def forward(self, input1: Tensor, input2: Tensor) -> Tensor: + return CorrelationFunction.apply(input1, input2, self.kernel_size, + self.max_displacement, self.stride, + self.padding, self.dilation, + self.dilation_patch) + + def __repr__(self) -> str: + s = self.__class__.__name__ + s += f'(kernel_size={self.kernel_size}, ' + s += f'max_displacement={self.max_displacement}, ' + s += f'stride={self.stride}, ' + s += f'padding={self.padding}, ' + s += f'dilation={self.dilation}, ' + s += f'dilation_patch={self.dilation_patch})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/deform_conv.py b/custom_mmpkg/custom_mmcv/ops/deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..c8e80c5af51a525915875a1f9cb030e77d24f190 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/deform_conv.py @@ -0,0 +1,405 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch import Tensor +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair, _single + +from custom_mmpkg.custom_mmcv.utils import deprecated_api_warning +from ..cnn import CONV_LAYERS +from ..utils import ext_loader, print_log + +ext_module = ext_loader.load_ext('_ext', [ + 'deform_conv_forward', 'deform_conv_backward_input', + 'deform_conv_backward_parameters' +]) + + +class DeformConv2dFunction(Function): + + @staticmethod + def symbolic(g, + input, + offset, + weight, + stride, + padding, + dilation, + groups, + deform_groups, + bias=False, + im2col_step=32): + return g.op( + 'mmcv::MMCVDeformConv2d', + input, + offset, + weight, + stride_i=stride, + padding_i=padding, + dilation_i=dilation, + groups_i=groups, + deform_groups_i=deform_groups, + bias_i=bias, + im2col_step_i=im2col_step) + + @staticmethod + def forward(ctx, + input, + offset, + weight, + stride=1, + padding=0, + dilation=1, + groups=1, + deform_groups=1, + bias=False, + im2col_step=32): + if input is not None and input.dim() != 4: + raise ValueError( + f'Expected 4D tensor as input, got {input.dim()}D tensor \ + instead.') + assert bias is False, 'Only support bias is False.' + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deform_groups = deform_groups + ctx.im2col_step = im2col_step + + # When pytorch version >= 1.6.0, amp is adopted for fp16 mode; + # amp won't cast the type of model (float32), but "offset" is cast + # to float16 by nn.Conv2d automatically, leading to the type + # mismatch with input (when it is float32) or weight. + # The flag for whether to use fp16 or amp is the type of "offset", + # we cast weight and input to temporarily support fp16 and amp + # whatever the pytorch version is. + input = input.type_as(offset) + weight = weight.type_as(input) + ctx.save_for_backward(input, offset, weight) + + output = input.new_empty( + DeformConv2dFunction._output_size(ctx, input, weight)) + + ctx.bufs_ = [input.new_empty(0), input.new_empty(0)] # columns, ones + + cur_im2col_step = min(ctx.im2col_step, input.size(0)) + assert (input.size(0) % + cur_im2col_step) == 0, 'im2col step must divide batchsize' + ext_module.deform_conv_forward( + input, + weight, + offset, + output, + ctx.bufs_[0], + ctx.bufs_[1], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + im2col_step=cur_im2col_step) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, weight = ctx.saved_tensors + + grad_input = grad_offset = grad_weight = None + + cur_im2col_step = min(ctx.im2col_step, input.size(0)) + assert (input.size(0) % cur_im2col_step + ) == 0, 'batch size must be divisible by im2col_step' + + grad_output = grad_output.contiguous() + if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]: + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + ext_module.deform_conv_backward_input( + input, + offset, + grad_output, + grad_input, + grad_offset, + weight, + ctx.bufs_[0], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + im2col_step=cur_im2col_step) + + if ctx.needs_input_grad[2]: + grad_weight = torch.zeros_like(weight) + ext_module.deform_conv_backward_parameters( + input, + offset, + grad_output, + grad_weight, + ctx.bufs_[0], + ctx.bufs_[1], + kW=weight.size(3), + kH=weight.size(2), + dW=ctx.stride[1], + dH=ctx.stride[0], + padW=ctx.padding[1], + padH=ctx.padding[0], + dilationW=ctx.dilation[1], + dilationH=ctx.dilation[0], + group=ctx.groups, + deformable_group=ctx.deform_groups, + scale=1, + im2col_step=cur_im2col_step) + + return grad_input, grad_offset, grad_weight, \ + None, None, None, None, None, None, None + + @staticmethod + def _output_size(ctx, input, weight): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = ctx.padding[d] + kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = ctx.stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + 'convolution input is too small (output would be ' + + 'x'.join(map(str, output_size)) + ')') + return output_size + + +deform_conv2d = DeformConv2dFunction.apply + + +class DeformConv2d(nn.Module): + r"""Deformable 2D convolution. + + Applies a deformable 2D convolution over an input signal composed of + several input planes. DeformConv2d was described in the paper + `Deformable Convolutional Networks + `_ + + Note: + The argument ``im2col_step`` was added in version 1.3.17, which means + number of samples processed by the ``im2col_cuda_kernel`` per call. + It enables users to define ``batch_size`` and ``im2col_step`` more + flexibly and solved `issue mmcv#1440 + `_. + + Args: + in_channels (int): Number of channels in the input image. + out_channels (int): Number of channels produced by the convolution. + kernel_size(int, tuple): Size of the convolving kernel. + stride(int, tuple): Stride of the convolution. Default: 1. + padding (int or tuple): Zero-padding added to both sides of the input. + Default: 0. + dilation (int or tuple): Spacing between kernel elements. Default: 1. + groups (int): Number of blocked connections from input. + channels to output channels. Default: 1. + deform_groups (int): Number of deformable group partitions. + bias (bool): If True, adds a learnable bias to the output. + Default: False. + im2col_step (int): Number of samples processed by im2col_cuda_kernel + per call. It will work when ``batch_size`` > ``im2col_step``, but + ``batch_size`` must be divisible by ``im2col_step``. Default: 32. + `New in version 1.3.17.` + """ + + @deprecated_api_warning({'deformable_groups': 'deform_groups'}, + cls_name='DeformConv2d') + def __init__(self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, ...]], + stride: Union[int, Tuple[int, ...]] = 1, + padding: Union[int, Tuple[int, ...]] = 0, + dilation: Union[int, Tuple[int, ...]] = 1, + groups: int = 1, + deform_groups: int = 1, + bias: bool = False, + im2col_step: int = 32) -> None: + super(DeformConv2d, self).__init__() + + assert not bias, \ + f'bias={bias} is not supported in DeformConv2d.' + assert in_channels % groups == 0, \ + f'in_channels {in_channels} cannot be divisible by groups {groups}' + assert out_channels % groups == 0, \ + f'out_channels {out_channels} cannot be divisible by groups \ + {groups}' + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deform_groups = deform_groups + self.im2col_step = im2col_step + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + # only weight, no bias + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // self.groups, + *self.kernel_size)) + + self.reset_parameters() + + def reset_parameters(self): + # switch the initialization of `self.weight` to the standard kaiming + # method described in `Delving deep into rectifiers: Surpassing + # human-level performance on ImageNet classification` - He, K. et al. + # (2015), using a uniform distribution + nn.init.kaiming_uniform_(self.weight, nonlinearity='relu') + + def forward(self, x: Tensor, offset: Tensor) -> Tensor: + """Deformable Convolutional forward function. + + Args: + x (Tensor): Input feature, shape (B, C_in, H_in, W_in) + offset (Tensor): Offset for deformable convolution, shape + (B, deform_groups*kernel_size[0]*kernel_size[1]*2, + H_out, W_out), H_out, W_out are equal to the output's. + + An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. + The spatial arrangement is like: + + .. code:: text + + (x0, y0) (x1, y1) (x2, y2) + (x3, y3) (x4, y4) (x5, y5) + (x6, y6) (x7, y7) (x8, y8) + + Returns: + Tensor: Output of the layer. + """ + # To fix an assert error in deform_conv_cuda.cpp:128 + # input image is smaller than kernel + input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) < + self.kernel_size[1]) + if input_pad: + pad_h = max(self.kernel_size[0] - x.size(2), 0) + pad_w = max(self.kernel_size[1] - x.size(3), 0) + x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous() + offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0) + offset = offset.contiguous() + out = deform_conv2d(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deform_groups, + False, self.im2col_step) + if input_pad: + out = out[:, :, :out.size(2) - pad_h, :out.size(3) - + pad_w].contiguous() + return out + + def __repr__(self): + s = self.__class__.__name__ + s += f'(in_channels={self.in_channels},\n' + s += f'out_channels={self.out_channels},\n' + s += f'kernel_size={self.kernel_size},\n' + s += f'stride={self.stride},\n' + s += f'padding={self.padding},\n' + s += f'dilation={self.dilation},\n' + s += f'groups={self.groups},\n' + s += f'deform_groups={self.deform_groups},\n' + # bias is not supported in DeformConv2d. + s += 'bias=False)' + return s + + +@CONV_LAYERS.register_module('DCN') +class DeformConv2dPack(DeformConv2d): + """A Deformable Conv Encapsulation that acts as normal Conv layers. + + The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`. + The spatial arrangement is like: + + .. code:: text + + (x0, y0) (x1, y1) (x2, y2) + (x3, y3) (x4, y4) (x5, y5) + (x6, y6) (x7, y7) (x8, y8) + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int or tuple[int]): Same as nn.Conv2d. + padding (int or tuple[int]): Same as nn.Conv2d. + dilation (int or tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + """ + + _version = 2 + + def __init__(self, *args, **kwargs): + super(DeformConv2dPack, self).__init__(*args, **kwargs) + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1], + kernel_size=self.kernel_size, + stride=_pair(self.stride), + padding=_pair(self.padding), + dilation=_pair(self.dilation), + bias=True) + self.init_offset() + + def init_offset(self): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + offset = self.conv_offset(x) + return deform_conv2d(x, offset, self.weight, self.stride, self.padding, + self.dilation, self.groups, self.deform_groups, + False, self.im2col_step) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + + if version is None or version < 2: + # the key is different in early versions + # In version < 2, DeformConvPack loads previous benchmark models. + if (prefix + 'conv_offset.weight' not in state_dict + and prefix[:-1] + '_offset.weight' in state_dict): + state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( + prefix[:-1] + '_offset.weight') + if (prefix + 'conv_offset.bias' not in state_dict + and prefix[:-1] + '_offset.bias' in state_dict): + state_dict[prefix + + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + + '_offset.bias') + + if version is not None and version > 1: + print_log( + f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to ' + 'version 2.', + logger='root') + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) diff --git a/custom_mmpkg/custom_mmcv/ops/deform_roi_pool.py b/custom_mmpkg/custom_mmcv/ops/deform_roi_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..cc245ba91fee252226ba22e76bb94a35db9a629b --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/deform_roi_pool.py @@ -0,0 +1,204 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch import nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward']) + + +class DeformRoIPoolFunction(Function): + + @staticmethod + def symbolic(g, input, rois, offset, output_size, spatial_scale, + sampling_ratio, gamma): + return g.op( + 'mmcv::MMCVDeformRoIPool', + input, + rois, + offset, + pooled_height_i=output_size[0], + pooled_width_i=output_size[1], + spatial_scale_f=spatial_scale, + sampling_ratio_f=sampling_ratio, + gamma_f=gamma) + + @staticmethod + def forward(ctx, + input, + rois, + offset, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + if offset is None: + offset = input.new_zeros(0) + ctx.output_size = _pair(output_size) + ctx.spatial_scale = float(spatial_scale) + ctx.sampling_ratio = int(sampling_ratio) + ctx.gamma = float(gamma) + + assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' + + output_shape = (rois.size(0), input.size(1), ctx.output_size[0], + ctx.output_size[1]) + output = input.new_zeros(output_shape) + + ext_module.deform_roi_pool_forward( + input, + rois, + offset, + output, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + gamma=ctx.gamma) + + ctx.save_for_backward(input, rois, offset) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, rois, offset = ctx.saved_tensors + grad_input = grad_output.new_zeros(input.shape) + grad_offset = grad_output.new_zeros(offset.shape) + + ext_module.deform_roi_pool_backward( + grad_output, + input, + rois, + offset, + grad_input, + grad_offset, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + gamma=ctx.gamma) + if grad_offset.numel() == 0: + grad_offset = None + return grad_input, None, grad_offset, None, None, None, None + + +deform_roi_pool = DeformRoIPoolFunction.apply + + +class DeformRoIPool(nn.Module): + + def __init__(self, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + super(DeformRoIPool, self).__init__() + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + self.sampling_ratio = int(sampling_ratio) + self.gamma = float(gamma) + + def forward(self, input, rois, offset=None): + return deform_roi_pool(input, rois, offset, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + + +class DeformRoIPoolPack(DeformRoIPool): + + def __init__(self, + output_size, + output_channels, + deform_fc_channels=1024, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + super(DeformRoIPoolPack, self).__init__(output_size, spatial_scale, + sampling_ratio, gamma) + + self.output_channels = output_channels + self.deform_fc_channels = deform_fc_channels + + self.offset_fc = nn.Sequential( + nn.Linear( + self.output_size[0] * self.output_size[1] * + self.output_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, + self.output_size[0] * self.output_size[1] * 2)) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + def forward(self, input, rois): + assert input.size(1) == self.output_channels + x = deform_roi_pool(input, rois, None, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + rois_num = rois.size(0) + offset = self.offset_fc(x.view(rois_num, -1)) + offset = offset.view(rois_num, 2, self.output_size[0], + self.output_size[1]) + return deform_roi_pool(input, rois, offset, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + + +class ModulatedDeformRoIPoolPack(DeformRoIPool): + + def __init__(self, + output_size, + output_channels, + deform_fc_channels=1024, + spatial_scale=1.0, + sampling_ratio=0, + gamma=0.1): + super(ModulatedDeformRoIPoolPack, + self).__init__(output_size, spatial_scale, sampling_ratio, gamma) + + self.output_channels = output_channels + self.deform_fc_channels = deform_fc_channels + + self.offset_fc = nn.Sequential( + nn.Linear( + self.output_size[0] * self.output_size[1] * + self.output_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, + self.output_size[0] * self.output_size[1] * 2)) + self.offset_fc[-1].weight.data.zero_() + self.offset_fc[-1].bias.data.zero_() + + self.mask_fc = nn.Sequential( + nn.Linear( + self.output_size[0] * self.output_size[1] * + self.output_channels, self.deform_fc_channels), + nn.ReLU(inplace=True), + nn.Linear(self.deform_fc_channels, + self.output_size[0] * self.output_size[1] * 1), + nn.Sigmoid()) + self.mask_fc[2].weight.data.zero_() + self.mask_fc[2].bias.data.zero_() + + def forward(self, input, rois): + assert input.size(1) == self.output_channels + x = deform_roi_pool(input, rois, None, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + rois_num = rois.size(0) + offset = self.offset_fc(x.view(rois_num, -1)) + offset = offset.view(rois_num, 2, self.output_size[0], + self.output_size[1]) + mask = self.mask_fc(x.view(rois_num, -1)) + mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1]) + d = deform_roi_pool(input, rois, offset, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.gamma) + return d * mask diff --git a/custom_mmpkg/custom_mmcv/ops/deprecated_wrappers.py b/custom_mmpkg/custom_mmcv/ops/deprecated_wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..a2e593df9ee57637038683d7a1efaa347b2b69e7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/deprecated_wrappers.py @@ -0,0 +1,43 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# This file is for backward compatibility. +# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks. +import warnings + +from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d + + +class Conv2d_deprecated(Conv2d): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in' + ' the future. Please import them from "mmcv.cnn" instead') + + +class ConvTranspose2d_deprecated(ConvTranspose2d): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing ConvTranspose2d wrapper from "mmcv.ops" will be ' + 'deprecated in the future. Please import them from "mmcv.cnn" ' + 'instead') + + +class MaxPool2d_deprecated(MaxPool2d): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in' + ' the future. Please import them from "mmcv.cnn" instead') + + +class Linear_deprecated(Linear): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + warnings.warn( + 'Importing Linear wrapper from "mmcv.ops" will be deprecated in' + ' the future. Please import them from "mmcv.cnn" instead') diff --git a/custom_mmpkg/custom_mmcv/ops/focal_loss.py b/custom_mmpkg/custom_mmcv/ops/focal_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..763bc93bd2575c49ca8ccf20996bbd92d1e0d1a4 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/focal_loss.py @@ -0,0 +1,212 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward', + 'softmax_focal_loss_forward', 'softmax_focal_loss_backward' +]) + + +class SigmoidFocalLossFunction(Function): + + @staticmethod + def symbolic(g, input, target, gamma, alpha, weight, reduction): + return g.op( + 'mmcv::MMCVSigmoidFocalLoss', + input, + target, + gamma_f=gamma, + alpha_f=alpha, + weight_f=weight, + reduction_s=reduction) + + @staticmethod + def forward(ctx, + input, + target, + gamma=2.0, + alpha=0.25, + weight=None, + reduction='mean'): + + assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor)) + assert input.dim() == 2 + assert target.dim() == 1 + assert input.size(0) == target.size(0) + if weight is None: + weight = input.new_empty(0) + else: + assert weight.dim() == 1 + assert input.size(1) == weight.size(0) + ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2} + assert reduction in ctx.reduction_dict.keys() + + ctx.gamma = float(gamma) + ctx.alpha = float(alpha) + ctx.reduction = ctx.reduction_dict[reduction] + + output = input.new_zeros(input.size()) + + ext_module.sigmoid_focal_loss_forward( + input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha) + if ctx.reduction == ctx.reduction_dict['mean']: + output = output.sum() / input.size(0) + elif ctx.reduction == ctx.reduction_dict['sum']: + output = output.sum() + ctx.save_for_backward(input, target, weight) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, target, weight = ctx.saved_tensors + + grad_input = input.new_zeros(input.size()) + + ext_module.sigmoid_focal_loss_backward( + input, + target, + weight, + grad_input, + gamma=ctx.gamma, + alpha=ctx.alpha) + + grad_input *= grad_output + if ctx.reduction == ctx.reduction_dict['mean']: + grad_input /= input.size(0) + return grad_input, None, None, None, None, None + + +sigmoid_focal_loss = SigmoidFocalLossFunction.apply + + +class SigmoidFocalLoss(nn.Module): + + def __init__(self, gamma, alpha, weight=None, reduction='mean'): + super(SigmoidFocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + self.register_buffer('weight', weight) + self.reduction = reduction + + def forward(self, input, target): + return sigmoid_focal_loss(input, target, self.gamma, self.alpha, + self.weight, self.reduction) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(gamma={self.gamma}, ' + s += f'alpha={self.alpha}, ' + s += f'reduction={self.reduction})' + return s + + +class SoftmaxFocalLossFunction(Function): + + @staticmethod + def symbolic(g, input, target, gamma, alpha, weight, reduction): + return g.op( + 'mmcv::MMCVSoftmaxFocalLoss', + input, + target, + gamma_f=gamma, + alpha_f=alpha, + weight_f=weight, + reduction_s=reduction) + + @staticmethod + def forward(ctx, + input, + target, + gamma=2.0, + alpha=0.25, + weight=None, + reduction='mean'): + + assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor)) + assert input.dim() == 2 + assert target.dim() == 1 + assert input.size(0) == target.size(0) + if weight is None: + weight = input.new_empty(0) + else: + assert weight.dim() == 1 + assert input.size(1) == weight.size(0) + ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2} + assert reduction in ctx.reduction_dict.keys() + + ctx.gamma = float(gamma) + ctx.alpha = float(alpha) + ctx.reduction = ctx.reduction_dict[reduction] + + channel_stats, _ = torch.max(input, dim=1) + input_softmax = input - channel_stats.unsqueeze(1).expand_as(input) + input_softmax.exp_() + + channel_stats = input_softmax.sum(dim=1) + input_softmax /= channel_stats.unsqueeze(1).expand_as(input) + + output = input.new_zeros(input.size(0)) + ext_module.softmax_focal_loss_forward( + input_softmax, + target, + weight, + output, + gamma=ctx.gamma, + alpha=ctx.alpha) + + if ctx.reduction == ctx.reduction_dict['mean']: + output = output.sum() / input.size(0) + elif ctx.reduction == ctx.reduction_dict['sum']: + output = output.sum() + ctx.save_for_backward(input_softmax, target, weight) + return output + + @staticmethod + def backward(ctx, grad_output): + input_softmax, target, weight = ctx.saved_tensors + buff = input_softmax.new_zeros(input_softmax.size(0)) + grad_input = input_softmax.new_zeros(input_softmax.size()) + + ext_module.softmax_focal_loss_backward( + input_softmax, + target, + weight, + buff, + grad_input, + gamma=ctx.gamma, + alpha=ctx.alpha) + + grad_input *= grad_output + if ctx.reduction == ctx.reduction_dict['mean']: + grad_input /= input_softmax.size(0) + return grad_input, None, None, None, None, None + + +softmax_focal_loss = SoftmaxFocalLossFunction.apply + + +class SoftmaxFocalLoss(nn.Module): + + def __init__(self, gamma, alpha, weight=None, reduction='mean'): + super(SoftmaxFocalLoss, self).__init__() + self.gamma = gamma + self.alpha = alpha + self.register_buffer('weight', weight) + self.reduction = reduction + + def forward(self, input, target): + return softmax_focal_loss(input, target, self.gamma, self.alpha, + self.weight, self.reduction) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(gamma={self.gamma}, ' + s += f'alpha={self.alpha}, ' + s += f'reduction={self.reduction})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/furthest_point_sample.py b/custom_mmpkg/custom_mmcv/ops/furthest_point_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..374b7a878f1972c183941af28ba1df216ac1a60f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/furthest_point_sample.py @@ -0,0 +1,83 @@ +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'furthest_point_sampling_forward', + 'furthest_point_sampling_with_dist_forward' +]) + + +class FurthestPointSampling(Function): + """Uses iterative furthest point sampling to select a set of features whose + corresponding points have the furthest distance.""" + + @staticmethod + def forward(ctx, points_xyz: torch.Tensor, + num_points: int) -> torch.Tensor: + """ + Args: + points_xyz (Tensor): (B, N, 3) where N > num_points. + num_points (int): Number of points in the sampled set. + + Returns: + Tensor: (B, num_points) indices of the sampled points. + """ + assert points_xyz.is_contiguous() + + B, N = points_xyz.size()[:2] + output = torch.cuda.IntTensor(B, num_points) + temp = torch.cuda.FloatTensor(B, N).fill_(1e10) + + ext_module.furthest_point_sampling_forward( + points_xyz, + temp, + output, + b=B, + n=N, + m=num_points, + ) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(output) + return output + + @staticmethod + def backward(xyz, a=None): + return None, None + + +class FurthestPointSamplingWithDist(Function): + """Uses iterative furthest point sampling to select a set of features whose + corresponding points have the furthest distance.""" + + @staticmethod + def forward(ctx, points_dist: torch.Tensor, + num_points: int) -> torch.Tensor: + """ + Args: + points_dist (Tensor): (B, N, N) Distance between each point pair. + num_points (int): Number of points in the sampled set. + + Returns: + Tensor: (B, num_points) indices of the sampled points. + """ + assert points_dist.is_contiguous() + + B, N, _ = points_dist.size() + output = points_dist.new_zeros([B, num_points], dtype=torch.int32) + temp = points_dist.new_zeros([B, N]).fill_(1e10) + + ext_module.furthest_point_sampling_with_dist_forward( + points_dist, temp, output, b=B, n=N, m=num_points) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(output) + return output + + @staticmethod + def backward(xyz, a=None): + return None, None + + +furthest_point_sample = FurthestPointSampling.apply +furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply diff --git a/custom_mmpkg/custom_mmcv/ops/fused_bias_leakyrelu.py b/custom_mmpkg/custom_mmcv/ops/fused_bias_leakyrelu.py new file mode 100644 index 0000000000000000000000000000000000000000..6d12508469c6c8fa1884debece44c58d158cb6fa --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/fused_bias_leakyrelu.py @@ -0,0 +1,268 @@ +# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501 + +# Copyright (c) 2021, NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator +# Augmentation (ADA) +# ======================================================================= + +# 1. Definitions + +# "Licensor" means any person or entity that distributes its Work. + +# "Software" means the original work of authorship made available under +# this License. + +# "Work" means the Software and any additions to or derivative works of +# the Software that are made available under this License. + +# The terms "reproduce," "reproduction," "derivative works," and +# "distribution" have the meaning as provided under U.S. copyright law; +# provided, however, that for the purposes of this License, derivative +# works shall not include works that remain separable from, or merely +# link (or bind by name) to the interfaces of, the Work. + +# Works, including the Software, are "made available" under this License +# by including in or with the Work either (a) a copyright notice +# referencing the applicability of this License to the Work, or (b) a +# copy of this License. + +# 2. License Grants + +# 2.1 Copyright Grant. Subject to the terms and conditions of this +# License, each Licensor grants to you a perpetual, worldwide, +# non-exclusive, royalty-free, copyright license to reproduce, +# prepare derivative works of, publicly display, publicly perform, +# sublicense and distribute its Work and any resulting derivative +# works in any form. + +# 3. Limitations + +# 3.1 Redistribution. You may reproduce or distribute the Work only +# if (a) you do so under this License, (b) you include a complete +# copy of this License with your distribution, and (c) you retain +# without modification any copyright, patent, trademark, or +# attribution notices that are present in the Work. + +# 3.2 Derivative Works. You may specify that additional or different +# terms apply to the use, reproduction, and distribution of your +# derivative works of the Work ("Your Terms") only if (a) Your Terms +# provide that the use limitation in Section 3.3 applies to your +# derivative works, and (b) you identify the specific derivative +# works that are subject to Your Terms. Notwithstanding Your Terms, +# this License (including the redistribution requirements in Section +# 3.1) will continue to apply to the Work itself. + +# 3.3 Use Limitation. The Work and any derivative works thereof only +# may be used or intended for use non-commercially. Notwithstanding +# the foregoing, NVIDIA and its affiliates may use the Work and any +# derivative works commercially. As used herein, "non-commercially" +# means for research or evaluation purposes only. + +# 3.4 Patent Claims. If you bring or threaten to bring a patent claim +# against any Licensor (including any claim, cross-claim or +# counterclaim in a lawsuit) to enforce any patents that you allege +# are infringed by any Work, then your rights under this License from +# such Licensor (including the grant in Section 2.1) will terminate +# immediately. + +# 3.5 Trademarks. This License does not grant any rights to use any +# Licensor’s or its affiliates’ names, logos, or trademarks, except +# as necessary to reproduce the notices described in this License. + +# 3.6 Termination. If you violate any term of this License, then your +# rights under this License (including the grant in Section 2.1) will +# terminate immediately. + +# 4. Disclaimer of Warranty. + +# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR +# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER +# THIS LICENSE. + +# 5. Limitation of Liability. + +# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL +# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE +# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, +# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF +# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK +# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, +# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER +# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGES. + +# ======================================================================= + +import torch +import torch.nn.functional as F +from torch import nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu']) + + +class FusedBiasLeakyReLUFunctionBackward(Function): + """Calculate second order deviation. + + This function is to compute the second order deviation for the fused leaky + relu operation. + """ + + @staticmethod + def forward(ctx, grad_output, out, negative_slope, scale): + ctx.save_for_backward(out) + ctx.negative_slope = negative_slope + ctx.scale = scale + + empty = grad_output.new_empty(0) + + grad_input = ext_module.fused_bias_leakyrelu( + grad_output, + empty, + out, + act=3, + grad=1, + alpha=negative_slope, + scale=scale) + + dim = [0] + + if grad_input.ndim > 2: + dim += list(range(2, grad_input.ndim)) + + grad_bias = grad_input.sum(dim).detach() + + return grad_input, grad_bias + + @staticmethod + def backward(ctx, gradgrad_input, gradgrad_bias): + out, = ctx.saved_tensors + + # The second order deviation, in fact, contains two parts, while the + # the first part is zero. Thus, we direct consider the second part + # which is similar with the first order deviation in implementation. + gradgrad_out = ext_module.fused_bias_leakyrelu( + gradgrad_input, + gradgrad_bias.to(out.dtype), + out, + act=3, + grad=1, + alpha=ctx.negative_slope, + scale=ctx.scale) + + return gradgrad_out, None, None, None + + +class FusedBiasLeakyReLUFunction(Function): + + @staticmethod + def forward(ctx, input, bias, negative_slope, scale): + empty = input.new_empty(0) + + out = ext_module.fused_bias_leakyrelu( + input, + bias, + empty, + act=3, + grad=0, + alpha=negative_slope, + scale=scale) + ctx.save_for_backward(out) + ctx.negative_slope = negative_slope + ctx.scale = scale + + return out + + @staticmethod + def backward(ctx, grad_output): + out, = ctx.saved_tensors + + grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply( + grad_output, out, ctx.negative_slope, ctx.scale) + + return grad_input, grad_bias, None, None + + +class FusedBiasLeakyReLU(nn.Module): + """Fused bias leaky ReLU. + + This function is introduced in the StyleGAN2: + http://arxiv.org/abs/1912.04958 + + The bias term comes from the convolution operation. In addition, to keep + the variance of the feature map or gradients unchanged, they also adopt a + scale similarly with Kaiming initialization. However, since the + :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the + final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501 + your own scale. + + TODO: Implement the CPU version. + + Args: + channel (int): The channel number of the feature map. + negative_slope (float, optional): Same as nn.LeakyRelu. + Defaults to 0.2. + scale (float, optional): A scalar to adjust the variance of the feature + map. Defaults to 2**0.5. + """ + + def __init__(self, num_channels, negative_slope=0.2, scale=2**0.5): + super(FusedBiasLeakyReLU, self).__init__() + + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.negative_slope = negative_slope + self.scale = scale + + def forward(self, input): + return fused_bias_leakyrelu(input, self.bias, self.negative_slope, + self.scale) + + +def fused_bias_leakyrelu(input, bias, negative_slope=0.2, scale=2**0.5): + """Fused bias leaky ReLU function. + + This function is introduced in the StyleGAN2: + http://arxiv.org/abs/1912.04958 + + The bias term comes from the convolution operation. In addition, to keep + the variance of the feature map or gradients unchanged, they also adopt a + scale similarly with Kaiming initialization. However, since the + :math:`1+{alpha}^2` : is too small, we can just ignore it. Therefore, the + final scale is just :math:`\sqrt{2}`:. Of course, you may change it with # noqa: W605, E501 + your own scale. + + Args: + input (torch.Tensor): Input feature map. + bias (nn.Parameter): The bias from convolution operation. + negative_slope (float, optional): Same as nn.LeakyRelu. + Defaults to 0.2. + scale (float, optional): A scalar to adjust the variance of the feature + map. Defaults to 2**0.5. + + Returns: + torch.Tensor: Feature map after non-linear activation. + """ + + if not input.is_cuda: + return bias_leakyrelu_ref(input, bias, negative_slope, scale) + + return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype), + negative_slope, scale) + + +def bias_leakyrelu_ref(x, bias, negative_slope=0.2, scale=2**0.5): + + if bias is not None: + assert bias.ndim == 1 + assert bias.shape[0] == x.shape[1] + x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)]) + + x = F.leaky_relu(x, negative_slope) + if scale != 1: + x = x * scale + + return x diff --git a/custom_mmpkg/custom_mmcv/ops/gather_points.py b/custom_mmpkg/custom_mmcv/ops/gather_points.py new file mode 100644 index 0000000000000000000000000000000000000000..f52f1677d8ea0facafc56a3672d37adb44677ff3 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/gather_points.py @@ -0,0 +1,57 @@ +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['gather_points_forward', 'gather_points_backward']) + + +class GatherPoints(Function): + """Gather points with given index.""" + + @staticmethod + def forward(ctx, features: torch.Tensor, + indices: torch.Tensor) -> torch.Tensor: + """ + Args: + features (Tensor): (B, C, N) features to gather. + indices (Tensor): (B, M) where M is the number of points. + + Returns: + Tensor: (B, C, M) where M is the number of points. + """ + assert features.is_contiguous() + assert indices.is_contiguous() + + B, npoint = indices.size() + _, C, N = features.size() + output = torch.cuda.FloatTensor(B, C, npoint) + + ext_module.gather_points_forward( + features, indices, output, b=B, c=C, n=N, npoints=npoint) + + ctx.for_backwards = (indices, C, N) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(indices) + return output + + @staticmethod + def backward(ctx, grad_out): + idx, C, N = ctx.for_backwards + B, npoint = idx.size() + + grad_features = torch.cuda.FloatTensor(B, C, N).zero_() + grad_out_data = grad_out.data.contiguous() + ext_module.gather_points_backward( + grad_out_data, + idx, + grad_features.data, + b=B, + c=C, + n=N, + npoints=npoint) + return grad_features, None + + +gather_points = GatherPoints.apply diff --git a/custom_mmpkg/custom_mmcv/ops/group_points.py b/custom_mmpkg/custom_mmcv/ops/group_points.py new file mode 100644 index 0000000000000000000000000000000000000000..6c3ec9d758ebe4e1c2205882af4be154008253a5 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/group_points.py @@ -0,0 +1,224 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Tuple + +import torch +from torch import nn as nn +from torch.autograd import Function + +from ..utils import ext_loader +from .ball_query import ball_query +from .knn import knn + +ext_module = ext_loader.load_ext( + '_ext', ['group_points_forward', 'group_points_backward']) + + +class QueryAndGroup(nn.Module): + """Groups points with a ball query of radius. + + Args: + max_radius (float): The maximum radius of the balls. + If None is given, we will use kNN sampling instead of ball query. + sample_num (int): Maximum number of features to gather in the ball. + min_radius (float, optional): The minimum radius of the balls. + Default: 0. + use_xyz (bool, optional): Whether to use xyz. + Default: True. + return_grouped_xyz (bool, optional): Whether to return grouped xyz. + Default: False. + normalize_xyz (bool, optional): Whether to normalize xyz. + Default: False. + uniform_sample (bool, optional): Whether to sample uniformly. + Default: False + return_unique_cnt (bool, optional): Whether to return the count of + unique samples. Default: False. + return_grouped_idx (bool, optional): Whether to return grouped idx. + Default: False. + """ + + def __init__(self, + max_radius, + sample_num, + min_radius=0, + use_xyz=True, + return_grouped_xyz=False, + normalize_xyz=False, + uniform_sample=False, + return_unique_cnt=False, + return_grouped_idx=False): + super().__init__() + self.max_radius = max_radius + self.min_radius = min_radius + self.sample_num = sample_num + self.use_xyz = use_xyz + self.return_grouped_xyz = return_grouped_xyz + self.normalize_xyz = normalize_xyz + self.uniform_sample = uniform_sample + self.return_unique_cnt = return_unique_cnt + self.return_grouped_idx = return_grouped_idx + if self.return_unique_cnt: + assert self.uniform_sample, \ + 'uniform_sample should be True when ' \ + 'returning the count of unique samples' + if self.max_radius is None: + assert not self.normalize_xyz, \ + 'can not normalize grouped xyz when max_radius is None' + + def forward(self, points_xyz, center_xyz, features=None): + """ + Args: + points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. + center_xyz (Tensor): (B, npoint, 3) coordinates of the centriods. + features (Tensor): (B, C, N) Descriptors of the features. + + Returns: + Tensor: (B, 3 + C, npoint, sample_num) Grouped feature. + """ + # if self.max_radius is None, we will perform kNN instead of ball query + # idx is of shape [B, npoint, sample_num] + if self.max_radius is None: + idx = knn(self.sample_num, points_xyz, center_xyz, False) + idx = idx.transpose(1, 2).contiguous() + else: + idx = ball_query(self.min_radius, self.max_radius, self.sample_num, + points_xyz, center_xyz) + + if self.uniform_sample: + unique_cnt = torch.zeros((idx.shape[0], idx.shape[1])) + for i_batch in range(idx.shape[0]): + for i_region in range(idx.shape[1]): + unique_ind = torch.unique(idx[i_batch, i_region, :]) + num_unique = unique_ind.shape[0] + unique_cnt[i_batch, i_region] = num_unique + sample_ind = torch.randint( + 0, + num_unique, (self.sample_num - num_unique, ), + dtype=torch.long) + all_ind = torch.cat((unique_ind, unique_ind[sample_ind])) + idx[i_batch, i_region, :] = all_ind + + xyz_trans = points_xyz.transpose(1, 2).contiguous() + # (B, 3, npoint, sample_num) + grouped_xyz = grouping_operation(xyz_trans, idx) + grouped_xyz_diff = grouped_xyz - \ + center_xyz.transpose(1, 2).unsqueeze(-1) # relative offsets + if self.normalize_xyz: + grouped_xyz_diff /= self.max_radius + + if features is not None: + grouped_features = grouping_operation(features, idx) + if self.use_xyz: + # (B, C + 3, npoint, sample_num) + new_features = torch.cat([grouped_xyz_diff, grouped_features], + dim=1) + else: + new_features = grouped_features + else: + assert (self.use_xyz + ), 'Cannot have not features and not use xyz as a feature!' + new_features = grouped_xyz_diff + + ret = [new_features] + if self.return_grouped_xyz: + ret.append(grouped_xyz) + if self.return_unique_cnt: + ret.append(unique_cnt) + if self.return_grouped_idx: + ret.append(idx) + if len(ret) == 1: + return ret[0] + else: + return tuple(ret) + + +class GroupAll(nn.Module): + """Group xyz with feature. + + Args: + use_xyz (bool): Whether to use xyz. + """ + + def __init__(self, use_xyz: bool = True): + super().__init__() + self.use_xyz = use_xyz + + def forward(self, + xyz: torch.Tensor, + new_xyz: torch.Tensor, + features: torch.Tensor = None): + """ + Args: + xyz (Tensor): (B, N, 3) xyz coordinates of the features. + new_xyz (Tensor): new xyz coordinates of the features. + features (Tensor): (B, C, N) features to group. + + Returns: + Tensor: (B, C + 3, 1, N) Grouped feature. + """ + grouped_xyz = xyz.transpose(1, 2).unsqueeze(2) + if features is not None: + grouped_features = features.unsqueeze(2) + if self.use_xyz: + # (B, 3 + C, 1, N) + new_features = torch.cat([grouped_xyz, grouped_features], + dim=1) + else: + new_features = grouped_features + else: + new_features = grouped_xyz + + return new_features + + +class GroupingOperation(Function): + """Group feature with given index.""" + + @staticmethod + def forward(ctx, features: torch.Tensor, + indices: torch.Tensor) -> torch.Tensor: + """ + Args: + features (Tensor): (B, C, N) tensor of features to group. + indices (Tensor): (B, npoint, nsample) the indices of + features to group with. + + Returns: + Tensor: (B, C, npoint, nsample) Grouped features. + """ + features = features.contiguous() + indices = indices.contiguous() + + B, nfeatures, nsample = indices.size() + _, C, N = features.size() + output = torch.cuda.FloatTensor(B, C, nfeatures, nsample) + + ext_module.group_points_forward(B, C, N, nfeatures, nsample, features, + indices, output) + + ctx.for_backwards = (indices, N) + return output + + @staticmethod + def backward(ctx, + grad_out: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients + of the output from forward. + + Returns: + Tensor: (B, C, N) gradient of the features. + """ + idx, N = ctx.for_backwards + + B, C, npoint, nsample = grad_out.size() + grad_features = torch.cuda.FloatTensor(B, C, N).zero_() + + grad_out_data = grad_out.data.contiguous() + ext_module.group_points_backward(B, C, N, npoint, nsample, + grad_out_data, idx, + grad_features.data) + return grad_features, None + + +grouping_operation = GroupingOperation.apply diff --git a/custom_mmpkg/custom_mmcv/ops/info.py b/custom_mmpkg/custom_mmcv/ops/info.py new file mode 100644 index 0000000000000000000000000000000000000000..29f2e5598ae2bb5866ccd15a7d3b4de33c0cd14d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/info.py @@ -0,0 +1,36 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import glob +import os + +import torch + +if torch.__version__ == 'parrots': + import parrots + + def get_compiler_version(): + return 'GCC ' + parrots.version.compiler + + def get_compiling_cuda_version(): + return parrots.version.cuda +else: + from ..utils import ext_loader + ext_module = ext_loader.load_ext( + '_ext', ['get_compiler_version', 'get_compiling_cuda_version']) + + def get_compiler_version(): + return ext_module.get_compiler_version() + + def get_compiling_cuda_version(): + return ext_module.get_compiling_cuda_version() + + +def get_onnxruntime_op_path(): + wildcard = os.path.join( + os.path.abspath(os.path.dirname(os.path.dirname(__file__))), + '_ext_ort.*.so') + + paths = glob.glob(wildcard) + if len(paths) > 0: + return paths[0] + else: + return '' diff --git a/custom_mmpkg/custom_mmcv/ops/iou3d.py b/custom_mmpkg/custom_mmcv/ops/iou3d.py new file mode 100644 index 0000000000000000000000000000000000000000..6fc71979190323f44c09f8b7e1761cf49cd2d76b --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/iou3d.py @@ -0,0 +1,85 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'iou3d_boxes_iou_bev_forward', 'iou3d_nms_forward', + 'iou3d_nms_normal_forward' +]) + + +def boxes_iou_bev(boxes_a, boxes_b): + """Calculate boxes IoU in the Bird's Eye View. + + Args: + boxes_a (torch.Tensor): Input boxes a with shape (M, 5). + boxes_b (torch.Tensor): Input boxes b with shape (N, 5). + + Returns: + ans_iou (torch.Tensor): IoU result with shape (M, N). + """ + ans_iou = boxes_a.new_zeros( + torch.Size((boxes_a.shape[0], boxes_b.shape[0]))) + + ext_module.iou3d_boxes_iou_bev_forward(boxes_a.contiguous(), + boxes_b.contiguous(), ans_iou) + + return ans_iou + + +def nms_bev(boxes, scores, thresh, pre_max_size=None, post_max_size=None): + """NMS function GPU implementation (for BEV boxes). The overlap of two + boxes for IoU calculation is defined as the exact overlapping area of the + two boxes. In this function, one can also set ``pre_max_size`` and + ``post_max_size``. + + Args: + boxes (torch.Tensor): Input boxes with the shape of [N, 5] + ([x1, y1, x2, y2, ry]). + scores (torch.Tensor): Scores of boxes with the shape of [N]. + thresh (float): Overlap threshold of NMS. + pre_max_size (int, optional): Max size of boxes before NMS. + Default: None. + post_max_size (int, optional): Max size of boxes after NMS. + Default: None. + + Returns: + torch.Tensor: Indexes after NMS. + """ + assert boxes.size(1) == 5, 'Input boxes shape should be [N, 5]' + order = scores.sort(0, descending=True)[1] + + if pre_max_size is not None: + order = order[:pre_max_size] + boxes = boxes[order].contiguous() + + keep = torch.zeros(boxes.size(0), dtype=torch.long) + num_out = ext_module.iou3d_nms_forward(boxes, keep, thresh) + keep = order[keep[:num_out].cuda(boxes.device)].contiguous() + if post_max_size is not None: + keep = keep[:post_max_size] + return keep + + +def nms_normal_bev(boxes, scores, thresh): + """Normal NMS function GPU implementation (for BEV boxes). The overlap of + two boxes for IoU calculation is defined as the exact overlapping area of + the two boxes WITH their yaw angle set to 0. + + Args: + boxes (torch.Tensor): Input boxes with shape (N, 5). + scores (torch.Tensor): Scores of predicted boxes with shape (N). + thresh (float): Overlap threshold of NMS. + + Returns: + torch.Tensor: Remaining indices with scores in descending order. + """ + assert boxes.shape[1] == 5, 'Input boxes shape should be [N, 5]' + order = scores.sort(0, descending=True)[1] + + boxes = boxes[order].contiguous() + + keep = torch.zeros(boxes.size(0), dtype=torch.long) + num_out = ext_module.iou3d_nms_normal_forward(boxes, keep, thresh) + return order[keep[:num_out].cuda(boxes.device)].contiguous() diff --git a/custom_mmpkg/custom_mmcv/ops/knn.py b/custom_mmpkg/custom_mmcv/ops/knn.py new file mode 100644 index 0000000000000000000000000000000000000000..f335785036669fc19239825b0aae6dde3f73bf92 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/knn.py @@ -0,0 +1,77 @@ +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['knn_forward']) + + +class KNN(Function): + r"""KNN (CUDA) based on heap data structure. + Modified from `PAConv `_. + + Find k-nearest points. + """ + + @staticmethod + def forward(ctx, + k: int, + xyz: torch.Tensor, + center_xyz: torch.Tensor = None, + transposed: bool = False) -> torch.Tensor: + """ + Args: + k (int): number of nearest neighbors. + xyz (Tensor): (B, N, 3) if transposed == False, else (B, 3, N). + xyz coordinates of the features. + center_xyz (Tensor, optional): (B, npoint, 3) if transposed == + False, else (B, 3, npoint). centers of the knn query. + Default: None. + transposed (bool, optional): whether the input tensors are + transposed. Should not explicitly use this keyword when + calling knn (=KNN.apply), just add the fourth param. + Default: False. + + Returns: + Tensor: (B, k, npoint) tensor with the indices of + the features that form k-nearest neighbours. + """ + assert (k > 0) & (k < 100), 'k should be in range(0, 100)' + + if center_xyz is None: + center_xyz = xyz + + if transposed: + xyz = xyz.transpose(2, 1).contiguous() + center_xyz = center_xyz.transpose(2, 1).contiguous() + + assert xyz.is_contiguous() # [B, N, 3] + assert center_xyz.is_contiguous() # [B, npoint, 3] + + center_xyz_device = center_xyz.get_device() + assert center_xyz_device == xyz.get_device(), \ + 'center_xyz and xyz should be put on the same device' + if torch.cuda.current_device() != center_xyz_device: + torch.cuda.set_device(center_xyz_device) + + B, npoint, _ = center_xyz.shape + N = xyz.shape[1] + + idx = center_xyz.new_zeros((B, npoint, k)).int() + dist2 = center_xyz.new_zeros((B, npoint, k)).float() + + ext_module.knn_forward( + xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k) + # idx shape to [B, k, npoint] + idx = idx.transpose(2, 1).contiguous() + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(idx) + return idx + + @staticmethod + def backward(ctx, a=None): + return None, None, None + + +knn = KNN.apply diff --git a/custom_mmpkg/custom_mmcv/ops/masked_conv.py b/custom_mmpkg/custom_mmcv/ops/masked_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..cd514cc204c1d571ea5dc7e74b038c0f477a008b --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/masked_conv.py @@ -0,0 +1,111 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['masked_im2col_forward', 'masked_col2im_forward']) + + +class MaskedConv2dFunction(Function): + + @staticmethod + def symbolic(g, features, mask, weight, bias, padding, stride): + return g.op( + 'mmcv::MMCVMaskedConv2d', + features, + mask, + weight, + bias, + padding_i=padding, + stride_i=stride) + + @staticmethod + def forward(ctx, features, mask, weight, bias, padding=0, stride=1): + assert mask.dim() == 3 and mask.size(0) == 1 + assert features.dim() == 4 and features.size(0) == 1 + assert features.size()[2:] == mask.size()[1:] + pad_h, pad_w = _pair(padding) + stride_h, stride_w = _pair(stride) + if stride_h != 1 or stride_w != 1: + raise ValueError( + 'Stride could not only be 1 in masked_conv2d currently.') + out_channel, in_channel, kernel_h, kernel_w = weight.size() + + batch_size = features.size(0) + out_h = int( + math.floor((features.size(2) + 2 * pad_h - + (kernel_h - 1) - 1) / stride_h + 1)) + out_w = int( + math.floor((features.size(3) + 2 * pad_w - + (kernel_h - 1) - 1) / stride_w + 1)) + mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False) + output = features.new_zeros(batch_size, out_channel, out_h, out_w) + if mask_inds.numel() > 0: + mask_h_idx = mask_inds[:, 0].contiguous() + mask_w_idx = mask_inds[:, 1].contiguous() + data_col = features.new_zeros(in_channel * kernel_h * kernel_w, + mask_inds.size(0)) + ext_module.masked_im2col_forward( + features, + mask_h_idx, + mask_w_idx, + data_col, + kernel_h=kernel_h, + kernel_w=kernel_w, + pad_h=pad_h, + pad_w=pad_w) + + masked_output = torch.addmm(1, bias[:, None], 1, + weight.view(out_channel, -1), data_col) + ext_module.masked_col2im_forward( + masked_output, + mask_h_idx, + mask_w_idx, + output, + height=out_h, + width=out_w, + channels=out_channel) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + return (None, ) * 5 + + +masked_conv2d = MaskedConv2dFunction.apply + + +class MaskedConv2d(nn.Conv2d): + """A MaskedConv2d which inherits the official Conv2d. + + The masked forward doesn't implement the backward function and only + supports the stride parameter to be 1 currently. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True): + super(MaskedConv2d, + self).__init__(in_channels, out_channels, kernel_size, stride, + padding, dilation, groups, bias) + + def forward(self, input, mask=None): + if mask is None: # fallback to the normal Conv2d + return super(MaskedConv2d, self).forward(input) + else: + return masked_conv2d(input, mask, self.weight, self.bias, + self.padding) diff --git a/custom_mmpkg/custom_mmcv/ops/merge_cells.py b/custom_mmpkg/custom_mmcv/ops/merge_cells.py new file mode 100644 index 0000000000000000000000000000000000000000..48ca8cc0a8aca8432835bd760c0403a3c35b34cf --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/merge_cells.py @@ -0,0 +1,149 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from abc import abstractmethod + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..cnn import ConvModule + + +class BaseMergeCell(nn.Module): + """The basic class for cells used in NAS-FPN and NAS-FCOS. + + BaseMergeCell takes 2 inputs. After applying convolution + on them, they are resized to the target size. Then, + they go through binary_op, which depends on the type of cell. + If with_out_conv is True, the result of output will go through + another convolution layer. + + Args: + in_channels (int): number of input channels in out_conv layer. + out_channels (int): number of output channels in out_conv layer. + with_out_conv (bool): Whether to use out_conv layer + out_conv_cfg (dict): Config dict for convolution layer, which should + contain "groups", "kernel_size", "padding", "bias" to build + out_conv layer. + out_norm_cfg (dict): Config dict for normalization layer in out_conv. + out_conv_order (tuple): The order of conv/norm/activation layers in + out_conv. + with_input1_conv (bool): Whether to use convolution on input1. + with_input2_conv (bool): Whether to use convolution on input2. + input_conv_cfg (dict): Config dict for building input1_conv layer and + input2_conv layer, which is expected to contain the type of + convolution. + Default: None, which means using conv2d. + input_norm_cfg (dict): Config dict for normalization layer in + input1_conv and input2_conv layer. Default: None. + upsample_mode (str): Interpolation method used to resize the output + of input1_conv and input2_conv to target size. Currently, we + support ['nearest', 'bilinear']. Default: 'nearest'. + """ + + def __init__(self, + fused_channels=256, + out_channels=256, + with_out_conv=True, + out_conv_cfg=dict( + groups=1, kernel_size=3, padding=1, bias=True), + out_norm_cfg=None, + out_conv_order=('act', 'conv', 'norm'), + with_input1_conv=False, + with_input2_conv=False, + input_conv_cfg=None, + input_norm_cfg=None, + upsample_mode='nearest'): + super(BaseMergeCell, self).__init__() + assert upsample_mode in ['nearest', 'bilinear'] + self.with_out_conv = with_out_conv + self.with_input1_conv = with_input1_conv + self.with_input2_conv = with_input2_conv + self.upsample_mode = upsample_mode + + if self.with_out_conv: + self.out_conv = ConvModule( + fused_channels, + out_channels, + **out_conv_cfg, + norm_cfg=out_norm_cfg, + order=out_conv_order) + + self.input1_conv = self._build_input_conv( + out_channels, input_conv_cfg, + input_norm_cfg) if with_input1_conv else nn.Sequential() + self.input2_conv = self._build_input_conv( + out_channels, input_conv_cfg, + input_norm_cfg) if with_input2_conv else nn.Sequential() + + def _build_input_conv(self, channel, conv_cfg, norm_cfg): + return ConvModule( + channel, + channel, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + bias=True) + + @abstractmethod + def _binary_op(self, x1, x2): + pass + + def _resize(self, x, size): + if x.shape[-2:] == size: + return x + elif x.shape[-2:] < size: + return F.interpolate(x, size=size, mode=self.upsample_mode) + else: + assert x.shape[-2] % size[-2] == 0 and x.shape[-1] % size[-1] == 0 + kernel_size = x.shape[-1] // size[-1] + x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size) + return x + + def forward(self, x1, x2, out_size=None): + assert x1.shape[:2] == x2.shape[:2] + assert out_size is None or len(out_size) == 2 + if out_size is None: # resize to larger one + out_size = max(x1.size()[2:], x2.size()[2:]) + + x1 = self.input1_conv(x1) + x2 = self.input2_conv(x2) + + x1 = self._resize(x1, out_size) + x2 = self._resize(x2, out_size) + + x = self._binary_op(x1, x2) + if self.with_out_conv: + x = self.out_conv(x) + return x + + +class SumCell(BaseMergeCell): + + def __init__(self, in_channels, out_channels, **kwargs): + super(SumCell, self).__init__(in_channels, out_channels, **kwargs) + + def _binary_op(self, x1, x2): + return x1 + x2 + + +class ConcatCell(BaseMergeCell): + + def __init__(self, in_channels, out_channels, **kwargs): + super(ConcatCell, self).__init__(in_channels * 2, out_channels, + **kwargs) + + def _binary_op(self, x1, x2): + ret = torch.cat([x1, x2], dim=1) + return ret + + +class GlobalPoolingCell(BaseMergeCell): + + def __init__(self, in_channels=None, out_channels=None, **kwargs): + super().__init__(in_channels, out_channels, **kwargs) + self.global_pool = nn.AdaptiveAvgPool2d((1, 1)) + + def _binary_op(self, x1, x2): + x2_att = self.global_pool(x2).sigmoid() + return x2 + x2_att * x1 diff --git a/custom_mmpkg/custom_mmcv/ops/modulated_deform_conv.py b/custom_mmpkg/custom_mmcv/ops/modulated_deform_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..95b4828ef5ba35445856f6e19c0d565d8855c2ed --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/modulated_deform_conv.py @@ -0,0 +1,282 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math + +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair, _single + +from custom_mmpkg.custom_mmcv.utils import deprecated_api_warning +from ..cnn import CONV_LAYERS +from ..utils import ext_loader, print_log + +ext_module = ext_loader.load_ext( + '_ext', + ['modulated_deform_conv_forward', 'modulated_deform_conv_backward']) + + +class ModulatedDeformConv2dFunction(Function): + + @staticmethod + def symbolic(g, input, offset, mask, weight, bias, stride, padding, + dilation, groups, deform_groups): + input_tensors = [input, offset, mask, weight] + if bias is not None: + input_tensors.append(bias) + return g.op( + 'mmcv::MMCVModulatedDeformConv2d', + *input_tensors, + stride_i=stride, + padding_i=padding, + dilation_i=dilation, + groups_i=groups, + deform_groups_i=deform_groups) + + @staticmethod + def forward(ctx, + input, + offset, + mask, + weight, + bias=None, + stride=1, + padding=0, + dilation=1, + groups=1, + deform_groups=1): + if input is not None and input.dim() != 4: + raise ValueError( + f'Expected 4D tensor as input, got {input.dim()}D tensor \ + instead.') + ctx.stride = _pair(stride) + ctx.padding = _pair(padding) + ctx.dilation = _pair(dilation) + ctx.groups = groups + ctx.deform_groups = deform_groups + ctx.with_bias = bias is not None + if not ctx.with_bias: + bias = input.new_empty(0) # fake tensor + # When pytorch version >= 1.6.0, amp is adopted for fp16 mode; + # amp won't cast the type of model (float32), but "offset" is cast + # to float16 by nn.Conv2d automatically, leading to the type + # mismatch with input (when it is float32) or weight. + # The flag for whether to use fp16 or amp is the type of "offset", + # we cast weight and input to temporarily support fp16 and amp + # whatever the pytorch version is. + input = input.type_as(offset) + weight = weight.type_as(input) + ctx.save_for_backward(input, offset, mask, weight, bias) + output = input.new_empty( + ModulatedDeformConv2dFunction._output_size(ctx, input, weight)) + ctx._bufs = [input.new_empty(0), input.new_empty(0)] + ext_module.modulated_deform_conv_forward( + input, + weight, + bias, + ctx._bufs[0], + offset, + mask, + output, + ctx._bufs[1], + kernel_h=weight.size(2), + kernel_w=weight.size(3), + stride_h=ctx.stride[0], + stride_w=ctx.stride[1], + pad_h=ctx.padding[0], + pad_w=ctx.padding[1], + dilation_h=ctx.dilation[0], + dilation_w=ctx.dilation[1], + group=ctx.groups, + deformable_group=ctx.deform_groups, + with_bias=ctx.with_bias) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + input, offset, mask, weight, bias = ctx.saved_tensors + grad_input = torch.zeros_like(input) + grad_offset = torch.zeros_like(offset) + grad_mask = torch.zeros_like(mask) + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(bias) + grad_output = grad_output.contiguous() + ext_module.modulated_deform_conv_backward( + input, + weight, + bias, + ctx._bufs[0], + offset, + mask, + ctx._bufs[1], + grad_input, + grad_weight, + grad_bias, + grad_offset, + grad_mask, + grad_output, + kernel_h=weight.size(2), + kernel_w=weight.size(3), + stride_h=ctx.stride[0], + stride_w=ctx.stride[1], + pad_h=ctx.padding[0], + pad_w=ctx.padding[1], + dilation_h=ctx.dilation[0], + dilation_w=ctx.dilation[1], + group=ctx.groups, + deformable_group=ctx.deform_groups, + with_bias=ctx.with_bias) + if not ctx.with_bias: + grad_bias = None + + return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias, + None, None, None, None, None) + + @staticmethod + def _output_size(ctx, input, weight): + channels = weight.size(0) + output_size = (input.size(0), channels) + for d in range(input.dim() - 2): + in_size = input.size(d + 2) + pad = ctx.padding[d] + kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1 + stride_ = ctx.stride[d] + output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, ) + if not all(map(lambda s: s > 0, output_size)): + raise ValueError( + 'convolution input is too small (output would be ' + + 'x'.join(map(str, output_size)) + ')') + return output_size + + +modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply + + +class ModulatedDeformConv2d(nn.Module): + + @deprecated_api_warning({'deformable_groups': 'deform_groups'}, + cls_name='ModulatedDeformConv2d') + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + deform_groups=1, + bias=True): + super(ModulatedDeformConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = _pair(kernel_size) + self.stride = _pair(stride) + self.padding = _pair(padding) + self.dilation = _pair(dilation) + self.groups = groups + self.deform_groups = deform_groups + # enable compatibility with nn.Conv2d + self.transposed = False + self.output_padding = _single(0) + + self.weight = nn.Parameter( + torch.Tensor(out_channels, in_channels // groups, + *self.kernel_size)) + if bias: + self.bias = nn.Parameter(torch.Tensor(out_channels)) + else: + self.register_parameter('bias', None) + self.init_weights() + + def init_weights(self): + n = self.in_channels + for k in self.kernel_size: + n *= k + stdv = 1. / math.sqrt(n) + self.weight.data.uniform_(-stdv, stdv) + if self.bias is not None: + self.bias.data.zero_() + + def forward(self, x, offset, mask): + return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, + self.dilation, self.groups, + self.deform_groups) + + +@CONV_LAYERS.register_module('DCNv2') +class ModulatedDeformConv2dPack(ModulatedDeformConv2d): + """A ModulatedDeformable Conv Encapsulation that acts as normal Conv + layers. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int or tuple[int]): Same as nn.Conv2d. + stride (int): Same as nn.Conv2d, while tuple is not supported. + padding (int): Same as nn.Conv2d, while tuple is not supported. + dilation (int): Same as nn.Conv2d, while tuple is not supported. + groups (int): Same as nn.Conv2d. + bias (bool or str): If specified as `auto`, it will be decided by the + norm_cfg. Bias will be set as True if norm_cfg is None, otherwise + False. + """ + + _version = 2 + + def __init__(self, *args, **kwargs): + super(ModulatedDeformConv2dPack, self).__init__(*args, **kwargs) + self.conv_offset = nn.Conv2d( + self.in_channels, + self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1], + kernel_size=self.kernel_size, + stride=self.stride, + padding=self.padding, + dilation=self.dilation, + bias=True) + self.init_weights() + + def init_weights(self): + super(ModulatedDeformConv2dPack, self).init_weights() + if hasattr(self, 'conv_offset'): + self.conv_offset.weight.data.zero_() + self.conv_offset.bias.data.zero_() + + def forward(self, x): + out = self.conv_offset(x) + o1, o2, mask = torch.chunk(out, 3, dim=1) + offset = torch.cat((o1, o2), dim=1) + mask = torch.sigmoid(mask) + return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias, + self.stride, self.padding, + self.dilation, self.groups, + self.deform_groups) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + version = local_metadata.get('version', None) + + if version is None or version < 2: + # the key is different in early versions + # In version < 2, ModulatedDeformConvPack + # loads previous benchmark models. + if (prefix + 'conv_offset.weight' not in state_dict + and prefix[:-1] + '_offset.weight' in state_dict): + state_dict[prefix + 'conv_offset.weight'] = state_dict.pop( + prefix[:-1] + '_offset.weight') + if (prefix + 'conv_offset.bias' not in state_dict + and prefix[:-1] + '_offset.bias' in state_dict): + state_dict[prefix + + 'conv_offset.bias'] = state_dict.pop(prefix[:-1] + + '_offset.bias') + + if version is not None and version > 1: + print_log( + f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to ' + 'version 2.', + logger='root') + + super()._load_from_state_dict(state_dict, prefix, local_metadata, + strict, missing_keys, unexpected_keys, + error_msgs) diff --git a/custom_mmpkg/custom_mmcv/ops/multi_scale_deform_attn.py b/custom_mmpkg/custom_mmcv/ops/multi_scale_deform_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..8696322b086872322185b6be4daf15f94d5981a0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/multi_scale_deform_attn.py @@ -0,0 +1,358 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import math +import warnings + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.autograd.function import Function, once_differentiable + +from custom_mmpkg.custom_mmcv import deprecated_api_warning +from custom_mmpkg.custom_mmcv.cnn import constant_init, xavier_init +from custom_mmpkg.custom_mmcv.cnn.bricks.registry import ATTENTION +from custom_mmpkg.custom_mmcv.runner import BaseModule +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward']) + + +class MultiScaleDeformableAttnFunction(Function): + + @staticmethod + def forward(ctx, value, value_spatial_shapes, value_level_start_index, + sampling_locations, attention_weights, im2col_step): + """GPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + im2col_step (Tensor): The step used in image to column. + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + ctx.im2col_step = im2col_step + output = ext_module.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step=ctx.im2col_step) + ctx.save_for_backward(value, value_spatial_shapes, + value_level_start_index, sampling_locations, + attention_weights) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + """GPU version of backward function. + + Args: + grad_output (Tensor): Gradient + of output tensor of forward. + + Returns: + Tuple[Tensor]: Gradient + of input tensors in forward. + """ + value, value_spatial_shapes, value_level_start_index,\ + sampling_locations, attention_weights = ctx.saved_tensors + grad_value = torch.zeros_like(value) + grad_sampling_loc = torch.zeros_like(sampling_locations) + grad_attn_weight = torch.zeros_like(attention_weights) + + ext_module.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output.contiguous(), + grad_value, + grad_sampling_loc, + grad_attn_weight, + im2col_step=ctx.im2col_step) + + return grad_value, None, None, \ + grad_sampling_loc, grad_attn_weight, None + + +def multi_scale_deformable_attn_pytorch(value, value_spatial_shapes, + sampling_locations, attention_weights): + """CPU version of multi-scale deformable attention. + + Args: + value (Tensor): The value has shape + (bs, num_keys, mum_heads, embed_dims//num_heads) + value_spatial_shapes (Tensor): Spatial shape of + each feature map, has shape (num_levels, 2), + last dimension 2 represent (h, w) + sampling_locations (Tensor): The location of sampling points, + has shape + (bs ,num_queries, num_heads, num_levels, num_points, 2), + the last dimension 2 represent (x, y). + attention_weights (Tensor): The weight of sampling points used + when calculate the attention, has shape + (bs ,num_queries, num_heads, num_levels, num_points), + + Returns: + Tensor: has shape (bs, num_queries, embed_dims) + """ + + bs, _, num_heads, embed_dims = value.shape + _, num_queries, num_heads, num_levels, num_points, _ =\ + sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], + dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level, (H_, W_) in enumerate(value_spatial_shapes): + # bs, H_*W_, num_heads, embed_dims -> + # bs, H_*W_, num_heads*embed_dims -> + # bs, num_heads*embed_dims, H_*W_ -> + # bs*num_heads, embed_dims, H_, W_ + value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape( + bs * num_heads, embed_dims, H_, W_) + # bs, num_queries, num_heads, num_points, 2 -> + # bs, num_heads, num_queries, num_points, 2 -> + # bs*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, + level].transpose(1, 2).flatten(0, 1) + # bs*num_heads, embed_dims, num_queries, num_points + sampling_value_l_ = F.grid_sample( + value_l_, + sampling_grid_l_, + mode='bilinear', + padding_mode='zeros', + align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (bs, num_queries, num_heads, num_levels, num_points) -> + # (bs, num_heads, num_queries, num_levels, num_points) -> + # (bs, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + bs * num_heads, 1, num_queries, num_levels * num_points) + output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * + attention_weights).sum(-1).view(bs, num_heads * embed_dims, + num_queries) + return output.transpose(1, 2).contiguous() + + +@ATTENTION.register_module() +class MultiScaleDeformableAttention(BaseModule): + """An attention module used in Deformable-Detr. + + `Deformable DETR: Deformable Transformers for End-to-End Object Detection. + `_. + + Args: + embed_dims (int): The embedding dimension of Attention. + Default: 256. + num_heads (int): Parallel attention heads. Default: 64. + num_levels (int): The number of feature map used in + Attention. Default: 4. + num_points (int): The number of sampling points for + each query in each head. Default: 4. + im2col_step (int): The step used in image_to_column. + Default: 64. + dropout (float): A Dropout layer on `inp_identity`. + Default: 0.1. + batch_first (bool): Key, Query and Value are shape of + (batch, n, embed_dim) + or (n, batch, embed_dim). Default to False. + norm_cfg (dict): Config dict for normalization layer. + Default: None. + init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization. + Default: None. + """ + + def __init__(self, + embed_dims=256, + num_heads=8, + num_levels=4, + num_points=4, + im2col_step=64, + dropout=0.1, + batch_first=False, + norm_cfg=None, + init_cfg=None): + super().__init__(init_cfg) + if embed_dims % num_heads != 0: + raise ValueError(f'embed_dims must be divisible by num_heads, ' + f'but got {embed_dims} and {num_heads}') + dim_per_head = embed_dims // num_heads + self.norm_cfg = norm_cfg + self.dropout = nn.Dropout(dropout) + self.batch_first = batch_first + + # you'd better set dim_per_head to a power of 2 + # which is more efficient in the CUDA implementation + def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError( + 'invalid input for _is_power_of_2: {} (type: {})'.format( + n, type(n))) + return (n & (n - 1) == 0) and n != 0 + + if not _is_power_of_2(dim_per_head): + warnings.warn( + "You'd better set embed_dims in " + 'MultiScaleDeformAttention to make ' + 'the dimension of each attention head a power of 2 ' + 'which is more efficient in our CUDA implementation.') + + self.im2col_step = im2col_step + self.embed_dims = embed_dims + self.num_levels = num_levels + self.num_heads = num_heads + self.num_points = num_points + self.sampling_offsets = nn.Linear( + embed_dims, num_heads * num_levels * num_points * 2) + self.attention_weights = nn.Linear(embed_dims, + num_heads * num_levels * num_points) + self.value_proj = nn.Linear(embed_dims, embed_dims) + self.output_proj = nn.Linear(embed_dims, embed_dims) + self.init_weights() + + def init_weights(self): + """Default initialization for Parameters of Module.""" + constant_init(self.sampling_offsets, 0.) + thetas = torch.arange( + self.num_heads, + dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / + grid_init.abs().max(-1, keepdim=True)[0]).view( + self.num_heads, 1, 1, + 2).repeat(1, self.num_levels, self.num_points, 1) + for i in range(self.num_points): + grid_init[:, :, i, :] *= i + 1 + + self.sampling_offsets.bias.data = grid_init.view(-1) + constant_init(self.attention_weights, val=0., bias=0.) + xavier_init(self.value_proj, distribution='uniform', bias=0.) + xavier_init(self.output_proj, distribution='uniform', bias=0.) + self._is_init = True + + @deprecated_api_warning({'residual': 'identity'}, + cls_name='MultiScaleDeformableAttention') + def forward(self, + query, + key=None, + value=None, + identity=None, + query_pos=None, + key_padding_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + **kwargs): + """Forward Function of MultiScaleDeformAttention. + + Args: + query (Tensor): Query of Transformer with shape + (num_query, bs, embed_dims). + key (Tensor): The key tensor with shape + `(num_key, bs, embed_dims)`. + value (Tensor): The value tensor with shape + `(num_key, bs, embed_dims)`. + identity (Tensor): The tensor used for addition, with the + same shape as `query`. Default None. If None, + `query` will be used. + query_pos (Tensor): The positional encoding for `query`. + Default: None. + key_pos (Tensor): The positional encoding for `key`. Default + None. + reference_points (Tensor): The normalized reference + points with shape (bs, num_query, num_levels, 2), + all elements is range in [0, 1], top-left (0,0), + bottom-right (1, 1), including padding area. + or (N, Length_{query}, num_levels, 4), add + additional two dimensions is (w, h) to + form reference boxes. + key_padding_mask (Tensor): ByteTensor for `query`, with + shape [bs, num_key]. + spatial_shapes (Tensor): Spatial shape of features in + different levels. With shape (num_levels, 2), + last dimension represents (h, w). + level_start_index (Tensor): The start index of each level. + A tensor has shape ``(num_levels, )`` and can be represented + as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...]. + + Returns: + Tensor: forwarded results with shape [num_query, bs, embed_dims]. + """ + + if value is None: + value = query + + if identity is None: + identity = query + if query_pos is not None: + query = query + query_pos + if not self.batch_first: + # change to (bs, num_query ,embed_dims) + query = query.permute(1, 0, 2) + value = value.permute(1, 0, 2) + + bs, num_query, _ = query.shape + bs, num_value, _ = value.shape + assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value + + value = self.value_proj(value) + if key_padding_mask is not None: + value = value.masked_fill(key_padding_mask[..., None], 0.0) + value = value.view(bs, num_value, self.num_heads, -1) + sampling_offsets = self.sampling_offsets(query).view( + bs, num_query, self.num_heads, self.num_levels, self.num_points, 2) + attention_weights = self.attention_weights(query).view( + bs, num_query, self.num_heads, self.num_levels * self.num_points) + attention_weights = attention_weights.softmax(-1) + + attention_weights = attention_weights.view(bs, num_query, + self.num_heads, + self.num_levels, + self.num_points) + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack( + [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets \ + / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.num_points \ + * reference_points[:, :, None, :, None, 2:] \ + * 0.5 + else: + raise ValueError( + f'Last dim of reference_points must be' + f' 2 or 4, but get {reference_points.shape[-1]} instead.') + if torch.cuda.is_available() and value.is_cuda: + output = MultiScaleDeformableAttnFunction.apply( + value, spatial_shapes, level_start_index, sampling_locations, + attention_weights, self.im2col_step) + else: + output = multi_scale_deformable_attn_pytorch( + value, spatial_shapes, sampling_locations, attention_weights) + + output = self.output_proj(output) + + if not self.batch_first: + # (num_query, bs ,embed_dims) + output = output.permute(1, 0, 2) + + return self.dropout(output) + identity diff --git a/custom_mmpkg/custom_mmcv/ops/nms.py b/custom_mmpkg/custom_mmcv/ops/nms.py new file mode 100644 index 0000000000000000000000000000000000000000..080c0cf0f2ddef9c4d502b8011c85ed10eff94af --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/nms.py @@ -0,0 +1,417 @@ +import os + +import numpy as np +import torch + +from custom_mmpkg.custom_mmcv.utils import deprecated_api_warning +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated']) + + +# This function is modified from: https://github.com/pytorch/vision/ +class NMSop(torch.autograd.Function): + + @staticmethod + def forward(ctx, bboxes, scores, iou_threshold, offset, score_threshold, + max_num): + is_filtering_by_score = score_threshold > 0 + if is_filtering_by_score: + valid_mask = scores > score_threshold + bboxes, scores = bboxes[valid_mask], scores[valid_mask] + valid_inds = torch.nonzero( + valid_mask, as_tuple=False).squeeze(dim=1) + + inds = ext_module.nms( + bboxes, scores, iou_threshold=float(iou_threshold), offset=offset) + + if max_num > 0: + inds = inds[:max_num] + if is_filtering_by_score: + inds = valid_inds[inds] + return inds + + @staticmethod + def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold, + max_num): + from ..onnx import is_custom_op_loaded + has_custom_op = is_custom_op_loaded() + # TensorRT nms plugin is aligned with original nms in ONNXRuntime + is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT' + if has_custom_op and (not is_trt_backend): + return g.op( + 'mmcv::NonMaxSuppression', + bboxes, + scores, + iou_threshold_f=float(iou_threshold), + offset_i=int(offset)) + else: + from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze + from ..onnx.onnx_utils.symbolic_helper import _size_helper + + boxes = unsqueeze(g, bboxes, 0) + scores = unsqueeze(g, unsqueeze(g, scores, 0), 0) + + if max_num > 0: + max_num = g.op( + 'Constant', + value_t=torch.tensor(max_num, dtype=torch.long)) + else: + dim = g.op('Constant', value_t=torch.tensor(0)) + max_num = _size_helper(g, bboxes, dim) + max_output_per_class = max_num + iou_threshold = g.op( + 'Constant', + value_t=torch.tensor([iou_threshold], dtype=torch.float)) + score_threshold = g.op( + 'Constant', + value_t=torch.tensor([score_threshold], dtype=torch.float)) + nms_out = g.op('NonMaxSuppression', boxes, scores, + max_output_per_class, iou_threshold, + score_threshold) + return squeeze( + g, + select( + g, nms_out, 1, + g.op( + 'Constant', + value_t=torch.tensor([2], dtype=torch.long))), 1) + + +class SoftNMSop(torch.autograd.Function): + + @staticmethod + def forward(ctx, boxes, scores, iou_threshold, sigma, min_score, method, + offset): + dets = boxes.new_empty((boxes.size(0), 5), device='cpu') + inds = ext_module.softnms( + boxes.cpu(), + scores.cpu(), + dets.cpu(), + iou_threshold=float(iou_threshold), + sigma=float(sigma), + min_score=float(min_score), + method=int(method), + offset=int(offset)) + return dets, inds + + @staticmethod + def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method, + offset): + from packaging import version + assert version.parse(torch.__version__) >= version.parse('1.7.0') + nms_out = g.op( + 'mmcv::SoftNonMaxSuppression', + boxes, + scores, + iou_threshold_f=float(iou_threshold), + sigma_f=float(sigma), + min_score_f=float(min_score), + method_i=int(method), + offset_i=int(offset), + outputs=2) + return nms_out + + +@deprecated_api_warning({'iou_thr': 'iou_threshold'}) +def nms(boxes, scores, iou_threshold, offset=0, score_threshold=0, max_num=-1): + """Dispatch to either CPU or GPU NMS implementations. + + The input can be either torch tensor or numpy array. GPU NMS will be used + if the input is gpu tensor, otherwise CPU NMS + will be used. The returned type will always be the same as inputs. + + Arguments: + boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4). + scores (torch.Tensor or np.ndarray): scores in shape (N, ). + iou_threshold (float): IoU threshold for NMS. + offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset). + score_threshold (float): score threshold for NMS. + max_num (int): maximum number of boxes after NMS. + + Returns: + tuple: kept dets(boxes and scores) and indice, which is always the \ + same data type as the input. + + Example: + >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9], + >>> [49.3, 32.9, 51.0, 35.3], + >>> [49.2, 31.8, 51.0, 35.4], + >>> [35.1, 11.5, 39.1, 15.7], + >>> [35.6, 11.8, 39.3, 14.2], + >>> [35.3, 11.5, 39.9, 14.5], + >>> [35.2, 11.7, 39.7, 15.7]], dtype=np.float32) + >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\ + dtype=np.float32) + >>> iou_threshold = 0.6 + >>> dets, inds = nms(boxes, scores, iou_threshold) + >>> assert len(inds) == len(dets) == 3 + """ + assert isinstance(boxes, (torch.Tensor, np.ndarray)) + assert isinstance(scores, (torch.Tensor, np.ndarray)) + is_numpy = False + if isinstance(boxes, np.ndarray): + is_numpy = True + boxes = torch.from_numpy(boxes) + if isinstance(scores, np.ndarray): + scores = torch.from_numpy(scores) + assert boxes.size(1) == 4 + assert boxes.size(0) == scores.size(0) + assert offset in (0, 1) + + if torch.__version__ == 'parrots': + indata_list = [boxes, scores] + indata_dict = { + 'iou_threshold': float(iou_threshold), + 'offset': int(offset) + } + inds = ext_module.nms(*indata_list, **indata_dict) + else: + inds = NMSop.apply(boxes, scores, iou_threshold, offset, + score_threshold, max_num) + dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1) + if is_numpy: + dets = dets.cpu().numpy() + inds = inds.cpu().numpy() + return dets, inds + + +@deprecated_api_warning({'iou_thr': 'iou_threshold'}) +def soft_nms(boxes, + scores, + iou_threshold=0.3, + sigma=0.5, + min_score=1e-3, + method='linear', + offset=0): + """Dispatch to only CPU Soft NMS implementations. + + The input can be either a torch tensor or numpy array. + The returned type will always be the same as inputs. + + Arguments: + boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4). + scores (torch.Tensor or np.ndarray): scores in shape (N, ). + iou_threshold (float): IoU threshold for NMS. + sigma (float): hyperparameter for gaussian method + min_score (float): score filter threshold + method (str): either 'linear' or 'gaussian' + offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset). + + Returns: + tuple: kept dets(boxes and scores) and indice, which is always the \ + same data type as the input. + + Example: + >>> boxes = np.array([[4., 3., 5., 3.], + >>> [4., 3., 5., 4.], + >>> [3., 1., 3., 1.], + >>> [3., 1., 3., 1.], + >>> [3., 1., 3., 1.], + >>> [3., 1., 3., 1.]], dtype=np.float32) + >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32) + >>> iou_threshold = 0.6 + >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5) + >>> assert len(inds) == len(dets) == 5 + """ + + assert isinstance(boxes, (torch.Tensor, np.ndarray)) + assert isinstance(scores, (torch.Tensor, np.ndarray)) + is_numpy = False + if isinstance(boxes, np.ndarray): + is_numpy = True + boxes = torch.from_numpy(boxes) + if isinstance(scores, np.ndarray): + scores = torch.from_numpy(scores) + assert boxes.size(1) == 4 + assert boxes.size(0) == scores.size(0) + assert offset in (0, 1) + method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2} + assert method in method_dict.keys() + + if torch.__version__ == 'parrots': + dets = boxes.new_empty((boxes.size(0), 5), device='cpu') + indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()] + indata_dict = { + 'iou_threshold': float(iou_threshold), + 'sigma': float(sigma), + 'min_score': min_score, + 'method': method_dict[method], + 'offset': int(offset) + } + inds = ext_module.softnms(*indata_list, **indata_dict) + else: + dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(), + float(iou_threshold), float(sigma), + float(min_score), method_dict[method], + int(offset)) + + dets = dets[:inds.size(0)] + + if is_numpy: + dets = dets.cpu().numpy() + inds = inds.cpu().numpy() + return dets, inds + else: + return dets.to(device=boxes.device), inds.to(device=boxes.device) + + +def batched_nms(boxes, scores, idxs, nms_cfg, class_agnostic=False): + """Performs non-maximum suppression in a batched fashion. + + Modified from https://github.com/pytorch/vision/blob + /505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39. + In order to perform NMS independently per class, we add an offset to all + the boxes. The offset is dependent only on the class idx, and is large + enough so that boxes from different classes do not overlap. + + Arguments: + boxes (torch.Tensor): boxes in shape (N, 4). + scores (torch.Tensor): scores in shape (N, ). + idxs (torch.Tensor): each index value correspond to a bbox cluster, + and NMS will not be applied between elements of different idxs, + shape (N, ). + nms_cfg (dict): specify nms type and other parameters like iou_thr. + Possible keys includes the following. + + - iou_thr (float): IoU threshold used for NMS. + - split_thr (float): threshold number of boxes. In some cases the + number of boxes is large (e.g., 200k). To avoid OOM during + training, the users could set `split_thr` to a small value. + If the number of boxes is greater than the threshold, it will + perform NMS on each group of boxes separately and sequentially. + Defaults to 10000. + class_agnostic (bool): if true, nms is class agnostic, + i.e. IoU thresholding happens over all boxes, + regardless of the predicted class. + + Returns: + tuple: kept dets and indice. + """ + nms_cfg_ = nms_cfg.copy() + class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic) + if class_agnostic: + boxes_for_nms = boxes + else: + max_coordinate = boxes.max() + offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes)) + boxes_for_nms = boxes + offsets[:, None] + + nms_type = nms_cfg_.pop('type', 'nms') + nms_op = eval(nms_type) + + split_thr = nms_cfg_.pop('split_thr', 10000) + # Won't split to multiple nms nodes when exporting to onnx + if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export(): + dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_) + boxes = boxes[keep] + # -1 indexing works abnormal in TensorRT + # This assumes `dets` has 5 dimensions where + # the last dimension is score. + # TODO: more elegant way to handle the dimension issue. + # Some type of nms would reweight the score, such as SoftNMS + scores = dets[:, 4] + else: + max_num = nms_cfg_.pop('max_num', -1) + total_mask = scores.new_zeros(scores.size(), dtype=torch.bool) + # Some type of nms would reweight the score, such as SoftNMS + scores_after_nms = scores.new_zeros(scores.size()) + for id in torch.unique(idxs): + mask = (idxs == id).nonzero(as_tuple=False).view(-1) + dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_) + total_mask[mask[keep]] = True + scores_after_nms[mask[keep]] = dets[:, -1] + keep = total_mask.nonzero(as_tuple=False).view(-1) + + scores, inds = scores_after_nms[keep].sort(descending=True) + keep = keep[inds] + boxes = boxes[keep] + + if max_num > 0: + keep = keep[:max_num] + boxes = boxes[:max_num] + scores = scores[:max_num] + + return torch.cat([boxes, scores[:, None]], -1), keep + + +def nms_match(dets, iou_threshold): + """Matched dets into different groups by NMS. + + NMS match is Similar to NMS but when a bbox is suppressed, nms match will + record the indice of suppressed bbox and form a group with the indice of + kept bbox. In each group, indice is sorted as score order. + + Arguments: + dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5). + iou_thr (float): IoU thresh for NMS. + + Returns: + List[torch.Tensor | np.ndarray]: The outer list corresponds different + matched group, the inner Tensor corresponds the indices for a group + in score order. + """ + if dets.shape[0] == 0: + matched = [] + else: + assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \ + f'but get {dets.shape}' + if isinstance(dets, torch.Tensor): + dets_t = dets.detach().cpu() + else: + dets_t = torch.from_numpy(dets) + indata_list = [dets_t] + indata_dict = {'iou_threshold': float(iou_threshold)} + matched = ext_module.nms_match(*indata_list, **indata_dict) + if torch.__version__ == 'parrots': + matched = matched.tolist() + + if isinstance(dets, torch.Tensor): + return [dets.new_tensor(m, dtype=torch.long) for m in matched] + else: + return [np.array(m, dtype=np.int) for m in matched] + + +def nms_rotated(dets, scores, iou_threshold, labels=None): + """Performs non-maximum suppression (NMS) on the rotated boxes according to + their intersection-over-union (IoU). + + Rotated NMS iteratively removes lower scoring rotated boxes which have an + IoU greater than iou_threshold with another (higher scoring) rotated box. + + Args: + boxes (Tensor): Rotated boxes in shape (N, 5). They are expected to \ + be in (x_ctr, y_ctr, width, height, angle_radian) format. + scores (Tensor): scores in shape (N, ). + iou_threshold (float): IoU thresh for NMS. + labels (Tensor): boxes' label in shape (N,). + + Returns: + tuple: kept dets(boxes and scores) and indice, which is always the \ + same data type as the input. + """ + if dets.shape[0] == 0: + return dets, None + multi_label = labels is not None + if multi_label: + dets_wl = torch.cat((dets, labels.unsqueeze(1)), 1) + else: + dets_wl = dets + _, order = scores.sort(0, descending=True) + dets_sorted = dets_wl.index_select(0, order) + + if torch.__version__ == 'parrots': + keep_inds = ext_module.nms_rotated( + dets_wl, + scores, + order, + dets_sorted, + iou_threshold=iou_threshold, + multi_label=multi_label) + else: + keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted, + iou_threshold, multi_label) + dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)), + dim=1) + return dets, keep_inds diff --git a/custom_mmpkg/custom_mmcv/ops/pixel_group.py b/custom_mmpkg/custom_mmcv/ops/pixel_group.py new file mode 100644 index 0000000000000000000000000000000000000000..2143c75f835a467c802fc3c37ecd3ac0f85bcda4 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/pixel_group.py @@ -0,0 +1,75 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numpy as np +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['pixel_group']) + + +def pixel_group(score, mask, embedding, kernel_label, kernel_contour, + kernel_region_num, distance_threshold): + """Group pixels into text instances, which is widely used text detection + methods. + + Arguments: + score (np.array or Tensor): The foreground score with size hxw. + mask (np.array or Tensor): The foreground mask with size hxw. + embedding (np.array or Tensor): The embedding with size hxwxc to + distinguish instances. + kernel_label (np.array or Tensor): The instance kernel index with + size hxw. + kernel_contour (np.array or Tensor): The kernel contour with size hxw. + kernel_region_num (int): The instance kernel region number. + distance_threshold (float): The embedding distance threshold between + kernel and pixel in one instance. + + Returns: + pixel_assignment (List[List[float]]): The instance coordinate list. + Each element consists of averaged confidence, pixel number, and + coordinates (x_i, y_i for all pixels) in order. + """ + assert isinstance(score, (torch.Tensor, np.ndarray)) + assert isinstance(mask, (torch.Tensor, np.ndarray)) + assert isinstance(embedding, (torch.Tensor, np.ndarray)) + assert isinstance(kernel_label, (torch.Tensor, np.ndarray)) + assert isinstance(kernel_contour, (torch.Tensor, np.ndarray)) + assert isinstance(kernel_region_num, int) + assert isinstance(distance_threshold, float) + + if isinstance(score, np.ndarray): + score = torch.from_numpy(score) + if isinstance(mask, np.ndarray): + mask = torch.from_numpy(mask) + if isinstance(embedding, np.ndarray): + embedding = torch.from_numpy(embedding) + if isinstance(kernel_label, np.ndarray): + kernel_label = torch.from_numpy(kernel_label) + if isinstance(kernel_contour, np.ndarray): + kernel_contour = torch.from_numpy(kernel_contour) + + if torch.__version__ == 'parrots': + label = ext_module.pixel_group( + score, + mask, + embedding, + kernel_label, + kernel_contour, + kernel_region_num=kernel_region_num, + distance_threshold=distance_threshold) + label = label.tolist() + label = label[0] + list_index = kernel_region_num + pixel_assignment = [] + for x in range(kernel_region_num): + pixel_assignment.append( + np.array( + label[list_index:list_index + int(label[x])], + dtype=np.float)) + list_index = list_index + int(label[x]) + else: + pixel_assignment = ext_module.pixel_group(score, mask, embedding, + kernel_label, kernel_contour, + kernel_region_num, + distance_threshold) + return pixel_assignment diff --git a/custom_mmpkg/custom_mmcv/ops/point_sample.py b/custom_mmpkg/custom_mmcv/ops/point_sample.py new file mode 100644 index 0000000000000000000000000000000000000000..0f09ce3ce366b9f5050f04a5f611a338484b30e7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/point_sample.py @@ -0,0 +1,336 @@ +# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend # noqa + +from os import path as osp + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.nn.modules.utils import _pair +from torch.onnx.operators import shape_as_tensor + + +def bilinear_grid_sample(im, grid, align_corners=False): + """Given an input and a flow-field grid, computes the output using input + values and pixel locations from grid. Supported only bilinear interpolation + method to sample the input pixels. + + Args: + im (torch.Tensor): Input feature map, shape (N, C, H, W) + grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2) + align_corners {bool}: If set to True, the extrema (-1 and 1) are + considered as referring to the center points of the input’s + corner pixels. If set to False, they are instead considered as + referring to the corner points of the input’s corner pixels, + making the sampling more resolution agnostic. + Returns: + torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg) + """ + n, c, h, w = im.shape + gn, gh, gw, _ = grid.shape + assert n == gn + + x = grid[:, :, :, 0] + y = grid[:, :, :, 1] + + if align_corners: + x = ((x + 1) / 2) * (w - 1) + y = ((y + 1) / 2) * (h - 1) + else: + x = ((x + 1) * w - 1) / 2 + y = ((y + 1) * h - 1) / 2 + + x = x.view(n, -1) + y = y.view(n, -1) + + x0 = torch.floor(x).long() + y0 = torch.floor(y).long() + x1 = x0 + 1 + y1 = y0 + 1 + + wa = ((x1 - x) * (y1 - y)).unsqueeze(1) + wb = ((x1 - x) * (y - y0)).unsqueeze(1) + wc = ((x - x0) * (y1 - y)).unsqueeze(1) + wd = ((x - x0) * (y - y0)).unsqueeze(1) + + # Apply default for grid_sample function zero padding + im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0) + padded_h = h + 2 + padded_w = w + 2 + # save points positions after padding + x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1 + + # Clip coordinates to padded image size + x0 = torch.where(x0 < 0, torch.tensor(0), x0) + x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0) + x1 = torch.where(x1 < 0, torch.tensor(0), x1) + x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1) + y0 = torch.where(y0 < 0, torch.tensor(0), y0) + y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0) + y1 = torch.where(y1 < 0, torch.tensor(0), y1) + y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1) + + im_padded = im_padded.view(n, c, -1) + + x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1) + x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1) + x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1) + x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1) + + Ia = torch.gather(im_padded, 2, x0_y0) + Ib = torch.gather(im_padded, 2, x0_y1) + Ic = torch.gather(im_padded, 2, x1_y0) + Id = torch.gather(im_padded, 2, x1_y1) + + return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw) + + +def is_in_onnx_export_without_custom_ops(): + from custom_mmpkg.custom_mmcv.ops import get_onnxruntime_op_path + ort_custom_op_path = get_onnxruntime_op_path() + return torch.onnx.is_in_onnx_export( + ) and not osp.exists(ort_custom_op_path) + + +def normalize(grid): + """Normalize input grid from [-1, 1] to [0, 1] + Args: + grid (Tensor): The grid to be normalize, range [-1, 1]. + Returns: + Tensor: Normalized grid, range [0, 1]. + """ + + return (grid + 1.0) / 2.0 + + +def denormalize(grid): + """Denormalize input grid from range [0, 1] to [-1, 1] + Args: + grid (Tensor): The grid to be denormalize, range [0, 1]. + Returns: + Tensor: Denormalized grid, range [-1, 1]. + """ + + return grid * 2.0 - 1.0 + + +def generate_grid(num_grid, size, device): + """Generate regular square grid of points in [0, 1] x [0, 1] coordinate + space. + + Args: + num_grid (int): The number of grids to sample, one for each region. + size (tuple(int, int)): The side size of the regular grid. + device (torch.device): Desired device of returned tensor. + + Returns: + (torch.Tensor): A tensor of shape (num_grid, size[0]*size[1], 2) that + contains coordinates for the regular grids. + """ + + affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device) + grid = F.affine_grid( + affine_trans, torch.Size((1, 1, *size)), align_corners=False) + grid = normalize(grid) + return grid.view(1, -1, 2).expand(num_grid, -1, -1) + + +def rel_roi_point_to_abs_img_point(rois, rel_roi_points): + """Convert roi based relative point coordinates to image based absolute + point coordinates. + + Args: + rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5) + rel_roi_points (Tensor): Point coordinates inside RoI, relative to + RoI, location, range (0, 1), shape (N, P, 2) + Returns: + Tensor: Image based absolute point coordinates, shape (N, P, 2) + """ + + with torch.no_grad(): + assert rel_roi_points.size(0) == rois.size(0) + assert rois.dim() == 2 + assert rel_roi_points.dim() == 3 + assert rel_roi_points.size(2) == 2 + # remove batch idx + if rois.size(1) == 5: + rois = rois[:, 1:] + abs_img_points = rel_roi_points.clone() + # To avoid an error during exporting to onnx use independent + # variables instead inplace computation + xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0]) + ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1]) + xs += rois[:, None, 0] + ys += rois[:, None, 1] + abs_img_points = torch.stack([xs, ys], dim=2) + return abs_img_points + + +def get_shape_from_feature_map(x): + """Get spatial resolution of input feature map considering exporting to + onnx mode. + + Args: + x (torch.Tensor): Input tensor, shape (N, C, H, W) + Returns: + torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2) + """ + if torch.onnx.is_in_onnx_export(): + img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to( + x.device).float() + else: + img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to( + x.device).float() + return img_shape + + +def abs_img_point_to_rel_img_point(abs_img_points, img, spatial_scale=1.): + """Convert image based absolute point coordinates to image based relative + coordinates for sampling. + + Args: + abs_img_points (Tensor): Image based absolute point coordinates, + shape (N, P, 2) + img (tuple/Tensor): (height, width) of image or feature map. + spatial_scale (float): Scale points by this factor. Default: 1. + + Returns: + Tensor: Image based relative point coordinates for sampling, + shape (N, P, 2) + """ + + assert (isinstance(img, tuple) and len(img) == 2) or \ + (isinstance(img, torch.Tensor) and len(img.shape) == 4) + + if isinstance(img, tuple): + h, w = img + scale = torch.tensor([w, h], + dtype=torch.float, + device=abs_img_points.device) + scale = scale.view(1, 1, 2) + else: + scale = get_shape_from_feature_map(img) + + return abs_img_points / scale * spatial_scale + + +def rel_roi_point_to_rel_img_point(rois, + rel_roi_points, + img, + spatial_scale=1.): + """Convert roi based relative point coordinates to image based absolute + point coordinates. + + Args: + rois (Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5) + rel_roi_points (Tensor): Point coordinates inside RoI, relative to + RoI, location, range (0, 1), shape (N, P, 2) + img (tuple/Tensor): (height, width) of image or feature map. + spatial_scale (float): Scale points by this factor. Default: 1. + + Returns: + Tensor: Image based relative point coordinates for sampling, + shape (N, P, 2) + """ + + abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points) + rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img, + spatial_scale) + + return rel_img_point + + +def point_sample(input, points, align_corners=False, **kwargs): + """A wrapper around :func:`grid_sample` to support 3D point_coords tensors + Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to + lie inside ``[0, 1] x [0, 1]`` square. + + Args: + input (Tensor): Feature map, shape (N, C, H, W). + points (Tensor): Image based absolute point coordinates (normalized), + range [0, 1] x [0, 1], shape (N, P, 2) or (N, Hgrid, Wgrid, 2). + align_corners (bool): Whether align_corners. Default: False + + Returns: + Tensor: Features of `point` on `input`, shape (N, C, P) or + (N, C, Hgrid, Wgrid). + """ + + add_dim = False + if points.dim() == 3: + add_dim = True + points = points.unsqueeze(2) + if is_in_onnx_export_without_custom_ops(): + # If custom ops for onnx runtime not compiled use python + # implementation of grid_sample function to make onnx graph + # with supported nodes + output = bilinear_grid_sample( + input, denormalize(points), align_corners=align_corners) + else: + output = F.grid_sample( + input, denormalize(points), align_corners=align_corners, **kwargs) + if add_dim: + output = output.squeeze(3) + return output + + +class SimpleRoIAlign(nn.Module): + + def __init__(self, output_size, spatial_scale, aligned=True): + """Simple RoI align in PointRend, faster than standard RoIAlign. + + Args: + output_size (tuple[int]): h, w + spatial_scale (float): scale the input boxes by this number + aligned (bool): if False, use the legacy implementation in + MMDetection, align_corners=True will be used in F.grid_sample. + If True, align the results more perfectly. + """ + + super(SimpleRoIAlign, self).__init__() + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + # to be consistent with other RoI ops + self.use_torchvision = False + self.aligned = aligned + + def forward(self, features, rois): + num_imgs = features.size(0) + num_rois = rois.size(0) + rel_roi_points = generate_grid( + num_rois, self.output_size, device=rois.device) + + if torch.onnx.is_in_onnx_export(): + rel_img_points = rel_roi_point_to_rel_img_point( + rois, rel_roi_points, features, self.spatial_scale) + rel_img_points = rel_img_points.reshape(num_imgs, -1, + *rel_img_points.shape[1:]) + point_feats = point_sample( + features, rel_img_points, align_corners=not self.aligned) + point_feats = point_feats.transpose(1, 2) + else: + point_feats = [] + for batch_ind in range(num_imgs): + # unravel batch dim + feat = features[batch_ind].unsqueeze(0) + inds = (rois[:, 0].long() == batch_ind) + if inds.any(): + rel_img_points = rel_roi_point_to_rel_img_point( + rois[inds], rel_roi_points[inds], feat, + self.spatial_scale).unsqueeze(0) + point_feat = point_sample( + feat, rel_img_points, align_corners=not self.aligned) + point_feat = point_feat.squeeze(0).transpose(0, 1) + point_feats.append(point_feat) + + point_feats = torch.cat(point_feats, dim=0) + + channels = features.size(1) + roi_feats = point_feats.reshape(num_rois, channels, *self.output_size) + + return roi_feats + + def __repr__(self): + format_str = self.__class__.__name__ + format_str += '(output_size={}, spatial_scale={}'.format( + self.output_size, self.spatial_scale) + return format_str diff --git a/custom_mmpkg/custom_mmcv/ops/points_in_boxes.py b/custom_mmpkg/custom_mmcv/ops/points_in_boxes.py new file mode 100644 index 0000000000000000000000000000000000000000..4003173a53052161dbcd687a2fa1d755642fdab8 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/points_in_boxes.py @@ -0,0 +1,133 @@ +import torch + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward', + 'points_in_boxes_all_forward' +]) + + +def points_in_boxes_part(points, boxes): + """Find the box in which each point is (CUDA). + + Args: + points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate + boxes (torch.Tensor): [B, T, 7], + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in + LiDAR/DEPTH coordinate, (x, y, z) is the bottom center + + Returns: + box_idxs_of_pts (torch.Tensor): (B, M), default background = -1 + """ + assert points.shape[0] == boxes.shape[0], \ + 'Points and boxes should have the same batch size, ' \ + f'but got {points.shape[0]} and {boxes.shape[0]}' + assert boxes.shape[2] == 7, \ + 'boxes dimension should be 7, ' \ + f'but got unexpected shape {boxes.shape[2]}' + assert points.shape[2] == 3, \ + 'points dimension should be 3, ' \ + f'but got unexpected shape {points.shape[2]}' + batch_size, num_points, _ = points.shape + + box_idxs_of_pts = points.new_zeros((batch_size, num_points), + dtype=torch.int).fill_(-1) + + # If manually put the tensor 'points' or 'boxes' on a device + # which is not the current device, some temporary variables + # will be created on the current device in the cuda op, + # and the output will be incorrect. + # Therefore, we force the current device to be the same + # as the device of the tensors if it was not. + # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305 + # for the incorrect output before the fix. + points_device = points.get_device() + assert points_device == boxes.get_device(), \ + 'Points and boxes should be put on the same device' + if torch.cuda.current_device() != points_device: + torch.cuda.set_device(points_device) + + ext_module.points_in_boxes_part_forward(boxes.contiguous(), + points.contiguous(), + box_idxs_of_pts) + + return box_idxs_of_pts + + +def points_in_boxes_cpu(points, boxes): + """Find all boxes in which each point is (CPU). The CPU version of + :meth:`points_in_boxes_all`. + + Args: + points (torch.Tensor): [B, M, 3], [x, y, z] in + LiDAR/DEPTH coordinate + boxes (torch.Tensor): [B, T, 7], + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], + (x, y, z) is the bottom center. + + Returns: + box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. + """ + assert points.shape[0] == boxes.shape[0], \ + 'Points and boxes should have the same batch size, ' \ + f'but got {points.shape[0]} and {boxes.shape[0]}' + assert boxes.shape[2] == 7, \ + 'boxes dimension should be 7, ' \ + f'but got unexpected shape {boxes.shape[2]}' + assert points.shape[2] == 3, \ + 'points dimension should be 3, ' \ + f'but got unexpected shape {points.shape[2]}' + batch_size, num_points, _ = points.shape + num_boxes = boxes.shape[1] + + point_indices = points.new_zeros((batch_size, num_boxes, num_points), + dtype=torch.int) + for b in range(batch_size): + ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(), + points[b].float().contiguous(), + point_indices[b]) + point_indices = point_indices.transpose(1, 2) + + return point_indices + + +def points_in_boxes_all(points, boxes): + """Find all boxes in which each point is (CUDA). + + Args: + points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate + boxes (torch.Tensor): [B, T, 7], + num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz], + (x, y, z) is the bottom center. + + Returns: + box_idxs_of_pts (torch.Tensor): (B, M, T), default background = 0. + """ + assert boxes.shape[0] == points.shape[0], \ + 'Points and boxes should have the same batch size, ' \ + f'but got {boxes.shape[0]} and {boxes.shape[0]}' + assert boxes.shape[2] == 7, \ + 'boxes dimension should be 7, ' \ + f'but got unexpected shape {boxes.shape[2]}' + assert points.shape[2] == 3, \ + 'points dimension should be 3, ' \ + f'but got unexpected shape {points.shape[2]}' + batch_size, num_points, _ = points.shape + num_boxes = boxes.shape[1] + + box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes), + dtype=torch.int).fill_(0) + + # Same reason as line 25-32 + points_device = points.get_device() + assert points_device == boxes.get_device(), \ + 'Points and boxes should be put on the same device' + if torch.cuda.current_device() != points_device: + torch.cuda.set_device(points_device) + + ext_module.points_in_boxes_all_forward(boxes.contiguous(), + points.contiguous(), + box_idxs_of_pts) + + return box_idxs_of_pts diff --git a/custom_mmpkg/custom_mmcv/ops/points_sampler.py b/custom_mmpkg/custom_mmcv/ops/points_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..1df321530990289ebfe426434635351b3687dce6 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/points_sampler.py @@ -0,0 +1,177 @@ +from typing import List + +import torch +from torch import nn as nn + +from custom_mmpkg.custom_mmcv.runner import force_fp32 +from .furthest_point_sample import (furthest_point_sample, + furthest_point_sample_with_dist) + + +def calc_square_dist(point_feat_a, point_feat_b, norm=True): + """Calculating square distance between a and b. + + Args: + point_feat_a (Tensor): (B, N, C) Feature vector of each point. + point_feat_b (Tensor): (B, M, C) Feature vector of each point. + norm (Bool, optional): Whether to normalize the distance. + Default: True. + + Returns: + Tensor: (B, N, M) Distance between each pair points. + """ + num_channel = point_feat_a.shape[-1] + # [bs, n, 1] + a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1) + # [bs, 1, m] + b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1) + + corr_matrix = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2)) + + dist = a_square + b_square - 2 * corr_matrix + if norm: + dist = torch.sqrt(dist) / num_channel + return dist + + +def get_sampler_cls(sampler_type): + """Get the type and mode of points sampler. + + Args: + sampler_type (str): The type of points sampler. + The valid value are "D-FPS", "F-FPS", or "FS". + + Returns: + class: Points sampler type. + """ + sampler_mappings = { + 'D-FPS': DFPSSampler, + 'F-FPS': FFPSSampler, + 'FS': FSSampler, + } + try: + return sampler_mappings[sampler_type] + except KeyError: + raise KeyError( + f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \ + {sampler_type}') + + +class PointsSampler(nn.Module): + """Points sampling. + + Args: + num_point (list[int]): Number of sample points. + fps_mod_list (list[str], optional): Type of FPS method, valid mod + ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS']. + F-FPS: using feature distances for FPS. + D-FPS: using Euclidean distances of points for FPS. + FS: using F-FPS and D-FPS simultaneously. + fps_sample_range_list (list[int], optional): + Range of points to apply FPS. Default: [-1]. + """ + + def __init__(self, + num_point: List[int], + fps_mod_list: List[str] = ['D-FPS'], + fps_sample_range_list: List[int] = [-1]): + super().__init__() + # FPS would be applied to different fps_mod in the list, + # so the length of the num_point should be equal to + # fps_mod_list and fps_sample_range_list. + assert len(num_point) == len(fps_mod_list) == len( + fps_sample_range_list) + self.num_point = num_point + self.fps_sample_range_list = fps_sample_range_list + self.samplers = nn.ModuleList() + for fps_mod in fps_mod_list: + self.samplers.append(get_sampler_cls(fps_mod)()) + self.fp16_enabled = False + + @force_fp32() + def forward(self, points_xyz, features): + """ + Args: + points_xyz (Tensor): (B, N, 3) xyz coordinates of the features. + features (Tensor): (B, C, N) Descriptors of the features. + + Returns: + Tensor: (B, npoint, sample_num) Indices of sampled points. + """ + indices = [] + last_fps_end_index = 0 + + for fps_sample_range, sampler, npoint in zip( + self.fps_sample_range_list, self.samplers, self.num_point): + assert fps_sample_range < points_xyz.shape[1] + + if fps_sample_range == -1: + sample_points_xyz = points_xyz[:, last_fps_end_index:] + if features is not None: + sample_features = features[:, :, last_fps_end_index:] + else: + sample_features = None + else: + sample_points_xyz = \ + points_xyz[:, last_fps_end_index:fps_sample_range] + if features is not None: + sample_features = features[:, :, last_fps_end_index: + fps_sample_range] + else: + sample_features = None + + fps_idx = sampler(sample_points_xyz.contiguous(), sample_features, + npoint) + + indices.append(fps_idx + last_fps_end_index) + last_fps_end_index += fps_sample_range + indices = torch.cat(indices, dim=1) + + return indices + + +class DFPSSampler(nn.Module): + """Using Euclidean distances of points for FPS.""" + + def __init__(self): + super().__init__() + + def forward(self, points, features, npoint): + """Sampling points with D-FPS.""" + fps_idx = furthest_point_sample(points.contiguous(), npoint) + return fps_idx + + +class FFPSSampler(nn.Module): + """Using feature distances for FPS.""" + + def __init__(self): + super().__init__() + + def forward(self, points, features, npoint): + """Sampling points with F-FPS.""" + assert features is not None, \ + 'feature input to FFPS_Sampler should not be None' + features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2) + features_dist = calc_square_dist( + features_for_fps, features_for_fps, norm=False) + fps_idx = furthest_point_sample_with_dist(features_dist, npoint) + return fps_idx + + +class FSSampler(nn.Module): + """Using F-FPS and D-FPS simultaneously.""" + + def __init__(self): + super().__init__() + + def forward(self, points, features, npoint): + """Sampling points with FS_Sampling.""" + assert features is not None, \ + 'feature input to FS_Sampler should not be None' + ffps_sampler = FFPSSampler() + dfps_sampler = DFPSSampler() + fps_idx_ffps = ffps_sampler(points, features, npoint) + fps_idx_dfps = dfps_sampler(points, features, npoint) + fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1) + return fps_idx diff --git a/custom_mmpkg/custom_mmcv/ops/psa_mask.py b/custom_mmpkg/custom_mmcv/ops/psa_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..cdf14e62b50e8d4dd6856c94333c703bcc4c9ab6 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/psa_mask.py @@ -0,0 +1,92 @@ +# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa +from torch import nn +from torch.autograd import Function +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['psamask_forward', 'psamask_backward']) + + +class PSAMaskFunction(Function): + + @staticmethod + def symbolic(g, input, psa_type, mask_size): + return g.op( + 'mmcv::MMCVPSAMask', + input, + psa_type_i=psa_type, + mask_size_i=mask_size) + + @staticmethod + def forward(ctx, input, psa_type, mask_size): + ctx.psa_type = psa_type + ctx.mask_size = _pair(mask_size) + ctx.save_for_backward(input) + + h_mask, w_mask = ctx.mask_size + batch_size, channels, h_feature, w_feature = input.size() + assert channels == h_mask * w_mask + output = input.new_zeros( + (batch_size, h_feature * w_feature, h_feature, w_feature)) + + ext_module.psamask_forward( + input, + output, + psa_type=psa_type, + num_=batch_size, + h_feature=h_feature, + w_feature=w_feature, + h_mask=h_mask, + w_mask=w_mask, + half_h_mask=(h_mask - 1) // 2, + half_w_mask=(w_mask - 1) // 2) + return output + + @staticmethod + def backward(ctx, grad_output): + input = ctx.saved_tensors[0] + psa_type = ctx.psa_type + h_mask, w_mask = ctx.mask_size + batch_size, channels, h_feature, w_feature = input.size() + grad_input = grad_output.new_zeros( + (batch_size, channels, h_feature, w_feature)) + ext_module.psamask_backward( + grad_output, + grad_input, + psa_type=psa_type, + num_=batch_size, + h_feature=h_feature, + w_feature=w_feature, + h_mask=h_mask, + w_mask=w_mask, + half_h_mask=(h_mask - 1) // 2, + half_w_mask=(w_mask - 1) // 2) + return grad_input, None, None, None + + +psa_mask = PSAMaskFunction.apply + + +class PSAMask(nn.Module): + + def __init__(self, psa_type, mask_size=None): + super(PSAMask, self).__init__() + assert psa_type in ['collect', 'distribute'] + if psa_type == 'collect': + psa_type_enum = 0 + else: + psa_type_enum = 1 + self.psa_type_enum = psa_type_enum + self.mask_size = mask_size + self.psa_type = psa_type + + def forward(self, input): + return psa_mask(input, self.psa_type_enum, self.mask_size) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(psa_type={self.psa_type}, ' + s += f'mask_size={self.mask_size})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/roi_align.py b/custom_mmpkg/custom_mmcv/ops/roi_align.py new file mode 100644 index 0000000000000000000000000000000000000000..0755aefc66e67233ceae0f4b77948301c443e9fb --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/roi_align.py @@ -0,0 +1,223 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import deprecated_api_warning, ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['roi_align_forward', 'roi_align_backward']) + + +class RoIAlignFunction(Function): + + @staticmethod + def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio, + pool_mode, aligned): + from ..onnx import is_custom_op_loaded + has_custom_op = is_custom_op_loaded() + if has_custom_op: + return g.op( + 'mmcv::MMCVRoiAlign', + input, + rois, + output_height_i=output_size[0], + output_width_i=output_size[1], + spatial_scale_f=spatial_scale, + sampling_ratio_i=sampling_ratio, + mode_s=pool_mode, + aligned_i=aligned) + else: + from torch.onnx.symbolic_opset9 import sub, squeeze + from torch.onnx.symbolic_helper import _slice_helper + from torch.onnx import TensorProtoDataType + # batch_indices = rois[:, 0].long() + batch_indices = _slice_helper( + g, rois, axes=[1], starts=[0], ends=[1]) + batch_indices = squeeze(g, batch_indices, 1) + batch_indices = g.op( + 'Cast', batch_indices, to_i=TensorProtoDataType.INT64) + # rois = rois[:, 1:] + rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5]) + if aligned: + # rois -= 0.5/spatial_scale + aligned_offset = g.op( + 'Constant', + value_t=torch.tensor([0.5 / spatial_scale], + dtype=torch.float32)) + rois = sub(g, rois, aligned_offset) + # roi align + return g.op( + 'RoiAlign', + input, + rois, + batch_indices, + output_height_i=output_size[0], + output_width_i=output_size[1], + spatial_scale_f=spatial_scale, + sampling_ratio_i=max(0, sampling_ratio), + mode_s=pool_mode) + + @staticmethod + def forward(ctx, + input, + rois, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + pool_mode='avg', + aligned=True): + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.sampling_ratio = sampling_ratio + assert pool_mode in ('max', 'avg') + ctx.pool_mode = 0 if pool_mode == 'max' else 1 + ctx.aligned = aligned + ctx.input_shape = input.size() + + assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' + + output_shape = (rois.size(0), input.size(1), ctx.output_size[0], + ctx.output_size[1]) + output = input.new_zeros(output_shape) + if ctx.pool_mode == 0: + argmax_y = input.new_zeros(output_shape) + argmax_x = input.new_zeros(output_shape) + else: + argmax_y = input.new_zeros(0) + argmax_x = input.new_zeros(0) + + ext_module.roi_align_forward( + input, + rois, + output, + argmax_y, + argmax_x, + aligned_height=ctx.output_size[0], + aligned_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + pool_mode=ctx.pool_mode, + aligned=ctx.aligned) + + ctx.save_for_backward(rois, argmax_y, argmax_x) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + rois, argmax_y, argmax_x = ctx.saved_tensors + grad_input = grad_output.new_zeros(ctx.input_shape) + # complex head architecture may cause grad_output uncontiguous. + grad_output = grad_output.contiguous() + ext_module.roi_align_backward( + grad_output, + rois, + argmax_y, + argmax_x, + grad_input, + aligned_height=ctx.output_size[0], + aligned_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale, + sampling_ratio=ctx.sampling_ratio, + pool_mode=ctx.pool_mode, + aligned=ctx.aligned) + return grad_input, None, None, None, None, None, None + + +roi_align = RoIAlignFunction.apply + + +class RoIAlign(nn.Module): + """RoI align pooling layer. + + Args: + output_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sampling_ratio (int): number of inputs samples to take for each + output sample. 0 to take samples densely for current models. + pool_mode (str, 'avg' or 'max'): pooling mode in each bin. + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + use_torchvision (bool): whether to use roi_align from torchvision. + + Note: + The implementation of RoIAlign when aligned=True is modified from + https://github.com/facebookresearch/detectron2/ + + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel + indices (in our pixel model) are computed by floor(c - 0.5) and + ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete + indices [0] and [1] (which are sampled from the underlying signal + at continuous coordinates 0.5 and 1.5). But the original roi_align + (aligned=False) does not subtract the 0.5 when computing + neighboring pixel indices and therefore it uses pixels with a + slightly incorrect alignment (relative to our pixel model) when + performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; + + The difference does not make a difference to the model's + performance if ROIAlign is used together with conv layers. + """ + + @deprecated_api_warning( + { + 'out_size': 'output_size', + 'sample_num': 'sampling_ratio' + }, + cls_name='RoIAlign') + def __init__(self, + output_size, + spatial_scale=1.0, + sampling_ratio=0, + pool_mode='avg', + aligned=True, + use_torchvision=False): + super(RoIAlign, self).__init__() + + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + self.sampling_ratio = int(sampling_ratio) + self.pool_mode = pool_mode + self.aligned = aligned + self.use_torchvision = use_torchvision + + def forward(self, input, rois): + """ + Args: + input: NCHW images + rois: Bx5 boxes. First column is the index into N.\ + The other 4 columns are xyxy. + """ + if self.use_torchvision: + from torchvision.ops import roi_align as tv_roi_align + if 'aligned' in tv_roi_align.__code__.co_varnames: + return tv_roi_align(input, rois, self.output_size, + self.spatial_scale, self.sampling_ratio, + self.aligned) + else: + if self.aligned: + rois -= rois.new_tensor([0.] + + [0.5 / self.spatial_scale] * 4) + return tv_roi_align(input, rois, self.output_size, + self.spatial_scale, self.sampling_ratio) + else: + return roi_align(input, rois, self.output_size, self.spatial_scale, + self.sampling_ratio, self.pool_mode, self.aligned) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(output_size={self.output_size}, ' + s += f'spatial_scale={self.spatial_scale}, ' + s += f'sampling_ratio={self.sampling_ratio}, ' + s += f'pool_mode={self.pool_mode}, ' + s += f'aligned={self.aligned}, ' + s += f'use_torchvision={self.use_torchvision})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/roi_align_rotated.py b/custom_mmpkg/custom_mmcv/ops/roi_align_rotated.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce4961a3555d4da8bc3e32f1f7d5ad50036587d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/roi_align_rotated.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch.nn as nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward']) + + +class RoIAlignRotatedFunction(Function): + + @staticmethod + def symbolic(g, features, rois, out_size, spatial_scale, sample_num, + aligned, clockwise): + if isinstance(out_size, int): + out_h = out_size + out_w = out_size + elif isinstance(out_size, tuple): + assert len(out_size) == 2 + assert isinstance(out_size[0], int) + assert isinstance(out_size[1], int) + out_h, out_w = out_size + else: + raise TypeError( + '"out_size" must be an integer or tuple of integers') + return g.op( + 'mmcv::MMCVRoIAlignRotated', + features, + rois, + output_height_i=out_h, + output_width_i=out_h, + spatial_scale_f=spatial_scale, + sampling_ratio_i=sample_num, + aligned_i=aligned, + clockwise_i=clockwise) + + @staticmethod + def forward(ctx, + features, + rois, + out_size, + spatial_scale, + sample_num=0, + aligned=True, + clockwise=False): + if isinstance(out_size, int): + out_h = out_size + out_w = out_size + elif isinstance(out_size, tuple): + assert len(out_size) == 2 + assert isinstance(out_size[0], int) + assert isinstance(out_size[1], int) + out_h, out_w = out_size + else: + raise TypeError( + '"out_size" must be an integer or tuple of integers') + ctx.spatial_scale = spatial_scale + ctx.sample_num = sample_num + ctx.aligned = aligned + ctx.clockwise = clockwise + ctx.save_for_backward(rois) + ctx.feature_size = features.size() + + batch_size, num_channels, data_height, data_width = features.size() + num_rois = rois.size(0) + + output = features.new_zeros(num_rois, num_channels, out_h, out_w) + ext_module.roi_align_rotated_forward( + features, + rois, + output, + pooled_height=out_h, + pooled_width=out_w, + spatial_scale=spatial_scale, + sample_num=sample_num, + aligned=aligned, + clockwise=clockwise) + return output + + @staticmethod + def backward(ctx, grad_output): + feature_size = ctx.feature_size + spatial_scale = ctx.spatial_scale + aligned = ctx.aligned + clockwise = ctx.clockwise + sample_num = ctx.sample_num + rois = ctx.saved_tensors[0] + assert feature_size is not None + batch_size, num_channels, data_height, data_width = feature_size + + out_w = grad_output.size(3) + out_h = grad_output.size(2) + + grad_input = grad_rois = None + + if ctx.needs_input_grad[0]: + grad_input = rois.new_zeros(batch_size, num_channels, data_height, + data_width) + ext_module.roi_align_rotated_backward( + grad_output.contiguous(), + rois, + grad_input, + pooled_height=out_h, + pooled_width=out_w, + spatial_scale=spatial_scale, + sample_num=sample_num, + aligned=aligned, + clockwise=clockwise) + return grad_input, grad_rois, None, None, None, None, None + + +roi_align_rotated = RoIAlignRotatedFunction.apply + + +class RoIAlignRotated(nn.Module): + """RoI align pooling layer for rotated proposals. + + It accepts a feature map of shape (N, C, H, W) and rois with shape + (n, 6) with each roi decoded as (batch_index, center_x, center_y, + w, h, angle). The angle is in radian. + + Args: + out_size (tuple): h, w + spatial_scale (float): scale the input boxes by this number + sample_num (int): number of inputs samples to take for each + output sample. 0 to take samples densely for current models. + aligned (bool): if False, use the legacy implementation in + MMDetection. If True, align the results more perfectly. + Default: True. + clockwise (bool): If True, the angle in each proposal follows a + clockwise fashion in image space, otherwise, the angle is + counterclockwise. Default: False. + + Note: + The implementation of RoIAlign when aligned=True is modified from + https://github.com/facebookresearch/detectron2/ + + The meaning of aligned=True: + + Given a continuous coordinate c, its two neighboring pixel + indices (in our pixel model) are computed by floor(c - 0.5) and + ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete + indices [0] and [1] (which are sampled from the underlying signal + at continuous coordinates 0.5 and 1.5). But the original roi_align + (aligned=False) does not subtract the 0.5 when computing + neighboring pixel indices and therefore it uses pixels with a + slightly incorrect alignment (relative to our pixel model) when + performing bilinear interpolation. + + With `aligned=True`, + we first appropriately scale the ROI and then shift it by -0.5 + prior to calling roi_align. This produces the correct neighbors; + + The difference does not make a difference to the model's + performance if ROIAlign is used together with conv layers. + """ + + def __init__(self, + out_size, + spatial_scale, + sample_num=0, + aligned=True, + clockwise=False): + super(RoIAlignRotated, self).__init__() + + self.out_size = out_size + self.spatial_scale = float(spatial_scale) + self.sample_num = int(sample_num) + self.aligned = aligned + self.clockwise = clockwise + + def forward(self, features, rois): + return RoIAlignRotatedFunction.apply(features, rois, self.out_size, + self.spatial_scale, + self.sample_num, self.aligned, + self.clockwise) diff --git a/custom_mmpkg/custom_mmcv/ops/roi_pool.py b/custom_mmpkg/custom_mmcv/ops/roi_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..d339d8f2941eabc1cbe181a9c6c5ab5ff4ff4e5f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/roi_pool.py @@ -0,0 +1,86 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['roi_pool_forward', 'roi_pool_backward']) + + +class RoIPoolFunction(Function): + + @staticmethod + def symbolic(g, input, rois, output_size, spatial_scale): + return g.op( + 'MaxRoiPool', + input, + rois, + pooled_shape_i=output_size, + spatial_scale_f=spatial_scale) + + @staticmethod + def forward(ctx, input, rois, output_size, spatial_scale=1.0): + ctx.output_size = _pair(output_size) + ctx.spatial_scale = spatial_scale + ctx.input_shape = input.size() + + assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!' + + output_shape = (rois.size(0), input.size(1), ctx.output_size[0], + ctx.output_size[1]) + output = input.new_zeros(output_shape) + argmax = input.new_zeros(output_shape, dtype=torch.int) + + ext_module.roi_pool_forward( + input, + rois, + output, + argmax, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale) + + ctx.save_for_backward(rois, argmax) + return output + + @staticmethod + @once_differentiable + def backward(ctx, grad_output): + rois, argmax = ctx.saved_tensors + grad_input = grad_output.new_zeros(ctx.input_shape) + + ext_module.roi_pool_backward( + grad_output, + rois, + argmax, + grad_input, + pooled_height=ctx.output_size[0], + pooled_width=ctx.output_size[1], + spatial_scale=ctx.spatial_scale) + + return grad_input, None, None, None + + +roi_pool = RoIPoolFunction.apply + + +class RoIPool(nn.Module): + + def __init__(self, output_size, spatial_scale=1.0): + super(RoIPool, self).__init__() + + self.output_size = _pair(output_size) + self.spatial_scale = float(spatial_scale) + + def forward(self, input, rois): + return roi_pool(input, rois, self.output_size, self.spatial_scale) + + def __repr__(self): + s = self.__class__.__name__ + s += f'(output_size={self.output_size}, ' + s += f'spatial_scale={self.spatial_scale})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/roiaware_pool3d.py b/custom_mmpkg/custom_mmcv/ops/roiaware_pool3d.py new file mode 100644 index 0000000000000000000000000000000000000000..00d8a4d7f99181f224bda079ff7487aae5b92383 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/roiaware_pool3d.py @@ -0,0 +1,114 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn as nn +from torch.autograd import Function + +import custom_mmpkg.custom_mmcv as mmcv +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward']) + + +class RoIAwarePool3d(nn.Module): + """Encode the geometry-specific features of each 3D proposal. + + Please refer to `PartA2 `_ for more + details. + + Args: + out_size (int or tuple): The size of output features. n or + [n1, n2, n3]. + max_pts_per_voxel (int, optional): The maximum number of points per + voxel. Default: 128. + mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'. + Default: 'max'. + """ + + def __init__(self, out_size, max_pts_per_voxel=128, mode='max'): + super().__init__() + + self.out_size = out_size + self.max_pts_per_voxel = max_pts_per_voxel + assert mode in ['max', 'avg'] + pool_mapping = {'max': 0, 'avg': 1} + self.mode = pool_mapping[mode] + + def forward(self, rois, pts, pts_feature): + """ + Args: + rois (torch.Tensor): [N, 7], in LiDAR coordinate, + (x, y, z) is the bottom center of rois. + pts (torch.Tensor): [npoints, 3], coordinates of input points. + pts_feature (torch.Tensor): [npoints, C], features of input points. + + Returns: + pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C] + """ + + return RoIAwarePool3dFunction.apply(rois, pts, pts_feature, + self.out_size, + self.max_pts_per_voxel, self.mode) + + +class RoIAwarePool3dFunction(Function): + + @staticmethod + def forward(ctx, rois, pts, pts_feature, out_size, max_pts_per_voxel, + mode): + """ + Args: + rois (torch.Tensor): [N, 7], in LiDAR coordinate, + (x, y, z) is the bottom center of rois. + pts (torch.Tensor): [npoints, 3], coordinates of input points. + pts_feature (torch.Tensor): [npoints, C], features of input points. + out_size (int or tuple): The size of output features. n or + [n1, n2, n3]. + max_pts_per_voxel (int): The maximum number of points per voxel. + Default: 128. + mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average + pool). + + Returns: + pooled_features (torch.Tensor): [N, out_x, out_y, out_z, C], output + pooled features. + """ + + if isinstance(out_size, int): + out_x = out_y = out_z = out_size + else: + assert len(out_size) == 3 + assert mmcv.is_tuple_of(out_size, int) + out_x, out_y, out_z = out_size + + num_rois = rois.shape[0] + num_channels = pts_feature.shape[-1] + num_pts = pts.shape[0] + + pooled_features = pts_feature.new_zeros( + (num_rois, out_x, out_y, out_z, num_channels)) + argmax = pts_feature.new_zeros( + (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int) + pts_idx_of_voxels = pts_feature.new_zeros( + (num_rois, out_x, out_y, out_z, max_pts_per_voxel), + dtype=torch.int) + + ext_module.roiaware_pool3d_forward(rois, pts, pts_feature, argmax, + pts_idx_of_voxels, pooled_features, + mode) + + ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode, + num_pts, num_channels) + return pooled_features + + @staticmethod + def backward(ctx, grad_out): + ret = ctx.roiaware_pool3d_for_backward + pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret + + grad_in = grad_out.new_zeros((num_pts, num_channels)) + ext_module.roiaware_pool3d_backward(pts_idx_of_voxels, argmax, + grad_out.contiguous(), grad_in, + mode) + + return None, None, grad_in, None, None, None diff --git a/custom_mmpkg/custom_mmcv/ops/roipoint_pool3d.py b/custom_mmpkg/custom_mmcv/ops/roipoint_pool3d.py new file mode 100644 index 0000000000000000000000000000000000000000..0a21412c0728431c04b84245bc2e3109eea9aefc --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/roipoint_pool3d.py @@ -0,0 +1,77 @@ +from torch import nn as nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward']) + + +class RoIPointPool3d(nn.Module): + """Encode the geometry-specific features of each 3D proposal. + + Please refer to `Paper of PartA2 `_ + for more details. + + Args: + num_sampled_points (int, optional): Number of samples in each roi. + Default: 512. + """ + + def __init__(self, num_sampled_points=512): + super().__init__() + self.num_sampled_points = num_sampled_points + + def forward(self, points, point_features, boxes3d): + """ + Args: + points (torch.Tensor): Input points whose shape is (B, N, C). + point_features (torch.Tensor): Features of input points whose shape + is (B, N, C). + boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7). + + Returns: + pooled_features (torch.Tensor): The output pooled features whose + shape is (B, M, 512, 3 + C). + pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M). + """ + return RoIPointPool3dFunction.apply(points, point_features, boxes3d, + self.num_sampled_points) + + +class RoIPointPool3dFunction(Function): + + @staticmethod + def forward(ctx, points, point_features, boxes3d, num_sampled_points=512): + """ + Args: + points (torch.Tensor): Input points whose shape is (B, N, C). + point_features (torch.Tensor): Features of input points whose shape + is (B, N, C). + boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7). + num_sampled_points (int, optional): The num of sampled points. + Default: 512. + + Returns: + pooled_features (torch.Tensor): The output pooled features whose + shape is (B, M, 512, 3 + C). + pooled_empty_flag (torch.Tensor): Empty flag whose shape is (B, M). + """ + assert len(points.shape) == 3 and points.shape[2] == 3 + batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[ + 1], point_features.shape[2] + pooled_boxes3d = boxes3d.view(batch_size, -1, 7) + pooled_features = point_features.new_zeros( + (batch_size, boxes_num, num_sampled_points, 3 + feature_len)) + pooled_empty_flag = point_features.new_zeros( + (batch_size, boxes_num)).int() + + ext_module.roipoint_pool3d_forward(points.contiguous(), + pooled_boxes3d.contiguous(), + point_features.contiguous(), + pooled_features, pooled_empty_flag) + + return pooled_features, pooled_empty_flag + + @staticmethod + def backward(ctx, grad_out): + raise NotImplementedError diff --git a/custom_mmpkg/custom_mmcv/ops/saconv.py b/custom_mmpkg/custom_mmcv/ops/saconv.py new file mode 100644 index 0000000000000000000000000000000000000000..46d26992534cba3ba0ee36f08b700c5489fea30d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/saconv.py @@ -0,0 +1,145 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_mmpkg.custom_mmcv.cnn import CONV_LAYERS, ConvAWS2d, constant_init +from custom_mmpkg.custom_mmcv.ops.deform_conv import deform_conv2d +from custom_mmpkg.custom_mmcv.utils import TORCH_VERSION, digit_version + + +@CONV_LAYERS.register_module(name='SAC') +class SAConv2d(ConvAWS2d): + """SAC (Switchable Atrous Convolution) + + This is an implementation of SAC in DetectoRS + (https://arxiv.org/pdf/2006.02334.pdf). + + Args: + in_channels (int): Number of channels in the input image + out_channels (int): Number of channels produced by the convolution + kernel_size (int or tuple): Size of the convolving kernel + stride (int or tuple, optional): Stride of the convolution. Default: 1 + padding (int or tuple, optional): Zero-padding added to both sides of + the input. Default: 0 + padding_mode (string, optional): ``'zeros'``, ``'reflect'``, + ``'replicate'`` or ``'circular'``. Default: ``'zeros'`` + dilation (int or tuple, optional): Spacing between kernel elements. + Default: 1 + groups (int, optional): Number of blocked connections from input + channels to output channels. Default: 1 + bias (bool, optional): If ``True``, adds a learnable bias to the + output. Default: ``True`` + use_deform: If ``True``, replace convolution with deformable + convolution. Default: ``False``. + """ + + def __init__(self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + use_deform=False): + super().__init__( + in_channels, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias) + self.use_deform = use_deform + self.switch = nn.Conv2d( + self.in_channels, 1, kernel_size=1, stride=stride, bias=True) + self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size())) + self.pre_context = nn.Conv2d( + self.in_channels, self.in_channels, kernel_size=1, bias=True) + self.post_context = nn.Conv2d( + self.out_channels, self.out_channels, kernel_size=1, bias=True) + if self.use_deform: + self.offset_s = nn.Conv2d( + self.in_channels, + 18, + kernel_size=3, + padding=1, + stride=stride, + bias=True) + self.offset_l = nn.Conv2d( + self.in_channels, + 18, + kernel_size=3, + padding=1, + stride=stride, + bias=True) + self.init_weights() + + def init_weights(self): + constant_init(self.switch, 0, bias=1) + self.weight_diff.data.zero_() + constant_init(self.pre_context, 0) + constant_init(self.post_context, 0) + if self.use_deform: + constant_init(self.offset_s, 0) + constant_init(self.offset_l, 0) + + def forward(self, x): + # pre-context + avg_x = F.adaptive_avg_pool2d(x, output_size=1) + avg_x = self.pre_context(avg_x) + avg_x = avg_x.expand_as(x) + x = x + avg_x + # switch + avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect') + avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0) + switch = self.switch(avg_x) + # sac + weight = self._get_weight(self.weight) + zero_bias = torch.zeros( + self.out_channels, device=weight.device, dtype=weight.dtype) + + if self.use_deform: + offset = self.offset_s(avg_x) + out_s = deform_conv2d(x, offset, weight, self.stride, self.padding, + self.dilation, self.groups, 1) + else: + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.5.0')): + out_s = super().conv2d_forward(x, weight) + elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'): + # bias is a required argument of _conv_forward in torch 1.8.0 + out_s = super()._conv_forward(x, weight, zero_bias) + else: + out_s = super()._conv_forward(x, weight) + ori_p = self.padding + ori_d = self.dilation + self.padding = tuple(3 * p for p in self.padding) + self.dilation = tuple(3 * d for d in self.dilation) + weight = weight + self.weight_diff + if self.use_deform: + offset = self.offset_l(avg_x) + out_l = deform_conv2d(x, offset, weight, self.stride, self.padding, + self.dilation, self.groups, 1) + else: + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.5.0')): + out_l = super().conv2d_forward(x, weight) + elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'): + # bias is a required argument of _conv_forward in torch 1.8.0 + out_l = super()._conv_forward(x, weight, zero_bias) + else: + out_l = super()._conv_forward(x, weight) + + out = switch * out_s + (1 - switch) * out_l + self.padding = ori_p + self.dilation = ori_d + # post-context + avg_x = F.adaptive_avg_pool2d(out, output_size=1) + avg_x = self.post_context(avg_x) + avg_x = avg_x.expand_as(out) + out = out + avg_x + return out diff --git a/custom_mmpkg/custom_mmcv/ops/scatter_points.py b/custom_mmpkg/custom_mmcv/ops/scatter_points.py new file mode 100644 index 0000000000000000000000000000000000000000..2b8aa4169e9f6ca4a6f845ce17d6d1e4db416bb8 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/scatter_points.py @@ -0,0 +1,135 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', + ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward']) + + +class _DynamicScatter(Function): + + @staticmethod + def forward(ctx, feats, coors, reduce_type='max'): + """convert kitti points(N, >=3) to voxels. + + Args: + feats (torch.Tensor): [N, C]. Points features to be reduced + into voxels. + coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates + (specifically multi-dim voxel index) of each points. + reduce_type (str, optional): Reduce op. support 'max', 'sum' and + 'mean'. Default: 'max'. + + Returns: + voxel_feats (torch.Tensor): [M, C]. Reduced features, input + features that shares the same voxel coordinates are reduced to + one row. + voxel_coors (torch.Tensor): [M, ndim]. Voxel coordinates. + """ + results = ext_module.dynamic_point_to_voxel_forward( + feats, coors, reduce_type) + (voxel_feats, voxel_coors, point2voxel_map, + voxel_points_count) = results + ctx.reduce_type = reduce_type + ctx.save_for_backward(feats, voxel_feats, point2voxel_map, + voxel_points_count) + ctx.mark_non_differentiable(voxel_coors) + return voxel_feats, voxel_coors + + @staticmethod + def backward(ctx, grad_voxel_feats, grad_voxel_coors=None): + (feats, voxel_feats, point2voxel_map, + voxel_points_count) = ctx.saved_tensors + grad_feats = torch.zeros_like(feats) + # TODO: whether to use index put or use cuda_backward + # To use index put, need point to voxel index + ext_module.dynamic_point_to_voxel_backward( + grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats, + point2voxel_map, voxel_points_count, ctx.reduce_type) + return grad_feats, None, None + + +dynamic_scatter = _DynamicScatter.apply + + +class DynamicScatter(nn.Module): + """Scatters points into voxels, used in the voxel encoder with dynamic + voxelization. + + Note: + The CPU and GPU implementation get the same output, but have numerical + difference after summation and division (e.g., 5e-7). + + Args: + voxel_size (list): list [x, y, z] size of three dimension. + point_cloud_range (list): The coordinate range of points, [x_min, + y_min, z_min, x_max, y_max, z_max]. + average_points (bool): whether to use avg pooling to scatter points + into voxel. + """ + + def __init__(self, voxel_size, point_cloud_range, average_points: bool): + super().__init__() + + self.voxel_size = voxel_size + self.point_cloud_range = point_cloud_range + self.average_points = average_points + + def forward_single(self, points, coors): + """Scatters points into voxels. + + Args: + points (torch.Tensor): Points to be reduced into voxels. + coors (torch.Tensor): Corresponding voxel coordinates (specifically + multi-dim voxel index) of each points. + + Returns: + voxel_feats (torch.Tensor): Reduced features, input features that + shares the same voxel coordinates are reduced to one row. + voxel_coors (torch.Tensor): Voxel coordinates. + """ + reduce = 'mean' if self.average_points else 'max' + return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce) + + def forward(self, points, coors): + """Scatters points/features into voxels. + + Args: + points (torch.Tensor): Points to be reduced into voxels. + coors (torch.Tensor): Corresponding voxel coordinates (specifically + multi-dim voxel index) of each points. + + Returns: + voxel_feats (torch.Tensor): Reduced features, input features that + shares the same voxel coordinates are reduced to one row. + voxel_coors (torch.Tensor): Voxel coordinates. + """ + if coors.size(-1) == 3: + return self.forward_single(points, coors) + else: + batch_size = coors[-1, 0] + 1 + voxels, voxel_coors = [], [] + for i in range(batch_size): + inds = torch.where(coors[:, 0] == i) + voxel, voxel_coor = self.forward_single( + points[inds], coors[inds][:, 1:]) + coor_pad = nn.functional.pad( + voxel_coor, (1, 0), mode='constant', value=i) + voxel_coors.append(coor_pad) + voxels.append(voxel) + features = torch.cat(voxels, dim=0) + feature_coors = torch.cat(voxel_coors, dim=0) + + return features, feature_coors + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'voxel_size=' + str(self.voxel_size) + s += ', point_cloud_range=' + str(self.point_cloud_range) + s += ', average_points=' + str(self.average_points) + s += ')' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/sync_bn.py b/custom_mmpkg/custom_mmcv/ops/sync_bn.py new file mode 100644 index 0000000000000000000000000000000000000000..f885caac860ae7197ba2a29433b3c3debfdb2e65 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/sync_bn.py @@ -0,0 +1,279 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.distributed as dist +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable +from torch.nn.modules.module import Module +from torch.nn.parameter import Parameter + +from custom_mmpkg.custom_mmcv.cnn import NORM_LAYERS +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', [ + 'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output', + 'sync_bn_backward_param', 'sync_bn_backward_data' +]) + + +class SyncBatchNormFunction(Function): + + @staticmethod + def symbolic(g, input, running_mean, running_var, weight, bias, momentum, + eps, group, group_size, stats_mode): + return g.op( + 'mmcv::MMCVSyncBatchNorm', + input, + running_mean, + running_var, + weight, + bias, + momentum_f=momentum, + eps_f=eps, + group_i=group, + group_size_i=group_size, + stats_mode=stats_mode) + + @staticmethod + def forward(self, input, running_mean, running_var, weight, bias, momentum, + eps, group, group_size, stats_mode): + self.momentum = momentum + self.eps = eps + self.group = group + self.group_size = group_size + self.stats_mode = stats_mode + + assert isinstance( + input, (torch.HalfTensor, torch.FloatTensor, + torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \ + f'only support Half or Float Tensor, but {input.type()}' + output = torch.zeros_like(input) + input3d = input.flatten(start_dim=2) + output3d = output.view_as(input3d) + num_channels = input3d.size(1) + + # ensure mean/var/norm/std are initialized as zeros + # ``torch.empty()`` does not guarantee that + mean = torch.zeros( + num_channels, dtype=torch.float, device=input3d.device) + var = torch.zeros( + num_channels, dtype=torch.float, device=input3d.device) + norm = torch.zeros_like( + input3d, dtype=torch.float, device=input3d.device) + std = torch.zeros( + num_channels, dtype=torch.float, device=input3d.device) + + batch_size = input3d.size(0) + if batch_size > 0: + ext_module.sync_bn_forward_mean(input3d, mean) + batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype) + else: + # skip updating mean and leave it as zeros when the input is empty + batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype) + + # synchronize mean and the batch flag + vec = torch.cat([mean, batch_flag]) + if self.stats_mode == 'N': + vec *= batch_size + if self.group_size > 1: + dist.all_reduce(vec, group=self.group) + total_batch = vec[-1].detach() + mean = vec[:num_channels] + + if self.stats_mode == 'default': + mean = mean / self.group_size + elif self.stats_mode == 'N': + mean = mean / total_batch.clamp(min=1) + else: + raise NotImplementedError + + # leave var as zeros when the input is empty + if batch_size > 0: + ext_module.sync_bn_forward_var(input3d, mean, var) + + if self.stats_mode == 'N': + var *= batch_size + if self.group_size > 1: + dist.all_reduce(var, group=self.group) + + if self.stats_mode == 'default': + var /= self.group_size + elif self.stats_mode == 'N': + var /= total_batch.clamp(min=1) + else: + raise NotImplementedError + + # if the total batch size over all the ranks is zero, + # we should not update the statistics in the current batch + update_flag = total_batch.clamp(max=1) + momentum = update_flag * self.momentum + ext_module.sync_bn_forward_output( + input3d, + mean, + var, + weight, + bias, + running_mean, + running_var, + norm, + std, + output3d, + eps=self.eps, + momentum=momentum, + group_size=self.group_size) + self.save_for_backward(norm, std, weight) + return output + + @staticmethod + @once_differentiable + def backward(self, grad_output): + norm, std, weight = self.saved_tensors + grad_weight = torch.zeros_like(weight) + grad_bias = torch.zeros_like(weight) + grad_input = torch.zeros_like(grad_output) + grad_output3d = grad_output.flatten(start_dim=2) + grad_input3d = grad_input.view_as(grad_output3d) + + batch_size = grad_input3d.size(0) + if batch_size > 0: + ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight, + grad_bias) + + # all reduce + if self.group_size > 1: + dist.all_reduce(grad_weight, group=self.group) + dist.all_reduce(grad_bias, group=self.group) + grad_weight /= self.group_size + grad_bias /= self.group_size + + if batch_size > 0: + ext_module.sync_bn_backward_data(grad_output3d, weight, + grad_weight, grad_bias, norm, std, + grad_input3d) + + return grad_input, None, None, grad_weight, grad_bias, \ + None, None, None, None, None + + +@NORM_LAYERS.register_module(name='MMSyncBN') +class SyncBatchNorm(Module): + """Synchronized Batch Normalization. + + Args: + num_features (int): number of features/chennels in input tensor + eps (float, optional): a value added to the denominator for numerical + stability. Defaults to 1e-5. + momentum (float, optional): the value used for the running_mean and + running_var computation. Defaults to 0.1. + affine (bool, optional): whether to use learnable affine parameters. + Defaults to True. + track_running_stats (bool, optional): whether to track the running + mean and variance during training. When set to False, this + module does not track such statistics, and initializes statistics + buffers ``running_mean`` and ``running_var`` as ``None``. When + these buffers are ``None``, this module always uses batch + statistics in both training and eval modes. Defaults to True. + group (int, optional): synchronization of stats happen within + each process group individually. By default it is synchronization + across the whole world. Defaults to None. + stats_mode (str, optional): The statistical mode. Available options + includes ``'default'`` and ``'N'``. Defaults to 'default'. + When ``stats_mode=='default'``, it computes the overall statistics + using those from each worker with equal weight, i.e., the + statistics are synchronized and simply divied by ``group``. This + mode will produce inaccurate statistics when empty tensors occur. + When ``stats_mode=='N'``, it compute the overall statistics using + the total number of batches in each worker ignoring the number of + group, i.e., the statistics are synchronized and then divied by + the total batch ``N``. This mode is beneficial when empty tensors + occur during training, as it average the total mean by the real + number of batch. + """ + + def __init__(self, + num_features, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + group=None, + stats_mode='default'): + super(SyncBatchNorm, self).__init__() + self.num_features = num_features + self.eps = eps + self.momentum = momentum + self.affine = affine + self.track_running_stats = track_running_stats + group = dist.group.WORLD if group is None else group + self.group = group + self.group_size = dist.get_world_size(group) + assert stats_mode in ['default', 'N'], \ + f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"' + self.stats_mode = stats_mode + if self.affine: + self.weight = Parameter(torch.Tensor(num_features)) + self.bias = Parameter(torch.Tensor(num_features)) + else: + self.register_parameter('weight', None) + self.register_parameter('bias', None) + if self.track_running_stats: + self.register_buffer('running_mean', torch.zeros(num_features)) + self.register_buffer('running_var', torch.ones(num_features)) + self.register_buffer('num_batches_tracked', + torch.tensor(0, dtype=torch.long)) + else: + self.register_buffer('running_mean', None) + self.register_buffer('running_var', None) + self.register_buffer('num_batches_tracked', None) + self.reset_parameters() + + def reset_running_stats(self): + if self.track_running_stats: + self.running_mean.zero_() + self.running_var.fill_(1) + self.num_batches_tracked.zero_() + + def reset_parameters(self): + self.reset_running_stats() + if self.affine: + self.weight.data.uniform_() # pytorch use ones_() + self.bias.data.zero_() + + def forward(self, input): + if input.dim() < 2: + raise ValueError( + f'expected at least 2D input, got {input.dim()}D input') + if self.momentum is None: + exponential_average_factor = 0.0 + else: + exponential_average_factor = self.momentum + + if self.training and self.track_running_stats: + if self.num_batches_tracked is not None: + self.num_batches_tracked += 1 + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / float( + self.num_batches_tracked) + else: # use exponential moving average + exponential_average_factor = self.momentum + + if self.training or not self.track_running_stats: + return SyncBatchNormFunction.apply( + input, self.running_mean, self.running_var, self.weight, + self.bias, exponential_average_factor, self.eps, self.group, + self.group_size, self.stats_mode) + else: + return F.batch_norm(input, self.running_mean, self.running_var, + self.weight, self.bias, False, + exponential_average_factor, self.eps) + + def __repr__(self): + s = self.__class__.__name__ + s += f'({self.num_features}, ' + s += f'eps={self.eps}, ' + s += f'momentum={self.momentum}, ' + s += f'affine={self.affine}, ' + s += f'track_running_stats={self.track_running_stats}, ' + s += f'group_size={self.group_size},' + s += f'stats_mode={self.stats_mode})' + return s diff --git a/custom_mmpkg/custom_mmcv/ops/three_interpolate.py b/custom_mmpkg/custom_mmcv/ops/three_interpolate.py new file mode 100644 index 0000000000000000000000000000000000000000..203f47f05d58087e034fb3cd8cd6a09233947b4a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/three_interpolate.py @@ -0,0 +1,68 @@ +from typing import Tuple + +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['three_interpolate_forward', 'three_interpolate_backward']) + + +class ThreeInterpolate(Function): + """Performs weighted linear interpolation on 3 features. + + Please refer to `Paper of PointNet++ `_ + for more details. + """ + + @staticmethod + def forward(ctx, features: torch.Tensor, indices: torch.Tensor, + weight: torch.Tensor) -> torch.Tensor: + """ + Args: + features (Tensor): (B, C, M) Features descriptors to be + interpolated + indices (Tensor): (B, n, 3) index three nearest neighbors + of the target features in features + weight (Tensor): (B, n, 3) weights of interpolation + + Returns: + Tensor: (B, C, N) tensor of the interpolated features + """ + assert features.is_contiguous() + assert indices.is_contiguous() + assert weight.is_contiguous() + + B, c, m = features.size() + n = indices.size(1) + ctx.three_interpolate_for_backward = (indices, weight, m) + output = torch.cuda.FloatTensor(B, c, n) + + ext_module.three_interpolate_forward( + features, indices, weight, output, b=B, c=c, m=m, n=n) + return output + + @staticmethod + def backward( + ctx, grad_out: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Args: + grad_out (Tensor): (B, C, N) tensor with gradients of outputs + + Returns: + Tensor: (B, C, M) tensor with gradients of features + """ + idx, weight, m = ctx.three_interpolate_for_backward + B, c, n = grad_out.size() + + grad_features = torch.cuda.FloatTensor(B, c, m).zero_() + grad_out_data = grad_out.data.contiguous() + + ext_module.three_interpolate_backward( + grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m) + return grad_features, None, None + + +three_interpolate = ThreeInterpolate.apply diff --git a/custom_mmpkg/custom_mmcv/ops/three_nn.py b/custom_mmpkg/custom_mmcv/ops/three_nn.py new file mode 100644 index 0000000000000000000000000000000000000000..2b01047a129989cd5545a0a86f23a487f4a13ce1 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/three_nn.py @@ -0,0 +1,51 @@ +from typing import Tuple + +import torch +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', ['three_nn_forward']) + + +class ThreeNN(Function): + """Find the top-3 nearest neighbors of the target set from the source set. + + Please refer to `Paper of PointNet++ `_ + for more details. + """ + + @staticmethod + def forward(ctx, target: torch.Tensor, + source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Args: + target (Tensor): shape (B, N, 3), points set that needs to + find the nearest neighbors. + source (Tensor): shape (B, M, 3), points set that is used + to find the nearest neighbors of points in target set. + + Returns: + Tensor: shape (B, N, 3), L2 distance of each point in target + set to their corresponding nearest neighbors. + """ + target = target.contiguous() + source = source.contiguous() + + B, N, _ = target.size() + m = source.size(1) + dist2 = torch.cuda.FloatTensor(B, N, 3) + idx = torch.cuda.IntTensor(B, N, 3) + + ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m) + if torch.__version__ != 'parrots': + ctx.mark_non_differentiable(idx) + + return torch.sqrt(dist2), idx + + @staticmethod + def backward(ctx, a=None, b=None): + return None, None + + +three_nn = ThreeNN.apply diff --git a/custom_mmpkg/custom_mmcv/ops/tin_shift.py b/custom_mmpkg/custom_mmcv/ops/tin_shift.py new file mode 100644 index 0000000000000000000000000000000000000000..472c9fcfe45a124e819b7ed5653e585f94a8811e --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/tin_shift.py @@ -0,0 +1,68 @@ +# Copyright (c) OpenMMLab. All rights reserved. +# Code reference from "Temporal Interlacing Network" +# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py +# Hao Shao, Shengju Qian, Yu Liu +# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk + +import torch +import torch.nn as nn +from torch.autograd import Function + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext('_ext', + ['tin_shift_forward', 'tin_shift_backward']) + + +class TINShiftFunction(Function): + + @staticmethod + def forward(ctx, input, shift): + C = input.size(2) + num_segments = shift.size(1) + if C // num_segments <= 0 or C % num_segments != 0: + raise ValueError('C should be a multiple of num_segments, ' + f'but got C={C} and num_segments={num_segments}.') + + ctx.save_for_backward(shift) + + out = torch.zeros_like(input) + ext_module.tin_shift_forward(input, shift, out) + + return out + + @staticmethod + def backward(ctx, grad_output): + + shift = ctx.saved_tensors[0] + data_grad_input = grad_output.new(*grad_output.size()).zero_() + shift_grad_input = shift.new(*shift.size()).zero_() + ext_module.tin_shift_backward(grad_output, shift, data_grad_input) + + return data_grad_input, shift_grad_input + + +tin_shift = TINShiftFunction.apply + + +class TINShift(nn.Module): + """Temporal Interlace Shift. + + Temporal Interlace shift is a differentiable temporal-wise frame shifting + which is proposed in "Temporal Interlacing Network" + + Please refer to https://arxiv.org/abs/2001.06499 for more details. + Code is modified from https://github.com/mit-han-lab/temporal-shift-module + """ + + def forward(self, input, shift): + """Perform temporal interlace shift. + + Args: + input (Tensor): Feature map with shape [N, num_segments, C, H * W]. + shift (Tensor): Shift tensor with shape [N, num_segments]. + + Returns: + Feature map after temporal interlace shift. + """ + return tin_shift(input, shift) diff --git a/custom_mmpkg/custom_mmcv/ops/upfirdn2d.py b/custom_mmpkg/custom_mmcv/ops/upfirdn2d.py new file mode 100644 index 0000000000000000000000000000000000000000..ef4a5236dda57340017f0e16857bca297d4e1b2f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/upfirdn2d.py @@ -0,0 +1,330 @@ +# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py # noqa:E501 + +# Copyright (c) 2021, NVIDIA Corporation. All rights reserved. +# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator +# Augmentation (ADA) +# ======================================================================= + +# 1. Definitions + +# "Licensor" means any person or entity that distributes its Work. + +# "Software" means the original work of authorship made available under +# this License. + +# "Work" means the Software and any additions to or derivative works of +# the Software that are made available under this License. + +# The terms "reproduce," "reproduction," "derivative works," and +# "distribution" have the meaning as provided under U.S. copyright law; +# provided, however, that for the purposes of this License, derivative +# works shall not include works that remain separable from, or merely +# link (or bind by name) to the interfaces of, the Work. + +# Works, including the Software, are "made available" under this License +# by including in or with the Work either (a) a copyright notice +# referencing the applicability of this License to the Work, or (b) a +# copy of this License. + +# 2. License Grants + +# 2.1 Copyright Grant. Subject to the terms and conditions of this +# License, each Licensor grants to you a perpetual, worldwide, +# non-exclusive, royalty-free, copyright license to reproduce, +# prepare derivative works of, publicly display, publicly perform, +# sublicense and distribute its Work and any resulting derivative +# works in any form. + +# 3. Limitations + +# 3.1 Redistribution. You may reproduce or distribute the Work only +# if (a) you do so under this License, (b) you include a complete +# copy of this License with your distribution, and (c) you retain +# without modification any copyright, patent, trademark, or +# attribution notices that are present in the Work. + +# 3.2 Derivative Works. You may specify that additional or different +# terms apply to the use, reproduction, and distribution of your +# derivative works of the Work ("Your Terms") only if (a) Your Terms +# provide that the use limitation in Section 3.3 applies to your +# derivative works, and (b) you identify the specific derivative +# works that are subject to Your Terms. Notwithstanding Your Terms, +# this License (including the redistribution requirements in Section +# 3.1) will continue to apply to the Work itself. + +# 3.3 Use Limitation. The Work and any derivative works thereof only +# may be used or intended for use non-commercially. Notwithstanding +# the foregoing, NVIDIA and its affiliates may use the Work and any +# derivative works commercially. As used herein, "non-commercially" +# means for research or evaluation purposes only. + +# 3.4 Patent Claims. If you bring or threaten to bring a patent claim +# against any Licensor (including any claim, cross-claim or +# counterclaim in a lawsuit) to enforce any patents that you allege +# are infringed by any Work, then your rights under this License from +# such Licensor (including the grant in Section 2.1) will terminate +# immediately. + +# 3.5 Trademarks. This License does not grant any rights to use any +# Licensor’s or its affiliates’ names, logos, or trademarks, except +# as necessary to reproduce the notices described in this License. + +# 3.6 Termination. If you violate any term of this License, then your +# rights under this License (including the grant in Section 2.1) will +# terminate immediately. + +# 4. Disclaimer of Warranty. + +# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR +# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER +# THIS LICENSE. + +# 5. Limitation of Liability. + +# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL +# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE +# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT, +# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF +# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK +# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION, +# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER +# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF +# THE POSSIBILITY OF SUCH DAMAGES. + +# ======================================================================= + +import torch +from torch.autograd import Function +from torch.nn import functional as F + +from custom_mmpkg.custom_mmcv.utils import to_2tuple +from ..utils import ext_loader + +upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d']) + + +class UpFirDn2dBackward(Function): + + @staticmethod + def forward(ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, + in_size, out_size): + + up_x, up_y = up + down_x, down_y = down + g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad + + grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1) + + grad_input = upfirdn2d_ext.upfirdn2d( + grad_output, + grad_kernel, + up_x=down_x, + up_y=down_y, + down_x=up_x, + down_y=up_y, + pad_x0=g_pad_x0, + pad_x1=g_pad_x1, + pad_y0=g_pad_y0, + pad_y1=g_pad_y1) + grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], + in_size[3]) + + ctx.save_for_backward(kernel) + + pad_x0, pad_x1, pad_y0, pad_y1 = pad + + ctx.up_x = up_x + ctx.up_y = up_y + ctx.down_x = down_x + ctx.down_y = down_y + ctx.pad_x0 = pad_x0 + ctx.pad_x1 = pad_x1 + ctx.pad_y0 = pad_y0 + ctx.pad_y1 = pad_y1 + ctx.in_size = in_size + ctx.out_size = out_size + + return grad_input + + @staticmethod + def backward(ctx, gradgrad_input): + kernel, = ctx.saved_tensors + + gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], + ctx.in_size[3], 1) + + gradgrad_out = upfirdn2d_ext.upfirdn2d( + gradgrad_input, + kernel, + up_x=ctx.up_x, + up_y=ctx.up_y, + down_x=ctx.down_x, + down_y=ctx.down_y, + pad_x0=ctx.pad_x0, + pad_x1=ctx.pad_x1, + pad_y0=ctx.pad_y0, + pad_y1=ctx.pad_y1) + # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0], + # ctx.out_size[1], ctx.in_size[3]) + gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1], + ctx.out_size[0], ctx.out_size[1]) + + return gradgrad_out, None, None, None, None, None, None, None, None + + +class UpFirDn2d(Function): + + @staticmethod + def forward(ctx, input, kernel, up, down, pad): + up_x, up_y = up + down_x, down_y = down + pad_x0, pad_x1, pad_y0, pad_y1 = pad + + kernel_h, kernel_w = kernel.shape + batch, channel, in_h, in_w = input.shape + ctx.in_size = input.shape + + input = input.reshape(-1, in_h, in_w, 1) + + ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1])) + + out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 + out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 + ctx.out_size = (out_h, out_w) + + ctx.up = (up_x, up_y) + ctx.down = (down_x, down_y) + ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1) + + g_pad_x0 = kernel_w - pad_x0 - 1 + g_pad_y0 = kernel_h - pad_y0 - 1 + g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1 + g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1 + + ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1) + + out = upfirdn2d_ext.upfirdn2d( + input, + kernel, + up_x=up_x, + up_y=up_y, + down_x=down_x, + down_y=down_y, + pad_x0=pad_x0, + pad_x1=pad_x1, + pad_y0=pad_y0, + pad_y1=pad_y1) + # out = out.view(major, out_h, out_w, minor) + out = out.view(-1, channel, out_h, out_w) + + return out + + @staticmethod + def backward(ctx, grad_output): + kernel, grad_kernel = ctx.saved_tensors + + grad_input = UpFirDn2dBackward.apply( + grad_output, + kernel, + grad_kernel, + ctx.up, + ctx.down, + ctx.pad, + ctx.g_pad, + ctx.in_size, + ctx.out_size, + ) + + return grad_input, None, None, None, None + + +def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)): + """UpFRIDn for 2d features. + + UpFIRDn is short for upsample, apply FIR filter and downsample. More + details can be found in: + https://www.mathworks.com/help/signal/ref/upfirdn.html + + Args: + input (Tensor): Tensor with shape of (n, c, h, w). + kernel (Tensor): Filter kernel. + up (int | tuple[int], optional): Upsampling factor. If given a number, + we will use this factor for the both height and width side. + Defaults to 1. + down (int | tuple[int], optional): Downsampling factor. If given a + number, we will use this factor for the both height and width side. + Defaults to 1. + pad (tuple[int], optional): Padding for tensors, (x_pad, y_pad) or + (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0). + + Returns: + Tensor: Tensor after UpFIRDn. + """ + if input.device.type == 'cpu': + if len(pad) == 2: + pad = (pad[0], pad[1], pad[0], pad[1]) + + up = to_2tuple(up) + + down = to_2tuple(down) + + out = upfirdn2d_native(input, kernel, up[0], up[1], down[0], down[1], + pad[0], pad[1], pad[2], pad[3]) + else: + _up = to_2tuple(up) + + _down = to_2tuple(down) + + if len(pad) == 4: + _pad = pad + elif len(pad) == 2: + _pad = (pad[0], pad[1], pad[0], pad[1]) + + out = UpFirDn2d.apply(input, kernel, _up, _down, _pad) + + return out + + +def upfirdn2d_native(input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, + pad_y0, pad_y1): + _, channel, in_h, in_w = input.shape + input = input.reshape(-1, in_h, in_w, 1) + + _, in_h, in_w, minor = input.shape + kernel_h, kernel_w = kernel.shape + + out = input.view(-1, in_h, 1, in_w, 1, minor) + out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1]) + out = out.view(-1, in_h * up_y, in_w * up_x, minor) + + out = F.pad( + out, + [0, 0, + max(pad_x0, 0), + max(pad_x1, 0), + max(pad_y0, 0), + max(pad_y1, 0)]) + out = out[:, + max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0), + max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ] + + out = out.permute(0, 3, 1, 2) + out = out.reshape( + [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]) + w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w) + out = F.conv2d(out, w) + out = out.reshape( + -1, + minor, + in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1, + in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1, + ) + out = out.permute(0, 2, 3, 1) + out = out[:, ::down_y, ::down_x, :] + + out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1 + out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1 + + return out.view(-1, channel, out_h, out_w) diff --git a/custom_mmpkg/custom_mmcv/ops/voxelize.py b/custom_mmpkg/custom_mmcv/ops/voxelize.py new file mode 100644 index 0000000000000000000000000000000000000000..ca3226a4fbcbfe58490fa2ea8e1c16b531214121 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/ops/voxelize.py @@ -0,0 +1,132 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch import nn +from torch.autograd import Function +from torch.nn.modules.utils import _pair + +from ..utils import ext_loader + +ext_module = ext_loader.load_ext( + '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward']) + + +class _Voxelization(Function): + + @staticmethod + def forward(ctx, + points, + voxel_size, + coors_range, + max_points=35, + max_voxels=20000): + """Convert kitti points(N, >=3) to voxels. + + Args: + points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points + and points[:, 3:] contain other information like reflectivity. + voxel_size (tuple or float): The size of voxel with the shape of + [3]. + coors_range (tuple or float): The coordinate range of voxel with + the shape of [6]. + max_points (int, optional): maximum points contained in a voxel. if + max_points=-1, it means using dynamic_voxelize. Default: 35. + max_voxels (int, optional): maximum voxels this function create. + for second, 20000 is a good choice. Users should shuffle points + before call this function because max_voxels may drop points. + Default: 20000. + + Returns: + voxels_out (torch.Tensor): Output voxels with the shape of [M, + max_points, ndim]. Only contain points and returned when + max_points != -1. + coors_out (torch.Tensor): Output coordinates with the shape of + [M, 3]. + num_points_per_voxel_out (torch.Tensor): Num points per voxel with + the shape of [M]. Only returned when max_points != -1. + """ + if max_points == -1 or max_voxels == -1: + coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int) + ext_module.dynamic_voxelize_forward(points, coors, voxel_size, + coors_range, 3) + return coors + else: + voxels = points.new_zeros( + size=(max_voxels, max_points, points.size(1))) + coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int) + num_points_per_voxel = points.new_zeros( + size=(max_voxels, ), dtype=torch.int) + voxel_num = ext_module.hard_voxelize_forward( + points, voxels, coors, num_points_per_voxel, voxel_size, + coors_range, max_points, max_voxels, 3) + # select the valid voxels + voxels_out = voxels[:voxel_num] + coors_out = coors[:voxel_num] + num_points_per_voxel_out = num_points_per_voxel[:voxel_num] + return voxels_out, coors_out, num_points_per_voxel_out + + +voxelization = _Voxelization.apply + + +class Voxelization(nn.Module): + """Convert kitti points(N, >=3) to voxels. + + Please refer to `PVCNN `_ for more + details. + + Args: + voxel_size (tuple or float): The size of voxel with the shape of [3]. + point_cloud_range (tuple or float): The coordinate range of voxel with + the shape of [6]. + max_num_points (int): maximum points contained in a voxel. if + max_points=-1, it means using dynamic_voxelize. + max_voxels (int, optional): maximum voxels this function create. + for second, 20000 is a good choice. Users should shuffle points + before call this function because max_voxels may drop points. + Default: 20000. + """ + + def __init__(self, + voxel_size, + point_cloud_range, + max_num_points, + max_voxels=20000): + super().__init__() + + self.voxel_size = voxel_size + self.point_cloud_range = point_cloud_range + self.max_num_points = max_num_points + if isinstance(max_voxels, tuple): + self.max_voxels = max_voxels + else: + self.max_voxels = _pair(max_voxels) + + point_cloud_range = torch.tensor( + point_cloud_range, dtype=torch.float32) + voxel_size = torch.tensor(voxel_size, dtype=torch.float32) + grid_size = (point_cloud_range[3:] - + point_cloud_range[:3]) / voxel_size + grid_size = torch.round(grid_size).long() + input_feat_shape = grid_size[:2] + self.grid_size = grid_size + # the origin shape is as [x-len, y-len, z-len] + # [w, h, d] -> [d, h, w] + self.pcd_shape = [*input_feat_shape, 1][::-1] + + def forward(self, input): + if self.training: + max_voxels = self.max_voxels[0] + else: + max_voxels = self.max_voxels[1] + + return voxelization(input, self.voxel_size, self.point_cloud_range, + self.max_num_points, max_voxels) + + def __repr__(self): + s = self.__class__.__name__ + '(' + s += 'voxel_size=' + str(self.voxel_size) + s += ', point_cloud_range=' + str(self.point_cloud_range) + s += ', max_num_points=' + str(self.max_num_points) + s += ', max_voxels=' + str(self.max_voxels) + s += ')' + return s diff --git a/custom_mmpkg/custom_mmcv/parallel/__init__.py b/custom_mmpkg/custom_mmcv/parallel/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed2c17ad357742e423beeaf4d35db03fe9af469 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .collate import collate +from .data_container import DataContainer +from .data_parallel import MMDataParallel +from .distributed import MMDistributedDataParallel +from .registry import MODULE_WRAPPERS +from .scatter_gather import scatter, scatter_kwargs +from .utils import is_module_wrapper + +__all__ = [ + 'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel', + 'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS' +] diff --git a/custom_mmpkg/custom_mmcv/parallel/_functions.py b/custom_mmpkg/custom_mmcv/parallel/_functions.py new file mode 100644 index 0000000000000000000000000000000000000000..9b5a8a44483ab991411d07122b22a1d027e4be8e --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/_functions.py @@ -0,0 +1,79 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn.parallel._functions import _get_stream + + +def scatter(input, devices, streams=None): + """Scatters tensor across multiple GPUs.""" + if streams is None: + streams = [None] * len(devices) + + if isinstance(input, list): + chunk_size = (len(input) - 1) // len(devices) + 1 + outputs = [ + scatter(input[i], [devices[i // chunk_size]], + [streams[i // chunk_size]]) for i in range(len(input)) + ] + return outputs + elif isinstance(input, torch.Tensor): + output = input.contiguous() + # TODO: copy to a pinned buffer first (if copying from CPU) + stream = streams[0] if output.numel() > 0 else None + if devices != [-1]: + with torch.cuda.device(devices[0]), torch.cuda.stream(stream): + output = output.cuda(devices[0], non_blocking=True) + else: + # unsqueeze the first dimension thus the tensor's shape is the + # same as those scattered with GPU. + output = output.unsqueeze(0) + return output + else: + raise Exception(f'Unknown type {type(input)}.') + + +def synchronize_stream(output, devices, streams): + if isinstance(output, list): + chunk_size = len(output) // len(devices) + for i in range(len(devices)): + for j in range(chunk_size): + synchronize_stream(output[i * chunk_size + j], [devices[i]], + [streams[i]]) + elif isinstance(output, torch.Tensor): + if output.numel() != 0: + with torch.cuda.device(devices[0]): + main_stream = torch.cuda.current_stream() + main_stream.wait_stream(streams[0]) + output.record_stream(main_stream) + else: + raise Exception(f'Unknown type {type(output)}.') + + +def get_input_device(input): + if isinstance(input, list): + for item in input: + input_device = get_input_device(item) + if input_device != -1: + return input_device + return -1 + elif isinstance(input, torch.Tensor): + return input.get_device() if input.is_cuda else -1 + else: + raise Exception(f'Unknown type {type(input)}.') + + +class Scatter: + + @staticmethod + def forward(target_gpus, input): + input_device = get_input_device(input) + streams = None + if input_device == -1 and target_gpus != [-1]: + # Perform CPU to GPU copies in a background stream + streams = [_get_stream(device) for device in target_gpus] + + outputs = scatter(input, target_gpus, streams) + # Synchronize with the copy stream + if streams is not None: + synchronize_stream(outputs, target_gpus, streams) + + return tuple(outputs) diff --git a/custom_mmpkg/custom_mmcv/parallel/collate.py b/custom_mmpkg/custom_mmcv/parallel/collate.py new file mode 100644 index 0000000000000000000000000000000000000000..ad749197df21b0d74297548be5f66a696adebf7f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/collate.py @@ -0,0 +1,84 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections.abc import Mapping, Sequence + +import torch +import torch.nn.functional as F +from torch.utils.data.dataloader import default_collate + +from .data_container import DataContainer + + +def collate(batch, samples_per_gpu=1): + """Puts each data field into a tensor/DataContainer with outer dimension + batch size. + + Extend default_collate to add support for + :type:`~mmcv.parallel.DataContainer`. There are 3 cases. + + 1. cpu_only = True, e.g., meta data + 2. cpu_only = False, stack = True, e.g., images tensors + 3. cpu_only = False, stack = False, e.g., gt bboxes + """ + + if not isinstance(batch, Sequence): + raise TypeError(f'{batch.dtype} is not supported.') + + if isinstance(batch[0], DataContainer): + stacked = [] + if batch[0].cpu_only: + for i in range(0, len(batch), samples_per_gpu): + stacked.append( + [sample.data for sample in batch[i:i + samples_per_gpu]]) + return DataContainer( + stacked, batch[0].stack, batch[0].padding_value, cpu_only=True) + elif batch[0].stack: + for i in range(0, len(batch), samples_per_gpu): + assert isinstance(batch[i].data, torch.Tensor) + + if batch[i].pad_dims is not None: + ndim = batch[i].dim() + assert ndim > batch[i].pad_dims + max_shape = [0 for _ in range(batch[i].pad_dims)] + for dim in range(1, batch[i].pad_dims + 1): + max_shape[dim - 1] = batch[i].size(-dim) + for sample in batch[i:i + samples_per_gpu]: + for dim in range(0, ndim - batch[i].pad_dims): + assert batch[i].size(dim) == sample.size(dim) + for dim in range(1, batch[i].pad_dims + 1): + max_shape[dim - 1] = max(max_shape[dim - 1], + sample.size(-dim)) + padded_samples = [] + for sample in batch[i:i + samples_per_gpu]: + pad = [0 for _ in range(batch[i].pad_dims * 2)] + for dim in range(1, batch[i].pad_dims + 1): + pad[2 * dim - + 1] = max_shape[dim - 1] - sample.size(-dim) + padded_samples.append( + F.pad( + sample.data, pad, value=sample.padding_value)) + stacked.append(default_collate(padded_samples)) + elif batch[i].pad_dims is None: + stacked.append( + default_collate([ + sample.data + for sample in batch[i:i + samples_per_gpu] + ])) + else: + raise ValueError( + 'pad_dims should be either None or integers (1-3)') + + else: + for i in range(0, len(batch), samples_per_gpu): + stacked.append( + [sample.data for sample in batch[i:i + samples_per_gpu]]) + return DataContainer(stacked, batch[0].stack, batch[0].padding_value) + elif isinstance(batch[0], Sequence): + transposed = zip(*batch) + return [collate(samples, samples_per_gpu) for samples in transposed] + elif isinstance(batch[0], Mapping): + return { + key: collate([d[key] for d in batch], samples_per_gpu) + for key in batch[0] + } + else: + return default_collate(batch) diff --git a/custom_mmpkg/custom_mmcv/parallel/data_container.py b/custom_mmpkg/custom_mmcv/parallel/data_container.py new file mode 100644 index 0000000000000000000000000000000000000000..cedb0d32a51a1f575a622b38de2cee3ab4757821 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/data_container.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools + +import torch + + +def assert_tensor_type(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + if not isinstance(args[0].data, torch.Tensor): + raise AttributeError( + f'{args[0].__class__.__name__} has no attribute ' + f'{func.__name__} for type {args[0].datatype}') + return func(*args, **kwargs) + + return wrapper + + +class DataContainer: + """A container for any type of objects. + + Typically tensors will be stacked in the collate function and sliced along + some dimension in the scatter function. This behavior has some limitations. + 1. All tensors have to be the same size. + 2. Types are limited (numpy array or Tensor). + + We design `DataContainer` and `MMDataParallel` to overcome these + limitations. The behavior can be either of the following. + + - copy to GPU, pad all tensors to the same size and stack them + - copy to GPU without stacking + - leave the objects as is and pass it to the model + - pad_dims specifies the number of last few dimensions to do padding + """ + + def __init__(self, + data, + stack=False, + padding_value=0, + cpu_only=False, + pad_dims=2): + self._data = data + self._cpu_only = cpu_only + self._stack = stack + self._padding_value = padding_value + assert pad_dims in [None, 1, 2, 3] + self._pad_dims = pad_dims + + def __repr__(self): + return f'{self.__class__.__name__}({repr(self.data)})' + + def __len__(self): + return len(self._data) + + @property + def data(self): + return self._data + + @property + def datatype(self): + if isinstance(self.data, torch.Tensor): + return self.data.type() + else: + return type(self.data) + + @property + def cpu_only(self): + return self._cpu_only + + @property + def stack(self): + return self._stack + + @property + def padding_value(self): + return self._padding_value + + @property + def pad_dims(self): + return self._pad_dims + + @assert_tensor_type + def size(self, *args, **kwargs): + return self.data.size(*args, **kwargs) + + @assert_tensor_type + def dim(self): + return self.data.dim() diff --git a/custom_mmpkg/custom_mmcv/parallel/data_parallel.py b/custom_mmpkg/custom_mmcv/parallel/data_parallel.py new file mode 100644 index 0000000000000000000000000000000000000000..79b5f69b654cf647dc7ae9174223781ab5c607d2 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/data_parallel.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from itertools import chain + +from torch.nn.parallel import DataParallel + +from .scatter_gather import scatter_kwargs + + +class MMDataParallel(DataParallel): + """The DataParallel module that supports DataContainer. + + MMDataParallel has two main differences with PyTorch DataParallel: + + - It supports a custom type :class:`DataContainer` which allows more + flexible control of input data during both GPU and CPU inference. + - It implement two more APIs ``train_step()`` and ``val_step()``. + + Args: + module (:class:`nn.Module`): Module to be encapsulated. + device_ids (list[int]): Device IDS of modules to be scattered to. + Defaults to None when GPU is not available. + output_device (str | int): Device ID for output. Defaults to None. + dim (int): Dimension used to scatter the data. Defaults to 0. + """ + + def __init__(self, *args, dim=0, **kwargs): + super(MMDataParallel, self).__init__(*args, dim=dim, **kwargs) + self.dim = dim + + def forward(self, *inputs, **kwargs): + """Override the original forward function. + + The main difference lies in the CPU inference where the data in + :class:`DataContainers` will still be gathered. + """ + if not self.device_ids: + # We add the following line thus the module could gather and + # convert data containers as those in GPU inference + inputs, kwargs = self.scatter(inputs, kwargs, [-1]) + return self.module(*inputs[0], **kwargs[0]) + else: + return super().forward(*inputs, **kwargs) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def train_step(self, *inputs, **kwargs): + if not self.device_ids: + # We add the following line thus the module could gather and + # convert data containers as those in GPU inference + inputs, kwargs = self.scatter(inputs, kwargs, [-1]) + return self.module.train_step(*inputs[0], **kwargs[0]) + + assert len(self.device_ids) == 1, \ + ('MMDataParallel only supports single GPU training, if you need to' + ' train with multiple GPUs, please use MMDistributedDataParallel' + 'instead.') + + for t in chain(self.module.parameters(), self.module.buffers()): + if t.device != self.src_device_obj: + raise RuntimeError( + 'module must have its parameters and buffers ' + f'on device {self.src_device_obj} (device_ids[0]) but ' + f'found one of them on device: {t.device}') + + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + return self.module.train_step(*inputs[0], **kwargs[0]) + + def val_step(self, *inputs, **kwargs): + if not self.device_ids: + # We add the following line thus the module could gather and + # convert data containers as those in GPU inference + inputs, kwargs = self.scatter(inputs, kwargs, [-1]) + return self.module.val_step(*inputs[0], **kwargs[0]) + + assert len(self.device_ids) == 1, \ + ('MMDataParallel only supports single GPU training, if you need to' + ' train with multiple GPUs, please use MMDistributedDataParallel' + ' instead.') + + for t in chain(self.module.parameters(), self.module.buffers()): + if t.device != self.src_device_obj: + raise RuntimeError( + 'module must have its parameters and buffers ' + f'on device {self.src_device_obj} (device_ids[0]) but ' + f'found one of them on device: {t.device}') + + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + return self.module.val_step(*inputs[0], **kwargs[0]) diff --git a/custom_mmpkg/custom_mmcv/parallel/distributed.py b/custom_mmpkg/custom_mmcv/parallel/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..fa1bae90f8d4078f7c52bfc565f8349f1e5c8db0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/distributed.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn.parallel.distributed import (DistributedDataParallel, + _find_tensors) + +from custom_mmpkg.custom_mmcv import print_log +from custom_mmpkg.custom_mmcv.utils import TORCH_VERSION, digit_version +from .scatter_gather import scatter_kwargs + + +class MMDistributedDataParallel(DistributedDataParallel): + """The DDP module that supports DataContainer. + + MMDDP has two main differences with PyTorch DDP: + + - It supports a custom type :class:`DataContainer` which allows more + flexible control of input data. + - It implement two APIs ``train_step()`` and ``val_step()``. + """ + + def to_kwargs(self, inputs, kwargs, device_id): + # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8 + # to move all tensors to device_id + return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def train_step(self, *inputs, **kwargs): + """train_step() API for module wrapped by DistributedDataParallel. + + This method is basically the same as + ``DistributedDataParallel.forward()``, while replacing + ``self.module.forward()`` with ``self.module.train_step()``. + It is compatible with PyTorch 1.1 - 1.5. + """ + + # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the + # end of backward to the beginning of forward. + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.7') + and self.reducer._rebuild_buckets()): + print_log( + 'Reducer buckets have been rebuilt in this iteration.', + logger='mmcv') + + if getattr(self, 'require_forward_param_sync', True): + self._sync_params() + if self.device_ids: + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + if len(self.device_ids) == 1: + output = self.module.train_step(*inputs[0], **kwargs[0]) + else: + outputs = self.parallel_apply( + self._module_copies[:len(inputs)], inputs, kwargs) + output = self.gather(outputs, self.output_device) + else: + output = self.module.train_step(*inputs, **kwargs) + + if torch.is_grad_enabled() and getattr( + self, 'require_backward_grad_sync', True): + if self.find_unused_parameters: + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) > digit_version('1.2')): + self.require_forward_param_sync = False + return output + + def val_step(self, *inputs, **kwargs): + """val_step() API for module wrapped by DistributedDataParallel. + + This method is basically the same as + ``DistributedDataParallel.forward()``, while replacing + ``self.module.forward()`` with ``self.module.val_step()``. + It is compatible with PyTorch 1.1 - 1.5. + """ + # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the + # end of backward to the beginning of forward. + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) >= digit_version('1.7') + and self.reducer._rebuild_buckets()): + print_log( + 'Reducer buckets have been rebuilt in this iteration.', + logger='mmcv') + + if getattr(self, 'require_forward_param_sync', True): + self._sync_params() + if self.device_ids: + inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids) + if len(self.device_ids) == 1: + output = self.module.val_step(*inputs[0], **kwargs[0]) + else: + outputs = self.parallel_apply( + self._module_copies[:len(inputs)], inputs, kwargs) + output = self.gather(outputs, self.output_device) + else: + output = self.module.val_step(*inputs, **kwargs) + + if torch.is_grad_enabled() and getattr( + self, 'require_backward_grad_sync', True): + if self.find_unused_parameters: + self.reducer.prepare_for_backward(list(_find_tensors(output))) + else: + self.reducer.prepare_for_backward([]) + else: + if ('parrots' not in TORCH_VERSION + and digit_version(TORCH_VERSION) > digit_version('1.2')): + self.require_forward_param_sync = False + return output diff --git a/custom_mmpkg/custom_mmcv/parallel/distributed_deprecated.py b/custom_mmpkg/custom_mmcv/parallel/distributed_deprecated.py new file mode 100644 index 0000000000000000000000000000000000000000..d31f7be0eb5b7f92c0d2fca6faca69152472ac27 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/distributed_deprecated.py @@ -0,0 +1,70 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +import torch.distributed as dist +import torch.nn as nn +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + +from custom_mmpkg.custom_mmcv.utils import TORCH_VERSION, digit_version +from .registry import MODULE_WRAPPERS +from .scatter_gather import scatter_kwargs + + +@MODULE_WRAPPERS.register_module() +class MMDistributedDataParallel(nn.Module): + + def __init__(self, + module, + dim=0, + broadcast_buffers=True, + bucket_cap_mb=25): + super(MMDistributedDataParallel, self).__init__() + self.module = module + self.dim = dim + self.broadcast_buffers = broadcast_buffers + + self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024 + self._sync_params() + + def _dist_broadcast_coalesced(self, tensors, buffer_size): + for tensors in _take_tensors(tensors, buffer_size): + flat_tensors = _flatten_dense_tensors(tensors) + dist.broadcast(flat_tensors, 0) + for tensor, synced in zip( + tensors, _unflatten_dense_tensors(flat_tensors, tensors)): + tensor.copy_(synced) + + def _sync_params(self): + module_states = list(self.module.state_dict().values()) + if len(module_states) > 0: + self._dist_broadcast_coalesced(module_states, + self.broadcast_bucket_size) + if self.broadcast_buffers: + if (TORCH_VERSION != 'parrots' + and digit_version(TORCH_VERSION) < digit_version('1.0')): + buffers = [b.data for b in self.module._all_buffers()] + else: + buffers = [b.data for b in self.module.buffers()] + if len(buffers) > 0: + self._dist_broadcast_coalesced(buffers, + self.broadcast_bucket_size) + + def scatter(self, inputs, kwargs, device_ids): + return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim) + + def forward(self, *inputs, **kwargs): + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + return self.module(*inputs[0], **kwargs[0]) + + def train_step(self, *inputs, **kwargs): + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + output = self.module.train_step(*inputs[0], **kwargs[0]) + return output + + def val_step(self, *inputs, **kwargs): + inputs, kwargs = self.scatter(inputs, kwargs, + [torch.cuda.current_device()]) + output = self.module.val_step(*inputs[0], **kwargs[0]) + return output diff --git a/custom_mmpkg/custom_mmcv/parallel/registry.py b/custom_mmpkg/custom_mmcv/parallel/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..3a0e9f6639628c444e4682d639eabbef76114d01 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/registry.py @@ -0,0 +1,8 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from torch.nn.parallel import DataParallel, DistributedDataParallel + +from custom_mmpkg.custom_mmcv.utils import Registry + +MODULE_WRAPPERS = Registry('module wrapper') +MODULE_WRAPPERS.register_module(module=DataParallel) +MODULE_WRAPPERS.register_module(module=DistributedDataParallel) diff --git a/custom_mmpkg/custom_mmcv/parallel/scatter_gather.py b/custom_mmpkg/custom_mmcv/parallel/scatter_gather.py new file mode 100644 index 0000000000000000000000000000000000000000..900ff88566f8f14830590459dc4fd16d4b382e47 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/scatter_gather.py @@ -0,0 +1,59 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from torch.nn.parallel._functions import Scatter as OrigScatter + +from ._functions import Scatter +from .data_container import DataContainer + + +def scatter(inputs, target_gpus, dim=0): + """Scatter inputs to target gpus. + + The only difference from original :func:`scatter` is to add support for + :type:`~mmcv.parallel.DataContainer`. + """ + + def scatter_map(obj): + if isinstance(obj, torch.Tensor): + if target_gpus != [-1]: + return OrigScatter.apply(target_gpus, None, dim, obj) + else: + # for CPU inference we use self-implemented scatter + return Scatter.forward(target_gpus, obj) + if isinstance(obj, DataContainer): + if obj.cpu_only: + return obj.data + else: + return Scatter.forward(target_gpus, obj.data) + if isinstance(obj, tuple) and len(obj) > 0: + return list(zip(*map(scatter_map, obj))) + if isinstance(obj, list) and len(obj) > 0: + out = list(map(list, zip(*map(scatter_map, obj)))) + return out + if isinstance(obj, dict) and len(obj) > 0: + out = list(map(type(obj), zip(*map(scatter_map, obj.items())))) + return out + return [obj for targets in target_gpus] + + # After scatter_map is called, a scatter_map cell will exist. This cell + # has a reference to the actual function scatter_map, which has references + # to a closure that has a reference to the scatter_map cell (because the + # fn is recursive). To avoid this reference cycle, we set the function to + # None, clearing the cell + try: + return scatter_map(inputs) + finally: + scatter_map = None + + +def scatter_kwargs(inputs, kwargs, target_gpus, dim=0): + """Scatter with support for kwargs dictionary.""" + inputs = scatter(inputs, target_gpus, dim) if inputs else [] + kwargs = scatter(kwargs, target_gpus, dim) if kwargs else [] + if len(inputs) < len(kwargs): + inputs.extend([() for _ in range(len(kwargs) - len(inputs))]) + elif len(kwargs) < len(inputs): + kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))]) + inputs = tuple(inputs) + kwargs = tuple(kwargs) + return inputs, kwargs diff --git a/custom_mmpkg/custom_mmcv/parallel/utils.py b/custom_mmpkg/custom_mmcv/parallel/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0f5712cb42c38a2e8563bf563efb6681383cab9b --- /dev/null +++ b/custom_mmpkg/custom_mmcv/parallel/utils.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .registry import MODULE_WRAPPERS + + +def is_module_wrapper(module): + """Check if a module is a module wrapper. + + The following 3 modules in MMCV (and their subclasses) are regarded as + module wrappers: DataParallel, DistributedDataParallel, + MMDistributedDataParallel (the deprecated version). You may add you own + module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS. + + Args: + module (nn.Module): The module to be checked. + + Returns: + bool: True if the input module is a module wrapper. + """ + module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values()) + return isinstance(module, module_wrappers) diff --git a/custom_mmpkg/custom_mmcv/runner/__init__.py b/custom_mmpkg/custom_mmcv/runner/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..52e4b48d383a84a055dcd7f6236f6e8e58eab924 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/__init__.py @@ -0,0 +1,47 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base_module import BaseModule, ModuleList, Sequential +from .base_runner import BaseRunner +from .builder import RUNNERS, build_runner +from .checkpoint import (CheckpointLoader, _load_checkpoint, + _load_checkpoint_with_prefix, load_checkpoint, + load_state_dict, save_checkpoint, weights_to_cpu) +from .default_constructor import DefaultRunnerConstructor +from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info, + init_dist, master_only) +from .epoch_based_runner import EpochBasedRunner, Runner +from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model +from .hooks import (HOOKS, CheckpointHook, ClosureHook, DistEvalHook, + DistSamplerSeedHook, DvcliveLoggerHook, EMAHook, EvalHook, + Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook, + GradientCumulativeOptimizerHook, Hook, IterTimerHook, + LoggerHook, LrUpdaterHook, MlflowLoggerHook, + NeptuneLoggerHook, OptimizerHook, PaviLoggerHook, + SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook, + WandbLoggerHook) +from .iter_based_runner import IterBasedRunner, IterLoader +from .log_buffer import LogBuffer +from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS, + DefaultOptimizerConstructor, build_optimizer, + build_optimizer_constructor) +from .priority import Priority, get_priority +from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed + +__all__ = [ + 'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer', + 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook', + 'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook', + 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook', + 'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook', + 'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict', + 'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority', + 'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict', + 'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS', + 'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer', + 'build_optimizer_constructor', 'IterLoader', 'set_random_seed', + 'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook', + 'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads', + 'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule', + '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential', + 'ModuleList', 'GradientCumulativeOptimizerHook', + 'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor' +] diff --git a/custom_mmpkg/custom_mmcv/runner/base_module.py b/custom_mmpkg/custom_mmcv/runner/base_module.py new file mode 100644 index 0000000000000000000000000000000000000000..362b0ae39a9e5e92b22f52918eaecc11dfde10b3 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/base_module.py @@ -0,0 +1,195 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import warnings +from abc import ABCMeta +from collections import defaultdict +from logging import FileHandler + +import torch.nn as nn + +from custom_mmpkg.custom_mmcv.runner.dist_utils import master_only +from custom_mmpkg.custom_mmcv.utils.logging import get_logger, logger_initialized, print_log + + +class BaseModule(nn.Module, metaclass=ABCMeta): + """Base module for all modules in openmmlab. + + ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional + functionality of parameter initialization. Compared with + ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes. + + - ``init_cfg``: the config to control the initialization. + - ``init_weights``: The function of parameter + initialization and recording initialization + information. + - ``_params_init_info``: Used to track the parameter + initialization information. This attribute only + exists during executing the ``init_weights``. + + Args: + init_cfg (dict, optional): Initialization config dict. + """ + + def __init__(self, init_cfg=None): + """Initialize BaseModule, inherited from `torch.nn.Module`""" + + # NOTE init_cfg can be defined in different levels, but init_cfg + # in low levels has a higher priority. + + super(BaseModule, self).__init__() + # define default value of init_cfg instead of hard code + # in init_weights() function + self._is_init = False + + self.init_cfg = copy.deepcopy(init_cfg) + + # Backward compatibility in derived classes + # if pretrained is not None: + # warnings.warn('DeprecationWarning: pretrained is a deprecated \ + # key, please consider using init_cfg') + # self.init_cfg = dict(type='Pretrained', checkpoint=pretrained) + + @property + def is_init(self): + return self._is_init + + def init_weights(self): + """Initialize the weights.""" + + is_top_level_module = False + # check if it is top-level module + if not hasattr(self, '_params_init_info'): + # The `_params_init_info` is used to record the initialization + # information of the parameters + # the key should be the obj:`nn.Parameter` of model and the value + # should be a dict containing + # - init_info (str): The string that describes the initialization. + # - tmp_mean_value (FloatTensor): The mean of the parameter, + # which indicates whether the parameter has been modified. + # this attribute would be deleted after all parameters + # is initialized. + self._params_init_info = defaultdict(dict) + is_top_level_module = True + + # Initialize the `_params_init_info`, + # When detecting the `tmp_mean_value` of + # the corresponding parameter is changed, update related + # initialization information + for name, param in self.named_parameters(): + self._params_init_info[param][ + 'init_info'] = f'The value is the same before and ' \ + f'after calling `init_weights` ' \ + f'of {self.__class__.__name__} ' + self._params_init_info[param][ + 'tmp_mean_value'] = param.data.mean() + + # pass `params_init_info` to all submodules + # All submodules share the same `params_init_info`, + # so it will be updated when parameters are + # modified at any level of the model. + for sub_module in self.modules(): + sub_module._params_init_info = self._params_init_info + + # Get the initialized logger, if not exist, + # create a logger named `mmcv` + logger_names = list(logger_initialized.keys()) + logger_name = logger_names[0] if logger_names else 'mmcv' + + from ..cnn import initialize + from ..cnn.utils.weight_init import update_init_info + module_name = self.__class__.__name__ + if not self._is_init: + if self.init_cfg: + print_log( + f'initialize {module_name} with init_cfg {self.init_cfg}', + logger=logger_name) + initialize(self, self.init_cfg) + if isinstance(self.init_cfg, dict): + # prevent the parameters of + # the pre-trained model + # from being overwritten by + # the `init_weights` + if self.init_cfg['type'] == 'Pretrained': + return + + for m in self.children(): + if hasattr(m, 'init_weights'): + m.init_weights() + # users may overload the `init_weights` + update_init_info( + m, + init_info=f'Initialized by ' + f'user-defined `init_weights`' + f' in {m.__class__.__name__} ') + + self._is_init = True + else: + warnings.warn(f'init_weights of {self.__class__.__name__} has ' + f'been called more than once.') + + if is_top_level_module: + self._dump_init_info(logger_name) + + for sub_module in self.modules(): + del sub_module._params_init_info + + @master_only + def _dump_init_info(self, logger_name): + """Dump the initialization information to a file named + `initialization.log.json` in workdir. + + Args: + logger_name (str): The name of logger. + """ + + logger = get_logger(logger_name) + + with_file_handler = False + # dump the information to the logger file if there is a `FileHandler` + for handler in logger.handlers: + if isinstance(handler, FileHandler): + handler.stream.write( + 'Name of parameter - Initialization information\n') + for name, param in self.named_parameters(): + handler.stream.write( + f'\n{name} - {param.shape}: ' + f"\n{self._params_init_info[param]['init_info']} \n") + handler.stream.flush() + with_file_handler = True + if not with_file_handler: + for name, param in self.named_parameters(): + print_log( + f'\n{name} - {param.shape}: ' + f"\n{self._params_init_info[param]['init_info']} \n ", + logger=logger_name) + + def __repr__(self): + s = super().__repr__() + if self.init_cfg: + s += f'\ninit_cfg={self.init_cfg}' + return s + + +class Sequential(BaseModule, nn.Sequential): + """Sequential module in openmmlab. + + Args: + init_cfg (dict, optional): Initialization config dict. + """ + + def __init__(self, *args, init_cfg=None): + BaseModule.__init__(self, init_cfg) + nn.Sequential.__init__(self, *args) + + +class ModuleList(BaseModule, nn.ModuleList): + """ModuleList in openmmlab. + + Args: + modules (iterable, optional): an iterable of modules to add. + init_cfg (dict, optional): Initialization config dict. + """ + + def __init__(self, modules=None, init_cfg=None): + BaseModule.__init__(self, init_cfg) + nn.ModuleList.__init__(self, modules) diff --git a/custom_mmpkg/custom_mmcv/runner/base_runner.py b/custom_mmpkg/custom_mmcv/runner/base_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..6f15d71940ae558c10fcd4372d0c87f1efde93a9 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/base_runner.py @@ -0,0 +1,542 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import logging +import os.path as osp +import warnings +from abc import ABCMeta, abstractmethod + +import torch +from torch.optim import Optimizer + +import custom_mmpkg.custom_mmcv as mmcv +from ..parallel import is_module_wrapper +from .checkpoint import load_checkpoint +from .dist_utils import get_dist_info +from .hooks import HOOKS, Hook +from .log_buffer import LogBuffer +from .priority import Priority, get_priority +from .utils import get_time_str + + +class BaseRunner(metaclass=ABCMeta): + """The base class of Runner, a training helper for PyTorch. + + All subclasses should implement the following APIs: + + - ``run()`` + - ``train()`` + - ``val()`` + - ``save_checkpoint()`` + + Args: + model (:obj:`torch.nn.Module`): The model to be run. + batch_processor (callable): A callable method that process a data + batch. The interface of this method should be + `batch_processor(model, data, train_mode) -> dict` + optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an + optimizer (in most cases) or a dict of optimizers (in models that + requires more than one optimizer, e.g., GAN). + work_dir (str, optional): The working directory to save checkpoints + and logs. Defaults to None. + logger (:obj:`logging.Logger`): Logger used during training. + Defaults to None. (The default value is just for backward + compatibility) + meta (dict | None): A dict records some import information such as + environment info and seed, which will be logged in logger hook. + Defaults to None. + max_epochs (int, optional): Total training epochs. + max_iters (int, optional): Total training iterations. + """ + + def __init__(self, + model, + batch_processor=None, + optimizer=None, + work_dir=None, + logger=None, + meta=None, + max_iters=None, + max_epochs=None): + if batch_processor is not None: + if not callable(batch_processor): + raise TypeError('batch_processor must be callable, ' + f'but got {type(batch_processor)}') + warnings.warn('batch_processor is deprecated, please implement ' + 'train_step() and val_step() in the model instead.') + # raise an error is `batch_processor` is not None and + # `model.train_step()` exists. + if is_module_wrapper(model): + _model = model.module + else: + _model = model + if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'): + raise RuntimeError( + 'batch_processor and model.train_step()/model.val_step() ' + 'cannot be both available.') + else: + assert hasattr(model, 'train_step') + + # check the type of `optimizer` + if isinstance(optimizer, dict): + for name, optim in optimizer.items(): + if not isinstance(optim, Optimizer): + raise TypeError( + f'optimizer must be a dict of torch.optim.Optimizers, ' + f'but optimizer["{name}"] is a {type(optim)}') + elif not isinstance(optimizer, Optimizer) and optimizer is not None: + raise TypeError( + f'optimizer must be a torch.optim.Optimizer object ' + f'or dict or None, but got {type(optimizer)}') + + # check the type of `logger` + if not isinstance(logger, logging.Logger): + raise TypeError(f'logger must be a logging.Logger object, ' + f'but got {type(logger)}') + + # check the type of `meta` + if meta is not None and not isinstance(meta, dict): + raise TypeError( + f'meta must be a dict or None, but got {type(meta)}') + + self.model = model + self.batch_processor = batch_processor + self.optimizer = optimizer + self.logger = logger + self.meta = meta + # create work_dir + if mmcv.is_str(work_dir): + self.work_dir = osp.abspath(work_dir) + mmcv.mkdir_or_exist(self.work_dir) + elif work_dir is None: + self.work_dir = None + else: + raise TypeError('"work_dir" must be a str or None') + + # get model name from the model class + if hasattr(self.model, 'module'): + self._model_name = self.model.module.__class__.__name__ + else: + self._model_name = self.model.__class__.__name__ + + self._rank, self._world_size = get_dist_info() + self.timestamp = get_time_str() + self.mode = None + self._hooks = [] + self._epoch = 0 + self._iter = 0 + self._inner_iter = 0 + + if max_epochs is not None and max_iters is not None: + raise ValueError( + 'Only one of `max_epochs` or `max_iters` can be set.') + + self._max_epochs = max_epochs + self._max_iters = max_iters + # TODO: Redesign LogBuffer, it is not flexible and elegant enough + self.log_buffer = LogBuffer() + + @property + def model_name(self): + """str: Name of the model, usually the module class name.""" + return self._model_name + + @property + def rank(self): + """int: Rank of current process. (distributed training)""" + return self._rank + + @property + def world_size(self): + """int: Number of processes participating in the job. + (distributed training)""" + return self._world_size + + @property + def hooks(self): + """list[:obj:`Hook`]: A list of registered hooks.""" + return self._hooks + + @property + def epoch(self): + """int: Current epoch.""" + return self._epoch + + @property + def iter(self): + """int: Current iteration.""" + return self._iter + + @property + def inner_iter(self): + """int: Iteration in an epoch.""" + return self._inner_iter + + @property + def max_epochs(self): + """int: Maximum training epochs.""" + return self._max_epochs + + @property + def max_iters(self): + """int: Maximum training iterations.""" + return self._max_iters + + @abstractmethod + def train(self): + pass + + @abstractmethod + def val(self): + pass + + @abstractmethod + def run(self, data_loaders, workflow, **kwargs): + pass + + @abstractmethod + def save_checkpoint(self, + out_dir, + filename_tmpl, + save_optimizer=True, + meta=None, + create_symlink=True): + pass + + def current_lr(self): + """Get current learning rates. + + Returns: + list[float] | dict[str, list[float]]: Current learning rates of all + param groups. If the runner has a dict of optimizers, this + method will return a dict. + """ + if isinstance(self.optimizer, torch.optim.Optimizer): + lr = [group['lr'] for group in self.optimizer.param_groups] + elif isinstance(self.optimizer, dict): + lr = dict() + for name, optim in self.optimizer.items(): + lr[name] = [group['lr'] for group in optim.param_groups] + else: + raise RuntimeError( + 'lr is not applicable because optimizer does not exist.') + return lr + + def current_momentum(self): + """Get current momentums. + + Returns: + list[float] | dict[str, list[float]]: Current momentums of all + param groups. If the runner has a dict of optimizers, this + method will return a dict. + """ + + def _get_momentum(optimizer): + momentums = [] + for group in optimizer.param_groups: + if 'momentum' in group.keys(): + momentums.append(group['momentum']) + elif 'betas' in group.keys(): + momentums.append(group['betas'][0]) + else: + momentums.append(0) + return momentums + + if self.optimizer is None: + raise RuntimeError( + 'momentum is not applicable because optimizer does not exist.') + elif isinstance(self.optimizer, torch.optim.Optimizer): + momentums = _get_momentum(self.optimizer) + elif isinstance(self.optimizer, dict): + momentums = dict() + for name, optim in self.optimizer.items(): + momentums[name] = _get_momentum(optim) + return momentums + + def register_hook(self, hook, priority='NORMAL'): + """Register a hook into the hook list. + + The hook will be inserted into a priority queue, with the specified + priority (See :class:`Priority` for details of priorities). + For hooks with the same priority, they will be triggered in the same + order as they are registered. + + Args: + hook (:obj:`Hook`): The hook to be registered. + priority (int or str or :obj:`Priority`): Hook priority. + Lower value means higher priority. + """ + assert isinstance(hook, Hook) + if hasattr(hook, 'priority'): + raise ValueError('"priority" is a reserved attribute for hooks') + priority = get_priority(priority) + hook.priority = priority + # insert the hook to a sorted list + inserted = False + for i in range(len(self._hooks) - 1, -1, -1): + if priority >= self._hooks[i].priority: + self._hooks.insert(i + 1, hook) + inserted = True + break + if not inserted: + self._hooks.insert(0, hook) + + def register_hook_from_cfg(self, hook_cfg): + """Register a hook from its cfg. + + Args: + hook_cfg (dict): Hook config. It should have at least keys 'type' + and 'priority' indicating its type and priority. + + Notes: + The specific hook class to register should not use 'type' and + 'priority' arguments during initialization. + """ + hook_cfg = hook_cfg.copy() + priority = hook_cfg.pop('priority', 'NORMAL') + hook = mmcv.build_from_cfg(hook_cfg, HOOKS) + self.register_hook(hook, priority=priority) + + def call_hook(self, fn_name): + """Call all hooks. + + Args: + fn_name (str): The function name in each hook to be called, such as + "before_train_epoch". + """ + for hook in self._hooks: + getattr(hook, fn_name)(self) + + def get_hook_info(self): + # Get hooks info in each stage + stage_hook_map = {stage: [] for stage in Hook.stages} + for hook in self.hooks: + try: + priority = Priority(hook.priority).name + except ValueError: + priority = hook.priority + classname = hook.__class__.__name__ + hook_info = f'({priority:<12}) {classname:<35}' + for trigger_stage in hook.get_triggered_stages(): + stage_hook_map[trigger_stage].append(hook_info) + + stage_hook_infos = [] + for stage in Hook.stages: + hook_infos = stage_hook_map[stage] + if len(hook_infos) > 0: + info = f'{stage}:\n' + info += '\n'.join(hook_infos) + info += '\n -------------------- ' + stage_hook_infos.append(info) + return '\n'.join(stage_hook_infos) + + def load_checkpoint(self, + filename, + map_location='cpu', + strict=False, + revise_keys=[(r'^module.', '')]): + return load_checkpoint( + self.model, + filename, + map_location, + strict, + self.logger, + revise_keys=revise_keys) + + def resume(self, + checkpoint, + resume_optimizer=True, + map_location='default'): + if map_location == 'default': + if torch.cuda.is_available(): + device_id = torch.cuda.current_device() + checkpoint = self.load_checkpoint( + checkpoint, + map_location=lambda storage, loc: storage.cuda(device_id)) + else: + checkpoint = self.load_checkpoint(checkpoint) + else: + checkpoint = self.load_checkpoint( + checkpoint, map_location=map_location) + + self._epoch = checkpoint['meta']['epoch'] + self._iter = checkpoint['meta']['iter'] + if self.meta is None: + self.meta = {} + self.meta.setdefault('hook_msgs', {}) + # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages + self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {})) + + # Re-calculate the number of iterations when resuming + # models with different number of GPUs + if 'config' in checkpoint['meta']: + config = mmcv.Config.fromstring( + checkpoint['meta']['config'], file_format='.py') + previous_gpu_ids = config.get('gpu_ids', None) + if previous_gpu_ids and len(previous_gpu_ids) > 0 and len( + previous_gpu_ids) != self.world_size: + self._iter = int(self._iter * len(previous_gpu_ids) / + self.world_size) + self.logger.info('the iteration number is changed due to ' + 'change of GPU number') + + # resume meta information meta + self.meta = checkpoint['meta'] + + if 'optimizer' in checkpoint and resume_optimizer: + if isinstance(self.optimizer, Optimizer): + self.optimizer.load_state_dict(checkpoint['optimizer']) + elif isinstance(self.optimizer, dict): + for k in self.optimizer.keys(): + self.optimizer[k].load_state_dict( + checkpoint['optimizer'][k]) + else: + raise TypeError( + 'Optimizer should be dict or torch.optim.Optimizer ' + f'but got {type(self.optimizer)}') + + self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter) + + def register_lr_hook(self, lr_config): + if lr_config is None: + return + elif isinstance(lr_config, dict): + assert 'policy' in lr_config + policy_type = lr_config.pop('policy') + # If the type of policy is all in lower case, e.g., 'cyclic', + # then its first letter will be capitalized, e.g., to be 'Cyclic'. + # This is for the convenient usage of Lr updater. + # Since this is not applicable for ` + # CosineAnnealingLrUpdater`, + # the string will not be changed if it contains capital letters. + if policy_type == policy_type.lower(): + policy_type = policy_type.title() + hook_type = policy_type + 'LrUpdaterHook' + lr_config['type'] = hook_type + hook = mmcv.build_from_cfg(lr_config, HOOKS) + else: + hook = lr_config + self.register_hook(hook, priority='VERY_HIGH') + + def register_momentum_hook(self, momentum_config): + if momentum_config is None: + return + if isinstance(momentum_config, dict): + assert 'policy' in momentum_config + policy_type = momentum_config.pop('policy') + # If the type of policy is all in lower case, e.g., 'cyclic', + # then its first letter will be capitalized, e.g., to be 'Cyclic'. + # This is for the convenient usage of momentum updater. + # Since this is not applicable for + # `CosineAnnealingMomentumUpdater`, + # the string will not be changed if it contains capital letters. + if policy_type == policy_type.lower(): + policy_type = policy_type.title() + hook_type = policy_type + 'MomentumUpdaterHook' + momentum_config['type'] = hook_type + hook = mmcv.build_from_cfg(momentum_config, HOOKS) + else: + hook = momentum_config + self.register_hook(hook, priority='HIGH') + + def register_optimizer_hook(self, optimizer_config): + if optimizer_config is None: + return + if isinstance(optimizer_config, dict): + optimizer_config.setdefault('type', 'OptimizerHook') + hook = mmcv.build_from_cfg(optimizer_config, HOOKS) + else: + hook = optimizer_config + self.register_hook(hook, priority='ABOVE_NORMAL') + + def register_checkpoint_hook(self, checkpoint_config): + if checkpoint_config is None: + return + if isinstance(checkpoint_config, dict): + checkpoint_config.setdefault('type', 'CheckpointHook') + hook = mmcv.build_from_cfg(checkpoint_config, HOOKS) + else: + hook = checkpoint_config + self.register_hook(hook, priority='NORMAL') + + def register_logger_hooks(self, log_config): + if log_config is None: + return + log_interval = log_config['interval'] + for info in log_config['hooks']: + logger_hook = mmcv.build_from_cfg( + info, HOOKS, default_args=dict(interval=log_interval)) + self.register_hook(logger_hook, priority='VERY_LOW') + + def register_timer_hook(self, timer_config): + if timer_config is None: + return + if isinstance(timer_config, dict): + timer_config_ = copy.deepcopy(timer_config) + hook = mmcv.build_from_cfg(timer_config_, HOOKS) + else: + hook = timer_config + self.register_hook(hook, priority='LOW') + + def register_custom_hooks(self, custom_config): + if custom_config is None: + return + + if not isinstance(custom_config, list): + custom_config = [custom_config] + + for item in custom_config: + if isinstance(item, dict): + self.register_hook_from_cfg(item) + else: + self.register_hook(item, priority='NORMAL') + + def register_profiler_hook(self, profiler_config): + if profiler_config is None: + return + if isinstance(profiler_config, dict): + profiler_config.setdefault('type', 'ProfilerHook') + hook = mmcv.build_from_cfg(profiler_config, HOOKS) + else: + hook = profiler_config + self.register_hook(hook) + + def register_training_hooks(self, + lr_config, + optimizer_config=None, + checkpoint_config=None, + log_config=None, + momentum_config=None, + timer_config=dict(type='IterTimerHook'), + custom_hooks_config=None): + """Register default and custom hooks for training. + + Default and custom hooks include: + + +----------------------+-------------------------+ + | Hooks | Priority | + +======================+=========================+ + | LrUpdaterHook | VERY_HIGH (10) | + +----------------------+-------------------------+ + | MomentumUpdaterHook | HIGH (30) | + +----------------------+-------------------------+ + | OptimizerStepperHook | ABOVE_NORMAL (40) | + +----------------------+-------------------------+ + | CheckpointSaverHook | NORMAL (50) | + +----------------------+-------------------------+ + | IterTimerHook | LOW (70) | + +----------------------+-------------------------+ + | LoggerHook(s) | VERY_LOW (90) | + +----------------------+-------------------------+ + | CustomHook(s) | defaults to NORMAL (50) | + +----------------------+-------------------------+ + + If custom hooks have same priority with default hooks, custom hooks + will be triggered after default hooks. + """ + self.register_lr_hook(lr_config) + self.register_momentum_hook(momentum_config) + self.register_optimizer_hook(optimizer_config) + self.register_checkpoint_hook(checkpoint_config) + self.register_timer_hook(timer_config) + self.register_logger_hooks(log_config) + self.register_custom_hooks(custom_hooks_config) diff --git a/custom_mmpkg/custom_mmcv/runner/builder.py b/custom_mmpkg/custom_mmcv/runner/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..77c96ba0b2f30ead9da23f293c5dc84dd3e4a74f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/builder.py @@ -0,0 +1,24 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +from ..utils import Registry + +RUNNERS = Registry('runner') +RUNNER_BUILDERS = Registry('runner builder') + + +def build_runner_constructor(cfg): + return RUNNER_BUILDERS.build(cfg) + + +def build_runner(cfg, default_args=None): + runner_cfg = copy.deepcopy(cfg) + constructor_type = runner_cfg.pop('constructor', + 'DefaultRunnerConstructor') + runner_constructor = build_runner_constructor( + dict( + type=constructor_type, + runner_cfg=runner_cfg, + default_args=default_args)) + runner = runner_constructor() + return runner diff --git a/custom_mmpkg/custom_mmcv/runner/checkpoint.py b/custom_mmpkg/custom_mmcv/runner/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..da1481088ceb805007b3f1a7cad8bd528d5853f6 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/checkpoint.py @@ -0,0 +1,707 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import io +import os +import os.path as osp +import pkgutil +import re +import time +import warnings +from collections import OrderedDict +from importlib import import_module +from tempfile import TemporaryDirectory + +import torch +import torchvision +from torch.optim import Optimizer +from torch.utils import model_zoo + +import custom_mmpkg.custom_mmcv as mmcv +from ..fileio import FileClient +from ..fileio import load as load_file +from ..parallel import is_module_wrapper +from ..utils import mkdir_or_exist +from .dist_utils import get_dist_info + +ENV_MMCV_HOME = 'MMCV_HOME' +ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME' +DEFAULT_CACHE_DIR = '~/.cache' + + +def _get_mmcv_home(): + mmcv_home = os.path.expanduser( + os.getenv( + ENV_MMCV_HOME, + os.path.join( + os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv'))) + + mkdir_or_exist(mmcv_home) + return mmcv_home + + +def load_state_dict(module, state_dict, strict=False, logger=None): + """Load state_dict to a module. + + This method is modified from :meth:`torch.nn.Module.load_state_dict`. + Default value for ``strict`` is set to ``False`` and the message for + param mismatch will be shown even if strict is False. + + Args: + module (Module): Module that receives the state_dict. + state_dict (OrderedDict): Weights. + strict (bool): whether to strictly enforce that the keys + in :attr:`state_dict` match the keys returned by this module's + :meth:`~torch.nn.Module.state_dict` function. Default: ``False``. + logger (:obj:`logging.Logger`, optional): Logger to log the error + message. If not specified, print function will be used. + """ + unexpected_keys = [] + all_missing_keys = [] + err_msg = [] + + metadata = getattr(state_dict, '_metadata', None) + state_dict = state_dict.copy() + if metadata is not None: + state_dict._metadata = metadata + + # use _load_from_state_dict to enable checkpoint version control + def load(module, prefix=''): + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_module_wrapper(module): + module = module.module + local_metadata = {} if metadata is None else metadata.get( + prefix[:-1], {}) + module._load_from_state_dict(state_dict, prefix, local_metadata, True, + all_missing_keys, unexpected_keys, + err_msg) + for name, child in module._modules.items(): + if child is not None: + load(child, prefix + name + '.') + + load(module) + load = None # break load->load reference cycle + + # ignore "num_batches_tracked" of BN layers + missing_keys = [ + key for key in all_missing_keys if 'num_batches_tracked' not in key + ] + + if unexpected_keys: + err_msg.append('unexpected key in source ' + f'state_dict: {", ".join(unexpected_keys)}\n') + if missing_keys: + err_msg.append( + f'missing keys in source state_dict: {", ".join(missing_keys)}\n') + + rank, _ = get_dist_info() + if len(err_msg) > 0 and rank == 0: + err_msg.insert( + 0, 'The model and loaded state dict do not match exactly\n') + err_msg = '\n'.join(err_msg) + if strict: + raise RuntimeError(err_msg) + elif logger is not None: + logger.warning(err_msg) + else: + print(err_msg) + + +def get_torchvision_models(): + model_urls = dict() + for _, name, ispkg in pkgutil.walk_packages(torchvision.models.__path__): + if ispkg: + continue + _zoo = import_module(f'torchvision.models.{name}') + if hasattr(_zoo, 'model_urls'): + _urls = getattr(_zoo, 'model_urls') + model_urls.update(_urls) + return model_urls + + +def get_external_models(): + mmcv_home = _get_mmcv_home() + default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json') + default_urls = load_file(default_json_path) + assert isinstance(default_urls, dict) + external_json_path = osp.join(mmcv_home, 'open_mmlab.json') + if osp.exists(external_json_path): + external_urls = load_file(external_json_path) + assert isinstance(external_urls, dict) + default_urls.update(external_urls) + + return default_urls + + +def get_mmcls_models(): + mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json') + mmcls_urls = load_file(mmcls_json_path) + + return mmcls_urls + + +def get_deprecated_model_names(): + deprecate_json_path = osp.join(mmcv.__path__[0], + 'model_zoo/deprecated.json') + deprecate_urls = load_file(deprecate_json_path) + assert isinstance(deprecate_urls, dict) + + return deprecate_urls + + +def _process_mmcls_checkpoint(checkpoint): + state_dict = checkpoint['state_dict'] + new_state_dict = OrderedDict() + for k, v in state_dict.items(): + if k.startswith('backbone.'): + new_state_dict[k[9:]] = v + new_checkpoint = dict(state_dict=new_state_dict) + + return new_checkpoint + + +class CheckpointLoader: + """A general checkpoint loader to manage all schemes.""" + + _schemes = {} + + @classmethod + def _register_scheme(cls, prefixes, loader, force=False): + if isinstance(prefixes, str): + prefixes = [prefixes] + else: + assert isinstance(prefixes, (list, tuple)) + for prefix in prefixes: + if (prefix not in cls._schemes) or force: + cls._schemes[prefix] = loader + else: + raise KeyError( + f'{prefix} is already registered as a loader backend, ' + 'add "force=True" if you want to override it') + # sort, longer prefixes take priority + cls._schemes = OrderedDict( + sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True)) + + @classmethod + def register_scheme(cls, prefixes, loader=None, force=False): + """Register a loader to CheckpointLoader. + + This method can be used as a normal class method or a decorator. + + Args: + prefixes (str or list[str] or tuple[str]): + The prefix of the registered loader. + loader (function, optional): The loader function to be registered. + When this method is used as a decorator, loader is None. + Defaults to None. + force (bool, optional): Whether to override the loader + if the prefix has already been registered. Defaults to False. + """ + + if loader is not None: + cls._register_scheme(prefixes, loader, force=force) + return + + def _register(loader_cls): + cls._register_scheme(prefixes, loader_cls, force=force) + return loader_cls + + return _register + + @classmethod + def _get_checkpoint_loader(cls, path): + """Finds a loader that supports the given path. Falls back to the local + loader if no other loader is found. + + Args: + path (str): checkpoint path + + Returns: + loader (function): checkpoint loader + """ + + for p in cls._schemes: + if path.startswith(p): + return cls._schemes[p] + + @classmethod + def load_checkpoint(cls, filename, map_location=None, logger=None): + """load checkpoint through URL scheme path. + + Args: + filename (str): checkpoint file name with given prefix + map_location (str, optional): Same as :func:`torch.load`. + Default: None + logger (:mod:`logging.Logger`, optional): The logger for message. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + checkpoint_loader = cls._get_checkpoint_loader(filename) + class_name = checkpoint_loader.__name__ + mmcv.print_log( + f'load checkpoint from {class_name[10:]} path: {filename}', logger) + return checkpoint_loader(filename, map_location) + + +@CheckpointLoader.register_scheme(prefixes='') +def load_from_local(filename, map_location): + """load checkpoint by local file path. + + Args: + filename (str): local checkpoint file path + map_location (str, optional): Same as :func:`torch.load`. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + if not osp.isfile(filename): + raise IOError(f'{filename} is not a checkpoint file') + checkpoint = torch.load(filename, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes=('http://', 'https://')) +def load_from_http(filename, map_location=None, model_dir=None): + """load checkpoint through HTTP or HTTPS scheme path. In distributed + setting, this function only download checkpoint at local rank 0. + + Args: + filename (str): checkpoint file path with modelzoo or + torchvision prefix + map_location (str, optional): Same as :func:`torch.load`. + model_dir (string, optional): directory in which to save the object, + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + rank, world_size = get_dist_info() + rank = int(os.environ.get('LOCAL_RANK', rank)) + if rank == 0: + checkpoint = model_zoo.load_url( + filename, model_dir=model_dir, map_location=map_location) + if world_size > 1: + torch.distributed.barrier() + if rank > 0: + checkpoint = model_zoo.load_url( + filename, model_dir=model_dir, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes='pavi://') +def load_from_pavi(filename, map_location=None): + """load checkpoint through the file path prefixed with pavi. In distributed + setting, this function download ckpt at all ranks to different temporary + directories. + + Args: + filename (str): checkpoint file path with pavi prefix + map_location (str, optional): Same as :func:`torch.load`. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + assert filename.startswith('pavi://'), \ + f'Expected filename startswith `pavi://`, but get {filename}' + model_path = filename[7:] + + try: + from pavi import modelcloud + except ImportError: + raise ImportError( + 'Please install pavi to load checkpoint from modelcloud.') + + model = modelcloud.get(model_path) + with TemporaryDirectory() as tmp_dir: + downloaded_file = osp.join(tmp_dir, model.name) + model.download(downloaded_file) + checkpoint = torch.load(downloaded_file, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes='s3://') +def load_from_ceph(filename, map_location=None, backend='petrel'): + """load checkpoint through the file path prefixed with s3. In distributed + setting, this function download ckpt at all ranks to different temporary + directories. + + Args: + filename (str): checkpoint file path with s3 prefix + map_location (str, optional): Same as :func:`torch.load`. + backend (str, optional): The storage backend type. Options are 'ceph', + 'petrel'. Default: 'petrel'. + + .. warning:: + :class:`mmcv.fileio.file_client.CephBackend` will be deprecated, + please use :class:`mmcv.fileio.file_client.PetrelBackend` instead. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + allowed_backends = ['ceph', 'petrel'] + if backend not in allowed_backends: + raise ValueError(f'Load from Backend {backend} is not supported.') + + if backend == 'ceph': + warnings.warn( + 'CephBackend will be deprecated, please use PetrelBackend instead') + + # CephClient and PetrelBackend have the same prefix 's3://' and the latter + # will be chosen as default. If PetrelBackend can not be instantiated + # successfully, the CephClient will be chosen. + try: + file_client = FileClient(backend=backend) + except ImportError: + allowed_backends.remove(backend) + file_client = FileClient(backend=allowed_backends[0]) + + with io.BytesIO(file_client.get(filename)) as buffer: + checkpoint = torch.load(buffer, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://')) +def load_from_torchvision(filename, map_location=None): + """load checkpoint through the file path prefixed with modelzoo or + torchvision. + + Args: + filename (str): checkpoint file path with modelzoo or + torchvision prefix + map_location (str, optional): Same as :func:`torch.load`. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + model_urls = get_torchvision_models() + if filename.startswith('modelzoo://'): + warnings.warn('The URL scheme of "modelzoo://" is deprecated, please ' + 'use "torchvision://" instead') + model_name = filename[11:] + else: + model_name = filename[14:] + return load_from_http(model_urls[model_name], map_location=map_location) + + +@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://')) +def load_from_openmmlab(filename, map_location=None): + """load checkpoint through the file path prefixed with open-mmlab or + openmmlab. + + Args: + filename (str): checkpoint file path with open-mmlab or + openmmlab prefix + map_location (str, optional): Same as :func:`torch.load`. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + model_urls = get_external_models() + prefix_str = 'open-mmlab://' + if filename.startswith(prefix_str): + model_name = filename[13:] + else: + model_name = filename[12:] + prefix_str = 'openmmlab://' + + deprecated_urls = get_deprecated_model_names() + if model_name in deprecated_urls: + warnings.warn(f'{prefix_str}{model_name} is deprecated in favor ' + f'of {prefix_str}{deprecated_urls[model_name]}') + model_name = deprecated_urls[model_name] + model_url = model_urls[model_name] + # check if is url + if model_url.startswith(('http://', 'https://')): + checkpoint = load_from_http(model_url, map_location=map_location) + else: + filename = osp.join(_get_mmcv_home(), model_url) + if not osp.isfile(filename): + raise IOError(f'{filename} is not a checkpoint file') + checkpoint = torch.load(filename, map_location=map_location) + return checkpoint + + +@CheckpointLoader.register_scheme(prefixes='mmcls://') +def load_from_mmcls(filename, map_location=None): + """load checkpoint through the file path prefixed with mmcls. + + Args: + filename (str): checkpoint file path with mmcls prefix + map_location (str, optional): Same as :func:`torch.load`. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + model_urls = get_mmcls_models() + model_name = filename[8:] + checkpoint = load_from_http( + model_urls[model_name], map_location=map_location) + checkpoint = _process_mmcls_checkpoint(checkpoint) + return checkpoint + + +def _load_checkpoint(filename, map_location=None, logger=None): + """Load checkpoint from somewhere (modelzoo, file, url). + + Args: + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str, optional): Same as :func:`torch.load`. + Default: None. + logger (:mod:`logging.Logger`, optional): The logger for error message. + Default: None + + Returns: + dict or OrderedDict: The loaded checkpoint. It can be either an + OrderedDict storing model weights or a dict containing other + information, which depends on the checkpoint. + """ + return CheckpointLoader.load_checkpoint(filename, map_location, logger) + + +def _load_checkpoint_with_prefix(prefix, filename, map_location=None): + """Load partial pretrained model with specific prefix. + + Args: + prefix (str): The prefix of sub-module. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str | None): Same as :func:`torch.load`. Default: None. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + + checkpoint = _load_checkpoint(filename, map_location=map_location) + + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + if not prefix.endswith('.'): + prefix += '.' + prefix_len = len(prefix) + + state_dict = { + k[prefix_len:]: v + for k, v in state_dict.items() if k.startswith(prefix) + } + + assert state_dict, f'{prefix} is not in the pretrained model' + return state_dict + + +def load_checkpoint(model, + filename, + map_location=None, + strict=False, + logger=None, + revise_keys=[(r'^module\.', '')]): + """Load checkpoint from a file or URI. + + Args: + model (Module): Module to load checkpoint. + filename (str): Accept local filepath, URL, ``torchvision://xxx``, + ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for + details. + map_location (str): Same as :func:`torch.load`. + strict (bool): Whether to allow different params for the model and + checkpoint. + logger (:mod:`logging.Logger` or None): The logger for error message. + revise_keys (list): A list of customized keywords to modify the + state_dict in checkpoint. Each item is a (pattern, replacement) + pair of the regular expression operations. Default: strip + the prefix 'module.' by [(r'^module\\.', '')]. + + Returns: + dict or OrderedDict: The loaded checkpoint. + """ + checkpoint = _load_checkpoint(filename, map_location, logger) + # OrderedDict is a subclass of dict + if not isinstance(checkpoint, dict): + raise RuntimeError( + f'No state_dict found in checkpoint file {filename}') + # get state_dict from checkpoint + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + # strip prefix of state_dict + metadata = getattr(state_dict, '_metadata', OrderedDict()) + for p, r in revise_keys: + state_dict = OrderedDict( + {re.sub(p, r, k): v + for k, v in state_dict.items()}) + # Keep metadata in state_dict + state_dict._metadata = metadata + + # load state_dict + load_state_dict(model, state_dict, strict, logger) + return checkpoint + + +def weights_to_cpu(state_dict): + """Copy a model state_dict to cpu. + + Args: + state_dict (OrderedDict): Model weights on GPU. + + Returns: + OrderedDict: Model weights on GPU. + """ + state_dict_cpu = OrderedDict() + for key, val in state_dict.items(): + state_dict_cpu[key] = val.cpu() + # Keep metadata in state_dict + state_dict_cpu._metadata = getattr(state_dict, '_metadata', OrderedDict()) + return state_dict_cpu + + +def _save_to_state_dict(module, destination, prefix, keep_vars): + """Saves module state to `destination` dictionary. + + This method is modified from :meth:`torch.nn.Module._save_to_state_dict`. + + Args: + module (nn.Module): The module to generate state_dict. + destination (dict): A dict where state will be stored. + prefix (str): The prefix for parameters and buffers used in this + module. + """ + for name, param in module._parameters.items(): + if param is not None: + destination[prefix + name] = param if keep_vars else param.detach() + for name, buf in module._buffers.items(): + # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d + if buf is not None: + destination[prefix + name] = buf if keep_vars else buf.detach() + + +def get_state_dict(module, destination=None, prefix='', keep_vars=False): + """Returns a dictionary containing a whole state of the module. + + Both parameters and persistent buffers (e.g. running averages) are + included. Keys are corresponding parameter and buffer names. + + This method is modified from :meth:`torch.nn.Module.state_dict` to + recursively check parallel module in case that the model has a complicated + structure, e.g., nn.Module(nn.Module(DDP)). + + Args: + module (nn.Module): The module to generate state_dict. + destination (OrderedDict): Returned dict for the state of the + module. + prefix (str): Prefix of the key. + keep_vars (bool): Whether to keep the variable property of the + parameters. Default: False. + + Returns: + dict: A dictionary containing a whole state of the module. + """ + # recursively check parallel module in case that the model has a + # complicated structure, e.g., nn.Module(nn.Module(DDP)) + if is_module_wrapper(module): + module = module.module + + # below is the same as torch.nn.Module.state_dict() + if destination is None: + destination = OrderedDict() + destination._metadata = OrderedDict() + destination._metadata[prefix[:-1]] = local_metadata = dict( + version=module._version) + _save_to_state_dict(module, destination, prefix, keep_vars) + for name, child in module._modules.items(): + if child is not None: + get_state_dict( + child, destination, prefix + name + '.', keep_vars=keep_vars) + for hook in module._state_dict_hooks.values(): + hook_result = hook(module, destination, prefix, local_metadata) + if hook_result is not None: + destination = hook_result + return destination + + +def save_checkpoint(model, + filename, + optimizer=None, + meta=None, + file_client_args=None): + """Save checkpoint to file. + + The checkpoint will have 3 fields: ``meta``, ``state_dict`` and + ``optimizer``. By default ``meta`` will contain version and time info. + + Args: + model (Module): Module whose params are to be saved. + filename (str): Checkpoint filename. + optimizer (:obj:`Optimizer`, optional): Optimizer to be saved. + meta (dict, optional): Metadata to be saved in checkpoint. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + `New in version 1.3.16.` + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError(f'meta must be a dict or None, but got {type(meta)}') + meta.update(mmcv_version=mmcv.__version__, time=time.asctime()) + + if is_module_wrapper(model): + model = model.module + + if hasattr(model, 'CLASSES') and model.CLASSES is not None: + # save class name to the meta + meta.update(CLASSES=model.CLASSES) + + checkpoint = { + 'meta': meta, + 'state_dict': weights_to_cpu(get_state_dict(model)) + } + # save optimizer state dict in the checkpoint + if isinstance(optimizer, Optimizer): + checkpoint['optimizer'] = optimizer.state_dict() + elif isinstance(optimizer, dict): + checkpoint['optimizer'] = {} + for name, optim in optimizer.items(): + checkpoint['optimizer'][name] = optim.state_dict() + + if filename.startswith('pavi://'): + if file_client_args is not None: + raise ValueError( + 'file_client_args should be "None" if filename starts with' + f'"pavi://", but got {file_client_args}') + try: + from pavi import modelcloud + from pavi import exception + except ImportError: + raise ImportError( + 'Please install pavi to load checkpoint from modelcloud.') + model_path = filename[7:] + root = modelcloud.Folder() + model_dir, model_name = osp.split(model_path) + try: + model = modelcloud.get(model_dir) + except exception.NodeNotFoundError: + model = root.create_training_model(model_dir) + with TemporaryDirectory() as tmp_dir: + checkpoint_file = osp.join(tmp_dir, model_name) + with open(checkpoint_file, 'wb') as f: + torch.save(checkpoint, f) + f.flush() + model.create_file(checkpoint_file, name=model_name) + else: + file_client = FileClient.infer_client(file_client_args, filename) + with io.BytesIO() as f: + torch.save(checkpoint, f) + file_client.put(f.getvalue(), filename) diff --git a/custom_mmpkg/custom_mmcv/runner/default_constructor.py b/custom_mmpkg/custom_mmcv/runner/default_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..fed2e0c83e19133ce3873ea092c1a872ca254bbf --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/default_constructor.py @@ -0,0 +1,44 @@ +from .builder import RUNNER_BUILDERS, RUNNERS + + +@RUNNER_BUILDERS.register_module() +class DefaultRunnerConstructor: + """Default constructor for runners. + + Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`. + For example, We can inject some new properties and functions for `Runner`. + + Example: + >>> from custom_mmpkg.custom_mmcv.runner import RUNNER_BUILDERS, build_runner + >>> # Define a new RunnerReconstructor + >>> @RUNNER_BUILDERS.register_module() + >>> class MyRunnerConstructor: + ... def __init__(self, runner_cfg, default_args=None): + ... if not isinstance(runner_cfg, dict): + ... raise TypeError('runner_cfg should be a dict', + ... f'but got {type(runner_cfg)}') + ... self.runner_cfg = runner_cfg + ... self.default_args = default_args + ... + ... def __call__(self): + ... runner = RUNNERS.build(self.runner_cfg, + ... default_args=self.default_args) + ... # Add new properties for existing runner + ... runner.my_name = 'my_runner' + ... runner.my_function = lambda self: print(self.my_name) + ... ... + >>> # build your runner + >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40, + ... constructor='MyRunnerConstructor') + >>> runner = build_runner(runner_cfg) + """ + + def __init__(self, runner_cfg, default_args=None): + if not isinstance(runner_cfg, dict): + raise TypeError('runner_cfg should be a dict', + f'but got {type(runner_cfg)}') + self.runner_cfg = runner_cfg + self.default_args = default_args + + def __call__(self): + return RUNNERS.build(self.runner_cfg, default_args=self.default_args) diff --git a/custom_mmpkg/custom_mmcv/runner/dist_utils.py b/custom_mmpkg/custom_mmcv/runner/dist_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d3a1ef3fda5ceeb31bf15a73779da1b1903ab0fe --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/dist_utils.py @@ -0,0 +1,164 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +import os +import subprocess +from collections import OrderedDict + +import torch +import torch.multiprocessing as mp +from torch import distributed as dist +from torch._utils import (_flatten_dense_tensors, _take_tensors, + _unflatten_dense_tensors) + + +def init_dist(launcher, backend='nccl', **kwargs): + if mp.get_start_method(allow_none=True) is None: + mp.set_start_method('spawn') + if launcher == 'pytorch': + _init_dist_pytorch(backend, **kwargs) + elif launcher == 'mpi': + _init_dist_mpi(backend, **kwargs) + elif launcher == 'slurm': + _init_dist_slurm(backend, **kwargs) + else: + raise ValueError(f'Invalid launcher type: {launcher}') + + +def _init_dist_pytorch(backend, **kwargs): + # TODO: use local_rank instead of rank % num_gpus + rank = int(os.environ['RANK']) + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(rank % num_gpus) + dist.init_process_group(backend=backend, **kwargs) + + +def _init_dist_mpi(backend, **kwargs): + # TODO: use local_rank instead of rank % num_gpus + rank = int(os.environ['OMPI_COMM_WORLD_RANK']) + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(rank % num_gpus) + dist.init_process_group(backend=backend, **kwargs) + + +def _init_dist_slurm(backend, port=None): + """Initialize slurm distributed training environment. + + If argument ``port`` is not specified, then the master port will be system + environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system + environment variable, then a default port ``29500`` will be used. + + Args: + backend (str): Backend of torch.distributed. + port (int, optional): Master port. Defaults to None. + """ + proc_id = int(os.environ['SLURM_PROCID']) + ntasks = int(os.environ['SLURM_NTASKS']) + node_list = os.environ['SLURM_NODELIST'] + num_gpus = torch.cuda.device_count() + torch.cuda.set_device(proc_id % num_gpus) + addr = subprocess.getoutput( + f'scontrol show hostname {node_list} | head -n1') + # specify master port + if port is not None: + os.environ['MASTER_PORT'] = str(port) + elif 'MASTER_PORT' in os.environ: + pass # use MASTER_PORT in the environment variable + else: + # 29500 is torch.distributed default port + os.environ['MASTER_PORT'] = '29500' + # use MASTER_ADDR in the environment variable if it already exists + if 'MASTER_ADDR' not in os.environ: + os.environ['MASTER_ADDR'] = addr + os.environ['WORLD_SIZE'] = str(ntasks) + os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) + os.environ['RANK'] = str(proc_id) + dist.init_process_group(backend=backend) + + +def get_dist_info(): + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + world_size = dist.get_world_size() + else: + rank = 0 + world_size = 1 + return rank, world_size + + +def master_only(func): + + @functools.wraps(func) + def wrapper(*args, **kwargs): + rank, _ = get_dist_info() + if rank == 0: + return func(*args, **kwargs) + + return wrapper + + +def allreduce_params(params, coalesce=True, bucket_size_mb=-1): + """Allreduce parameters. + + Args: + params (list[torch.Parameters]): List of parameters or buffers of a + model. + coalesce (bool, optional): Whether allreduce parameters as a whole. + Defaults to True. + bucket_size_mb (int, optional): Size of bucket, the unit is MB. + Defaults to -1. + """ + _, world_size = get_dist_info() + if world_size == 1: + return + params = [param.data for param in params] + if coalesce: + _allreduce_coalesced(params, world_size, bucket_size_mb) + else: + for tensor in params: + dist.all_reduce(tensor.div_(world_size)) + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + """Allreduce gradients. + + Args: + params (list[torch.Parameters]): List of parameters of a model + coalesce (bool, optional): Whether allreduce parameters as a whole. + Defaults to True. + bucket_size_mb (int, optional): Size of bucket, the unit is MB. + Defaults to -1. + """ + grads = [ + param.grad.data for param in params + if param.requires_grad and param.grad is not None + ] + _, world_size = get_dist_info() + if world_size == 1: + return + if coalesce: + _allreduce_coalesced(grads, world_size, bucket_size_mb) + else: + for tensor in grads: + dist.all_reduce(tensor.div_(world_size)) + + +def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1): + if bucket_size_mb > 0: + bucket_size_bytes = bucket_size_mb * 1024 * 1024 + buckets = _take_tensors(tensors, bucket_size_bytes) + else: + buckets = OrderedDict() + for tensor in tensors: + tp = tensor.type() + if tp not in buckets: + buckets[tp] = [] + buckets[tp].append(tensor) + buckets = buckets.values() + + for bucket in buckets: + flat_tensors = _flatten_dense_tensors(bucket) + dist.all_reduce(flat_tensors) + flat_tensors.div_(world_size) + for tensor, synced in zip( + bucket, _unflatten_dense_tensors(flat_tensors, bucket)): + tensor.copy_(synced) diff --git a/custom_mmpkg/custom_mmcv/runner/epoch_based_runner.py b/custom_mmpkg/custom_mmcv/runner/epoch_based_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..46b96618daec0941513cc0188edfa45e4c42dfe2 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/epoch_based_runner.py @@ -0,0 +1,187 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import platform +import shutil +import time +import warnings + +import torch + +import custom_mmpkg.custom_mmcv as mmcv +from .base_runner import BaseRunner +from .builder import RUNNERS +from .checkpoint import save_checkpoint +from .utils import get_host_info + + +@RUNNERS.register_module() +class EpochBasedRunner(BaseRunner): + """Epoch-based Runner. + + This runner train models epoch by epoch. + """ + + def run_iter(self, data_batch, train_mode, **kwargs): + if self.batch_processor is not None: + outputs = self.batch_processor( + self.model, data_batch, train_mode=train_mode, **kwargs) + elif train_mode: + outputs = self.model.train_step(data_batch, self.optimizer, + **kwargs) + else: + outputs = self.model.val_step(data_batch, self.optimizer, **kwargs) + if not isinstance(outputs, dict): + raise TypeError('"batch_processor()" or "model.train_step()"' + 'and "model.val_step()" must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs + + def train(self, data_loader, **kwargs): + self.model.train() + self.mode = 'train' + self.data_loader = data_loader + self._max_iters = self._max_epochs * len(self.data_loader) + self.call_hook('before_train_epoch') + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(self.data_loader): + self._inner_iter = i + self.call_hook('before_train_iter') + self.run_iter(data_batch, train_mode=True, **kwargs) + self.call_hook('after_train_iter') + self._iter += 1 + + self.call_hook('after_train_epoch') + self._epoch += 1 + + @torch.no_grad() + def val(self, data_loader, **kwargs): + self.model.eval() + self.mode = 'val' + self.data_loader = data_loader + self.call_hook('before_val_epoch') + time.sleep(2) # Prevent possible deadlock during epoch transition + for i, data_batch in enumerate(self.data_loader): + self._inner_iter = i + self.call_hook('before_val_iter') + self.run_iter(data_batch, train_mode=False) + self.call_hook('after_val_iter') + + self.call_hook('after_val_epoch') + + def run(self, data_loaders, workflow, max_epochs=None, **kwargs): + """Start running. + + Args: + data_loaders (list[:obj:`DataLoader`]): Dataloaders for training + and validation. + workflow (list[tuple]): A list of (phase, epochs) to specify the + running order and epochs. E.g, [('train', 2), ('val', 1)] means + running 2 epochs for training and 1 epoch for validation, + iteratively. + """ + assert isinstance(data_loaders, list) + assert mmcv.is_list_of(workflow, tuple) + assert len(data_loaders) == len(workflow) + if max_epochs is not None: + warnings.warn( + 'setting max_epochs in run is deprecated, ' + 'please set max_epochs in runner_config', DeprecationWarning) + self._max_epochs = max_epochs + + assert self._max_epochs is not None, ( + 'max_epochs must be specified during instantiation') + + for i, flow in enumerate(workflow): + mode, epochs = flow + if mode == 'train': + self._max_iters = self._max_epochs * len(data_loaders[i]) + break + + work_dir = self.work_dir if self.work_dir is not None else 'NONE' + self.logger.info('Start running, host: %s, work_dir: %s', + get_host_info(), work_dir) + self.logger.info('Hooks will be executed in the following order:\n%s', + self.get_hook_info()) + self.logger.info('workflow: %s, max: %d epochs', workflow, + self._max_epochs) + self.call_hook('before_run') + + while self.epoch < self._max_epochs: + for i, flow in enumerate(workflow): + mode, epochs = flow + if isinstance(mode, str): # self.train() + if not hasattr(self, mode): + raise ValueError( + f'runner has no method named "{mode}" to run an ' + 'epoch') + epoch_runner = getattr(self, mode) + else: + raise TypeError( + 'mode in workflow must be a str, but got {}'.format( + type(mode))) + + for _ in range(epochs): + if mode == 'train' and self.epoch >= self._max_epochs: + break + epoch_runner(data_loaders[i], **kwargs) + + time.sleep(1) # wait for some hooks like loggers to finish + self.call_hook('after_run') + + def save_checkpoint(self, + out_dir, + filename_tmpl='epoch_{}.pth', + save_optimizer=True, + meta=None, + create_symlink=True): + """Save the checkpoint. + + Args: + out_dir (str): The directory that checkpoints are saved. + filename_tmpl (str, optional): The checkpoint filename template, + which contains a placeholder for the epoch number. + Defaults to 'epoch_{}.pth'. + save_optimizer (bool, optional): Whether to save the optimizer to + the checkpoint. Defaults to True. + meta (dict, optional): The meta information to be saved in the + checkpoint. Defaults to None. + create_symlink (bool, optional): Whether to create a symlink + "latest.pth" to point to the latest checkpoint. + Defaults to True. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError( + f'meta should be a dict or None, but got {type(meta)}') + if self.meta is not None: + meta.update(self.meta) + # Note: meta.update(self.meta) should be done before + # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise + # there will be problems with resumed checkpoints. + # More details in https://github.com/open-mmlab/mmcv/pull/1108 + meta.update(epoch=self.epoch + 1, iter=self.iter) + + filename = filename_tmpl.format(self.epoch + 1) + filepath = osp.join(out_dir, filename) + optimizer = self.optimizer if save_optimizer else None + save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) + # in some environments, `os.symlink` is not supported, you may need to + # set `create_symlink` to False + if create_symlink: + dst_file = osp.join(out_dir, 'latest.pth') + if platform.system() != 'Windows': + mmcv.symlink(filename, dst_file) + else: + shutil.copy(filepath, dst_file) + + +@RUNNERS.register_module() +class Runner(EpochBasedRunner): + """Deprecated name of EpochBasedRunner.""" + + def __init__(self, *args, **kwargs): + warnings.warn( + 'Runner was deprecated, please use EpochBasedRunner instead') + super().__init__(*args, **kwargs) diff --git a/custom_mmpkg/custom_mmcv/runner/fp16_utils.py b/custom_mmpkg/custom_mmcv/runner/fp16_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..752e1b23fb971a56f72ea6dfee36166670221e93 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/fp16_utils.py @@ -0,0 +1,410 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import functools +import warnings +from collections import abc +from inspect import getfullargspec + +import numpy as np +import torch +import torch.nn as nn + +from custom_mmpkg.custom_mmcv.utils import TORCH_VERSION, digit_version +from .dist_utils import allreduce_grads as _allreduce_grads + +try: + # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported + # and used; otherwise, auto fp16 will adopt mmcv's implementation. + # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16 + # manually, so the behavior may not be consistent with real amp. + from torch.cuda.amp import autocast +except ImportError: + pass + + +def cast_tensor_type(inputs, src_type, dst_type): + """Recursively convert Tensor in inputs from src_type to dst_type. + + Args: + inputs: Inputs that to be casted. + src_type (torch.dtype): Source type.. + dst_type (torch.dtype): Destination type. + + Returns: + The same type with inputs, but all contained Tensors have been cast. + """ + if isinstance(inputs, nn.Module): + return inputs + elif isinstance(inputs, torch.Tensor): + return inputs.to(dst_type) + elif isinstance(inputs, str): + return inputs + elif isinstance(inputs, np.ndarray): + return inputs + elif isinstance(inputs, abc.Mapping): + return type(inputs)({ + k: cast_tensor_type(v, src_type, dst_type) + for k, v in inputs.items() + }) + elif isinstance(inputs, abc.Iterable): + return type(inputs)( + cast_tensor_type(item, src_type, dst_type) for item in inputs) + else: + return inputs + + +def auto_fp16(apply_to=None, out_fp32=False): + """Decorator to enable fp16 training automatically. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If inputs arguments are fp32 tensors, they will + be converted to fp16 automatically. Arguments other than fp32 tensors are + ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the + backend, otherwise, original mmcv implementation will be adopted. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp32 (bool): Whether to convert the output back to fp32. + + Example: + + >>> import torch.nn as nn + >>> class MyModule1(nn.Module): + >>> + >>> # Convert x and y to fp16 + >>> @auto_fp16() + >>> def forward(self, x, y): + >>> pass + + >>> import torch.nn as nn + >>> class MyModule2(nn.Module): + >>> + >>> # convert pred to fp16 + >>> @auto_fp16(apply_to=('pred', )) + >>> def do_something(self, pred, others): + >>> pass + """ + + def auto_fp16_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@auto_fp16 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + # NOTE: default args are not taken into consideration + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.float, torch.half)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = {} + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.float, torch.half) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + if (TORCH_VERSION != 'parrots' and + digit_version(TORCH_VERSION) >= digit_version('1.6.0')): + with autocast(enabled=True): + output = old_func(*new_args, **new_kwargs) + else: + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp32: + output = cast_tensor_type(output, torch.half, torch.float) + return output + + return new_func + + return auto_fp16_wrapper + + +def force_fp32(apply_to=None, out_fp16=False): + """Decorator to convert input arguments to fp32 in force. + + This decorator is useful when you write custom modules and want to support + mixed precision training. If there are some inputs that must be processed + in fp32 mode, then this decorator can handle it. If inputs arguments are + fp16 tensors, they will be converted to fp32 automatically. Arguments other + than fp16 tensors are ignored. If you are using PyTorch >= 1.6, + torch.cuda.amp is used as the backend, otherwise, original mmcv + implementation will be adopted. + + Args: + apply_to (Iterable, optional): The argument names to be converted. + `None` indicates all arguments. + out_fp16 (bool): Whether to convert the output back to fp16. + + Example: + + >>> import torch.nn as nn + >>> class MyModule1(nn.Module): + >>> + >>> # Convert x and y to fp32 + >>> @force_fp32() + >>> def loss(self, x, y): + >>> pass + + >>> import torch.nn as nn + >>> class MyModule2(nn.Module): + >>> + >>> # convert pred to fp32 + >>> @force_fp32(apply_to=('pred', )) + >>> def post_process(self, pred, others): + >>> pass + """ + + def force_fp32_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # check if the module has set the attribute `fp16_enabled`, if not, + # just fallback to the original method. + if not isinstance(args[0], torch.nn.Module): + raise TypeError('@force_fp32 can only be used to decorate the ' + 'method of nn.Module') + if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled): + return old_func(*args, **kwargs) + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get the argument names to be casted + args_to_cast = args_info.args if apply_to is None else apply_to + # convert the args that need to be processed + new_args = [] + if args: + arg_names = args_info.args[:len(args)] + for i, arg_name in enumerate(arg_names): + if arg_name in args_to_cast: + new_args.append( + cast_tensor_type(args[i], torch.half, torch.float)) + else: + new_args.append(args[i]) + # convert the kwargs that need to be processed + new_kwargs = dict() + if kwargs: + for arg_name, arg_value in kwargs.items(): + if arg_name in args_to_cast: + new_kwargs[arg_name] = cast_tensor_type( + arg_value, torch.half, torch.float) + else: + new_kwargs[arg_name] = arg_value + # apply converted arguments to the decorated method + if (TORCH_VERSION != 'parrots' and + digit_version(TORCH_VERSION) >= digit_version('1.6.0')): + with autocast(enabled=False): + output = old_func(*new_args, **new_kwargs) + else: + output = old_func(*new_args, **new_kwargs) + # cast the results back to fp32 if necessary + if out_fp16: + output = cast_tensor_type(output, torch.float, torch.half) + return output + + return new_func + + return force_fp32_wrapper + + +def allreduce_grads(params, coalesce=True, bucket_size_mb=-1): + warnings.warning( + '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be ' + 'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads') + _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb) + + +def wrap_fp16_model(model): + """Wrap the FP32 model to FP16. + + If you are using PyTorch >= 1.6, torch.cuda.amp is used as the + backend, otherwise, original mmcv implementation will be adopted. + + For PyTorch >= 1.6, this function will + 1. Set fp16 flag inside the model to True. + + Otherwise: + 1. Convert FP32 model to FP16. + 2. Remain some necessary layers to be FP32, e.g., normalization layers. + 3. Set `fp16_enabled` flag inside the model to True. + + Args: + model (nn.Module): Model in FP32. + """ + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.6.0')): + # convert model to fp16 + model.half() + # patch the normalization layers to make it work in fp32 mode + patch_norm_fp32(model) + # set `fp16_enabled` flag + for m in model.modules(): + if hasattr(m, 'fp16_enabled'): + m.fp16_enabled = True + + +def patch_norm_fp32(module): + """Recursively convert normalization layers from FP16 to FP32. + + Args: + module (nn.Module): The modules to be converted in FP16. + + Returns: + nn.Module: The converted module, the normalization layers have been + converted to FP32. + """ + if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)): + module.float() + if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3': + module.forward = patch_forward_method(module.forward, torch.half, + torch.float) + for child in module.children(): + patch_norm_fp32(child) + return module + + +def patch_forward_method(func, src_type, dst_type, convert_output=True): + """Patch the forward method of a module. + + Args: + func (callable): The original forward method. + src_type (torch.dtype): Type of input arguments to be converted from. + dst_type (torch.dtype): Type of input arguments to be converted to. + convert_output (bool): Whether to convert the output back to src_type. + + Returns: + callable: The patched forward method. + """ + + def new_forward(*args, **kwargs): + output = func(*cast_tensor_type(args, src_type, dst_type), + **cast_tensor_type(kwargs, src_type, dst_type)) + if convert_output: + output = cast_tensor_type(output, dst_type, src_type) + return output + + return new_forward + + +class LossScaler: + """Class that manages loss scaling in mixed precision training which + supports both dynamic or static mode. + + The implementation refers to + https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py. + Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling. + It's important to understand how :class:`LossScaler` operates. + Loss scaling is designed to combat the problem of underflowing + gradients encountered at long times when training fp16 networks. + Dynamic loss scaling begins by attempting a very high loss + scale. Ironically, this may result in OVERflowing gradients. + If overflowing gradients are encountered, :class:`FP16_Optimizer` then + skips the update step for this particular iteration/minibatch, + and :class:`LossScaler` adjusts the loss scale to a lower value. + If a certain number of iterations occur without overflowing gradients + detected,:class:`LossScaler` increases the loss scale once more. + In this way :class:`LossScaler` attempts to "ride the edge" of always + using the highest loss scale possible without incurring overflow. + + Args: + init_scale (float): Initial loss scale value, default: 2**32. + scale_factor (float): Factor used when adjusting the loss scale. + Default: 2. + mode (str): Loss scaling mode. 'dynamic' or 'static' + scale_window (int): Number of consecutive iterations without an + overflow to wait before increasing the loss scale. Default: 1000. + """ + + def __init__(self, + init_scale=2**32, + mode='dynamic', + scale_factor=2., + scale_window=1000): + self.cur_scale = init_scale + self.cur_iter = 0 + assert mode in ('dynamic', + 'static'), 'mode can only be dynamic or static' + self.mode = mode + self.last_overflow_iter = -1 + self.scale_factor = scale_factor + self.scale_window = scale_window + + def has_overflow(self, params): + """Check if params contain overflow.""" + if self.mode != 'dynamic': + return False + for p in params: + if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data): + return True + return False + + def _has_inf_or_nan(x): + """Check if params contain NaN.""" + try: + cpu_sum = float(x.float().sum()) + except RuntimeError as instance: + if 'value cannot be converted' not in instance.args[0]: + raise + return True + else: + if cpu_sum == float('inf') or cpu_sum == -float('inf') \ + or cpu_sum != cpu_sum: + return True + return False + + def update_scale(self, overflow): + """update the current loss scale value when overflow happens.""" + if self.mode != 'dynamic': + return + if overflow: + self.cur_scale = max(self.cur_scale / self.scale_factor, 1) + self.last_overflow_iter = self.cur_iter + else: + if (self.cur_iter - self.last_overflow_iter) % \ + self.scale_window == 0: + self.cur_scale *= self.scale_factor + self.cur_iter += 1 + + def state_dict(self): + """Returns the state of the scaler as a :class:`dict`.""" + return dict( + cur_scale=self.cur_scale, + cur_iter=self.cur_iter, + mode=self.mode, + last_overflow_iter=self.last_overflow_iter, + scale_factor=self.scale_factor, + scale_window=self.scale_window) + + def load_state_dict(self, state_dict): + """Loads the loss_scaler state dict. + + Args: + state_dict (dict): scaler state. + """ + self.cur_scale = state_dict['cur_scale'] + self.cur_iter = state_dict['cur_iter'] + self.mode = state_dict['mode'] + self.last_overflow_iter = state_dict['last_overflow_iter'] + self.scale_factor = state_dict['scale_factor'] + self.scale_window = state_dict['scale_window'] + + @property + def loss_scale(self): + return self.cur_scale diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/__init__.py b/custom_mmpkg/custom_mmcv/runner/hooks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..915af28cefab14a14c1188ed861161080fd138a3 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/__init__.py @@ -0,0 +1,29 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .checkpoint import CheckpointHook +from .closure import ClosureHook +from .ema import EMAHook +from .evaluation import DistEvalHook, EvalHook +from .hook import HOOKS, Hook +from .iter_timer import IterTimerHook +from .logger import (DvcliveLoggerHook, LoggerHook, MlflowLoggerHook, + NeptuneLoggerHook, PaviLoggerHook, TensorboardLoggerHook, + TextLoggerHook, WandbLoggerHook) +from .lr_updater import LrUpdaterHook +from .memory import EmptyCacheHook +from .momentum_updater import MomentumUpdaterHook +from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook, + GradientCumulativeOptimizerHook, OptimizerHook) +from .profiler import ProfilerHook +from .sampler_seed import DistSamplerSeedHook +from .sync_buffer import SyncBuffersHook + +__all__ = [ + 'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook', + 'OptimizerHook', 'Fp16OptimizerHook', 'IterTimerHook', + 'DistSamplerSeedHook', 'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', + 'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook', + 'NeptuneLoggerHook', 'WandbLoggerHook', 'DvcliveLoggerHook', + 'MomentumUpdaterHook', 'SyncBuffersHook', 'EMAHook', 'EvalHook', + 'DistEvalHook', 'ProfilerHook', 'GradientCumulativeOptimizerHook', + 'GradientCumulativeFp16OptimizerHook' +] diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/checkpoint.py b/custom_mmpkg/custom_mmcv/runner/hooks/checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..0a1b688bcbd9877423ba3930a81093464aed34f6 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/checkpoint.py @@ -0,0 +1,167 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings + +from custom_mmpkg.custom_mmcv.fileio import FileClient +from ..dist_utils import allreduce_params, master_only +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class CheckpointHook(Hook): + """Save checkpoints periodically. + + Args: + interval (int): The saving period. If ``by_epoch=True``, interval + indicates epochs, otherwise it indicates iterations. + Default: -1, which means "never". + by_epoch (bool): Saving checkpoints by epoch or by iteration. + Default: True. + save_optimizer (bool): Whether to save optimizer state_dict in the + checkpoint. It is usually used for resuming experiments. + Default: True. + out_dir (str, optional): The root directory to save checkpoints. If not + specified, ``runner.work_dir`` will be used by default. If + specified, the ``out_dir`` will be the concatenation of ``out_dir`` + and the last level directory of ``runner.work_dir``. + `Changed in version 1.3.16.` + max_keep_ckpts (int, optional): The maximum checkpoints to keep. + In some cases we want only the latest few checkpoints and would + like to delete old ones to save the disk space. + Default: -1, which means unlimited. + save_last (bool, optional): Whether to force the last checkpoint to be + saved regardless of interval. Default: True. + sync_buffer (bool, optional): Whether to synchronize buffers in + different gpus. Default: False. + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + `New in version 1.3.16.` + + .. warning:: + Before v1.3.16, the ``out_dir`` argument indicates the path where the + checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the + root directory and the final path to save checkpoint is the + concatenation of ``out_dir`` and the last level directory of + ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A" + and the value of ``runner.work_dir`` is "/path/of/B", then the final + path will be "/path/of/A/B". + """ + + def __init__(self, + interval=-1, + by_epoch=True, + save_optimizer=True, + out_dir=None, + max_keep_ckpts=-1, + save_last=True, + sync_buffer=False, + file_client_args=None, + **kwargs): + self.interval = interval + self.by_epoch = by_epoch + self.save_optimizer = save_optimizer + self.out_dir = out_dir + self.max_keep_ckpts = max_keep_ckpts + self.save_last = save_last + self.args = kwargs + self.sync_buffer = sync_buffer + self.file_client_args = file_client_args + + def before_run(self, runner): + if not self.out_dir: + self.out_dir = runner.work_dir + + self.file_client = FileClient.infer_client(self.file_client_args, + self.out_dir) + + # if `self.out_dir` is not equal to `runner.work_dir`, it means that + # `self.out_dir` is set so the final `self.out_dir` is the + # concatenation of `self.out_dir` and the last level directory of + # `runner.work_dir` + if self.out_dir != runner.work_dir: + basename = osp.basename(runner.work_dir.rstrip(osp.sep)) + self.out_dir = self.file_client.join_path(self.out_dir, basename) + + runner.logger.info((f'Checkpoints will be saved to {self.out_dir} by ' + f'{self.file_client.name}.')) + + # disable the create_symlink option because some file backends do not + # allow to create a symlink + if 'create_symlink' in self.args: + if self.args[ + 'create_symlink'] and not self.file_client.allow_symlink: + self.args['create_symlink'] = False + warnings.warn( + ('create_symlink is set as True by the user but is changed' + 'to be False because creating symbolic link is not ' + f'allowed in {self.file_client.name}')) + else: + self.args['create_symlink'] = self.file_client.allow_symlink + + def after_train_epoch(self, runner): + if not self.by_epoch: + return + + # save checkpoint for following cases: + # 1. every ``self.interval`` epochs + # 2. reach the last epoch of training + if self.every_n_epochs( + runner, self.interval) or (self.save_last + and self.is_last_epoch(runner)): + runner.logger.info( + f'Saving checkpoint at {runner.epoch + 1} epochs') + if self.sync_buffer: + allreduce_params(runner.model.buffers()) + self._save_checkpoint(runner) + + @master_only + def _save_checkpoint(self, runner): + """Save the current checkpoint and delete unwanted checkpoint.""" + runner.save_checkpoint( + self.out_dir, save_optimizer=self.save_optimizer, **self.args) + if runner.meta is not None: + if self.by_epoch: + cur_ckpt_filename = self.args.get( + 'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1) + else: + cur_ckpt_filename = self.args.get( + 'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1) + runner.meta.setdefault('hook_msgs', dict()) + runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path( + self.out_dir, cur_ckpt_filename) + # remove other checkpoints + if self.max_keep_ckpts > 0: + if self.by_epoch: + name = 'epoch_{}.pth' + current_ckpt = runner.epoch + 1 + else: + name = 'iter_{}.pth' + current_ckpt = runner.iter + 1 + redundant_ckpts = range( + current_ckpt - self.max_keep_ckpts * self.interval, 0, + -self.interval) + filename_tmpl = self.args.get('filename_tmpl', name) + for _step in redundant_ckpts: + ckpt_path = self.file_client.join_path( + self.out_dir, filename_tmpl.format(_step)) + if self.file_client.isfile(ckpt_path): + self.file_client.remove(ckpt_path) + else: + break + + def after_train_iter(self, runner): + if self.by_epoch: + return + + # save checkpoint for following cases: + # 1. every ``self.interval`` iterations + # 2. reach the last iteration of training + if self.every_n_iters( + runner, self.interval) or (self.save_last + and self.is_last_iter(runner)): + runner.logger.info( + f'Saving checkpoint at {runner.iter + 1} iterations') + if self.sync_buffer: + allreduce_params(runner.model.buffers()) + self._save_checkpoint(runner) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/closure.py b/custom_mmpkg/custom_mmcv/runner/hooks/closure.py new file mode 100644 index 0000000000000000000000000000000000000000..b955f81f425be4ac3e6bb3f4aac653887989e872 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/closure.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class ClosureHook(Hook): + + def __init__(self, fn_name, fn): + assert hasattr(self, fn_name) + assert callable(fn) + setattr(self, fn_name, fn) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/ema.py b/custom_mmpkg/custom_mmcv/runner/hooks/ema.py new file mode 100644 index 0000000000000000000000000000000000000000..15c7e68088f019802a59e7ae41cc1fe0c7f28f96 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/ema.py @@ -0,0 +1,89 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...parallel import is_module_wrapper +from ..hooks.hook import HOOKS, Hook + + +@HOOKS.register_module() +class EMAHook(Hook): + r"""Exponential Moving Average Hook. + + Use Exponential Moving Average on all parameters of model in training + process. All parameters have a ema backup, which update by the formula + as below. EMAHook takes priority over EvalHook and CheckpointSaverHook. + + .. math:: + + \text{Xema\_{t+1}} = (1 - \text{momentum}) \times + \text{Xema\_{t}} + \text{momentum} \times X_t + + Args: + momentum (float): The momentum used for updating ema parameter. + Defaults to 0.0002. + interval (int): Update ema parameter every interval iteration. + Defaults to 1. + warm_up (int): During first warm_up steps, we may use smaller momentum + to update ema parameters more slowly. Defaults to 100. + resume_from (str): The checkpoint path. Defaults to None. + """ + + def __init__(self, + momentum=0.0002, + interval=1, + warm_up=100, + resume_from=None): + assert isinstance(interval, int) and interval > 0 + self.warm_up = warm_up + self.interval = interval + assert momentum > 0 and momentum < 1 + self.momentum = momentum**interval + self.checkpoint = resume_from + + def before_run(self, runner): + """To resume model with it's ema parameters more friendly. + + Register ema parameter as ``named_buffer`` to model + """ + model = runner.model + if is_module_wrapper(model): + model = model.module + self.param_ema_buffer = {} + self.model_parameters = dict(model.named_parameters(recurse=True)) + for name, value in self.model_parameters.items(): + # "." is not allowed in module's buffer name + buffer_name = f"ema_{name.replace('.', '_')}" + self.param_ema_buffer[name] = buffer_name + model.register_buffer(buffer_name, value.data.clone()) + self.model_buffers = dict(model.named_buffers(recurse=True)) + if self.checkpoint is not None: + runner.resume(self.checkpoint) + + def after_train_iter(self, runner): + """Update ema parameter every self.interval iterations.""" + curr_step = runner.iter + # We warm up the momentum considering the instability at beginning + momentum = min(self.momentum, + (1 + curr_step) / (self.warm_up + curr_step)) + if curr_step % self.interval != 0: + return + for name, parameter in self.model_parameters.items(): + buffer_name = self.param_ema_buffer[name] + buffer_parameter = self.model_buffers[buffer_name] + buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data) + + def after_train_epoch(self, runner): + """We load parameter values from ema backup to model before the + EvalHook.""" + self._swap_ema_parameters() + + def before_train_epoch(self, runner): + """We recover model's parameter from ema backup after last epoch's + EvalHook.""" + self._swap_ema_parameters() + + def _swap_ema_parameters(self): + """Swap the parameter of model with parameter in ema_buffer.""" + for name, value in self.model_parameters.items(): + temp = value.data.clone() + ema_buffer = self.model_buffers[self.param_ema_buffer[name]] + value.data.copy_(ema_buffer.data) + ema_buffer.data.copy_(temp) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/evaluation.py b/custom_mmpkg/custom_mmcv/runner/hooks/evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..1d76699d3d2d297539cdd49e1fe0626c379ec26f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/evaluation.py @@ -0,0 +1,509 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import warnings +from math import inf + +import torch.distributed as dist +from torch.nn.modules.batchnorm import _BatchNorm +from torch.utils.data import DataLoader + +from custom_mmpkg.custom_mmcv.fileio import FileClient +from custom_mmpkg.custom_mmcv.utils import is_seq_of +from .hook import Hook +from .logger import LoggerHook + + +class EvalHook(Hook): + """Non-Distributed evaluation hook. + + This hook will regularly perform evaluation in a given interval when + performing in non-distributed environment. + + Args: + dataloader (DataLoader): A PyTorch dataloader, whose dataset has + implemented ``evaluate`` function. + start (int | None, optional): Evaluation starting epoch. It enables + evaluation before the training starts if ``start`` <= the resuming + epoch. If None, whether to evaluate is merely decided by + ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + Default: True. + save_best (str, optional): If a metric is specified, it would measure + the best checkpoint during evaluation. The information about best + checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep + best score value and best checkpoint path, which will be also + loaded when resume checkpoint. Options are the evaluation metrics + on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox + detection and instance segmentation. ``AR@100`` for proposal + recall. If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: None. + rule (str | None, optional): Comparison rule for best score. If set to + None, it will infer a reasonable rule. Keys such as 'acc', 'top' + .etc will be inferred by 'greater' rule. Keys contain 'loss' will + be inferred by 'less' rule. Options are 'greater', 'less', None. + Default: None. + test_fn (callable, optional): test a model with samples from a + dataloader, and return the test results. If ``None``, the default + test function ``mmcv.engine.single_gpu_test`` will be used. + (default: ``None``) + greater_keys (List[str] | None, optional): Metric keys that will be + inferred by 'greater' comparison rule. If ``None``, + _default_greater_keys will be used. (default: ``None``) + less_keys (List[str] | None, optional): Metric keys that will be + inferred by 'less' comparison rule. If ``None``, _default_less_keys + will be used. (default: ``None``) + out_dir (str, optional): The root directory to save checkpoints. If not + specified, `runner.work_dir` will be used by default. If specified, + the `out_dir` will be the concatenation of `out_dir` and the last + level directory of `runner.work_dir`. + `New in version 1.3.16.` + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. Default: None. + `New in version 1.3.16.` + **eval_kwargs: Evaluation arguments fed into the evaluate function of + the dataset. + + Notes: + If new arguments are added for EvalHook, tools/test.py, + tools/eval_metric.py may be affected. + """ + + # Since the key for determine greater or less is related to the downstream + # tasks, downstream repos may need to overwrite the following inner + # variable accordingly. + + rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y} + init_value_map = {'greater': -inf, 'less': inf} + _default_greater_keys = [ + 'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU', + 'mAcc', 'aAcc' + ] + _default_less_keys = ['loss'] + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best=None, + rule=None, + test_fn=None, + greater_keys=None, + less_keys=None, + out_dir=None, + file_client_args=None, + **eval_kwargs): + if not isinstance(dataloader, DataLoader): + raise TypeError(f'dataloader must be a pytorch DataLoader, ' + f'but got {type(dataloader)}') + + if interval <= 0: + raise ValueError(f'interval must be a positive number, ' + f'but got {interval}') + + assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean' + + if start is not None and start < 0: + raise ValueError(f'The evaluation start epoch {start} is smaller ' + f'than 0') + + self.dataloader = dataloader + self.interval = interval + self.start = start + self.by_epoch = by_epoch + + assert isinstance(save_best, str) or save_best is None, \ + '""save_best"" should be a str or None ' \ + f'rather than {type(save_best)}' + self.save_best = save_best + self.eval_kwargs = eval_kwargs + self.initial_flag = True + + if test_fn is None: + from custom_mmpkg.custom_mmcv.engine import single_gpu_test + self.test_fn = single_gpu_test + else: + self.test_fn = test_fn + + if greater_keys is None: + self.greater_keys = self._default_greater_keys + else: + if not isinstance(greater_keys, (list, tuple)): + greater_keys = (greater_keys, ) + assert is_seq_of(greater_keys, str) + self.greater_keys = greater_keys + + if less_keys is None: + self.less_keys = self._default_less_keys + else: + if not isinstance(less_keys, (list, tuple)): + less_keys = (less_keys, ) + assert is_seq_of(less_keys, str) + self.less_keys = less_keys + + if self.save_best is not None: + self.best_ckpt_path = None + self._init_rule(rule, self.save_best) + + self.out_dir = out_dir + self.file_client_args = file_client_args + + def _init_rule(self, rule, key_indicator): + """Initialize rule, key_indicator, comparison_func, and best score. + + Here is the rule to determine which rule is used for key indicator + when the rule is not specific (note that the key indicator matching + is case-insensitive): + 1. If the key indicator is in ``self.greater_keys``, the rule will be + specified as 'greater'. + 2. Or if the key indicator is in ``self.less_keys``, the rule will be + specified as 'less'. + 3. Or if the key indicator is equal to the substring in any one item + in ``self.greater_keys``, the rule will be specified as 'greater'. + 4. Or if the key indicator is equal to the substring in any one item + in ``self.less_keys``, the rule will be specified as 'less'. + + Args: + rule (str | None): Comparison rule for best score. + key_indicator (str | None): Key indicator to determine the + comparison rule. + """ + if rule not in self.rule_map and rule is not None: + raise KeyError(f'rule must be greater, less or None, ' + f'but got {rule}.') + + if rule is None: + if key_indicator != 'auto': + # `_lc` here means we use the lower case of keys for + # case-insensitive matching + key_indicator_lc = key_indicator.lower() + greater_keys = [key.lower() for key in self.greater_keys] + less_keys = [key.lower() for key in self.less_keys] + + if key_indicator_lc in greater_keys: + rule = 'greater' + elif key_indicator_lc in less_keys: + rule = 'less' + elif any(key in key_indicator_lc for key in greater_keys): + rule = 'greater' + elif any(key in key_indicator_lc for key in less_keys): + rule = 'less' + else: + raise ValueError(f'Cannot infer the rule for key ' + f'{key_indicator}, thus a specific rule ' + f'must be specified.') + self.rule = rule + self.key_indicator = key_indicator + if self.rule is not None: + self.compare_func = self.rule_map[self.rule] + + def before_run(self, runner): + if not self.out_dir: + self.out_dir = runner.work_dir + + self.file_client = FileClient.infer_client(self.file_client_args, + self.out_dir) + + # if `self.out_dir` is not equal to `runner.work_dir`, it means that + # `self.out_dir` is set so the final `self.out_dir` is the + # concatenation of `self.out_dir` and the last level directory of + # `runner.work_dir` + if self.out_dir != runner.work_dir: + basename = osp.basename(runner.work_dir.rstrip(osp.sep)) + self.out_dir = self.file_client.join_path(self.out_dir, basename) + runner.logger.info( + (f'The best checkpoint will be saved to {self.out_dir} by ' + f'{self.file_client.name}')) + + if self.save_best is not None: + if runner.meta is None: + warnings.warn('runner.meta is None. Creating an empty one.') + runner.meta = dict() + runner.meta.setdefault('hook_msgs', dict()) + self.best_ckpt_path = runner.meta['hook_msgs'].get( + 'best_ckpt', None) + + def before_train_iter(self, runner): + """Evaluate the model only at the start of training by iteration.""" + if self.by_epoch or not self.initial_flag: + return + if self.start is not None and runner.iter >= self.start: + self.after_train_iter(runner) + self.initial_flag = False + + def before_train_epoch(self, runner): + """Evaluate the model only at the start of training by epoch.""" + if not (self.by_epoch and self.initial_flag): + return + if self.start is not None and runner.epoch >= self.start: + self.after_train_epoch(runner) + self.initial_flag = False + + def after_train_iter(self, runner): + """Called after every training iter to evaluate the results.""" + if not self.by_epoch and self._should_evaluate(runner): + # Because the priority of EvalHook is higher than LoggerHook, the + # training log and the evaluating log are mixed. Therefore, + # we need to dump the training log and clear it before evaluating + # log is generated. In addition, this problem will only appear in + # `IterBasedRunner` whose `self.by_epoch` is False, because + # `EpochBasedRunner` whose `self.by_epoch` is True calls + # `_do_evaluate` in `after_train_epoch` stage, and at this stage + # the training log has been printed, so it will not cause any + # problem. more details at + # https://github.com/open-mmlab/mmsegmentation/issues/694 + for hook in runner._hooks: + if isinstance(hook, LoggerHook): + hook.after_train_iter(runner) + runner.log_buffer.clear() + + self._do_evaluate(runner) + + def after_train_epoch(self, runner): + """Called after every training epoch to evaluate the results.""" + if self.by_epoch and self._should_evaluate(runner): + self._do_evaluate(runner) + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + results = self.test_fn(runner.model, self.dataloader) + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) + # the key_score may be `None` so it needs to skip the action to save + # the best checkpoint + if self.save_best and key_score: + self._save_ckpt(runner, key_score) + + def _should_evaluate(self, runner): + """Judge whether to perform evaluation. + + Here is the rule to judge whether to perform evaluation: + 1. It will not perform evaluation during the epoch/iteration interval, + which is determined by ``self.interval``. + 2. It will not perform evaluation if the start time is larger than + current time. + 3. It will not perform evaluation when current time is larger than + the start time but during epoch/iteration interval. + + Returns: + bool: The flag indicating whether to perform evaluation. + """ + if self.by_epoch: + current = runner.epoch + check_time = self.every_n_epochs + else: + current = runner.iter + check_time = self.every_n_iters + + if self.start is None: + if not check_time(runner, self.interval): + # No evaluation during the interval. + return False + elif (current + 1) < self.start: + # No evaluation if start is larger than the current time. + return False + else: + # Evaluation only at epochs/iters 3, 5, 7... + # if start==3 and interval==2 + if (current + 1 - self.start) % self.interval: + return False + return True + + def _save_ckpt(self, runner, key_score): + """Save the best checkpoint. + + It will compare the score according to the compare function, write + related information (best score, best checkpoint path) and save the + best checkpoint into ``work_dir``. + """ + if self.by_epoch: + current = f'epoch_{runner.epoch + 1}' + cur_type, cur_time = 'epoch', runner.epoch + 1 + else: + current = f'iter_{runner.iter + 1}' + cur_type, cur_time = 'iter', runner.iter + 1 + + best_score = runner.meta['hook_msgs'].get( + 'best_score', self.init_value_map[self.rule]) + if self.compare_func(key_score, best_score): + best_score = key_score + runner.meta['hook_msgs']['best_score'] = best_score + + if self.best_ckpt_path and self.file_client.isfile( + self.best_ckpt_path): + self.file_client.remove(self.best_ckpt_path) + runner.logger.info( + (f'The previous best checkpoint {self.best_ckpt_path} was ' + 'removed')) + + best_ckpt_name = f'best_{self.key_indicator}_{current}.pth' + self.best_ckpt_path = self.file_client.join_path( + self.out_dir, best_ckpt_name) + runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path + + runner.save_checkpoint( + self.out_dir, best_ckpt_name, create_symlink=False) + runner.logger.info( + f'Now best checkpoint is saved as {best_ckpt_name}.') + runner.logger.info( + f'Best {self.key_indicator} is {best_score:0.4f} ' + f'at {cur_time} {cur_type}.') + + def evaluate(self, runner, results): + """Evaluate the results. + + Args: + runner (:obj:`mmcv.Runner`): The underlined training runner. + results (list): Output results. + """ + eval_res = self.dataloader.dataset.evaluate( + results, logger=runner.logger, **self.eval_kwargs) + + for name, val in eval_res.items(): + runner.log_buffer.output[name] = val + runner.log_buffer.ready = True + + if self.save_best is not None: + # If the performance of model is pool, the `eval_res` may be an + # empty dict and it will raise exception when `self.save_best` is + # not None. More details at + # https://github.com/open-mmlab/mmdetection/issues/6265. + if not eval_res: + warnings.warn( + 'Since `eval_res` is an empty dict, the behavior to save ' + 'the best checkpoint will be skipped in this evaluation.') + return None + + if self.key_indicator == 'auto': + # infer from eval_results + self._init_rule(self.rule, list(eval_res.keys())[0]) + return eval_res[self.key_indicator] + + return None + + +class DistEvalHook(EvalHook): + """Distributed evaluation hook. + + This hook will regularly perform evaluation in a given interval when + performing in distributed environment. + + Args: + dataloader (DataLoader): A PyTorch dataloader, whose dataset has + implemented ``evaluate`` function. + start (int | None, optional): Evaluation starting epoch. It enables + evaluation before the training starts if ``start`` <= the resuming + epoch. If None, whether to evaluate is merely decided by + ``interval``. Default: None. + interval (int): Evaluation interval. Default: 1. + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + default: True. + save_best (str, optional): If a metric is specified, it would measure + the best checkpoint during evaluation. The information about best + checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep + best score value and best checkpoint path, which will be also + loaded when resume checkpoint. Options are the evaluation metrics + on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox + detection and instance segmentation. ``AR@100`` for proposal + recall. If ``save_best`` is ``auto``, the first key of the returned + ``OrderedDict`` result will be used. Default: None. + rule (str | None, optional): Comparison rule for best score. If set to + None, it will infer a reasonable rule. Keys such as 'acc', 'top' + .etc will be inferred by 'greater' rule. Keys contain 'loss' will + be inferred by 'less' rule. Options are 'greater', 'less', None. + Default: None. + test_fn (callable, optional): test a model with samples from a + dataloader in a multi-gpu manner, and return the test results. If + ``None``, the default test function ``mmcv.engine.multi_gpu_test`` + will be used. (default: ``None``) + tmpdir (str | None): Temporary directory to save the results of all + processes. Default: None. + gpu_collect (bool): Whether to use gpu or cpu to collect results. + Default: False. + broadcast_bn_buffer (bool): Whether to broadcast the + buffer(running_mean and running_var) of rank 0 to other rank + before evaluation. Default: True. + out_dir (str, optional): The root directory to save checkpoints. If not + specified, `runner.work_dir` will be used by default. If specified, + the `out_dir` will be the concatenation of `out_dir` and the last + level directory of `runner.work_dir`. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. Default: None. + **eval_kwargs: Evaluation arguments fed into the evaluate function of + the dataset. + """ + + def __init__(self, + dataloader, + start=None, + interval=1, + by_epoch=True, + save_best=None, + rule=None, + test_fn=None, + greater_keys=None, + less_keys=None, + broadcast_bn_buffer=True, + tmpdir=None, + gpu_collect=False, + out_dir=None, + file_client_args=None, + **eval_kwargs): + + if test_fn is None: + from custom_mmpkg.custom_mmcv.engine import multi_gpu_test + test_fn = multi_gpu_test + + super().__init__( + dataloader, + start=start, + interval=interval, + by_epoch=by_epoch, + save_best=save_best, + rule=rule, + test_fn=test_fn, + greater_keys=greater_keys, + less_keys=less_keys, + out_dir=out_dir, + file_client_args=file_client_args, + **eval_kwargs) + + self.broadcast_bn_buffer = broadcast_bn_buffer + self.tmpdir = tmpdir + self.gpu_collect = gpu_collect + + def _do_evaluate(self, runner): + """perform evaluation and save ckpt.""" + # Synchronization of BatchNorm's buffer (running_mean + # and running_var) is not supported in the DDP of pytorch, + # which may cause the inconsistent performance of models in + # different ranks, so we broadcast BatchNorm's buffers + # of rank 0 to other ranks to avoid this. + if self.broadcast_bn_buffer: + model = runner.model + for name, module in model.named_modules(): + if isinstance(module, + _BatchNorm) and module.track_running_stats: + dist.broadcast(module.running_var, 0) + dist.broadcast(module.running_mean, 0) + + tmpdir = self.tmpdir + if tmpdir is None: + tmpdir = osp.join(runner.work_dir, '.eval_hook') + + results = self.test_fn( + runner.model, + self.dataloader, + tmpdir=tmpdir, + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + runner.log_buffer.output['eval_iter_num'] = len(self.dataloader) + key_score = self.evaluate(runner, results) + # the key_score may be `None` so it needs to skip the action to + # save the best checkpoint + if self.save_best and key_score: + self._save_ckpt(runner, key_score) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/hook.py b/custom_mmpkg/custom_mmcv/runner/hooks/hook.py new file mode 100644 index 0000000000000000000000000000000000000000..9e497e18e080f726fc95e62386248425a8848b3f --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/hook.py @@ -0,0 +1,92 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from custom_mmpkg.custom_mmcv.utils import Registry, is_method_overridden + +HOOKS = Registry('hook') + + +class Hook: + stages = ('before_run', 'before_train_epoch', 'before_train_iter', + 'after_train_iter', 'after_train_epoch', 'before_val_epoch', + 'before_val_iter', 'after_val_iter', 'after_val_epoch', + 'after_run') + + def before_run(self, runner): + pass + + def after_run(self, runner): + pass + + def before_epoch(self, runner): + pass + + def after_epoch(self, runner): + pass + + def before_iter(self, runner): + pass + + def after_iter(self, runner): + pass + + def before_train_epoch(self, runner): + self.before_epoch(runner) + + def before_val_epoch(self, runner): + self.before_epoch(runner) + + def after_train_epoch(self, runner): + self.after_epoch(runner) + + def after_val_epoch(self, runner): + self.after_epoch(runner) + + def before_train_iter(self, runner): + self.before_iter(runner) + + def before_val_iter(self, runner): + self.before_iter(runner) + + def after_train_iter(self, runner): + self.after_iter(runner) + + def after_val_iter(self, runner): + self.after_iter(runner) + + def every_n_epochs(self, runner, n): + return (runner.epoch + 1) % n == 0 if n > 0 else False + + def every_n_inner_iters(self, runner, n): + return (runner.inner_iter + 1) % n == 0 if n > 0 else False + + def every_n_iters(self, runner, n): + return (runner.iter + 1) % n == 0 if n > 0 else False + + def end_of_epoch(self, runner): + return runner.inner_iter + 1 == len(runner.data_loader) + + def is_last_epoch(self, runner): + return runner.epoch + 1 == runner._max_epochs + + def is_last_iter(self, runner): + return runner.iter + 1 == runner._max_iters + + def get_triggered_stages(self): + trigger_stages = set() + for stage in Hook.stages: + if is_method_overridden(stage, Hook, self): + trigger_stages.add(stage) + + # some methods will be triggered in multi stages + # use this dict to map method to stages. + method_stages_map = { + 'before_epoch': ['before_train_epoch', 'before_val_epoch'], + 'after_epoch': ['after_train_epoch', 'after_val_epoch'], + 'before_iter': ['before_train_iter', 'before_val_iter'], + 'after_iter': ['after_train_iter', 'after_val_iter'], + } + + for method, map_stages in method_stages_map.items(): + if is_method_overridden(method, Hook, self): + trigger_stages.update(map_stages) + + return [stage for stage in Hook.stages if stage in trigger_stages] diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/iter_timer.py b/custom_mmpkg/custom_mmcv/runner/hooks/iter_timer.py new file mode 100644 index 0000000000000000000000000000000000000000..cfd5002fe85ffc6992155ac01003878064a1d9be --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/iter_timer.py @@ -0,0 +1,18 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import time + +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class IterTimerHook(Hook): + + def before_epoch(self, runner): + self.t = time.time() + + def before_iter(self, runner): + runner.log_buffer.update({'data_time': time.time() - self.t}) + + def after_iter(self, runner): + runner.log_buffer.update({'time': time.time() - self.t}) + self.t = time.time() diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/__init__.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a0b6b345640a895368ac8a647afef6f24333d90e --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .base import LoggerHook +from .dvclive import DvcliveLoggerHook +from .mlflow import MlflowLoggerHook +from .neptune import NeptuneLoggerHook +from .pavi import PaviLoggerHook +from .tensorboard import TensorboardLoggerHook +from .text import TextLoggerHook +from .wandb import WandbLoggerHook + +__all__ = [ + 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook', + 'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook', + 'NeptuneLoggerHook', 'DvcliveLoggerHook' +] diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/base.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/base.py new file mode 100644 index 0000000000000000000000000000000000000000..f845256729458ced821762a1b8ef881e17ff9955 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/base.py @@ -0,0 +1,166 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numbers +from abc import ABCMeta, abstractmethod + +import numpy as np +import torch + +from ..hook import Hook + + +class LoggerHook(Hook): + """Base class for logger hooks. + + Args: + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging. + by_epoch (bool): Whether EpochBasedRunner is used. + """ + + __metaclass__ = ABCMeta + + def __init__(self, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True): + self.interval = interval + self.ignore_last = ignore_last + self.reset_flag = reset_flag + self.by_epoch = by_epoch + + @abstractmethod + def log(self, runner): + pass + + @staticmethod + def is_scalar(val, include_np=True, include_torch=True): + """Tell the input variable is a scalar or not. + + Args: + val: Input variable. + include_np (bool): Whether include 0-d np.ndarray as a scalar. + include_torch (bool): Whether include 0-d torch.Tensor as a scalar. + + Returns: + bool: True or False. + """ + if isinstance(val, numbers.Number): + return True + elif include_np and isinstance(val, np.ndarray) and val.ndim == 0: + return True + elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1: + return True + else: + return False + + def get_mode(self, runner): + if runner.mode == 'train': + if 'time' in runner.log_buffer.output: + mode = 'train' + else: + mode = 'val' + elif runner.mode == 'val': + mode = 'val' + else: + raise ValueError(f"runner mode should be 'train' or 'val', " + f'but got {runner.mode}') + return mode + + def get_epoch(self, runner): + if runner.mode == 'train': + epoch = runner.epoch + 1 + elif runner.mode == 'val': + # normal val mode + # runner.epoch += 1 has been done before val workflow + epoch = runner.epoch + else: + raise ValueError(f"runner mode should be 'train' or 'val', " + f'but got {runner.mode}') + return epoch + + def get_iter(self, runner, inner_iter=False): + """Get the current training iteration step.""" + if self.by_epoch and inner_iter: + current_iter = runner.inner_iter + 1 + else: + current_iter = runner.iter + 1 + return current_iter + + def get_lr_tags(self, runner): + tags = {} + lrs = runner.current_lr() + if isinstance(lrs, dict): + for name, value in lrs.items(): + tags[f'learning_rate/{name}'] = value[0] + else: + tags['learning_rate'] = lrs[0] + return tags + + def get_momentum_tags(self, runner): + tags = {} + momentums = runner.current_momentum() + if isinstance(momentums, dict): + for name, value in momentums.items(): + tags[f'momentum/{name}'] = value[0] + else: + tags['momentum'] = momentums[0] + return tags + + def get_loggable_tags(self, + runner, + allow_scalar=True, + allow_text=False, + add_mode=True, + tags_to_skip=('time', 'data_time')): + tags = {} + for var, val in runner.log_buffer.output.items(): + if var in tags_to_skip: + continue + if self.is_scalar(val) and not allow_scalar: + continue + if isinstance(val, str) and not allow_text: + continue + if add_mode: + var = f'{self.get_mode(runner)}/{var}' + tags[var] = val + tags.update(self.get_lr_tags(runner)) + tags.update(self.get_momentum_tags(runner)) + return tags + + def before_run(self, runner): + for hook in runner.hooks[::-1]: + if isinstance(hook, LoggerHook): + hook.reset_flag = True + break + + def before_epoch(self, runner): + runner.log_buffer.clear() # clear logs of last epoch + + def after_train_iter(self, runner): + if self.by_epoch and self.every_n_inner_iters(runner, self.interval): + runner.log_buffer.average(self.interval) + elif not self.by_epoch and self.every_n_iters(runner, self.interval): + runner.log_buffer.average(self.interval) + elif self.end_of_epoch(runner) and not self.ignore_last: + # not precise but more stable + runner.log_buffer.average(self.interval) + + if runner.log_buffer.ready: + self.log(runner) + if self.reset_flag: + runner.log_buffer.clear_output() + + def after_train_epoch(self, runner): + if runner.log_buffer.ready: + self.log(runner) + if self.reset_flag: + runner.log_buffer.clear_output() + + def after_val_epoch(self, runner): + runner.log_buffer.average() + self.log(runner) + if self.reset_flag: + runner.log_buffer.clear_output() diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/dvclive.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/dvclive.py new file mode 100644 index 0000000000000000000000000000000000000000..687cdc58c0336c92b1e4f9a410ba67ebaab2bc7a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/dvclive.py @@ -0,0 +1,58 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class DvcliveLoggerHook(LoggerHook): + """Class to log metrics with dvclive. + + It requires `dvclive`_ to be installed. + + Args: + path (str): Directory where dvclive will write TSV log files. + interval (int): Logging interval (every k iterations). + Default 10. + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + Default: True. + reset_flag (bool): Whether to clear the output buffer after logging. + Default: True. + by_epoch (bool): Whether EpochBasedRunner is used. + Default: True. + + .. _dvclive: + https://dvc.org/doc/dvclive + """ + + def __init__(self, + path, + interval=10, + ignore_last=True, + reset_flag=True, + by_epoch=True): + + super(DvcliveLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.path = path + self.import_dvclive() + + def import_dvclive(self): + try: + import dvclive + except ImportError: + raise ImportError( + 'Please run "pip install dvclive" to install dvclive') + self.dvclive = dvclive + + @master_only + def before_run(self, runner): + self.dvclive.init(self.path) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + for k, v in tags.items(): + self.dvclive.log(k, v, step=self.get_iter(runner)) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/mlflow.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/mlflow.py new file mode 100644 index 0000000000000000000000000000000000000000..f9a72592be47b534ce22573775fd5a7e8e86d72d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/mlflow.py @@ -0,0 +1,78 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class MlflowLoggerHook(LoggerHook): + + def __init__(self, + exp_name=None, + tags=None, + log_model=True, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True): + """Class to log metrics and (optionally) a trained model to MLflow. + + It requires `MLflow`_ to be installed. + + Args: + exp_name (str, optional): Name of the experiment to be used. + Default None. + If not None, set the active experiment. + If experiment does not exist, an experiment with provided name + will be created. + tags (dict of str: str, optional): Tags for the current run. + Default None. + If not None, set tags for the current run. + log_model (bool, optional): Whether to log an MLflow artifact. + Default True. + If True, log runner.model as an MLflow artifact + for the current run. + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging + by_epoch (bool): Whether EpochBasedRunner is used. + + .. _MLflow: + https://www.mlflow.org/docs/latest/index.html + """ + super(MlflowLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.import_mlflow() + self.exp_name = exp_name + self.tags = tags + self.log_model = log_model + + def import_mlflow(self): + try: + import mlflow + import mlflow.pytorch as mlflow_pytorch + except ImportError: + raise ImportError( + 'Please run "pip install mlflow" to install mlflow') + self.mlflow = mlflow + self.mlflow_pytorch = mlflow_pytorch + + @master_only + def before_run(self, runner): + super(MlflowLoggerHook, self).before_run(runner) + if self.exp_name is not None: + self.mlflow.set_experiment(self.exp_name) + if self.tags is not None: + self.mlflow.set_tags(self.tags) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + self.mlflow.log_metrics(tags, step=self.get_iter(runner)) + + @master_only + def after_run(self, runner): + if self.log_model: + self.mlflow_pytorch.log_model(runner.model, 'models') diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/neptune.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/neptune.py new file mode 100644 index 0000000000000000000000000000000000000000..7a38772b0c93a8608f32c6357b8616e77c139dc9 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/neptune.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class NeptuneLoggerHook(LoggerHook): + """Class to log metrics to NeptuneAI. + + It requires `neptune-client` to be installed. + + Args: + init_kwargs (dict): a dict contains the initialization keys as below: + - project (str): Name of a project in a form of + namespace/project_name. If None, the value of + NEPTUNE_PROJECT environment variable will be taken. + - api_token (str): User’s API token. + If None, the value of NEPTUNE_API_TOKEN environment + variable will be taken. Note: It is strongly recommended + to use NEPTUNE_API_TOKEN environment variable rather than + placing your API token in plain text in your source code. + - name (str, optional, default is 'Untitled'): Editable name of + the run. Name is displayed in the run's Details and in + Runs table as a column. + Check https://docs.neptune.ai/api-reference/neptune#init for + more init arguments. + interval (int): Logging interval (every k iterations). + ignore_last (bool): Ignore the log of last iterations in each epoch + if less than `interval`. + reset_flag (bool): Whether to clear the output buffer after logging + by_epoch (bool): Whether EpochBasedRunner is used. + + .. _NeptuneAI: + https://docs.neptune.ai/you-should-know/logging-metadata + """ + + def __init__(self, + init_kwargs=None, + interval=10, + ignore_last=True, + reset_flag=True, + with_step=True, + by_epoch=True): + + super(NeptuneLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.import_neptune() + self.init_kwargs = init_kwargs + self.with_step = with_step + + def import_neptune(self): + try: + import neptune.new as neptune + except ImportError: + raise ImportError( + 'Please run "pip install neptune-client" to install neptune') + self.neptune = neptune + self.run = None + + @master_only + def before_run(self, runner): + if self.init_kwargs: + self.run = self.neptune.init(**self.init_kwargs) + else: + self.run = self.neptune.init() + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + for tag_name, tag_value in tags.items(): + if self.with_step: + self.run[tag_name].log( + tag_value, step=self.get_iter(runner)) + else: + tags['global_step'] = self.get_iter(runner) + self.run[tag_name].log(tags) + + @master_only + def after_run(self, runner): + self.run.stop() diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/pavi.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/pavi.py new file mode 100644 index 0000000000000000000000000000000000000000..0c5f14224cc4762cd1ef18a5d3b49d023f22a1dc --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/pavi.py @@ -0,0 +1,117 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import json +import os +import os.path as osp + +import torch +import yaml + +import custom_mmpkg.custom_mmcv as mmcv +from ....parallel.utils import is_module_wrapper +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class PaviLoggerHook(LoggerHook): + + def __init__(self, + init_kwargs=None, + add_graph=False, + add_last_ckpt=False, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True, + img_key='img_info'): + super(PaviLoggerHook, self).__init__(interval, ignore_last, reset_flag, + by_epoch) + self.init_kwargs = init_kwargs + self.add_graph = add_graph + self.add_last_ckpt = add_last_ckpt + self.img_key = img_key + + @master_only + def before_run(self, runner): + super(PaviLoggerHook, self).before_run(runner) + try: + from pavi import SummaryWriter + except ImportError: + raise ImportError('Please run "pip install pavi" to install pavi.') + + self.run_name = runner.work_dir.split('/')[-1] + + if not self.init_kwargs: + self.init_kwargs = dict() + self.init_kwargs['name'] = self.run_name + self.init_kwargs['model'] = runner._model_name + if runner.meta is not None: + if 'config_dict' in runner.meta: + config_dict = runner.meta['config_dict'] + assert isinstance( + config_dict, + dict), ('meta["config_dict"] has to be of a dict, ' + f'but got {type(config_dict)}') + elif 'config_file' in runner.meta: + config_file = runner.meta['config_file'] + config_dict = dict(mmcv.Config.fromfile(config_file)) + else: + config_dict = None + if config_dict is not None: + # 'max_.*iter' is parsed in pavi sdk as the maximum iterations + # to properly set up the progress bar. + config_dict = config_dict.copy() + config_dict.setdefault('max_iter', runner.max_iters) + # non-serializable values are first converted in + # mmcv.dump to json + config_dict = json.loads( + mmcv.dump(config_dict, file_format='json')) + session_text = yaml.dump(config_dict) + self.init_kwargs['session_text'] = session_text + self.writer = SummaryWriter(**self.init_kwargs) + + def get_step(self, runner): + """Get the total training step/epoch.""" + if self.get_mode(runner) == 'val' and self.by_epoch: + return self.get_epoch(runner) + else: + return self.get_iter(runner) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner, add_mode=False) + if tags: + self.writer.add_scalars( + self.get_mode(runner), tags, self.get_step(runner)) + + @master_only + def after_run(self, runner): + if self.add_last_ckpt: + ckpt_path = osp.join(runner.work_dir, 'latest.pth') + if osp.islink(ckpt_path): + ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path)) + + if osp.isfile(ckpt_path): + # runner.epoch += 1 has been done before `after_run`. + iteration = runner.epoch if self.by_epoch else runner.iter + return self.writer.add_snapshot_file( + tag=self.run_name, + snapshot_file_path=ckpt_path, + iteration=iteration) + + # flush the buffer and send a task ending signal to Pavi + self.writer.close() + + @master_only + def before_epoch(self, runner): + if runner.epoch == 0 and self.add_graph: + if is_module_wrapper(runner.model): + _model = runner.model.module + else: + _model = runner.model + device = next(_model.parameters()).device + data = next(iter(runner.data_loader)) + image = data[self.img_key][0:1].to(device) + with torch.no_grad(): + self.writer.add_graph(_model, image) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/tensorboard.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/tensorboard.py new file mode 100644 index 0000000000000000000000000000000000000000..cc9c727ff9776c5c8d41838f2f0676a4db56186b --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/tensorboard.py @@ -0,0 +1,57 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp + +from custom_mmpkg.custom_mmcv.utils import TORCH_VERSION, digit_version +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class TensorboardLoggerHook(LoggerHook): + + def __init__(self, + log_dir=None, + interval=10, + ignore_last=True, + reset_flag=False, + by_epoch=True): + super(TensorboardLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.log_dir = log_dir + + @master_only + def before_run(self, runner): + super(TensorboardLoggerHook, self).before_run(runner) + if (TORCH_VERSION == 'parrots' + or digit_version(TORCH_VERSION) < digit_version('1.1')): + try: + from tensorboardX import SummaryWriter + except ImportError: + raise ImportError('Please install tensorboardX to use ' + 'TensorboardLoggerHook.') + else: + try: + from torch.utils.tensorboard import SummaryWriter + except ImportError: + raise ImportError( + 'Please run "pip install future tensorboard" to install ' + 'the dependencies to use torch.utils.tensorboard ' + '(applicable to PyTorch 1.1 or higher)') + + if self.log_dir is None: + self.log_dir = osp.join(runner.work_dir, 'tf_logs') + self.writer = SummaryWriter(self.log_dir) + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner, allow_text=True) + for tag, val in tags.items(): + if isinstance(val, str): + self.writer.add_text(tag, val, self.get_iter(runner)) + else: + self.writer.add_scalar(tag, val, self.get_iter(runner)) + + @master_only + def after_run(self, runner): + self.writer.close() diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/text.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/text.py new file mode 100644 index 0000000000000000000000000000000000000000..ea12c02a96d590493ae48055196bb28798bfefff --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/text.py @@ -0,0 +1,256 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import datetime +import os +import os.path as osp +from collections import OrderedDict + +import torch +import torch.distributed as dist + +import custom_mmpkg.custom_mmcv as mmcv +from custom_mmpkg.custom_mmcv.fileio.file_client import FileClient +from custom_mmpkg.custom_mmcv.utils import is_tuple_of, scandir +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class TextLoggerHook(LoggerHook): + """Logger hook in text. + + In this logger hook, the information will be printed on terminal and + saved in json file. + + Args: + by_epoch (bool, optional): Whether EpochBasedRunner is used. + Default: True. + interval (int, optional): Logging interval (every k iterations). + Default: 10. + ignore_last (bool, optional): Ignore the log of last iterations in each + epoch if less than :attr:`interval`. Default: True. + reset_flag (bool, optional): Whether to clear the output buffer after + logging. Default: False. + interval_exp_name (int, optional): Logging interval for experiment + name. This feature is to help users conveniently get the experiment + information from screen or log file. Default: 1000. + out_dir (str, optional): Logs are saved in ``runner.work_dir`` default. + If ``out_dir`` is specified, logs will be copied to a new directory + which is the concatenation of ``out_dir`` and the last level + directory of ``runner.work_dir``. Default: None. + `New in version 1.3.16.` + out_suffix (str or tuple[str], optional): Those filenames ending with + ``out_suffix`` will be copied to ``out_dir``. + Default: ('.log.json', '.log', '.py'). + `New in version 1.3.16.` + keep_local (bool, optional): Whether to keep local log when + :attr:`out_dir` is specified. If False, the local log will be + removed. Default: True. + `New in version 1.3.16.` + file_client_args (dict, optional): Arguments to instantiate a + FileClient. See :class:`mmcv.fileio.FileClient` for details. + Default: None. + `New in version 1.3.16.` + """ + + def __init__(self, + by_epoch=True, + interval=10, + ignore_last=True, + reset_flag=False, + interval_exp_name=1000, + out_dir=None, + out_suffix=('.log.json', '.log', '.py'), + keep_local=True, + file_client_args=None): + super(TextLoggerHook, self).__init__(interval, ignore_last, reset_flag, + by_epoch) + self.by_epoch = by_epoch + self.time_sec_tot = 0 + self.interval_exp_name = interval_exp_name + + if out_dir is None and file_client_args is not None: + raise ValueError( + 'file_client_args should be "None" when `out_dir` is not' + 'specified.') + self.out_dir = out_dir + + if not (out_dir is None or isinstance(out_dir, str) + or is_tuple_of(out_dir, str)): + raise TypeError('out_dir should be "None" or string or tuple of ' + 'string, but got {out_dir}') + self.out_suffix = out_suffix + + self.keep_local = keep_local + self.file_client_args = file_client_args + if self.out_dir is not None: + self.file_client = FileClient.infer_client(file_client_args, + self.out_dir) + + def before_run(self, runner): + super(TextLoggerHook, self).before_run(runner) + + if self.out_dir is not None: + self.file_client = FileClient.infer_client(self.file_client_args, + self.out_dir) + # The final `self.out_dir` is the concatenation of `self.out_dir` + # and the last level directory of `runner.work_dir` + basename = osp.basename(runner.work_dir.rstrip(osp.sep)) + self.out_dir = self.file_client.join_path(self.out_dir, basename) + runner.logger.info( + (f'Text logs will be saved to {self.out_dir} by ' + f'{self.file_client.name} after the training process.')) + + self.start_iter = runner.iter + self.json_log_path = osp.join(runner.work_dir, + f'{runner.timestamp}.log.json') + if runner.meta is not None: + self._dump_log(runner.meta, runner) + + def _get_max_memory(self, runner): + device = getattr(runner.model, 'output_device', None) + mem = torch.cuda.max_memory_allocated(device=device) + mem_mb = torch.tensor([mem / (1024 * 1024)], + dtype=torch.int, + device=device) + if runner.world_size > 1: + dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX) + return mem_mb.item() + + def _log_info(self, log_dict, runner): + # print exp name for users to distinguish experiments + # at every ``interval_exp_name`` iterations and the end of each epoch + if runner.meta is not None and 'exp_name' in runner.meta: + if (self.every_n_iters(runner, self.interval_exp_name)) or ( + self.by_epoch and self.end_of_epoch(runner)): + exp_info = f'Exp name: {runner.meta["exp_name"]}' + runner.logger.info(exp_info) + + if log_dict['mode'] == 'train': + if isinstance(log_dict['lr'], dict): + lr_str = [] + for k, val in log_dict['lr'].items(): + lr_str.append(f'lr_{k}: {val:.3e}') + lr_str = ' '.join(lr_str) + else: + lr_str = f'lr: {log_dict["lr"]:.3e}' + + # by epoch: Epoch [4][100/1000] + # by iter: Iter [100/100000] + if self.by_epoch: + log_str = f'Epoch [{log_dict["epoch"]}]' \ + f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t' + else: + log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t' + log_str += f'{lr_str}, ' + + if 'time' in log_dict.keys(): + self.time_sec_tot += (log_dict['time'] * self.interval) + time_sec_avg = self.time_sec_tot / ( + runner.iter - self.start_iter + 1) + eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1) + eta_str = str(datetime.timedelta(seconds=int(eta_sec))) + log_str += f'eta: {eta_str}, ' + log_str += f'time: {log_dict["time"]:.3f}, ' \ + f'data_time: {log_dict["data_time"]:.3f}, ' + # statistic memory + if torch.cuda.is_available(): + log_str += f'memory: {log_dict["memory"]}, ' + else: + # val/test time + # here 1000 is the length of the val dataloader + # by epoch: Epoch[val] [4][1000] + # by iter: Iter[val] [1000] + if self.by_epoch: + log_str = f'Epoch({log_dict["mode"]}) ' \ + f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t' + else: + log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t' + + log_items = [] + for name, val in log_dict.items(): + # TODO: resolve this hack + # these items have been in log_str + if name in [ + 'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time', + 'memory', 'epoch' + ]: + continue + if isinstance(val, float): + val = f'{val:.4f}' + log_items.append(f'{name}: {val}') + log_str += ', '.join(log_items) + + runner.logger.info(log_str) + + def _dump_log(self, log_dict, runner): + # dump log in json format + json_log = OrderedDict() + for k, v in log_dict.items(): + json_log[k] = self._round_float(v) + # only append log at last line + if runner.rank == 0: + with open(self.json_log_path, 'a+') as f: + mmcv.dump(json_log, f, file_format='json') + f.write('\n') + + def _round_float(self, items): + if isinstance(items, list): + return [self._round_float(item) for item in items] + elif isinstance(items, float): + return round(items, 5) + else: + return items + + def log(self, runner): + if 'eval_iter_num' in runner.log_buffer.output: + # this doesn't modify runner.iter and is regardless of by_epoch + cur_iter = runner.log_buffer.output.pop('eval_iter_num') + else: + cur_iter = self.get_iter(runner, inner_iter=True) + + log_dict = OrderedDict( + mode=self.get_mode(runner), + epoch=self.get_epoch(runner), + iter=cur_iter) + + # only record lr of the first param group + cur_lr = runner.current_lr() + if isinstance(cur_lr, list): + log_dict['lr'] = cur_lr[0] + else: + assert isinstance(cur_lr, dict) + log_dict['lr'] = {} + for k, lr_ in cur_lr.items(): + assert isinstance(lr_, list) + log_dict['lr'].update({k: lr_[0]}) + + if 'time' in runner.log_buffer.output: + # statistic memory + if torch.cuda.is_available(): + log_dict['memory'] = self._get_max_memory(runner) + + log_dict = dict(log_dict, **runner.log_buffer.output) + + self._log_info(log_dict, runner) + self._dump_log(log_dict, runner) + return log_dict + + def after_run(self, runner): + # copy or upload logs to self.out_dir + if self.out_dir is not None: + for filename in scandir(runner.work_dir, self.out_suffix, True): + local_filepath = osp.join(runner.work_dir, filename) + out_filepath = self.file_client.join_path( + self.out_dir, filename) + with open(local_filepath, 'r') as f: + self.file_client.put_text(f.read(), out_filepath) + + runner.logger.info( + (f'The file {local_filepath} has been uploaded to ' + f'{out_filepath}.')) + + if not self.keep_local: + os.remove(local_filepath) + runner.logger.info( + (f'{local_filepath} was removed due to the ' + '`self.keep_local=False`')) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/logger/wandb.py b/custom_mmpkg/custom_mmcv/runner/hooks/logger/wandb.py new file mode 100644 index 0000000000000000000000000000000000000000..9f6808462eb79ab2b04806a5d9f0d3dd079b5ea9 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/logger/wandb.py @@ -0,0 +1,56 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ...dist_utils import master_only +from ..hook import HOOKS +from .base import LoggerHook + + +@HOOKS.register_module() +class WandbLoggerHook(LoggerHook): + + def __init__(self, + init_kwargs=None, + interval=10, + ignore_last=True, + reset_flag=False, + commit=True, + by_epoch=True, + with_step=True): + super(WandbLoggerHook, self).__init__(interval, ignore_last, + reset_flag, by_epoch) + self.import_wandb() + self.init_kwargs = init_kwargs + self.commit = commit + self.with_step = with_step + + def import_wandb(self): + try: + import wandb + except ImportError: + raise ImportError( + 'Please run "pip install wandb" to install wandb') + self.wandb = wandb + + @master_only + def before_run(self, runner): + super(WandbLoggerHook, self).before_run(runner) + if self.wandb is None: + self.import_wandb() + if self.init_kwargs: + self.wandb.init(**self.init_kwargs) + else: + self.wandb.init() + + @master_only + def log(self, runner): + tags = self.get_loggable_tags(runner) + if tags: + if self.with_step: + self.wandb.log( + tags, step=self.get_iter(runner), commit=self.commit) + else: + tags['global_step'] = self.get_iter(runner) + self.wandb.log(tags, commit=self.commit) + + @master_only + def after_run(self, runner): + self.wandb.join() diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/lr_updater.py b/custom_mmpkg/custom_mmcv/runner/hooks/lr_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..f375932319cdbce2d50a7fc60b68ea750a60bb70 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/lr_updater.py @@ -0,0 +1,670 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import numbers +from math import cos, pi + +import custom_mmpkg.custom_mmcv as mmcv +from .hook import HOOKS, Hook + + +class LrUpdaterHook(Hook): + """LR Scheduler in MMCV. + + Args: + by_epoch (bool): LR changes epoch by epoch + warmup (string): Type of warmup used. It can be None(use no warmup), + 'constant', 'linear' or 'exp' + warmup_iters (int): The number of iterations or epochs that warmup + lasts + warmup_ratio (float): LR used at the beginning of warmup equals to + warmup_ratio * initial_lr + warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters + means the number of epochs that warmup lasts, otherwise means the + number of iteration that warmup lasts + """ + + def __init__(self, + by_epoch=True, + warmup=None, + warmup_iters=0, + warmup_ratio=0.1, + warmup_by_epoch=False): + # validate the "warmup" argument + if warmup is not None: + if warmup not in ['constant', 'linear', 'exp']: + raise ValueError( + f'"{warmup}" is not a supported type for warming up, valid' + ' types are "constant" and "linear"') + if warmup is not None: + assert warmup_iters > 0, \ + '"warmup_iters" must be a positive integer' + assert 0 < warmup_ratio <= 1.0, \ + '"warmup_ratio" must be in range (0,1]' + + self.by_epoch = by_epoch + self.warmup = warmup + self.warmup_iters = warmup_iters + self.warmup_ratio = warmup_ratio + self.warmup_by_epoch = warmup_by_epoch + + if self.warmup_by_epoch: + self.warmup_epochs = self.warmup_iters + self.warmup_iters = None + else: + self.warmup_epochs = None + + self.base_lr = [] # initial lr for all param groups + self.regular_lr = [] # expected lr if no warming up is performed + + def _set_lr(self, runner, lr_groups): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + for param_group, lr in zip(optim.param_groups, lr_groups[k]): + param_group['lr'] = lr + else: + for param_group, lr in zip(runner.optimizer.param_groups, + lr_groups): + param_group['lr'] = lr + + def get_lr(self, runner, base_lr): + raise NotImplementedError + + def get_regular_lr(self, runner): + if isinstance(runner.optimizer, dict): + lr_groups = {} + for k in runner.optimizer.keys(): + _lr_group = [ + self.get_lr(runner, _base_lr) + for _base_lr in self.base_lr[k] + ] + lr_groups.update({k: _lr_group}) + + return lr_groups + else: + return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr] + + def get_warmup_lr(self, cur_iters): + + def _get_warmup_lr(cur_iters, regular_lr): + if self.warmup == 'constant': + warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr] + elif self.warmup == 'linear': + k = (1 - cur_iters / self.warmup_iters) * (1 - + self.warmup_ratio) + warmup_lr = [_lr * (1 - k) for _lr in regular_lr] + elif self.warmup == 'exp': + k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters) + warmup_lr = [_lr * k for _lr in regular_lr] + return warmup_lr + + if isinstance(self.regular_lr, dict): + lr_groups = {} + for key, regular_lr in self.regular_lr.items(): + lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr) + return lr_groups + else: + return _get_warmup_lr(cur_iters, self.regular_lr) + + def before_run(self, runner): + # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved, + # it will be set according to the optimizer params + if isinstance(runner.optimizer, dict): + self.base_lr = {} + for k, optim in runner.optimizer.items(): + for group in optim.param_groups: + group.setdefault('initial_lr', group['lr']) + _base_lr = [ + group['initial_lr'] for group in optim.param_groups + ] + self.base_lr.update({k: _base_lr}) + else: + for group in runner.optimizer.param_groups: + group.setdefault('initial_lr', group['lr']) + self.base_lr = [ + group['initial_lr'] for group in runner.optimizer.param_groups + ] + + def before_train_epoch(self, runner): + if self.warmup_iters is None: + epoch_len = len(runner.data_loader) + self.warmup_iters = self.warmup_epochs * epoch_len + + if not self.by_epoch: + return + + self.regular_lr = self.get_regular_lr(runner) + self._set_lr(runner, self.regular_lr) + + def before_train_iter(self, runner): + cur_iter = runner.iter + if not self.by_epoch: + self.regular_lr = self.get_regular_lr(runner) + if self.warmup is None or cur_iter >= self.warmup_iters: + self._set_lr(runner, self.regular_lr) + else: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(runner, warmup_lr) + elif self.by_epoch: + if self.warmup is None or cur_iter > self.warmup_iters: + return + elif cur_iter == self.warmup_iters: + self._set_lr(runner, self.regular_lr) + else: + warmup_lr = self.get_warmup_lr(cur_iter) + self._set_lr(runner, warmup_lr) + + +@HOOKS.register_module() +class FixedLrUpdaterHook(LrUpdaterHook): + + def __init__(self, **kwargs): + super(FixedLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + return base_lr + + +@HOOKS.register_module() +class StepLrUpdaterHook(LrUpdaterHook): + """Step LR scheduler with min_lr clipping. + + Args: + step (int | list[int]): Step to decay the LR. If an int value is given, + regard it as the decay interval. If a list is given, decay LR at + these steps. + gamma (float, optional): Decay LR ratio. Default: 0.1. + min_lr (float, optional): Minimum LR value to keep. If LR after decay + is lower than `min_lr`, it will be clipped to this value. If None + is given, we don't perform lr clipping. Default: None. + """ + + def __init__(self, step, gamma=0.1, min_lr=None, **kwargs): + if isinstance(step, list): + assert mmcv.is_list_of(step, int) + assert all([s > 0 for s in step]) + elif isinstance(step, int): + assert step > 0 + else: + raise TypeError('"step" must be a list or integer') + self.step = step + self.gamma = gamma + self.min_lr = min_lr + super(StepLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = runner.epoch if self.by_epoch else runner.iter + + # calculate exponential term + if isinstance(self.step, int): + exp = progress // self.step + else: + exp = len(self.step) + for i, s in enumerate(self.step): + if progress < s: + exp = i + break + + lr = base_lr * (self.gamma**exp) + if self.min_lr is not None: + # clip to a minimum value + lr = max(lr, self.min_lr) + return lr + + +@HOOKS.register_module() +class ExpLrUpdaterHook(LrUpdaterHook): + + def __init__(self, gamma, **kwargs): + self.gamma = gamma + super(ExpLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = runner.epoch if self.by_epoch else runner.iter + return base_lr * self.gamma**progress + + +@HOOKS.register_module() +class PolyLrUpdaterHook(LrUpdaterHook): + + def __init__(self, power=1., min_lr=0., **kwargs): + self.power = power + self.min_lr = min_lr + super(PolyLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + if self.by_epoch: + progress = runner.epoch + max_progress = runner.max_epochs + else: + progress = runner.iter + max_progress = runner.max_iters + coeff = (1 - progress / max_progress)**self.power + return (base_lr - self.min_lr) * coeff + self.min_lr + + +@HOOKS.register_module() +class InvLrUpdaterHook(LrUpdaterHook): + + def __init__(self, gamma, power=1., **kwargs): + self.gamma = gamma + self.power = power + super(InvLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + progress = runner.epoch if self.by_epoch else runner.iter + return base_lr * (1 + self.gamma * progress)**(-self.power) + + +@HOOKS.register_module() +class CosineAnnealingLrUpdaterHook(LrUpdaterHook): + + def __init__(self, min_lr=None, min_lr_ratio=None, **kwargs): + assert (min_lr is None) ^ (min_lr_ratio is None) + self.min_lr = min_lr + self.min_lr_ratio = min_lr_ratio + super(CosineAnnealingLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + if self.by_epoch: + progress = runner.epoch + max_progress = runner.max_epochs + else: + progress = runner.iter + max_progress = runner.max_iters + + if self.min_lr_ratio is not None: + target_lr = base_lr * self.min_lr_ratio + else: + target_lr = self.min_lr + return annealing_cos(base_lr, target_lr, progress / max_progress) + + +@HOOKS.register_module() +class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook): + """Flat + Cosine lr schedule. + + Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501 + + Args: + start_percent (float): When to start annealing the learning rate + after the percentage of the total training steps. + The value should be in range [0, 1). + Default: 0.75 + min_lr (float, optional): The minimum lr. Default: None. + min_lr_ratio (float, optional): The ratio of minimum lr to the base lr. + Either `min_lr` or `min_lr_ratio` should be specified. + Default: None. + """ + + def __init__(self, + start_percent=0.75, + min_lr=None, + min_lr_ratio=None, + **kwargs): + assert (min_lr is None) ^ (min_lr_ratio is None) + if start_percent < 0 or start_percent > 1 or not isinstance( + start_percent, float): + raise ValueError( + 'expected float between 0 and 1 start_percent, but ' + f'got {start_percent}') + self.start_percent = start_percent + self.min_lr = min_lr + self.min_lr_ratio = min_lr_ratio + super(FlatCosineAnnealingLrUpdaterHook, self).__init__(**kwargs) + + def get_lr(self, runner, base_lr): + if self.by_epoch: + start = round(runner.max_epochs * self.start_percent) + progress = runner.epoch - start + max_progress = runner.max_epochs - start + else: + start = round(runner.max_iters * self.start_percent) + progress = runner.iter - start + max_progress = runner.max_iters - start + + if self.min_lr_ratio is not None: + target_lr = base_lr * self.min_lr_ratio + else: + target_lr = self.min_lr + + if progress < 0: + return base_lr + else: + return annealing_cos(base_lr, target_lr, progress / max_progress) + + +@HOOKS.register_module() +class CosineRestartLrUpdaterHook(LrUpdaterHook): + """Cosine annealing with restarts learning rate scheme. + + Args: + periods (list[int]): Periods for each cosine anneling cycle. + restart_weights (list[float], optional): Restart weights at each + restart iteration. Default: [1]. + min_lr (float, optional): The minimum lr. Default: None. + min_lr_ratio (float, optional): The ratio of minimum lr to the base lr. + Either `min_lr` or `min_lr_ratio` should be specified. + Default: None. + """ + + def __init__(self, + periods, + restart_weights=[1], + min_lr=None, + min_lr_ratio=None, + **kwargs): + assert (min_lr is None) ^ (min_lr_ratio is None) + self.periods = periods + self.min_lr = min_lr + self.min_lr_ratio = min_lr_ratio + self.restart_weights = restart_weights + assert (len(self.periods) == len(self.restart_weights) + ), 'periods and restart_weights should have the same length.' + super(CosineRestartLrUpdaterHook, self).__init__(**kwargs) + + self.cumulative_periods = [ + sum(self.periods[0:i + 1]) for i in range(0, len(self.periods)) + ] + + def get_lr(self, runner, base_lr): + if self.by_epoch: + progress = runner.epoch + else: + progress = runner.iter + + if self.min_lr_ratio is not None: + target_lr = base_lr * self.min_lr_ratio + else: + target_lr = self.min_lr + + idx = get_position_from_periods(progress, self.cumulative_periods) + current_weight = self.restart_weights[idx] + nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1] + current_periods = self.periods[idx] + + alpha = min((progress - nearest_restart) / current_periods, 1) + return annealing_cos(base_lr, target_lr, alpha, current_weight) + + +def get_position_from_periods(iteration, cumulative_periods): + """Get the position from a period list. + + It will return the index of the right-closest number in the period list. + For example, the cumulative_periods = [100, 200, 300, 400], + if iteration == 50, return 0; + if iteration == 210, return 2; + if iteration == 300, return 3. + + Args: + iteration (int): Current iteration. + cumulative_periods (list[int]): Cumulative period list. + + Returns: + int: The position of the right-closest number in the period list. + """ + for i, period in enumerate(cumulative_periods): + if iteration < period: + return i + raise ValueError(f'Current iteration {iteration} exceeds ' + f'cumulative_periods {cumulative_periods}') + + +@HOOKS.register_module() +class CyclicLrUpdaterHook(LrUpdaterHook): + """Cyclic LR Scheduler. + + Implement the cyclical learning rate policy (CLR) described in + https://arxiv.org/pdf/1506.01186.pdf + + Different from the original paper, we use cosine annealing rather than + triangular policy inside a cycle. This improves the performance in the + 3D detection area. + + Args: + by_epoch (bool): Whether to update LR by epoch. + target_ratio (tuple[float]): Relative ratio of the highest LR and the + lowest LR to the initial LR. + cyclic_times (int): Number of cycles during training + step_ratio_up (float): The ratio of the increasing process of LR in + the total cycle. + anneal_strategy (str): {'cos', 'linear'} + Specifies the annealing strategy: 'cos' for cosine annealing, + 'linear' for linear annealing. Default: 'cos'. + """ + + def __init__(self, + by_epoch=False, + target_ratio=(10, 1e-4), + cyclic_times=1, + step_ratio_up=0.4, + anneal_strategy='cos', + **kwargs): + if isinstance(target_ratio, float): + target_ratio = (target_ratio, target_ratio / 1e5) + elif isinstance(target_ratio, tuple): + target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \ + if len(target_ratio) == 1 else target_ratio + else: + raise ValueError('target_ratio should be either float ' + f'or tuple, got {type(target_ratio)}') + + assert len(target_ratio) == 2, \ + '"target_ratio" must be list or tuple of two floats' + assert 0 <= step_ratio_up < 1.0, \ + '"step_ratio_up" must be in range [0,1)' + + self.target_ratio = target_ratio + self.cyclic_times = cyclic_times + self.step_ratio_up = step_ratio_up + self.lr_phases = [] # init lr_phases + # validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError('anneal_strategy must be one of "cos" or ' + f'"linear", instead got {anneal_strategy}') + elif anneal_strategy == 'cos': + self.anneal_func = annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = annealing_linear + + assert not by_epoch, \ + 'currently only support "by_epoch" = False' + super(CyclicLrUpdaterHook, self).__init__(by_epoch, **kwargs) + + def before_run(self, runner): + super(CyclicLrUpdaterHook, self).before_run(runner) + # initiate lr_phases + # total lr_phases are separated as up and down + max_iter_per_phase = runner.max_iters // self.cyclic_times + iter_up_phase = int(self.step_ratio_up * max_iter_per_phase) + self.lr_phases.append( + [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]]) + self.lr_phases.append([ + iter_up_phase, max_iter_per_phase, max_iter_per_phase, + self.target_ratio[0], self.target_ratio[1] + ]) + + def get_lr(self, runner, base_lr): + curr_iter = runner.iter + for (start_iter, end_iter, max_iter_per_phase, start_ratio, + end_ratio) in self.lr_phases: + curr_iter %= max_iter_per_phase + if start_iter <= curr_iter < end_iter: + progress = curr_iter - start_iter + return self.anneal_func(base_lr * start_ratio, + base_lr * end_ratio, + progress / (end_iter - start_iter)) + + +@HOOKS.register_module() +class OneCycleLrUpdaterHook(LrUpdaterHook): + """One Cycle LR Scheduler. + + The 1cycle learning rate policy changes the learning rate after every + batch. The one cycle learning rate policy is described in + https://arxiv.org/pdf/1708.07120.pdf + + Args: + max_lr (float or list): Upper learning rate boundaries in the cycle + for each parameter group. + total_steps (int, optional): The total number of steps in the cycle. + Note that if a value is not provided here, it will be the max_iter + of runner. Default: None. + pct_start (float): The percentage of the cycle (in number of steps) + spent increasing the learning rate. + Default: 0.3 + anneal_strategy (str): {'cos', 'linear'} + Specifies the annealing strategy: 'cos' for cosine annealing, + 'linear' for linear annealing. + Default: 'cos' + div_factor (float): Determines the initial learning rate via + initial_lr = max_lr/div_factor + Default: 25 + final_div_factor (float): Determines the minimum learning rate via + min_lr = initial_lr/final_div_factor + Default: 1e4 + three_phase (bool): If three_phase is True, use a third phase of the + schedule to annihilate the learning rate according to + final_div_factor instead of modifying the second phase (the first + two phases will be symmetrical about the step indicated by + pct_start). + Default: False + """ + + def __init__(self, + max_lr, + total_steps=None, + pct_start=0.3, + anneal_strategy='cos', + div_factor=25, + final_div_factor=1e4, + three_phase=False, + **kwargs): + # validate by_epoch, currently only support by_epoch = False + if 'by_epoch' not in kwargs: + kwargs['by_epoch'] = False + else: + assert not kwargs['by_epoch'], \ + 'currently only support "by_epoch" = False' + if not isinstance(max_lr, (numbers.Number, list, dict)): + raise ValueError('the type of max_lr must be the one of list or ' + f'dict, but got {type(max_lr)}') + self._max_lr = max_lr + if total_steps is not None: + if not isinstance(total_steps, int): + raise ValueError('the type of total_steps must be int, but' + f'got {type(total_steps)}') + self.total_steps = total_steps + # validate pct_start + if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): + raise ValueError('expected float between 0 and 1 pct_start, but ' + f'got {pct_start}') + self.pct_start = pct_start + # validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError('anneal_strategy must be one of "cos" or ' + f'"linear", instead got {anneal_strategy}') + elif anneal_strategy == 'cos': + self.anneal_func = annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = annealing_linear + self.div_factor = div_factor + self.final_div_factor = final_div_factor + self.three_phase = three_phase + self.lr_phases = [] # init lr_phases + super(OneCycleLrUpdaterHook, self).__init__(**kwargs) + + def before_run(self, runner): + if hasattr(self, 'total_steps'): + total_steps = self.total_steps + else: + total_steps = runner.max_iters + if total_steps < runner.max_iters: + raise ValueError( + 'The total steps must be greater than or equal to max ' + f'iterations {runner.max_iters} of runner, but total steps ' + f'is {total_steps}.') + + if isinstance(runner.optimizer, dict): + self.base_lr = {} + for k, optim in runner.optimizer.items(): + _max_lr = format_param(k, optim, self._max_lr) + self.base_lr[k] = [lr / self.div_factor for lr in _max_lr] + for group, lr in zip(optim.param_groups, self.base_lr[k]): + group.setdefault('initial_lr', lr) + else: + k = type(runner.optimizer).__name__ + _max_lr = format_param(k, runner.optimizer, self._max_lr) + self.base_lr = [lr / self.div_factor for lr in _max_lr] + for group, lr in zip(runner.optimizer.param_groups, self.base_lr): + group.setdefault('initial_lr', lr) + + if self.three_phase: + self.lr_phases.append( + [float(self.pct_start * total_steps) - 1, 1, self.div_factor]) + self.lr_phases.append([ + float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1 + ]) + self.lr_phases.append( + [total_steps - 1, 1, 1 / self.final_div_factor]) + else: + self.lr_phases.append( + [float(self.pct_start * total_steps) - 1, 1, self.div_factor]) + self.lr_phases.append( + [total_steps - 1, self.div_factor, 1 / self.final_div_factor]) + + def get_lr(self, runner, base_lr): + curr_iter = runner.iter + start_iter = 0 + for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases): + if curr_iter <= end_iter: + pct = (curr_iter - start_iter) / (end_iter - start_iter) + lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr, + pct) + break + start_iter = end_iter + return lr + + +def annealing_cos(start, end, factor, weight=1): + """Calculate annealing cos learning rate. + + Cosine anneal from `weight * start + (1 - weight) * end` to `end` as + percentage goes from 0.0 to 1.0. + + Args: + start (float): The starting learning rate of the cosine annealing. + end (float): The ending learing rate of the cosine annealing. + factor (float): The coefficient of `pi` when calculating the current + percentage. Range from 0.0 to 1.0. + weight (float, optional): The combination factor of `start` and `end` + when calculating the actual starting learning rate. Default to 1. + """ + cos_out = cos(pi * factor) + 1 + return end + 0.5 * weight * (start - end) * cos_out + + +def annealing_linear(start, end, factor): + """Calculate annealing linear learning rate. + + Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0. + + Args: + start (float): The starting learning rate of the linear annealing. + end (float): The ending learing rate of the linear annealing. + factor (float): The coefficient of `pi` when calculating the current + percentage. Range from 0.0 to 1.0. + """ + return start + (end - start) * factor + + +def format_param(name, optim, param): + if isinstance(param, numbers.Number): + return [param] * len(optim.param_groups) + elif isinstance(param, (list, tuple)): # multi param groups + if len(param) != len(optim.param_groups): + raise ValueError(f'expected {len(optim.param_groups)} ' + f'values for {name}, got {len(param)}') + return param + else: # multi optimizers + if name not in param: + raise KeyError(f'{name} is not found in {param.keys()}') + return param[name] diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/memory.py b/custom_mmpkg/custom_mmcv/runner/hooks/memory.py new file mode 100644 index 0000000000000000000000000000000000000000..70cf9a838fb314e3bd3c07aadbc00921a81e83ed --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/memory.py @@ -0,0 +1,25 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch + +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class EmptyCacheHook(Hook): + + def __init__(self, before_epoch=False, after_epoch=True, after_iter=False): + self._before_epoch = before_epoch + self._after_epoch = after_epoch + self._after_iter = after_iter + + def after_iter(self, runner): + if self._after_iter: + torch.cuda.empty_cache() + + def before_epoch(self, runner): + if self._before_epoch: + torch.cuda.empty_cache() + + def after_epoch(self, runner): + if self._after_epoch: + torch.cuda.empty_cache() diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/momentum_updater.py b/custom_mmpkg/custom_mmcv/runner/hooks/momentum_updater.py new file mode 100644 index 0000000000000000000000000000000000000000..29b6c7c531a24603cbfee463f23e0c310cbfff41 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/momentum_updater.py @@ -0,0 +1,493 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import custom_mmpkg.custom_mmcv as mmcv +from .hook import HOOKS, Hook +from .lr_updater import annealing_cos, annealing_linear, format_param + + +class MomentumUpdaterHook(Hook): + + def __init__(self, + by_epoch=True, + warmup=None, + warmup_iters=0, + warmup_ratio=0.9): + # validate the "warmup" argument + if warmup is not None: + if warmup not in ['constant', 'linear', 'exp']: + raise ValueError( + f'"{warmup}" is not a supported type for warming up, valid' + ' types are "constant" and "linear"') + if warmup is not None: + assert warmup_iters > 0, \ + '"warmup_iters" must be a positive integer' + assert 0 < warmup_ratio <= 1.0, \ + '"warmup_momentum" must be in range (0,1]' + + self.by_epoch = by_epoch + self.warmup = warmup + self.warmup_iters = warmup_iters + self.warmup_ratio = warmup_ratio + + self.base_momentum = [] # initial momentum for all param groups + self.regular_momentum = [ + ] # expected momentum if no warming up is performed + + def _set_momentum(self, runner, momentum_groups): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + for param_group, mom in zip(optim.param_groups, + momentum_groups[k]): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + else: + for param_group, mom in zip(runner.optimizer.param_groups, + momentum_groups): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + + def get_momentum(self, runner, base_momentum): + raise NotImplementedError + + def get_regular_momentum(self, runner): + if isinstance(runner.optimizer, dict): + momentum_groups = {} + for k in runner.optimizer.keys(): + _momentum_group = [ + self.get_momentum(runner, _base_momentum) + for _base_momentum in self.base_momentum[k] + ] + momentum_groups.update({k: _momentum_group}) + return momentum_groups + else: + return [ + self.get_momentum(runner, _base_momentum) + for _base_momentum in self.base_momentum + ] + + def get_warmup_momentum(self, cur_iters): + + def _get_warmup_momentum(cur_iters, regular_momentum): + if self.warmup == 'constant': + warmup_momentum = [ + _momentum / self.warmup_ratio + for _momentum in self.regular_momentum + ] + elif self.warmup == 'linear': + k = (1 - cur_iters / self.warmup_iters) * (1 - + self.warmup_ratio) + warmup_momentum = [ + _momentum / (1 - k) for _momentum in self.regular_mom + ] + elif self.warmup == 'exp': + k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters) + warmup_momentum = [ + _momentum / k for _momentum in self.regular_mom + ] + return warmup_momentum + + if isinstance(self.regular_momentum, dict): + momentum_groups = {} + for key, regular_momentum in self.regular_momentum.items(): + momentum_groups[key] = _get_warmup_momentum( + cur_iters, regular_momentum) + return momentum_groups + else: + return _get_warmup_momentum(cur_iters, self.regular_momentum) + + def before_run(self, runner): + # NOTE: when resuming from a checkpoint, + # if 'initial_momentum' is not saved, + # it will be set according to the optimizer params + if isinstance(runner.optimizer, dict): + self.base_momentum = {} + for k, optim in runner.optimizer.items(): + for group in optim.param_groups: + if 'momentum' in group.keys(): + group.setdefault('initial_momentum', group['momentum']) + else: + group.setdefault('initial_momentum', group['betas'][0]) + _base_momentum = [ + group['initial_momentum'] for group in optim.param_groups + ] + self.base_momentum.update({k: _base_momentum}) + else: + for group in runner.optimizer.param_groups: + if 'momentum' in group.keys(): + group.setdefault('initial_momentum', group['momentum']) + else: + group.setdefault('initial_momentum', group['betas'][0]) + self.base_momentum = [ + group['initial_momentum'] + for group in runner.optimizer.param_groups + ] + + def before_train_epoch(self, runner): + if not self.by_epoch: + return + self.regular_mom = self.get_regular_momentum(runner) + self._set_momentum(runner, self.regular_mom) + + def before_train_iter(self, runner): + cur_iter = runner.iter + if not self.by_epoch: + self.regular_mom = self.get_regular_momentum(runner) + if self.warmup is None or cur_iter >= self.warmup_iters: + self._set_momentum(runner, self.regular_mom) + else: + warmup_momentum = self.get_warmup_momentum(cur_iter) + self._set_momentum(runner, warmup_momentum) + elif self.by_epoch: + if self.warmup is None or cur_iter > self.warmup_iters: + return + elif cur_iter == self.warmup_iters: + self._set_momentum(runner, self.regular_mom) + else: + warmup_momentum = self.get_warmup_momentum(cur_iter) + self._set_momentum(runner, warmup_momentum) + + +@HOOKS.register_module() +class StepMomentumUpdaterHook(MomentumUpdaterHook): + """Step momentum scheduler with min value clipping. + + Args: + step (int | list[int]): Step to decay the momentum. If an int value is + given, regard it as the decay interval. If a list is given, decay + momentum at these steps. + gamma (float, optional): Decay momentum ratio. Default: 0.5. + min_momentum (float, optional): Minimum momentum value to keep. If + momentum after decay is lower than this value, it will be clipped + accordingly. If None is given, we don't perform lr clipping. + Default: None. + """ + + def __init__(self, step, gamma=0.5, min_momentum=None, **kwargs): + if isinstance(step, list): + assert mmcv.is_list_of(step, int) + assert all([s > 0 for s in step]) + elif isinstance(step, int): + assert step > 0 + else: + raise TypeError('"step" must be a list or integer') + self.step = step + self.gamma = gamma + self.min_momentum = min_momentum + super(StepMomentumUpdaterHook, self).__init__(**kwargs) + + def get_momentum(self, runner, base_momentum): + progress = runner.epoch if self.by_epoch else runner.iter + + # calculate exponential term + if isinstance(self.step, int): + exp = progress // self.step + else: + exp = len(self.step) + for i, s in enumerate(self.step): + if progress < s: + exp = i + break + + momentum = base_momentum * (self.gamma**exp) + if self.min_momentum is not None: + # clip to a minimum value + momentum = max(momentum, self.min_momentum) + return momentum + + +@HOOKS.register_module() +class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook): + + def __init__(self, min_momentum=None, min_momentum_ratio=None, **kwargs): + assert (min_momentum is None) ^ (min_momentum_ratio is None) + self.min_momentum = min_momentum + self.min_momentum_ratio = min_momentum_ratio + super(CosineAnnealingMomentumUpdaterHook, self).__init__(**kwargs) + + def get_momentum(self, runner, base_momentum): + if self.by_epoch: + progress = runner.epoch + max_progress = runner.max_epochs + else: + progress = runner.iter + max_progress = runner.max_iters + if self.min_momentum_ratio is not None: + target_momentum = base_momentum * self.min_momentum_ratio + else: + target_momentum = self.min_momentum + return annealing_cos(base_momentum, target_momentum, + progress / max_progress) + + +@HOOKS.register_module() +class CyclicMomentumUpdaterHook(MomentumUpdaterHook): + """Cyclic momentum Scheduler. + + Implement the cyclical momentum scheduler policy described in + https://arxiv.org/pdf/1708.07120.pdf + + This momentum scheduler usually used together with the CyclicLRUpdater + to improve the performance in the 3D detection area. + + Attributes: + target_ratio (tuple[float]): Relative ratio of the lowest momentum and + the highest momentum to the initial momentum. + cyclic_times (int): Number of cycles during training + step_ratio_up (float): The ratio of the increasing process of momentum + in the total cycle. + by_epoch (bool): Whether to update momentum by epoch. + """ + + def __init__(self, + by_epoch=False, + target_ratio=(0.85 / 0.95, 1), + cyclic_times=1, + step_ratio_up=0.4, + **kwargs): + if isinstance(target_ratio, float): + target_ratio = (target_ratio, target_ratio / 1e5) + elif isinstance(target_ratio, tuple): + target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \ + if len(target_ratio) == 1 else target_ratio + else: + raise ValueError('target_ratio should be either float ' + f'or tuple, got {type(target_ratio)}') + + assert len(target_ratio) == 2, \ + '"target_ratio" must be list or tuple of two floats' + assert 0 <= step_ratio_up < 1.0, \ + '"step_ratio_up" must be in range [0,1)' + + self.target_ratio = target_ratio + self.cyclic_times = cyclic_times + self.step_ratio_up = step_ratio_up + self.momentum_phases = [] # init momentum_phases + # currently only support by_epoch=False + assert not by_epoch, \ + 'currently only support "by_epoch" = False' + super(CyclicMomentumUpdaterHook, self).__init__(by_epoch, **kwargs) + + def before_run(self, runner): + super(CyclicMomentumUpdaterHook, self).before_run(runner) + # initiate momentum_phases + # total momentum_phases are separated as up and down + max_iter_per_phase = runner.max_iters // self.cyclic_times + iter_up_phase = int(self.step_ratio_up * max_iter_per_phase) + self.momentum_phases.append( + [0, iter_up_phase, max_iter_per_phase, 1, self.target_ratio[0]]) + self.momentum_phases.append([ + iter_up_phase, max_iter_per_phase, max_iter_per_phase, + self.target_ratio[0], self.target_ratio[1] + ]) + + def get_momentum(self, runner, base_momentum): + curr_iter = runner.iter + for (start_iter, end_iter, max_iter_per_phase, start_ratio, + end_ratio) in self.momentum_phases: + curr_iter %= max_iter_per_phase + if start_iter <= curr_iter < end_iter: + progress = curr_iter - start_iter + return annealing_cos(base_momentum * start_ratio, + base_momentum * end_ratio, + progress / (end_iter - start_iter)) + + +@HOOKS.register_module() +class OneCycleMomentumUpdaterHook(MomentumUpdaterHook): + """OneCycle momentum Scheduler. + + This momentum scheduler usually used together with the OneCycleLrUpdater + to improve the performance. + + Args: + base_momentum (float or list): Lower momentum boundaries in the cycle + for each parameter group. Note that momentum is cycled inversely + to learning rate; at the peak of a cycle, momentum is + 'base_momentum' and learning rate is 'max_lr'. + Default: 0.85 + max_momentum (float or list): Upper momentum boundaries in the cycle + for each parameter group. Functionally, + it defines the cycle amplitude (max_momentum - base_momentum). + Note that momentum is cycled inversely + to learning rate; at the start of a cycle, momentum is + 'max_momentum' and learning rate is 'base_lr' + Default: 0.95 + pct_start (float): The percentage of the cycle (in number of steps) + spent increasing the learning rate. + Default: 0.3 + anneal_strategy (str): {'cos', 'linear'} + Specifies the annealing strategy: 'cos' for cosine annealing, + 'linear' for linear annealing. + Default: 'cos' + three_phase (bool): If three_phase is True, use a third phase of the + schedule to annihilate the learning rate according to + final_div_factor instead of modifying the second phase (the first + two phases will be symmetrical about the step indicated by + pct_start). + Default: False + """ + + def __init__(self, + base_momentum=0.85, + max_momentum=0.95, + pct_start=0.3, + anneal_strategy='cos', + three_phase=False, + **kwargs): + # validate by_epoch, currently only support by_epoch=False + if 'by_epoch' not in kwargs: + kwargs['by_epoch'] = False + else: + assert not kwargs['by_epoch'], \ + 'currently only support "by_epoch" = False' + if not isinstance(base_momentum, (float, list, dict)): + raise ValueError('base_momentum must be the type among of float,' + 'list or dict.') + self._base_momentum = base_momentum + if not isinstance(max_momentum, (float, list, dict)): + raise ValueError('max_momentum must be the type among of float,' + 'list or dict.') + self._max_momentum = max_momentum + # validate pct_start + if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float): + raise ValueError('Expected float between 0 and 1 pct_start, but ' + f'got {pct_start}') + self.pct_start = pct_start + # validate anneal_strategy + if anneal_strategy not in ['cos', 'linear']: + raise ValueError('anneal_strategy must by one of "cos" or ' + f'"linear", instead got {anneal_strategy}') + elif anneal_strategy == 'cos': + self.anneal_func = annealing_cos + elif anneal_strategy == 'linear': + self.anneal_func = annealing_linear + self.three_phase = three_phase + self.momentum_phases = [] # init momentum_phases + super(OneCycleMomentumUpdaterHook, self).__init__(**kwargs) + + def before_run(self, runner): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + if ('momentum' not in optim.defaults + and 'betas' not in optim.defaults): + raise ValueError('optimizer must support momentum with' + 'option enabled') + self.use_beta1 = 'betas' in optim.defaults + _base_momentum = format_param(k, optim, self._base_momentum) + _max_momentum = format_param(k, optim, self._max_momentum) + for group, b_momentum, m_momentum in zip( + optim.param_groups, _base_momentum, _max_momentum): + if self.use_beta1: + _, beta2 = group['betas'] + group['betas'] = (m_momentum, beta2) + else: + group['momentum'] = m_momentum + group['base_momentum'] = b_momentum + group['max_momentum'] = m_momentum + else: + optim = runner.optimizer + if ('momentum' not in optim.defaults + and 'betas' not in optim.defaults): + raise ValueError('optimizer must support momentum with' + 'option enabled') + self.use_beta1 = 'betas' in optim.defaults + k = type(optim).__name__ + _base_momentum = format_param(k, optim, self._base_momentum) + _max_momentum = format_param(k, optim, self._max_momentum) + for group, b_momentum, m_momentum in zip(optim.param_groups, + _base_momentum, + _max_momentum): + if self.use_beta1: + _, beta2 = group['betas'] + group['betas'] = (m_momentum, beta2) + else: + group['momentum'] = m_momentum + group['base_momentum'] = b_momentum + group['max_momentum'] = m_momentum + + if self.three_phase: + self.momentum_phases.append({ + 'end_iter': + float(self.pct_start * runner.max_iters) - 1, + 'start_momentum': + 'max_momentum', + 'end_momentum': + 'base_momentum' + }) + self.momentum_phases.append({ + 'end_iter': + float(2 * self.pct_start * runner.max_iters) - 2, + 'start_momentum': + 'base_momentum', + 'end_momentum': + 'max_momentum' + }) + self.momentum_phases.append({ + 'end_iter': runner.max_iters - 1, + 'start_momentum': 'max_momentum', + 'end_momentum': 'max_momentum' + }) + else: + self.momentum_phases.append({ + 'end_iter': + float(self.pct_start * runner.max_iters) - 1, + 'start_momentum': + 'max_momentum', + 'end_momentum': + 'base_momentum' + }) + self.momentum_phases.append({ + 'end_iter': runner.max_iters - 1, + 'start_momentum': 'base_momentum', + 'end_momentum': 'max_momentum' + }) + + def _set_momentum(self, runner, momentum_groups): + if isinstance(runner.optimizer, dict): + for k, optim in runner.optimizer.items(): + for param_group, mom in zip(optim.param_groups, + momentum_groups[k]): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + else: + for param_group, mom in zip(runner.optimizer.param_groups, + momentum_groups): + if 'momentum' in param_group.keys(): + param_group['momentum'] = mom + elif 'betas' in param_group.keys(): + param_group['betas'] = (mom, param_group['betas'][1]) + + def get_momentum(self, runner, param_group): + curr_iter = runner.iter + start_iter = 0 + for i, phase in enumerate(self.momentum_phases): + end_iter = phase['end_iter'] + if curr_iter <= end_iter or i == len(self.momentum_phases) - 1: + pct = (curr_iter - start_iter) / (end_iter - start_iter) + momentum = self.anneal_func( + param_group[phase['start_momentum']], + param_group[phase['end_momentum']], pct) + break + start_iter = end_iter + return momentum + + def get_regular_momentum(self, runner): + if isinstance(runner.optimizer, dict): + momentum_groups = {} + for k, optim in runner.optimizer.items(): + _momentum_group = [ + self.get_momentum(runner, param_group) + for param_group in optim.param_groups + ] + momentum_groups.update({k: _momentum_group}) + return momentum_groups + else: + momentum_groups = [] + for param_group in runner.optimizer.param_groups: + momentum_groups.append(self.get_momentum(runner, param_group)) + return momentum_groups diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/optimizer.py b/custom_mmpkg/custom_mmcv/runner/hooks/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..c7f111733b6d37a86dc396442e39b67a8880c99a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/optimizer.py @@ -0,0 +1,508 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +from collections import defaultdict +from itertools import chain + +from torch.nn.utils import clip_grad + +from custom_mmpkg.custom_mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version +from ..dist_utils import allreduce_grads +from ..fp16_utils import LossScaler, wrap_fp16_model +from .hook import HOOKS, Hook + +try: + # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported + # and used; otherwise, auto fp16 will adopt mmcv's implementation. + from torch.cuda.amp import GradScaler +except ImportError: + pass + + +@HOOKS.register_module() +class OptimizerHook(Hook): + + def __init__(self, grad_clip=None): + self.grad_clip = grad_clip + + def clip_grads(self, params): + params = list( + filter(lambda p: p.requires_grad and p.grad is not None, params)) + if len(params) > 0: + return clip_grad.clip_grad_norm_(params, **self.grad_clip) + + def after_train_iter(self, runner): + runner.optimizer.zero_grad() + runner.outputs['loss'].backward() + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update({'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + runner.optimizer.step() + + +@HOOKS.register_module() +class GradientCumulativeOptimizerHook(OptimizerHook): + """Optimizer Hook implements multi-iters gradient cumulating. + + Args: + cumulative_iters (int, optional): Num of gradient cumulative iters. + The optimizer will step every `cumulative_iters` iters. + Defaults to 1. + + Examples: + >>> # Use cumulative_iters to simulate a large batch size + >>> # It is helpful when the hardware cannot handle a large batch size. + >>> loader = DataLoader(data, batch_size=64) + >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4) + >>> # almost equals to + >>> loader = DataLoader(data, batch_size=256) + >>> optim_hook = OptimizerHook() + """ + + def __init__(self, cumulative_iters=1, **kwargs): + super(GradientCumulativeOptimizerHook, self).__init__(**kwargs) + + assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \ + f'cumulative_iters only accepts positive int, but got ' \ + f'{type(cumulative_iters)} instead.' + + self.cumulative_iters = cumulative_iters + self.divisible_iters = 0 + self.remainder_iters = 0 + self.initialized = False + + def has_batch_norm(self, module): + if isinstance(module, _BatchNorm): + return True + for m in module.children(): + if self.has_batch_norm(m): + return True + return False + + def _init(self, runner): + if runner.iter % self.cumulative_iters != 0: + runner.logger.warning( + 'Resume iter number is not divisible by cumulative_iters in ' + 'GradientCumulativeOptimizerHook, which means the gradient of ' + 'some iters is lost and the result may be influenced slightly.' + ) + + if self.has_batch_norm(runner.model) and self.cumulative_iters > 1: + runner.logger.warning( + 'GradientCumulativeOptimizerHook may slightly decrease ' + 'performance if the model has BatchNorm layers.') + + residual_iters = runner.max_iters - runner.iter + + self.divisible_iters = ( + residual_iters // self.cumulative_iters * self.cumulative_iters) + self.remainder_iters = residual_iters - self.divisible_iters + + self.initialized = True + + def after_train_iter(self, runner): + if not self.initialized: + self._init(runner) + + if runner.iter < self.divisible_iters: + loss_factor = self.cumulative_iters + else: + loss_factor = self.remainder_iters + loss = runner.outputs['loss'] + loss = loss / loss_factor + loss.backward() + + if (self.every_n_iters(runner, self.cumulative_iters) + or self.is_last_iter(runner)): + + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update({'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + runner.optimizer.step() + runner.optimizer.zero_grad() + + +if (TORCH_VERSION != 'parrots' + and digit_version(TORCH_VERSION) >= digit_version('1.6.0')): + + @HOOKS.register_module() + class Fp16OptimizerHook(OptimizerHook): + """FP16 optimizer hook (using PyTorch's implementation). + + If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, + to take care of the optimization procedure. + + Args: + loss_scale (float | str | dict): Scale factor configuration. + If loss_scale is a float, static loss scaling will be used with + the specified scale. If loss_scale is a string, it must be + 'dynamic', then dynamic loss scaling will be used. + It can also be a dict containing arguments of GradScalar. + Defaults to 512. For Pytorch >= 1.6, mmcv uses official + implementation of GradScaler. If you use a dict version of + loss_scale to create GradScaler, please refer to: + https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler + for the parameters. + + Examples: + >>> loss_scale = dict( + ... init_scale=65536.0, + ... growth_factor=2.0, + ... backoff_factor=0.5, + ... growth_interval=2000 + ... ) + >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale) + """ + + def __init__(self, + grad_clip=None, + coalesce=True, + bucket_size_mb=-1, + loss_scale=512., + distributed=True): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + self.distributed = distributed + self._scale_update_param = None + if loss_scale == 'dynamic': + self.loss_scaler = GradScaler() + elif isinstance(loss_scale, float): + self._scale_update_param = loss_scale + self.loss_scaler = GradScaler(init_scale=loss_scale) + elif isinstance(loss_scale, dict): + self.loss_scaler = GradScaler(**loss_scale) + else: + raise ValueError('loss_scale must be of type float, dict, or ' + f'"dynamic", got {loss_scale}') + + def before_run(self, runner): + """Preparing steps before Mixed Precision Training.""" + # wrap model mode to fp16 + wrap_fp16_model(runner.model) + # resume from state dict + if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']: + scaler_state_dict = runner.meta['fp16']['loss_scaler'] + self.loss_scaler.load_state_dict(scaler_state_dict) + + def copy_grads_to_fp32(self, fp16_net, fp32_weights): + """Copy gradients from fp16 model to fp32 weight copy.""" + for fp32_param, fp16_param in zip(fp32_weights, + fp16_net.parameters()): + if fp16_param.grad is not None: + if fp32_param.grad is None: + fp32_param.grad = fp32_param.data.new( + fp32_param.size()) + fp32_param.grad.copy_(fp16_param.grad) + + def copy_params_to_fp16(self, fp16_net, fp32_weights): + """Copy updated params from fp32 weight copy to fp16 model.""" + for fp16_param, fp32_param in zip(fp16_net.parameters(), + fp32_weights): + fp16_param.data.copy_(fp32_param.data) + + def after_train_iter(self, runner): + """Backward optimization steps for Mixed Precision Training. For + dynamic loss scaling, please refer to + https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler. + + 1. Scale the loss by a scale factor. + 2. Backward the loss to obtain the gradients. + 3. Unscale the optimizer’s gradient tensors. + 4. Call optimizer.step() and update scale factor. + 5. Save loss_scaler state_dict for resume purpose. + """ + # clear grads of last iteration + runner.model.zero_grad() + runner.optimizer.zero_grad() + + self.loss_scaler.scale(runner.outputs['loss']).backward() + self.loss_scaler.unscale_(runner.optimizer) + # grad clip + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update({'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + # backward and update scaler + self.loss_scaler.step(runner.optimizer) + self.loss_scaler.update(self._scale_update_param) + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + @HOOKS.register_module() + class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook, + Fp16OptimizerHook): + """Fp16 optimizer Hook (using PyTorch's implementation) implements + multi-iters gradient cumulating. + + If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend, + to take care of the optimization procedure. + """ + + def __init__(self, *args, **kwargs): + super(GradientCumulativeFp16OptimizerHook, + self).__init__(*args, **kwargs) + + def after_train_iter(self, runner): + if not self.initialized: + self._init(runner) + + if runner.iter < self.divisible_iters: + loss_factor = self.cumulative_iters + else: + loss_factor = self.remainder_iters + loss = runner.outputs['loss'] + loss = loss / loss_factor + + self.loss_scaler.scale(loss).backward() + + if (self.every_n_iters(runner, self.cumulative_iters) + or self.is_last_iter(runner)): + + # copy fp16 grads in the model to fp32 params in the optimizer + self.loss_scaler.unscale_(runner.optimizer) + + if self.grad_clip is not None: + grad_norm = self.clip_grads(runner.model.parameters()) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update( + {'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + + # backward and update scaler + self.loss_scaler.step(runner.optimizer) + self.loss_scaler.update(self._scale_update_param) + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + # clear grads + runner.model.zero_grad() + runner.optimizer.zero_grad() + +else: + + @HOOKS.register_module() + class Fp16OptimizerHook(OptimizerHook): + """FP16 optimizer hook (mmcv's implementation). + + The steps of fp16 optimizer is as follows. + 1. Scale the loss value. + 2. BP in the fp16 model. + 2. Copy gradients from fp16 model to fp32 weights. + 3. Update fp32 weights. + 4. Copy updated parameters from fp32 weights to fp16 model. + + Refer to https://arxiv.org/abs/1710.03740 for more details. + + Args: + loss_scale (float | str | dict): Scale factor configuration. + If loss_scale is a float, static loss scaling will be used with + the specified scale. If loss_scale is a string, it must be + 'dynamic', then dynamic loss scaling will be used. + It can also be a dict containing arguments of LossScaler. + Defaults to 512. + """ + + def __init__(self, + grad_clip=None, + coalesce=True, + bucket_size_mb=-1, + loss_scale=512., + distributed=True): + self.grad_clip = grad_clip + self.coalesce = coalesce + self.bucket_size_mb = bucket_size_mb + self.distributed = distributed + if loss_scale == 'dynamic': + self.loss_scaler = LossScaler(mode='dynamic') + elif isinstance(loss_scale, float): + self.loss_scaler = LossScaler( + init_scale=loss_scale, mode='static') + elif isinstance(loss_scale, dict): + self.loss_scaler = LossScaler(**loss_scale) + else: + raise ValueError('loss_scale must be of type float, dict, or ' + f'"dynamic", got {loss_scale}') + + def before_run(self, runner): + """Preparing steps before Mixed Precision Training. + + 1. Make a master copy of fp32 weights for optimization. + 2. Convert the main model from fp32 to fp16. + """ + # keep a copy of fp32 weights + old_groups = runner.optimizer.param_groups + runner.optimizer.param_groups = copy.deepcopy( + runner.optimizer.param_groups) + state = defaultdict(dict) + p_map = { + old_p: p + for old_p, p in zip( + chain(*(g['params'] for g in old_groups)), + chain(*(g['params'] + for g in runner.optimizer.param_groups))) + } + for k, v in runner.optimizer.state.items(): + state[p_map[k]] = v + runner.optimizer.state = state + # convert model to fp16 + wrap_fp16_model(runner.model) + # resume from state dict + if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']: + scaler_state_dict = runner.meta['fp16']['loss_scaler'] + self.loss_scaler.load_state_dict(scaler_state_dict) + + def copy_grads_to_fp32(self, fp16_net, fp32_weights): + """Copy gradients from fp16 model to fp32 weight copy.""" + for fp32_param, fp16_param in zip(fp32_weights, + fp16_net.parameters()): + if fp16_param.grad is not None: + if fp32_param.grad is None: + fp32_param.grad = fp32_param.data.new( + fp32_param.size()) + fp32_param.grad.copy_(fp16_param.grad) + + def copy_params_to_fp16(self, fp16_net, fp32_weights): + """Copy updated params from fp32 weight copy to fp16 model.""" + for fp16_param, fp32_param in zip(fp16_net.parameters(), + fp32_weights): + fp16_param.data.copy_(fp32_param.data) + + def after_train_iter(self, runner): + """Backward optimization steps for Mixed Precision Training. For + dynamic loss scaling, please refer `loss_scalar.py` + + 1. Scale the loss by a scale factor. + 2. Backward the loss to obtain the gradients (fp16). + 3. Copy gradients from the model to the fp32 weight copy. + 4. Scale the gradients back and update the fp32 weight copy. + 5. Copy back the params from fp32 weight copy to the fp16 model. + 6. Save loss_scaler state_dict for resume purpose. + """ + # clear grads of last iteration + runner.model.zero_grad() + runner.optimizer.zero_grad() + # scale the loss value + scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale + scaled_loss.backward() + # copy fp16 grads in the model to fp32 params in the optimizer + + fp32_weights = [] + for param_group in runner.optimizer.param_groups: + fp32_weights += param_group['params'] + self.copy_grads_to_fp32(runner.model, fp32_weights) + # allreduce grads + if self.distributed: + allreduce_grads(fp32_weights, self.coalesce, + self.bucket_size_mb) + + has_overflow = self.loss_scaler.has_overflow(fp32_weights) + # if has overflow, skip this iteration + if not has_overflow: + # scale the gradients back + for param in fp32_weights: + if param.grad is not None: + param.grad.div_(self.loss_scaler.loss_scale) + if self.grad_clip is not None: + grad_norm = self.clip_grads(fp32_weights) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update( + {'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + # update fp32 params + runner.optimizer.step() + # copy fp32 params to the fp16 model + self.copy_params_to_fp16(runner.model, fp32_weights) + self.loss_scaler.update_scale(has_overflow) + if has_overflow: + runner.logger.warning('Check overflow, downscale loss scale ' + f'to {self.loss_scaler.cur_scale}') + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + @HOOKS.register_module() + class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook, + Fp16OptimizerHook): + """Fp16 optimizer Hook (using mmcv implementation) implements multi- + iters gradient cumulating.""" + + def __init__(self, *args, **kwargs): + super(GradientCumulativeFp16OptimizerHook, + self).__init__(*args, **kwargs) + + def after_train_iter(self, runner): + if not self.initialized: + self._init(runner) + + if runner.iter < self.divisible_iters: + loss_factor = self.cumulative_iters + else: + loss_factor = self.remainder_iters + + loss = runner.outputs['loss'] + loss = loss / loss_factor + + # scale the loss value + scaled_loss = loss * self.loss_scaler.loss_scale + scaled_loss.backward() + + if (self.every_n_iters(runner, self.cumulative_iters) + or self.is_last_iter(runner)): + + # copy fp16 grads in the model to fp32 params in the optimizer + fp32_weights = [] + for param_group in runner.optimizer.param_groups: + fp32_weights += param_group['params'] + self.copy_grads_to_fp32(runner.model, fp32_weights) + # allreduce grads + if self.distributed: + allreduce_grads(fp32_weights, self.coalesce, + self.bucket_size_mb) + + has_overflow = self.loss_scaler.has_overflow(fp32_weights) + # if has overflow, skip this iteration + if not has_overflow: + # scale the gradients back + for param in fp32_weights: + if param.grad is not None: + param.grad.div_(self.loss_scaler.loss_scale) + if self.grad_clip is not None: + grad_norm = self.clip_grads(fp32_weights) + if grad_norm is not None: + # Add grad norm to the logger + runner.log_buffer.update( + {'grad_norm': float(grad_norm)}, + runner.outputs['num_samples']) + # update fp32 params + runner.optimizer.step() + # copy fp32 params to the fp16 model + self.copy_params_to_fp16(runner.model, fp32_weights) + else: + runner.logger.warning( + 'Check overflow, downscale loss scale ' + f'to {self.loss_scaler.cur_scale}') + + self.loss_scaler.update_scale(has_overflow) + + # save state_dict of loss_scaler + runner.meta.setdefault( + 'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict() + + # clear grads + runner.model.zero_grad() + runner.optimizer.zero_grad() diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/profiler.py b/custom_mmpkg/custom_mmcv/runner/hooks/profiler.py new file mode 100644 index 0000000000000000000000000000000000000000..b70236997eec59c2209ef351ae38863b4112d0ec --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/profiler.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings +from typing import Callable, List, Optional, Union + +import torch + +from ..dist_utils import master_only +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class ProfilerHook(Hook): + """Profiler to analyze performance during training. + + PyTorch Profiler is a tool that allows the collection of the performance + metrics during the training. More details on Profiler can be found at + https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile + + Args: + by_epoch (bool): Profile performance by epoch or by iteration. + Default: True. + profile_iters (int): Number of iterations for profiling. + If ``by_epoch=True``, profile_iters indicates that they are the + first profile_iters epochs at the beginning of the + training, otherwise it indicates the first profile_iters + iterations. Default: 1. + activities (list[str]): List of activity groups (CPU, CUDA) to use in + profiling. Default: ['cpu', 'cuda']. + schedule (dict, optional): Config of generating the callable schedule. + if schedule is None, profiler will not add step markers into the + trace and table view. Default: None. + on_trace_ready (callable, dict): Either a handler or a dict of generate + handler. Default: None. + record_shapes (bool): Save information about operator's input shapes. + Default: False. + profile_memory (bool): Track tensor memory allocation/deallocation. + Default: False. + with_stack (bool): Record source information (file and line number) + for the ops. Default: False. + with_flops (bool): Use formula to estimate the FLOPS of specific + operators (matrix multiplication and 2D convolution). + Default: False. + json_trace_path (str, optional): Exports the collected trace in Chrome + JSON format. Default: None. + + Example: + >>> runner = ... # instantiate a Runner + >>> # tensorboard trace + >>> trace_config = dict(type='tb_trace', dir_name='work_dir') + >>> profiler_config = dict(on_trace_ready=trace_config) + >>> runner.register_profiler_hook(profiler_config) + >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)]) + """ + + def __init__(self, + by_epoch: bool = True, + profile_iters: int = 1, + activities: List[str] = ['cpu', 'cuda'], + schedule: Optional[dict] = None, + on_trace_ready: Optional[Union[Callable, dict]] = None, + record_shapes: bool = False, + profile_memory: bool = False, + with_stack: bool = False, + with_flops: bool = False, + json_trace_path: Optional[str] = None) -> None: + try: + from torch import profiler # torch version >= 1.8.1 + except ImportError: + raise ImportError('profiler is the new feature of torch1.8.1, ' + f'but your version is {torch.__version__}') + + assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.' + self.by_epoch = by_epoch + + if profile_iters < 1: + raise ValueError('profile_iters should be greater than 0, but got ' + f'{profile_iters}') + self.profile_iters = profile_iters + + if not isinstance(activities, list): + raise ValueError( + f'activities should be list, but got {type(activities)}') + self.activities = [] + for activity in activities: + activity = activity.lower() + if activity == 'cpu': + self.activities.append(profiler.ProfilerActivity.CPU) + elif activity == 'cuda': + self.activities.append(profiler.ProfilerActivity.CUDA) + else: + raise ValueError( + f'activity should be "cpu" or "cuda", but got {activity}') + + if schedule is not None: + self.schedule = profiler.schedule(**schedule) + else: + self.schedule = None + + self.on_trace_ready = on_trace_ready + self.record_shapes = record_shapes + self.profile_memory = profile_memory + self.with_stack = with_stack + self.with_flops = with_flops + self.json_trace_path = json_trace_path + + @master_only + def before_run(self, runner): + if self.by_epoch and runner.max_epochs < self.profile_iters: + raise ValueError('self.profile_iters should not be greater than ' + f'{runner.max_epochs}') + + if not self.by_epoch and runner.max_iters < self.profile_iters: + raise ValueError('self.profile_iters should not be greater than ' + f'{runner.max_iters}') + + if callable(self.on_trace_ready): # handler + _on_trace_ready = self.on_trace_ready + elif isinstance(self.on_trace_ready, dict): # config of handler + trace_cfg = self.on_trace_ready.copy() + trace_type = trace_cfg.pop('type') # log_trace handler + if trace_type == 'log_trace': + + def _log_handler(prof): + print(prof.key_averages().table(**trace_cfg)) + + _on_trace_ready = _log_handler + elif trace_type == 'tb_trace': # tensorboard_trace handler + try: + import torch_tb_profiler # noqa: F401 + except ImportError: + raise ImportError('please run "pip install ' + 'torch-tb-profiler" to install ' + 'torch_tb_profiler') + _on_trace_ready = torch.profiler.tensorboard_trace_handler( + **trace_cfg) + else: + raise ValueError('trace_type should be "log_trace" or ' + f'"tb_trace", but got {trace_type}') + elif self.on_trace_ready is None: + _on_trace_ready = None # type: ignore + else: + raise ValueError('on_trace_ready should be handler, dict or None, ' + f'but got {type(self.on_trace_ready)}') + + if runner.max_epochs > 1: + warnings.warn(f'profiler will profile {runner.max_epochs} epochs ' + 'instead of 1 epoch. Since profiler will slow down ' + 'the training, it is recommended to train 1 epoch ' + 'with ProfilerHook and adjust your setting according' + ' to the profiler summary. During normal training ' + '(epoch > 1), you may disable the ProfilerHook.') + + self.profiler = torch.profiler.profile( + activities=self.activities, + schedule=self.schedule, + on_trace_ready=_on_trace_ready, + record_shapes=self.record_shapes, + profile_memory=self.profile_memory, + with_stack=self.with_stack, + with_flops=self.with_flops) + + self.profiler.__enter__() + runner.logger.info('profiler is profiling...') + + @master_only + def after_train_epoch(self, runner): + if self.by_epoch and runner.epoch == self.profile_iters - 1: + runner.logger.info('profiler may take a few minutes...') + self.profiler.__exit__(None, None, None) + if self.json_trace_path is not None: + self.profiler.export_chrome_trace(self.json_trace_path) + + @master_only + def after_train_iter(self, runner): + self.profiler.step() + if not self.by_epoch and runner.iter == self.profile_iters - 1: + runner.logger.info('profiler may take a few minutes...') + self.profiler.__exit__(None, None, None) + if self.json_trace_path is not None: + self.profiler.export_chrome_trace(self.json_trace_path) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/sampler_seed.py b/custom_mmpkg/custom_mmcv/runner/hooks/sampler_seed.py new file mode 100644 index 0000000000000000000000000000000000000000..ee0dc6bdd8df5775857028aaed5444c0f59caf80 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/sampler_seed.py @@ -0,0 +1,20 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class DistSamplerSeedHook(Hook): + """Data-loading sampler for distributed training. + + When distributed training, it is only useful in conjunction with + :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same + purpose with :obj:`IterLoader`. + """ + + def before_epoch(self, runner): + if hasattr(runner.data_loader.sampler, 'set_epoch'): + # in case the data loader uses `SequentialSampler` in Pytorch + runner.data_loader.sampler.set_epoch(runner.epoch) + elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'): + # batch sampler in pytorch warps the sampler as its attributes. + runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch) diff --git a/custom_mmpkg/custom_mmcv/runner/hooks/sync_buffer.py b/custom_mmpkg/custom_mmcv/runner/hooks/sync_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..6376b7ff894280cb2782243b25e8973650591577 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/hooks/sync_buffer.py @@ -0,0 +1,22 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from ..dist_utils import allreduce_params +from .hook import HOOKS, Hook + + +@HOOKS.register_module() +class SyncBuffersHook(Hook): + """Synchronize model buffers such as running_mean and running_var in BN at + the end of each epoch. + + Args: + distributed (bool): Whether distributed training is used. It is + effective only for distributed training. Defaults to True. + """ + + def __init__(self, distributed=True): + self.distributed = distributed + + def after_epoch(self, runner): + """All-reduce model buffers at the end of each epoch.""" + if self.distributed: + allreduce_params(runner.model.buffers()) diff --git a/custom_mmpkg/custom_mmcv/runner/iter_based_runner.py b/custom_mmpkg/custom_mmcv/runner/iter_based_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..075e4b93996c7e5c267a1cd01afd439a5ac06e53 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/iter_based_runner.py @@ -0,0 +1,273 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +import platform +import shutil +import time +import warnings + +import torch +from torch.optim import Optimizer + +import custom_mmpkg.custom_mmcv as mmcv +from .base_runner import BaseRunner +from .builder import RUNNERS +from .checkpoint import save_checkpoint +from .hooks import IterTimerHook +from .utils import get_host_info + + +class IterLoader: + + def __init__(self, dataloader): + self._dataloader = dataloader + self.iter_loader = iter(self._dataloader) + self._epoch = 0 + + @property + def epoch(self): + return self._epoch + + def __next__(self): + try: + data = next(self.iter_loader) + except StopIteration: + self._epoch += 1 + if hasattr(self._dataloader.sampler, 'set_epoch'): + self._dataloader.sampler.set_epoch(self._epoch) + time.sleep(2) # Prevent possible deadlock during epoch transition + self.iter_loader = iter(self._dataloader) + data = next(self.iter_loader) + + return data + + def __len__(self): + return len(self._dataloader) + + +@RUNNERS.register_module() +class IterBasedRunner(BaseRunner): + """Iteration-based Runner. + + This runner train models iteration by iteration. + """ + + def train(self, data_loader, **kwargs): + self.model.train() + self.mode = 'train' + self.data_loader = data_loader + self._epoch = data_loader.epoch + data_batch = next(data_loader) + self.call_hook('before_train_iter') + outputs = self.model.train_step(data_batch, self.optimizer, **kwargs) + if not isinstance(outputs, dict): + raise TypeError('model.train_step() must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs + self.call_hook('after_train_iter') + self._inner_iter += 1 + self._iter += 1 + + @torch.no_grad() + def val(self, data_loader, **kwargs): + self.model.eval() + self.mode = 'val' + self.data_loader = data_loader + data_batch = next(data_loader) + self.call_hook('before_val_iter') + outputs = self.model.val_step(data_batch, **kwargs) + if not isinstance(outputs, dict): + raise TypeError('model.val_step() must return a dict') + if 'log_vars' in outputs: + self.log_buffer.update(outputs['log_vars'], outputs['num_samples']) + self.outputs = outputs + self.call_hook('after_val_iter') + self._inner_iter += 1 + + def run(self, data_loaders, workflow, max_iters=None, **kwargs): + """Start running. + + Args: + data_loaders (list[:obj:`DataLoader`]): Dataloaders for training + and validation. + workflow (list[tuple]): A list of (phase, iters) to specify the + running order and iterations. E.g, [('train', 10000), + ('val', 1000)] means running 10000 iterations for training and + 1000 iterations for validation, iteratively. + """ + assert isinstance(data_loaders, list) + assert mmcv.is_list_of(workflow, tuple) + assert len(data_loaders) == len(workflow) + if max_iters is not None: + warnings.warn( + 'setting max_iters in run is deprecated, ' + 'please set max_iters in runner_config', DeprecationWarning) + self._max_iters = max_iters + assert self._max_iters is not None, ( + 'max_iters must be specified during instantiation') + + work_dir = self.work_dir if self.work_dir is not None else 'NONE' + self.logger.info('Start running, host: %s, work_dir: %s', + get_host_info(), work_dir) + self.logger.info('Hooks will be executed in the following order:\n%s', + self.get_hook_info()) + self.logger.info('workflow: %s, max: %d iters', workflow, + self._max_iters) + self.call_hook('before_run') + + iter_loaders = [IterLoader(x) for x in data_loaders] + + self.call_hook('before_epoch') + + while self.iter < self._max_iters: + for i, flow in enumerate(workflow): + self._inner_iter = 0 + mode, iters = flow + if not isinstance(mode, str) or not hasattr(self, mode): + raise ValueError( + 'runner has no method named "{}" to run a workflow'. + format(mode)) + iter_runner = getattr(self, mode) + for _ in range(iters): + if mode == 'train' and self.iter >= self._max_iters: + break + iter_runner(iter_loaders[i], **kwargs) + + time.sleep(1) # wait for some hooks like loggers to finish + self.call_hook('after_epoch') + self.call_hook('after_run') + + def resume(self, + checkpoint, + resume_optimizer=True, + map_location='default'): + """Resume model from checkpoint. + + Args: + checkpoint (str): Checkpoint to resume from. + resume_optimizer (bool, optional): Whether resume the optimizer(s) + if the checkpoint file includes optimizer(s). Default to True. + map_location (str, optional): Same as :func:`torch.load`. + Default to 'default'. + """ + if map_location == 'default': + device_id = torch.cuda.current_device() + checkpoint = self.load_checkpoint( + checkpoint, + map_location=lambda storage, loc: storage.cuda(device_id)) + else: + checkpoint = self.load_checkpoint( + checkpoint, map_location=map_location) + + self._epoch = checkpoint['meta']['epoch'] + self._iter = checkpoint['meta']['iter'] + self._inner_iter = checkpoint['meta']['iter'] + if 'optimizer' in checkpoint and resume_optimizer: + if isinstance(self.optimizer, Optimizer): + self.optimizer.load_state_dict(checkpoint['optimizer']) + elif isinstance(self.optimizer, dict): + for k in self.optimizer.keys(): + self.optimizer[k].load_state_dict( + checkpoint['optimizer'][k]) + else: + raise TypeError( + 'Optimizer should be dict or torch.optim.Optimizer ' + f'but got {type(self.optimizer)}') + + self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}') + + def save_checkpoint(self, + out_dir, + filename_tmpl='iter_{}.pth', + meta=None, + save_optimizer=True, + create_symlink=True): + """Save checkpoint to file. + + Args: + out_dir (str): Directory to save checkpoint files. + filename_tmpl (str, optional): Checkpoint file template. + Defaults to 'iter_{}.pth'. + meta (dict, optional): Metadata to be saved in checkpoint. + Defaults to None. + save_optimizer (bool, optional): Whether save optimizer. + Defaults to True. + create_symlink (bool, optional): Whether create symlink to the + latest checkpoint file. Defaults to True. + """ + if meta is None: + meta = {} + elif not isinstance(meta, dict): + raise TypeError( + f'meta should be a dict or None, but got {type(meta)}') + if self.meta is not None: + meta.update(self.meta) + # Note: meta.update(self.meta) should be done before + # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise + # there will be problems with resumed checkpoints. + # More details in https://github.com/open-mmlab/mmcv/pull/1108 + meta.update(epoch=self.epoch + 1, iter=self.iter) + + filename = filename_tmpl.format(self.iter + 1) + filepath = osp.join(out_dir, filename) + optimizer = self.optimizer if save_optimizer else None + save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta) + # in some environments, `os.symlink` is not supported, you may need to + # set `create_symlink` to False + if create_symlink: + dst_file = osp.join(out_dir, 'latest.pth') + if platform.system() != 'Windows': + mmcv.symlink(filename, dst_file) + else: + shutil.copy(filepath, dst_file) + + def register_training_hooks(self, + lr_config, + optimizer_config=None, + checkpoint_config=None, + log_config=None, + momentum_config=None, + custom_hooks_config=None): + """Register default hooks for iter-based training. + + Checkpoint hook, optimizer stepper hook and logger hooks will be set to + `by_epoch=False` by default. + + Default hooks include: + + +----------------------+-------------------------+ + | Hooks | Priority | + +======================+=========================+ + | LrUpdaterHook | VERY_HIGH (10) | + +----------------------+-------------------------+ + | MomentumUpdaterHook | HIGH (30) | + +----------------------+-------------------------+ + | OptimizerStepperHook | ABOVE_NORMAL (40) | + +----------------------+-------------------------+ + | CheckpointSaverHook | NORMAL (50) | + +----------------------+-------------------------+ + | IterTimerHook | LOW (70) | + +----------------------+-------------------------+ + | LoggerHook(s) | VERY_LOW (90) | + +----------------------+-------------------------+ + | CustomHook(s) | defaults to NORMAL (50) | + +----------------------+-------------------------+ + + If custom hooks have same priority with default hooks, custom hooks + will be triggered after default hooks. + """ + if checkpoint_config is not None: + checkpoint_config.setdefault('by_epoch', False) + if lr_config is not None: + lr_config.setdefault('by_epoch', False) + if log_config is not None: + for info in log_config['hooks']: + info.setdefault('by_epoch', False) + super(IterBasedRunner, self).register_training_hooks( + lr_config=lr_config, + momentum_config=momentum_config, + optimizer_config=optimizer_config, + checkpoint_config=checkpoint_config, + log_config=log_config, + timer_config=IterTimerHook(), + custom_hooks_config=custom_hooks_config) diff --git a/custom_mmpkg/custom_mmcv/runner/log_buffer.py b/custom_mmpkg/custom_mmcv/runner/log_buffer.py new file mode 100644 index 0000000000000000000000000000000000000000..d949e2941c5400088c7cd8a1dc893d8b233ae785 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/log_buffer.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from collections import OrderedDict + +import numpy as np + + +class LogBuffer: + + def __init__(self): + self.val_history = OrderedDict() + self.n_history = OrderedDict() + self.output = OrderedDict() + self.ready = False + + def clear(self): + self.val_history.clear() + self.n_history.clear() + self.clear_output() + + def clear_output(self): + self.output.clear() + self.ready = False + + def update(self, vars, count=1): + assert isinstance(vars, dict) + for key, var in vars.items(): + if key not in self.val_history: + self.val_history[key] = [] + self.n_history[key] = [] + self.val_history[key].append(var) + self.n_history[key].append(count) + + def average(self, n=0): + """Average latest n values or all values.""" + assert n >= 0 + for key in self.val_history: + values = np.array(self.val_history[key][-n:]) + nums = np.array(self.n_history[key][-n:]) + avg = np.sum(values * nums) / np.sum(nums) + self.output[key] = avg + self.ready = True diff --git a/custom_mmpkg/custom_mmcv/runner/optimizer/__init__.py b/custom_mmpkg/custom_mmcv/runner/optimizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..53c34d0470992cbc374f29681fdd00dc0e57968d --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/optimizer/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer, + build_optimizer_constructor) +from .default_constructor import DefaultOptimizerConstructor + +__all__ = [ + 'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor', + 'build_optimizer', 'build_optimizer_constructor' +] diff --git a/custom_mmpkg/custom_mmcv/runner/optimizer/builder.py b/custom_mmpkg/custom_mmcv/runner/optimizer/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..f9234eed8f1f186d9d8dfda34562157ee39bdb3a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/optimizer/builder.py @@ -0,0 +1,44 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy +import inspect + +import torch + +from ...utils import Registry, build_from_cfg + +OPTIMIZERS = Registry('optimizer') +OPTIMIZER_BUILDERS = Registry('optimizer builder') + + +def register_torch_optimizers(): + torch_optimizers = [] + for module_name in dir(torch.optim): + if module_name.startswith('__'): + continue + _optim = getattr(torch.optim, module_name) + if inspect.isclass(_optim) and issubclass(_optim, + torch.optim.Optimizer): + OPTIMIZERS.register_module()(_optim) + torch_optimizers.append(module_name) + return torch_optimizers + + +TORCH_OPTIMIZERS = register_torch_optimizers() + + +def build_optimizer_constructor(cfg): + return build_from_cfg(cfg, OPTIMIZER_BUILDERS) + + +def build_optimizer(model, cfg): + optimizer_cfg = copy.deepcopy(cfg) + constructor_type = optimizer_cfg.pop('constructor', + 'DefaultOptimizerConstructor') + paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None) + optim_constructor = build_optimizer_constructor( + dict( + type=constructor_type, + optimizer_cfg=optimizer_cfg, + paramwise_cfg=paramwise_cfg)) + optimizer = optim_constructor(model) + return optimizer diff --git a/custom_mmpkg/custom_mmcv/runner/optimizer/default_constructor.py b/custom_mmpkg/custom_mmcv/runner/optimizer/default_constructor.py new file mode 100644 index 0000000000000000000000000000000000000000..5901955857ab2d650907a284312c0a989de7b9a7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/optimizer/default_constructor.py @@ -0,0 +1,249 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import torch +from torch.nn import GroupNorm, LayerNorm + +from custom_mmpkg.custom_mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of +from custom_mmpkg.custom_mmcv.utils.ext_loader import check_ops_exist +from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS + + +@OPTIMIZER_BUILDERS.register_module() +class DefaultOptimizerConstructor: + """Default constructor for optimizers. + + By default each parameter share the same optimizer settings, and we + provide an argument ``paramwise_cfg`` to specify parameter-wise settings. + It is a dict and may contain the following fields: + + - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If + one of the keys in ``custom_keys`` is a substring of the name of one + parameter, then the setting of the parameter will be specified by + ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will + be ignored. It should be noted that the aforementioned ``key`` is the + longest key that is a substring of the name of the parameter. If there + are multiple matched keys with the same length, then the key with lower + alphabet order will be chosen. + ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult`` + and ``decay_mult``. See Example 2 below. + - ``bias_lr_mult`` (float): It will be multiplied to the learning + rate for all bias parameters (except for those in normalization + layers and offset layers of DCN). + - ``bias_decay_mult`` (float): It will be multiplied to the weight + decay for all bias parameters (except for those in + normalization layers, depthwise conv layers, offset layers of DCN). + - ``norm_decay_mult`` (float): It will be multiplied to the weight + decay for all weight and bias parameters of normalization + layers. + - ``dwconv_decay_mult`` (float): It will be multiplied to the weight + decay for all weight and bias parameters of depthwise conv + layers. + - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning + rate for parameters of offset layer in the deformable convs + of a model. + - ``bypass_duplicate`` (bool): If true, the duplicate parameters + would not be added into optimizer. Default: False. + + Note: + 1. If the option ``dcn_offset_lr_mult`` is used, the constructor will + override the effect of ``bias_lr_mult`` in the bias of offset + layer. So be careful when using both ``bias_lr_mult`` and + ``dcn_offset_lr_mult``. If you wish to apply both of them to the + offset layer in deformable convs, set ``dcn_offset_lr_mult`` + to the original ``dcn_offset_lr_mult`` * ``bias_lr_mult``. + 2. If the option ``dcn_offset_lr_mult`` is used, the constructor will + apply it to all the DCN layers in the model. So be careful when + the model contains multiple DCN layers in places other than + backbone. + + Args: + model (:obj:`nn.Module`): The model with parameters to be optimized. + optimizer_cfg (dict): The config dict of the optimizer. + Positional fields are + + - `type`: class name of the optimizer. + + Optional fields are + + - any arguments of the corresponding optimizer type, e.g., + lr, weight_decay, momentum, etc. + paramwise_cfg (dict, optional): Parameter-wise options. + + Example 1: + >>> model = torch.nn.modules.Conv1d(1, 1, 1) + >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9, + >>> weight_decay=0.0001) + >>> paramwise_cfg = dict(norm_decay_mult=0.) + >>> optim_builder = DefaultOptimizerConstructor( + >>> optimizer_cfg, paramwise_cfg) + >>> optimizer = optim_builder(model) + + Example 2: + >>> # assume model have attribute model.backbone and model.cls_head + >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95) + >>> paramwise_cfg = dict(custom_keys={ + '.backbone': dict(lr_mult=0.1, decay_mult=0.9)}) + >>> optim_builder = DefaultOptimizerConstructor( + >>> optimizer_cfg, paramwise_cfg) + >>> optimizer = optim_builder(model) + >>> # Then the `lr` and `weight_decay` for model.backbone is + >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for + >>> # model.cls_head is (0.01, 0.95). + """ + + def __init__(self, optimizer_cfg, paramwise_cfg=None): + if not isinstance(optimizer_cfg, dict): + raise TypeError('optimizer_cfg should be a dict', + f'but got {type(optimizer_cfg)}') + self.optimizer_cfg = optimizer_cfg + self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg + self.base_lr = optimizer_cfg.get('lr', None) + self.base_wd = optimizer_cfg.get('weight_decay', None) + self._validate_cfg() + + def _validate_cfg(self): + if not isinstance(self.paramwise_cfg, dict): + raise TypeError('paramwise_cfg should be None or a dict, ' + f'but got {type(self.paramwise_cfg)}') + + if 'custom_keys' in self.paramwise_cfg: + if not isinstance(self.paramwise_cfg['custom_keys'], dict): + raise TypeError( + 'If specified, custom_keys must be a dict, ' + f'but got {type(self.paramwise_cfg["custom_keys"])}') + if self.base_wd is None: + for key in self.paramwise_cfg['custom_keys']: + if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]: + raise ValueError('base_wd should not be None') + + # get base lr and weight decay + # weight_decay must be explicitly specified if mult is specified + if ('bias_decay_mult' in self.paramwise_cfg + or 'norm_decay_mult' in self.paramwise_cfg + or 'dwconv_decay_mult' in self.paramwise_cfg): + if self.base_wd is None: + raise ValueError('base_wd should not be None') + + def _is_in(self, param_group, param_group_list): + assert is_list_of(param_group_list, dict) + param = set(param_group['params']) + param_set = set() + for group in param_group_list: + param_set.update(set(group['params'])) + + return not param.isdisjoint(param_set) + + def add_params(self, params, module, prefix='', is_dcn_module=None): + """Add all parameters of module to the params list. + + The parameters of the given module will be added to the list of param + groups, with specific rules defined by paramwise_cfg. + + Args: + params (list[dict]): A list of param groups, it will be modified + in place. + module (nn.Module): The module to be added. + prefix (str): The prefix of the module + is_dcn_module (int|float|None): If the current module is a + submodule of DCN, `is_dcn_module` will be passed to + control conv_offset layer's learning rate. Defaults to None. + """ + # get param-wise options + custom_keys = self.paramwise_cfg.get('custom_keys', {}) + # first sort with alphabet order and then sort with reversed len of str + sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True) + + bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.) + bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.) + norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.) + dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.) + bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False) + dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.) + + # special rules for norm layers and depth-wise conv layers + is_norm = isinstance(module, + (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)) + is_dwconv = ( + isinstance(module, torch.nn.Conv2d) + and module.in_channels == module.groups) + + for name, param in module.named_parameters(recurse=False): + param_group = {'params': [param]} + if not param.requires_grad: + params.append(param_group) + continue + if bypass_duplicate and self._is_in(param_group, params): + warnings.warn(f'{prefix} is duplicate. It is skipped since ' + f'bypass_duplicate={bypass_duplicate}') + continue + # if the parameter match one of the custom keys, ignore other rules + is_custom = False + for key in sorted_keys: + if key in f'{prefix}.{name}': + is_custom = True + lr_mult = custom_keys[key].get('lr_mult', 1.) + param_group['lr'] = self.base_lr * lr_mult + if self.base_wd is not None: + decay_mult = custom_keys[key].get('decay_mult', 1.) + param_group['weight_decay'] = self.base_wd * decay_mult + break + + if not is_custom: + # bias_lr_mult affects all bias parameters + # except for norm.bias dcn.conv_offset.bias + if name == 'bias' and not (is_norm or is_dcn_module): + param_group['lr'] = self.base_lr * bias_lr_mult + + if (prefix.find('conv_offset') != -1 and is_dcn_module + and isinstance(module, torch.nn.Conv2d)): + # deal with both dcn_offset's bias & weight + param_group['lr'] = self.base_lr * dcn_offset_lr_mult + + # apply weight decay policies + if self.base_wd is not None: + # norm decay + if is_norm: + param_group[ + 'weight_decay'] = self.base_wd * norm_decay_mult + # depth-wise conv + elif is_dwconv: + param_group[ + 'weight_decay'] = self.base_wd * dwconv_decay_mult + # bias lr and decay + elif name == 'bias' and not is_dcn_module: + # TODO: current bias_decay_mult will have affect on DCN + param_group[ + 'weight_decay'] = self.base_wd * bias_decay_mult + params.append(param_group) + + if check_ops_exist(): + from custom_mmpkg.custom_mmcv.ops import DeformConv2d, ModulatedDeformConv2d + is_dcn_module = isinstance(module, + (DeformConv2d, ModulatedDeformConv2d)) + else: + is_dcn_module = False + for child_name, child_mod in module.named_children(): + child_prefix = f'{prefix}.{child_name}' if prefix else child_name + self.add_params( + params, + child_mod, + prefix=child_prefix, + is_dcn_module=is_dcn_module) + + def __call__(self, model): + if hasattr(model, 'module'): + model = model.module + + optimizer_cfg = self.optimizer_cfg.copy() + # if no paramwise option is specified, just use the global setting + if not self.paramwise_cfg: + optimizer_cfg['params'] = model.parameters() + return build_from_cfg(optimizer_cfg, OPTIMIZERS) + + # set param-wise lr and weight decay recursively + params = [] + self.add_params(params, model) + optimizer_cfg['params'] = params + + return build_from_cfg(optimizer_cfg, OPTIMIZERS) diff --git a/custom_mmpkg/custom_mmcv/runner/priority.py b/custom_mmpkg/custom_mmcv/runner/priority.py new file mode 100644 index 0000000000000000000000000000000000000000..64cc4e3a05f8d5b89ab6eb32461e6e80f1d62e67 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/priority.py @@ -0,0 +1,60 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from enum import Enum + + +class Priority(Enum): + """Hook priority levels. + + +--------------+------------+ + | Level | Value | + +==============+============+ + | HIGHEST | 0 | + +--------------+------------+ + | VERY_HIGH | 10 | + +--------------+------------+ + | HIGH | 30 | + +--------------+------------+ + | ABOVE_NORMAL | 40 | + +--------------+------------+ + | NORMAL | 50 | + +--------------+------------+ + | BELOW_NORMAL | 60 | + +--------------+------------+ + | LOW | 70 | + +--------------+------------+ + | VERY_LOW | 90 | + +--------------+------------+ + | LOWEST | 100 | + +--------------+------------+ + """ + + HIGHEST = 0 + VERY_HIGH = 10 + HIGH = 30 + ABOVE_NORMAL = 40 + NORMAL = 50 + BELOW_NORMAL = 60 + LOW = 70 + VERY_LOW = 90 + LOWEST = 100 + + +def get_priority(priority): + """Get priority value. + + Args: + priority (int or str or :obj:`Priority`): Priority. + + Returns: + int: The priority value. + """ + if isinstance(priority, int): + if priority < 0 or priority > 100: + raise ValueError('priority must be between 0 and 100') + return priority + elif isinstance(priority, Priority): + return priority.value + elif isinstance(priority, str): + return Priority[priority.upper()].value + else: + raise TypeError('priority must be an integer or Priority enum value') diff --git a/custom_mmpkg/custom_mmcv/runner/utils.py b/custom_mmpkg/custom_mmcv/runner/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..32fa4a7297f2cb10f7f2824470434aa34d8de0bb --- /dev/null +++ b/custom_mmpkg/custom_mmcv/runner/utils.py @@ -0,0 +1,93 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import random +import sys +import time +import warnings +from getpass import getuser +from socket import gethostname + +import numpy as np +import torch + +import custom_mmpkg.custom_mmcv as mmcv + + +def get_host_info(): + """Get hostname and username. + + Return empty string if exception raised, e.g. ``getpass.getuser()`` will + lead to error in docker container + """ + host = '' + try: + host = f'{getuser()}@{gethostname()}' + except Exception as e: + warnings.warn(f'Host or user not found: {str(e)}') + finally: + return host + + +def get_time_str(): + return time.strftime('%Y%m%d_%H%M%S', time.localtime()) + + +def obj_from_dict(info, parent=None, default_args=None): + """Initialize an object from dict. + + The dict must contain the key "type", which indicates the object type, it + can be either a string or type, such as "list" or ``list``. Remaining + fields are treated as the arguments for constructing the object. + + Args: + info (dict): Object types and arguments. + parent (:class:`module`): Module which may containing expected object + classes. + default_args (dict, optional): Default arguments for initializing the + object. + + Returns: + any type: Object built from the dict. + """ + assert isinstance(info, dict) and 'type' in info + assert isinstance(default_args, dict) or default_args is None + args = info.copy() + obj_type = args.pop('type') + if mmcv.is_str(obj_type): + if parent is not None: + obj_type = getattr(parent, obj_type) + else: + obj_type = sys.modules[obj_type] + elif not isinstance(obj_type, type): + raise TypeError('type must be a str or valid type, but ' + f'got {type(obj_type)}') + if default_args is not None: + for name, value in default_args.items(): + args.setdefault(name, value) + return obj_type(**args) + + +def set_random_seed(seed, deterministic=False, use_rank_shift=False): + """Set random seed. + + Args: + seed (int): Seed to be used. + deterministic (bool): Whether to set the deterministic option for + CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` + to True and `torch.backends.cudnn.benchmark` to False. + Default: False. + rank_shift (bool): Whether to add rank number to the random seed to + have different random seed in different threads. Default: False. + """ + if use_rank_shift: + rank, _ = mmcv.runner.get_dist_info() + seed += rank + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + os.environ['PYTHONHASHSEED'] = str(seed) + if deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False diff --git a/custom_mmpkg/custom_mmcv/utils/__init__.py b/custom_mmpkg/custom_mmcv/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..378a0068432a371af364de9d73785901c0f83383 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/__init__.py @@ -0,0 +1,69 @@ +# flake8: noqa +# Copyright (c) OpenMMLab. All rights reserved. +from .config import Config, ConfigDict, DictAction +from .misc import (check_prerequisites, concat_list, deprecated_api_warning, + has_method, import_modules_from_strings, is_list_of, + is_method_overridden, is_seq_of, is_str, is_tuple_of, + iter_cast, list_cast, requires_executable, requires_package, + slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple, + to_ntuple, tuple_cast) +from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist, + scandir, symlink) +from .progressbar import (ProgressBar, track_iter_progress, + track_parallel_progress, track_progress) +from .testing import (assert_attrs_equal, assert_dict_contains_subset, + assert_dict_has_keys, assert_is_norm_layer, + assert_keys_equal, assert_params_all_zeros, + check_python_script) +from .timer import Timer, TimerError, check_time +from .version_utils import digit_version, get_git_hash + +try: + import torch +except ImportError: + __all__ = [ + 'Config', 'ConfigDict', 'DictAction', 'is_str', 'iter_cast', + 'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of', + 'slice_list', 'concat_list', 'check_prerequisites', 'requires_package', + 'requires_executable', 'is_filepath', 'fopen', 'check_file_exist', + 'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar', + 'track_progress', 'track_iter_progress', 'track_parallel_progress', + 'Timer', 'TimerError', 'check_time', 'deprecated_api_warning', + 'digit_version', 'get_git_hash', 'import_modules_from_strings', + 'assert_dict_contains_subset', 'assert_attrs_equal', + 'assert_dict_has_keys', 'assert_keys_equal', 'check_python_script', + 'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple', + 'is_method_overridden', 'has_method' + ] +else: + from .env import collect_env + from .logging import get_logger, print_log + from .parrots_jit import jit, skip_no_elena + from .parrots_wrapper import ( + TORCH_VERSION, BuildExtension, CppExtension, CUDAExtension, DataLoader, + PoolDataLoader, SyncBatchNorm, _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, + _AvgPoolNd, _BatchNorm, _ConvNd, _ConvTransposeMixin, _InstanceNorm, + _MaxPoolNd, get_build_config, is_rocm_pytorch, _get_cuda_home) + from .registry import Registry, build_from_cfg + from .trace import is_jit_tracing + __all__ = [ + 'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger', + 'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast', + 'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list', + 'check_prerequisites', 'requires_package', 'requires_executable', + 'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist', + 'symlink', 'scandir', 'ProgressBar', 'track_progress', + 'track_iter_progress', 'track_parallel_progress', 'Registry', + 'build_from_cfg', 'Timer', 'TimerError', 'check_time', 'SyncBatchNorm', + '_AdaptiveAvgPoolNd', '_AdaptiveMaxPoolNd', '_AvgPoolNd', '_BatchNorm', + '_ConvNd', '_ConvTransposeMixin', '_InstanceNorm', '_MaxPoolNd', + 'get_build_config', 'BuildExtension', 'CppExtension', 'CUDAExtension', + 'DataLoader', 'PoolDataLoader', 'TORCH_VERSION', + 'deprecated_api_warning', 'digit_version', 'get_git_hash', + 'import_modules_from_strings', 'jit', 'skip_no_elena', + 'assert_dict_contains_subset', 'assert_attrs_equal', + 'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer', + 'assert_params_all_zeros', 'check_python_script', + 'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch', + '_get_cuda_home', 'has_method' + ] diff --git a/custom_mmpkg/custom_mmcv/utils/config.py b/custom_mmpkg/custom_mmcv/utils/config.py new file mode 100644 index 0000000000000000000000000000000000000000..098a706764a1c18fee26bdaae6d5898d9af23282 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/config.py @@ -0,0 +1,688 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import ast +import copy +import os +import os.path as osp +import platform +import shutil +import sys +import tempfile +import uuid +import warnings +from argparse import Action, ArgumentParser +from collections import abc +from importlib import import_module + +from addict import Dict +from yapf.yapflib.yapf_api import FormatCode + +from .misc import import_modules_from_strings +from .path import check_file_exist + +if platform.system() == 'Windows': + import regex as re +else: + import re + +BASE_KEY = '_base_' +DELETE_KEY = '_delete_' +DEPRECATION_KEY = '_deprecation_' +RESERVED_KEYS = ['filename', 'text', 'pretty_text'] + + +class ConfigDict(Dict): + + def __missing__(self, name): + raise KeyError(name) + + def __getattr__(self, name): + try: + value = super(ConfigDict, self).__getattr__(name) + except KeyError: + ex = AttributeError(f"'{self.__class__.__name__}' object has no " + f"attribute '{name}'") + except Exception as e: + ex = e + else: + return value + raise ex + + +def add_args(parser, cfg, prefix=''): + for k, v in cfg.items(): + if isinstance(v, str): + parser.add_argument('--' + prefix + k) + elif isinstance(v, int): + parser.add_argument('--' + prefix + k, type=int) + elif isinstance(v, float): + parser.add_argument('--' + prefix + k, type=float) + elif isinstance(v, bool): + parser.add_argument('--' + prefix + k, action='store_true') + elif isinstance(v, dict): + add_args(parser, v, prefix + k + '.') + elif isinstance(v, abc.Iterable): + parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+') + else: + print(f'cannot parse key {prefix + k} of type {type(v)}') + return parser + + +class Config: + """A facility for config and config files. + + It supports common file formats as configs: python/json/yaml. The interface + is the same as a dict object and also allows access config values as + attributes. + + Example: + >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1]))) + >>> cfg.a + 1 + >>> cfg.b + {'b1': [0, 1]} + >>> cfg.b.b1 + [0, 1] + >>> cfg = Config.fromfile('tests/data/config/a.py') + >>> cfg.filename + "/home/kchen/projects/mmcv/tests/data/config/a.py" + >>> cfg.item4 + 'test' + >>> cfg + "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: " + "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}" + """ + + @staticmethod + def _validate_py_syntax(filename): + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + content = f.read() + try: + ast.parse(content) + except SyntaxError as e: + raise SyntaxError('There are syntax errors in config ' + f'file {filename}: {e}') + + @staticmethod + def _substitute_predefined_vars(filename, temp_config_name): + file_dirname = osp.dirname(filename) + file_basename = osp.basename(filename) + file_basename_no_extension = osp.splitext(file_basename)[0] + file_extname = osp.splitext(filename)[1] + support_templates = dict( + fileDirname=file_dirname, + fileBasename=file_basename, + fileBasenameNoExtension=file_basename_no_extension, + fileExtname=file_extname) + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + config_file = f.read() + for key, value in support_templates.items(): + regexp = r'\{\{\s*' + str(key) + r'\s*\}\}' + value = value.replace('\\', '/') + config_file = re.sub(regexp, value, config_file) + with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file: + tmp_config_file.write(config_file) + + @staticmethod + def _pre_substitute_base_vars(filename, temp_config_name): + """Substitute base variable placehoders to string, so that parsing + would work.""" + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + config_file = f.read() + base_var_dict = {} + regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}' + base_vars = set(re.findall(regexp, config_file)) + for base_var in base_vars: + randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}' + base_var_dict[randstr] = base_var + regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}' + config_file = re.sub(regexp, f'"{randstr}"', config_file) + with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file: + tmp_config_file.write(config_file) + return base_var_dict + + @staticmethod + def _substitute_base_vars(cfg, base_var_dict, base_cfg): + """Substitute variable strings to their actual values.""" + cfg = copy.deepcopy(cfg) + + if isinstance(cfg, dict): + for k, v in cfg.items(): + if isinstance(v, str) and v in base_var_dict: + new_v = base_cfg + for new_k in base_var_dict[v].split('.'): + new_v = new_v[new_k] + cfg[k] = new_v + elif isinstance(v, (list, tuple, dict)): + cfg[k] = Config._substitute_base_vars( + v, base_var_dict, base_cfg) + elif isinstance(cfg, tuple): + cfg = tuple( + Config._substitute_base_vars(c, base_var_dict, base_cfg) + for c in cfg) + elif isinstance(cfg, list): + cfg = [ + Config._substitute_base_vars(c, base_var_dict, base_cfg) + for c in cfg + ] + elif isinstance(cfg, str) and cfg in base_var_dict: + new_v = base_cfg + for new_k in base_var_dict[cfg].split('.'): + new_v = new_v[new_k] + cfg = new_v + + return cfg + + @staticmethod + def _file2dict(filename, use_predefined_variables=True): + filename = osp.abspath(osp.expanduser(filename)) + check_file_exist(filename) + fileExtname = osp.splitext(filename)[1] + if fileExtname not in ['.py', '.json', '.yaml', '.yml']: + raise IOError('Only py/yml/yaml/json type are supported now!') + + with tempfile.TemporaryDirectory() as temp_config_dir: + temp_config_file = tempfile.NamedTemporaryFile( + dir=temp_config_dir, suffix=fileExtname) + if platform.system() == 'Windows': + temp_config_file.close() + temp_config_name = osp.basename(temp_config_file.name) + # Substitute predefined variables + if use_predefined_variables: + Config._substitute_predefined_vars(filename, + temp_config_file.name) + else: + shutil.copyfile(filename, temp_config_file.name) + # Substitute base variables from placeholders to strings + base_var_dict = Config._pre_substitute_base_vars( + temp_config_file.name, temp_config_file.name) + + if filename.endswith('.py'): + temp_module_name = osp.splitext(temp_config_name)[0] + sys.path.insert(0, temp_config_dir) + Config._validate_py_syntax(filename) + mod = import_module(temp_module_name) + sys.path.pop(0) + cfg_dict = { + name: value + for name, value in mod.__dict__.items() + if not name.startswith('__') + } + # delete imported module + del sys.modules[temp_module_name] + elif filename.endswith(('.yml', '.yaml', '.json')): + import custom_mmpkg.custom_mmcv as mmcv + cfg_dict = mmcv.load(temp_config_file.name) + # close temp file + temp_config_file.close() + + # check deprecation information + if DEPRECATION_KEY in cfg_dict: + deprecation_info = cfg_dict.pop(DEPRECATION_KEY) + warning_msg = f'The config file {filename} will be deprecated ' \ + 'in the future.' + if 'expected' in deprecation_info: + warning_msg += f' Please use {deprecation_info["expected"]} ' \ + 'instead.' + if 'reference' in deprecation_info: + warning_msg += ' More information can be found at ' \ + f'{deprecation_info["reference"]}' + warnings.warn(warning_msg) + + cfg_text = filename + '\n' + with open(filename, 'r', encoding='utf-8') as f: + # Setting encoding explicitly to resolve coding issue on windows + cfg_text += f.read() + + if BASE_KEY in cfg_dict: + cfg_dir = osp.dirname(filename) + base_filename = cfg_dict.pop(BASE_KEY) + base_filename = base_filename if isinstance( + base_filename, list) else [base_filename] + + cfg_dict_list = list() + cfg_text_list = list() + for f in base_filename: + _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f)) + cfg_dict_list.append(_cfg_dict) + cfg_text_list.append(_cfg_text) + + base_cfg_dict = dict() + for c in cfg_dict_list: + duplicate_keys = base_cfg_dict.keys() & c.keys() + if len(duplicate_keys) > 0: + raise KeyError('Duplicate key is not allowed among bases. ' + f'Duplicate keys: {duplicate_keys}') + base_cfg_dict.update(c) + + # Substitute base variables from strings to their actual values + cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict, + base_cfg_dict) + + base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict) + cfg_dict = base_cfg_dict + + # merge cfg_text + cfg_text_list.append(cfg_text) + cfg_text = '\n'.join(cfg_text_list) + + return cfg_dict, cfg_text + + @staticmethod + def _merge_a_into_b(a, b, allow_list_keys=False): + """merge dict ``a`` into dict ``b`` (non-inplace). + + Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid + in-place modifications. + + Args: + a (dict): The source dict to be merged into ``b``. + b (dict): The origin dict to be fetch keys from ``a``. + allow_list_keys (bool): If True, int string keys (e.g. '0', '1') + are allowed in source ``a`` and will replace the element of the + corresponding index in b if b is a list. Default: False. + + Returns: + dict: The modified dict of ``b`` using ``a``. + + Examples: + # Normally merge a into b. + >>> Config._merge_a_into_b( + ... dict(obj=dict(a=2)), dict(obj=dict(a=1))) + {'obj': {'a': 2}} + + # Delete b first and merge a into b. + >>> Config._merge_a_into_b( + ... dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1))) + {'obj': {'a': 2}} + + # b is a list + >>> Config._merge_a_into_b( + ... {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True) + [{'a': 2}, {'b': 2}] + """ + b = b.copy() + for k, v in a.items(): + if allow_list_keys and k.isdigit() and isinstance(b, list): + k = int(k) + if len(b) <= k: + raise KeyError(f'Index {k} exceeds the length of list {b}') + b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys) + elif isinstance(v, + dict) and k in b and not v.pop(DELETE_KEY, False): + allowed_types = (dict, list) if allow_list_keys else dict + if not isinstance(b[k], allowed_types): + raise TypeError( + f'{k}={v} in child config cannot inherit from base ' + f'because {k} is a dict in the child config but is of ' + f'type {type(b[k])} in base config. You may set ' + f'`{DELETE_KEY}=True` to ignore the base config') + b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys) + else: + b[k] = v + return b + + @staticmethod + def fromfile(filename, + use_predefined_variables=True, + import_custom_modules=True): + cfg_dict, cfg_text = Config._file2dict(filename, + use_predefined_variables) + if import_custom_modules and cfg_dict.get('custom_imports', None): + import_modules_from_strings(**cfg_dict['custom_imports']) + return Config(cfg_dict, cfg_text=cfg_text, filename=filename) + + @staticmethod + def fromstring(cfg_str, file_format): + """Generate config from config str. + + Args: + cfg_str (str): Config str. + file_format (str): Config file format corresponding to the + config str. Only py/yml/yaml/json type are supported now! + + Returns: + obj:`Config`: Config obj. + """ + if file_format not in ['.py', '.json', '.yaml', '.yml']: + raise IOError('Only py/yml/yaml/json type are supported now!') + if file_format != '.py' and 'dict(' in cfg_str: + # check if users specify a wrong suffix for python + warnings.warn( + 'Please check "file_format", the file format may be .py') + with tempfile.NamedTemporaryFile( + 'w', encoding='utf-8', suffix=file_format, + delete=False) as temp_file: + temp_file.write(cfg_str) + # on windows, previous implementation cause error + # see PR 1077 for details + cfg = Config.fromfile(temp_file.name) + os.remove(temp_file.name) + return cfg + + @staticmethod + def auto_argparser(description=None): + """Generate argparser from config file automatically (experimental)""" + partial_parser = ArgumentParser(description=description) + partial_parser.add_argument('config', help='config file path') + cfg_file = partial_parser.parse_known_args()[0].config + cfg = Config.fromfile(cfg_file) + parser = ArgumentParser(description=description) + parser.add_argument('config', help='config file path') + add_args(parser, cfg) + return parser, cfg + + def __init__(self, cfg_dict=None, cfg_text=None, filename=None): + if cfg_dict is None: + cfg_dict = dict() + elif not isinstance(cfg_dict, dict): + raise TypeError('cfg_dict must be a dict, but ' + f'got {type(cfg_dict)}') + for key in cfg_dict: + if key in RESERVED_KEYS: + raise KeyError(f'{key} is reserved for config file') + + super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict)) + super(Config, self).__setattr__('_filename', filename) + if cfg_text: + text = cfg_text + elif filename: + with open(filename, 'r') as f: + text = f.read() + else: + text = '' + super(Config, self).__setattr__('_text', text) + + @property + def filename(self): + return self._filename + + @property + def text(self): + return self._text + + @property + def pretty_text(self): + + indent = 4 + + def _indent(s_, num_spaces): + s = s_.split('\n') + if len(s) == 1: + return s_ + first = s.pop(0) + s = [(num_spaces * ' ') + line for line in s] + s = '\n'.join(s) + s = first + '\n' + s + return s + + def _format_basic_types(k, v, use_mapping=False): + if isinstance(v, str): + v_str = f"'{v}'" + else: + v_str = str(v) + + if use_mapping: + k_str = f"'{k}'" if isinstance(k, str) else str(k) + attr_str = f'{k_str}: {v_str}' + else: + attr_str = f'{str(k)}={v_str}' + attr_str = _indent(attr_str, indent) + + return attr_str + + def _format_list(k, v, use_mapping=False): + # check if all items in the list are dict + if all(isinstance(_, dict) for _ in v): + v_str = '[\n' + v_str += '\n'.join( + f'dict({_indent(_format_dict(v_), indent)}),' + for v_ in v).rstrip(',') + if use_mapping: + k_str = f"'{k}'" if isinstance(k, str) else str(k) + attr_str = f'{k_str}: {v_str}' + else: + attr_str = f'{str(k)}={v_str}' + attr_str = _indent(attr_str, indent) + ']' + else: + attr_str = _format_basic_types(k, v, use_mapping) + return attr_str + + def _contain_invalid_identifier(dict_str): + contain_invalid_identifier = False + for key_name in dict_str: + contain_invalid_identifier |= \ + (not str(key_name).isidentifier()) + return contain_invalid_identifier + + def _format_dict(input_dict, outest_level=False): + r = '' + s = [] + + use_mapping = _contain_invalid_identifier(input_dict) + if use_mapping: + r += '{' + for idx, (k, v) in enumerate(input_dict.items()): + is_last = idx >= len(input_dict) - 1 + end = '' if outest_level or is_last else ',' + if isinstance(v, dict): + v_str = '\n' + _format_dict(v) + if use_mapping: + k_str = f"'{k}'" if isinstance(k, str) else str(k) + attr_str = f'{k_str}: dict({v_str}' + else: + attr_str = f'{str(k)}=dict({v_str}' + attr_str = _indent(attr_str, indent) + ')' + end + elif isinstance(v, list): + attr_str = _format_list(k, v, use_mapping) + end + else: + attr_str = _format_basic_types(k, v, use_mapping) + end + + s.append(attr_str) + r += '\n'.join(s) + if use_mapping: + r += '}' + return r + + cfg_dict = self._cfg_dict.to_dict() + text = _format_dict(cfg_dict, outest_level=True) + # copied from setup.cfg + yapf_style = dict( + based_on_style='pep8', + blank_line_before_nested_class_or_def=True, + split_before_expression_after_opening_paren=True) + text, _ = FormatCode(text, style_config=yapf_style, verify=True) + + return text + + def __repr__(self): + return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}' + + def __len__(self): + return len(self._cfg_dict) + + def __getattr__(self, name): + return getattr(self._cfg_dict, name) + + def __getitem__(self, name): + return self._cfg_dict.__getitem__(name) + + def __setattr__(self, name, value): + if isinstance(value, dict): + value = ConfigDict(value) + self._cfg_dict.__setattr__(name, value) + + def __setitem__(self, name, value): + if isinstance(value, dict): + value = ConfigDict(value) + self._cfg_dict.__setitem__(name, value) + + def __iter__(self): + return iter(self._cfg_dict) + + def __getstate__(self): + return (self._cfg_dict, self._filename, self._text) + + def __setstate__(self, state): + _cfg_dict, _filename, _text = state + super(Config, self).__setattr__('_cfg_dict', _cfg_dict) + super(Config, self).__setattr__('_filename', _filename) + super(Config, self).__setattr__('_text', _text) + + def dump(self, file=None): + cfg_dict = super(Config, self).__getattribute__('_cfg_dict').to_dict() + if self.filename.endswith('.py'): + if file is None: + return self.pretty_text + else: + with open(file, 'w', encoding='utf-8') as f: + f.write(self.pretty_text) + else: + import custom_mmpkg.custom_mmcv as mmcv + if file is None: + file_format = self.filename.split('.')[-1] + return mmcv.dump(cfg_dict, file_format=file_format) + else: + mmcv.dump(cfg_dict, file) + + def merge_from_dict(self, options, allow_list_keys=True): + """Merge list into cfg_dict. + + Merge the dict parsed by MultipleKVAction into this cfg. + + Examples: + >>> options = {'model.backbone.depth': 50, + ... 'model.backbone.with_cp':True} + >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet')))) + >>> cfg.merge_from_dict(options) + >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + >>> assert cfg_dict == dict( + ... model=dict(backbone=dict(depth=50, with_cp=True))) + + # Merge list element + >>> cfg = Config(dict(pipeline=[ + ... dict(type='LoadImage'), dict(type='LoadAnnotations')])) + >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')}) + >>> cfg.merge_from_dict(options, allow_list_keys=True) + >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + >>> assert cfg_dict == dict(pipeline=[ + ... dict(type='SelfLoadImage'), dict(type='LoadAnnotations')]) + + Args: + options (dict): dict of configs to merge from. + allow_list_keys (bool): If True, int string keys (e.g. '0', '1') + are allowed in ``options`` and will replace the element of the + corresponding index in the config if the config is a list. + Default: True. + """ + option_cfg_dict = {} + for full_key, v in options.items(): + d = option_cfg_dict + key_list = full_key.split('.') + for subkey in key_list[:-1]: + d.setdefault(subkey, ConfigDict()) + d = d[subkey] + subkey = key_list[-1] + d[subkey] = v + + cfg_dict = super(Config, self).__getattribute__('_cfg_dict') + super(Config, self).__setattr__( + '_cfg_dict', + Config._merge_a_into_b( + option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys)) + + +class DictAction(Action): + """ + argparse action to split an argument into KEY=VALUE form + on the first = and append to a dictionary. List options can + be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit + brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build + list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]' + """ + + @staticmethod + def _parse_int_float_bool(val): + try: + return int(val) + except ValueError: + pass + try: + return float(val) + except ValueError: + pass + if val.lower() in ['true', 'false']: + return True if val.lower() == 'true' else False + return val + + @staticmethod + def _parse_iterable(val): + """Parse iterable values in the string. + + All elements inside '()' or '[]' are treated as iterable values. + + Args: + val (str): Value string. + + Returns: + list | tuple: The expanded list or tuple from the string. + + Examples: + >>> DictAction._parse_iterable('1,2,3') + [1, 2, 3] + >>> DictAction._parse_iterable('[a, b, c]') + ['a', 'b', 'c'] + >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]') + [(1, 2, 3), ['a', 'b'], 'c'] + """ + + def find_next_comma(string): + """Find the position of next comma in the string. + + If no ',' is found in the string, return the string length. All + chars inside '()' and '[]' are treated as one element and thus ',' + inside these brackets are ignored. + """ + assert (string.count('(') == string.count(')')) and ( + string.count('[') == string.count(']')), \ + f'Imbalanced brackets exist in {string}' + end = len(string) + for idx, char in enumerate(string): + pre = string[:idx] + # The string before this ',' is balanced + if ((char == ',') and (pre.count('(') == pre.count(')')) + and (pre.count('[') == pre.count(']'))): + end = idx + break + return end + + # Strip ' and " characters and replace whitespace. + val = val.strip('\'\"').replace(' ', '') + is_tuple = False + if val.startswith('(') and val.endswith(')'): + is_tuple = True + val = val[1:-1] + elif val.startswith('[') and val.endswith(']'): + val = val[1:-1] + elif ',' not in val: + # val is a single value + return DictAction._parse_int_float_bool(val) + + values = [] + while len(val) > 0: + comma_idx = find_next_comma(val) + element = DictAction._parse_iterable(val[:comma_idx]) + values.append(element) + val = val[comma_idx + 1:] + if is_tuple: + values = tuple(values) + return values + + def __call__(self, parser, namespace, values, option_string=None): + options = {} + for kv in values: + key, val = kv.split('=', maxsplit=1) + options[key] = self._parse_iterable(val) + setattr(namespace, self.dest, options) diff --git a/custom_mmpkg/custom_mmcv/utils/env.py b/custom_mmpkg/custom_mmcv/utils/env.py new file mode 100644 index 0000000000000000000000000000000000000000..ffc2e44d2d272d81c74fb2333849265011cd5fec --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/env.py @@ -0,0 +1,95 @@ +# Copyright (c) OpenMMLab. All rights reserved. +"""This file holding some environment constant for sharing by other files.""" + +import os.path as osp +import subprocess +import sys +from collections import defaultdict + +import cv2 +import torch + +import custom_mmpkg.custom_mmcv as mmcv +from .parrots_wrapper import get_build_config + + +def collect_env(): + """Collect the information of the running environments. + + Returns: + dict: The environment information. The following fields are contained. + + - sys.platform: The variable of ``sys.platform``. + - Python: Python version. + - CUDA available: Bool, indicating if CUDA is available. + - GPU devices: Device type of each GPU. + - CUDA_HOME (optional): The env var ``CUDA_HOME``. + - NVCC (optional): NVCC version. + - GCC: GCC version, "n/a" if GCC is not installed. + - PyTorch: PyTorch version. + - PyTorch compiling details: The output of \ + ``torch.__config__.show()``. + - TorchVision (optional): TorchVision version. + - OpenCV: OpenCV version. + - MMCV: MMCV version. + - MMCV Compiler: The GCC version for compiling MMCV ops. + - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops. + """ + env_info = {} + env_info['sys.platform'] = sys.platform + env_info['Python'] = sys.version.replace('\n', '') + + cuda_available = torch.cuda.is_available() + env_info['CUDA available'] = cuda_available + + if cuda_available: + devices = defaultdict(list) + for k in range(torch.cuda.device_count()): + devices[torch.cuda.get_device_name(k)].append(str(k)) + for name, device_ids in devices.items(): + env_info['GPU ' + ','.join(device_ids)] = name + + from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import _get_cuda_home + CUDA_HOME = _get_cuda_home() + env_info['CUDA_HOME'] = CUDA_HOME + + if CUDA_HOME is not None and osp.isdir(CUDA_HOME): + try: + nvcc = osp.join(CUDA_HOME, 'bin/nvcc') + nvcc = subprocess.check_output( + f'"{nvcc}" -V | tail -n1', shell=True) + nvcc = nvcc.decode('utf-8').strip() + except subprocess.SubprocessError: + nvcc = 'Not Available' + env_info['NVCC'] = nvcc + + try: + gcc = subprocess.check_output('gcc --version | head -n1', shell=True) + gcc = gcc.decode('utf-8').strip() + env_info['GCC'] = gcc + except subprocess.CalledProcessError: # gcc is unavailable + env_info['GCC'] = 'n/a' + + env_info['PyTorch'] = torch.__version__ + env_info['PyTorch compiling details'] = get_build_config() + + try: + import torchvision + env_info['TorchVision'] = torchvision.__version__ + except ModuleNotFoundError: + pass + + env_info['OpenCV'] = cv2.__version__ + + env_info['MMCV'] = mmcv.__version__ + + try: + from custom_mmpkg.custom_mmcv.ops import get_compiler_version, get_compiling_cuda_version + except ModuleNotFoundError: + env_info['MMCV Compiler'] = 'n/a' + env_info['MMCV CUDA Compiler'] = 'n/a' + else: + env_info['MMCV Compiler'] = get_compiler_version() + env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version() + + return env_info diff --git a/custom_mmpkg/custom_mmcv/utils/ext_loader.py b/custom_mmpkg/custom_mmcv/utils/ext_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..9fbdddf85818f8c6f2fb8b121c9fdc26259a64b8 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/ext_loader.py @@ -0,0 +1,71 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import importlib +import os +import pkgutil +import warnings +from collections import namedtuple + +import torch + +if torch.__version__ != 'parrots': + + def load_ext(name, funcs): + ext = importlib.import_module('custom_mmcv.' + name) + for fun in funcs: + assert hasattr(ext, fun), f'{fun} miss in module {name}' + return ext +else: + from parrots import extension + from parrots.base import ParrotsException + + has_return_value_ops = [ + 'nms', + 'softnms', + 'nms_match', + 'nms_rotated', + 'top_pool_forward', + 'top_pool_backward', + 'bottom_pool_forward', + 'bottom_pool_backward', + 'left_pool_forward', + 'left_pool_backward', + 'right_pool_forward', + 'right_pool_backward', + 'fused_bias_leakyrelu', + 'upfirdn2d', + 'ms_deform_attn_forward', + 'pixel_group', + 'contour_expand', + ] + + def get_fake_func(name, e): + + def fake_func(*args, **kwargs): + warnings.warn(f'{name} is not supported in parrots now') + raise e + + return fake_func + + def load_ext(name, funcs): + ExtModule = namedtuple('ExtModule', funcs) + ext_list = [] + lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) + for fun in funcs: + try: + ext_fun = extension.load(fun, name, lib_dir=lib_root) + except ParrotsException as e: + if 'No element registered' not in e.message: + warnings.warn(e.message) + ext_fun = get_fake_func(fun, e) + ext_list.append(ext_fun) + else: + if fun in has_return_value_ops: + ext_list.append(ext_fun.op) + else: + ext_list.append(ext_fun.op_) + return ExtModule(*ext_list) + + +def check_ops_exist(): + ext_loader = pkgutil.find_loader('mmcv._ext') + return ext_loader is not None diff --git a/custom_mmpkg/custom_mmcv/utils/logging.py b/custom_mmpkg/custom_mmcv/utils/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa0e04bb9b3ab2a4bfbc4def50404ccbac2c6e6 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/logging.py @@ -0,0 +1,110 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging + +import torch.distributed as dist + +logger_initialized = {} + + +def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): + """Initialize and get a logger by name. + + If the logger has not been initialized, this method will initialize the + logger by adding one or two handlers, otherwise the initialized logger will + be directly returned. During initialization, a StreamHandler will always be + added. If `log_file` is specified and the process rank is 0, a FileHandler + will also be added. + + Args: + name (str): Logger name. + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the logger. + log_level (int): The logger level. Note that only the process of + rank 0 is affected, and other processes will set the level to + "Error" thus be silent most of the time. + file_mode (str): The file mode used in opening log file. + Defaults to 'w'. + + Returns: + logging.Logger: The expected logger. + """ + logger = logging.getLogger(name) + if name in logger_initialized: + return logger + # handle hierarchical names + # e.g., logger "a" is initialized, then logger "a.b" will skip the + # initialization since it is a child of "a". + for logger_name in logger_initialized: + if name.startswith(logger_name): + return logger + + # handle duplicate logs to the console + # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler (NOTSET) + # to the root logger. As logger.propagate is True by default, this root + # level handler causes logging messages from rank>0 processes to + # unexpectedly show up on the console, creating much unwanted clutter. + # To fix this issue, we set the root logger's StreamHandler, if any, to log + # at the ERROR level. + for handler in logger.root.handlers: + if type(handler) is logging.StreamHandler: + handler.setLevel(logging.ERROR) + + stream_handler = logging.StreamHandler() + handlers = [stream_handler] + + if dist.is_available() and dist.is_initialized(): + rank = dist.get_rank() + else: + rank = 0 + + # only rank 0 will add a FileHandler + if rank == 0 and log_file is not None: + # Here, the default behaviour of the official logger is 'a'. Thus, we + # provide an interface to change the file mode to the default + # behaviour. + file_handler = logging.FileHandler(log_file, file_mode) + handlers.append(file_handler) + + formatter = logging.Formatter( + '%(asctime)s - %(name)s - %(levelname)s - %(message)s') + for handler in handlers: + handler.setFormatter(formatter) + handler.setLevel(log_level) + logger.addHandler(handler) + + if rank == 0: + logger.setLevel(log_level) + else: + logger.setLevel(logging.ERROR) + + logger_initialized[name] = True + + return logger + + +def print_log(msg, logger=None, level=logging.INFO): + """Print a log message. + + Args: + msg (str): The message to be logged. + logger (logging.Logger | str | None): The logger to be used. + Some special loggers are: + - "silent": no message will be printed. + - other str: the logger obtained with `get_root_logger(logger)`. + - None: The `print()` method will be used to print log messages. + level (int): Logging level. Only available when `logger` is a Logger + object or "root". + """ + if logger is None: + print(msg) + elif isinstance(logger, logging.Logger): + logger.log(level, msg) + elif logger == 'silent': + pass + elif isinstance(logger, str): + _logger = get_logger(logger) + _logger.log(level, msg) + else: + raise TypeError( + 'logger should be either a logging.Logger object, str, ' + f'"silent" or None, but got {type(logger)}') diff --git a/custom_mmpkg/custom_mmcv/utils/misc.py b/custom_mmpkg/custom_mmcv/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..2c58d0d7fee9fe3d4519270ad8c1e998d0d8a18c --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/misc.py @@ -0,0 +1,377 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import collections.abc +import functools +import itertools +import subprocess +import warnings +from collections import abc +from importlib import import_module +from inspect import getfullargspec +from itertools import repeat + + +# From PyTorch internals +def _ntuple(n): + + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +def is_str(x): + """Whether the input is an string instance. + + Note: This method is deprecated since python 2 is no longer supported. + """ + return isinstance(x, str) + + +def import_modules_from_strings(imports, allow_failed_imports=False): + """Import modules from the given list of strings. + + Args: + imports (list | str | None): The given module names to be imported. + allow_failed_imports (bool): If True, the failed imports will return + None. Otherwise, an ImportError is raise. Default: False. + + Returns: + list[module] | module | None: The imported modules. + + Examples: + >>> osp, sys = import_modules_from_strings( + ... ['os.path', 'sys']) + >>> import os.path as osp_ + >>> import sys as sys_ + >>> assert osp == osp_ + >>> assert sys == sys_ + """ + if not imports: + return + single_import = False + if isinstance(imports, str): + single_import = True + imports = [imports] + if not isinstance(imports, list): + raise TypeError( + f'custom_imports must be a list but got type {type(imports)}') + imported = [] + for imp in imports: + if not isinstance(imp, str): + raise TypeError( + f'{imp} is of type {type(imp)} and cannot be imported.') + try: + imported_tmp = import_module(imp) + except ImportError: + if allow_failed_imports: + warnings.warn(f'{imp} failed to import and is ignored.', + UserWarning) + imported_tmp = None + else: + raise ImportError + imported.append(imported_tmp) + if single_import: + imported = imported[0] + return imported + + +def iter_cast(inputs, dst_type, return_type=None): + """Cast elements of an iterable object into some type. + + Args: + inputs (Iterable): The input object. + dst_type (type): Destination type. + return_type (type, optional): If specified, the output object will be + converted to this type, otherwise an iterator. + + Returns: + iterator or specified type: The converted object. + """ + if not isinstance(inputs, abc.Iterable): + raise TypeError('inputs must be an iterable object') + if not isinstance(dst_type, type): + raise TypeError('"dst_type" must be a valid type') + + out_iterable = map(dst_type, inputs) + + if return_type is None: + return out_iterable + else: + return return_type(out_iterable) + + +def list_cast(inputs, dst_type): + """Cast elements of an iterable object into a list of some type. + + A partial method of :func:`iter_cast`. + """ + return iter_cast(inputs, dst_type, return_type=list) + + +def tuple_cast(inputs, dst_type): + """Cast elements of an iterable object into a tuple of some type. + + A partial method of :func:`iter_cast`. + """ + return iter_cast(inputs, dst_type, return_type=tuple) + + +def is_seq_of(seq, expected_type, seq_type=None): + """Check whether it is a sequence of some type. + + Args: + seq (Sequence): The sequence to be checked. + expected_type (type): Expected type of sequence items. + seq_type (type, optional): Expected sequence type. + + Returns: + bool: Whether the sequence is valid. + """ + if seq_type is None: + exp_seq_type = abc.Sequence + else: + assert isinstance(seq_type, type) + exp_seq_type = seq_type + if not isinstance(seq, exp_seq_type): + return False + for item in seq: + if not isinstance(item, expected_type): + return False + return True + + +def is_list_of(seq, expected_type): + """Check whether it is a list of some type. + + A partial method of :func:`is_seq_of`. + """ + return is_seq_of(seq, expected_type, seq_type=list) + + +def is_tuple_of(seq, expected_type): + """Check whether it is a tuple of some type. + + A partial method of :func:`is_seq_of`. + """ + return is_seq_of(seq, expected_type, seq_type=tuple) + + +def slice_list(in_list, lens): + """Slice a list into several sub lists by a list of given length. + + Args: + in_list (list): The list to be sliced. + lens(int or list): The expected length of each out list. + + Returns: + list: A list of sliced list. + """ + if isinstance(lens, int): + assert len(in_list) % lens == 0 + lens = [lens] * int(len(in_list) / lens) + if not isinstance(lens, list): + raise TypeError('"indices" must be an integer or a list of integers') + elif sum(lens) != len(in_list): + raise ValueError('sum of lens and list length does not ' + f'match: {sum(lens)} != {len(in_list)}') + out_list = [] + idx = 0 + for i in range(len(lens)): + out_list.append(in_list[idx:idx + lens[i]]) + idx += lens[i] + return out_list + + +def concat_list(in_list): + """Concatenate a list of list into a single list. + + Args: + in_list (list): The list of list to be merged. + + Returns: + list: The concatenated flat list. + """ + return list(itertools.chain(*in_list)) + + +def check_prerequisites( + prerequisites, + checker, + msg_tmpl='Prerequisites "{}" are required in method "{}" but not ' + 'found, please install them first.'): # yapf: disable + """A decorator factory to check if prerequisites are satisfied. + + Args: + prerequisites (str of list[str]): Prerequisites to be checked. + checker (callable): The checker method that returns True if a + prerequisite is meet, False otherwise. + msg_tmpl (str): The message template with two variables. + + Returns: + decorator: A specific decorator. + """ + + def wrap(func): + + @functools.wraps(func) + def wrapped_func(*args, **kwargs): + requirements = [prerequisites] if isinstance( + prerequisites, str) else prerequisites + missing = [] + for item in requirements: + if not checker(item): + missing.append(item) + if missing: + print(msg_tmpl.format(', '.join(missing), func.__name__)) + raise RuntimeError('Prerequisites not meet.') + else: + return func(*args, **kwargs) + + return wrapped_func + + return wrap + + +def _check_py_package(package): + try: + import_module(package) + except ImportError: + return False + else: + return True + + +def _check_executable(cmd): + if subprocess.call(f'which {cmd}', shell=True) != 0: + return False + else: + return True + + +def requires_package(prerequisites): + """A decorator to check if some python packages are installed. + + Example: + >>> @requires_package('numpy') + >>> func(arg1, args): + >>> return numpy.zeros(1) + array([0.]) + >>> @requires_package(['numpy', 'non_package']) + >>> func(arg1, args): + >>> return numpy.zeros(1) + ImportError + """ + return check_prerequisites(prerequisites, checker=_check_py_package) + + +def requires_executable(prerequisites): + """A decorator to check if some executable files are installed. + + Example: + >>> @requires_executable('ffmpeg') + >>> func(arg1, args): + >>> print(1) + 1 + """ + return check_prerequisites(prerequisites, checker=_check_executable) + + +def deprecated_api_warning(name_dict, cls_name=None): + """A decorator to check if some arguments are deprecate and try to replace + deprecate src_arg_name to dst_arg_name. + + Args: + name_dict(dict): + key (str): Deprecate argument names. + val (str): Expected argument names. + + Returns: + func: New function. + """ + + def api_warning_wrapper(old_func): + + @functools.wraps(old_func) + def new_func(*args, **kwargs): + # get the arg spec of the decorated method + args_info = getfullargspec(old_func) + # get name of the function + func_name = old_func.__name__ + if cls_name is not None: + func_name = f'{cls_name}.{func_name}' + if args: + arg_names = args_info.args[:len(args)] + for src_arg_name, dst_arg_name in name_dict.items(): + if src_arg_name in arg_names: + warnings.warn( + f'"{src_arg_name}" is deprecated in ' + f'`{func_name}`, please use "{dst_arg_name}" ' + 'instead') + arg_names[arg_names.index(src_arg_name)] = dst_arg_name + if kwargs: + for src_arg_name, dst_arg_name in name_dict.items(): + if src_arg_name in kwargs: + + assert dst_arg_name not in kwargs, ( + f'The expected behavior is to replace ' + f'the deprecated key `{src_arg_name}` to ' + f'new key `{dst_arg_name}`, but got them ' + f'in the arguments at the same time, which ' + f'is confusing. `{src_arg_name} will be ' + f'deprecated in the future, please ' + f'use `{dst_arg_name}` instead.') + + warnings.warn( + f'"{src_arg_name}" is deprecated in ' + f'`{func_name}`, please use "{dst_arg_name}" ' + 'instead') + kwargs[dst_arg_name] = kwargs.pop(src_arg_name) + + # apply converted arguments to the decorated method + output = old_func(*args, **kwargs) + return output + + return new_func + + return api_warning_wrapper + + +def is_method_overridden(method, base_class, derived_class): + """Check if a method of base class is overridden in derived class. + + Args: + method (str): the method name to check. + base_class (type): the class of the base class. + derived_class (type | Any): the class or instance of the derived class. + """ + assert isinstance(base_class, type), \ + "base_class doesn't accept instance, Please pass class instead." + + if not isinstance(derived_class, type): + derived_class = derived_class.__class__ + + base_method = getattr(base_class, method) + derived_method = getattr(derived_class, method) + return derived_method != base_method + + +def has_method(obj: object, method: str) -> bool: + """Check whether the object has a method. + + Args: + method (str): The method name to check. + obj (object): The object to check. + + Returns: + bool: True if the object has the method else False. + """ + return hasattr(obj, method) and callable(getattr(obj, method)) diff --git a/custom_mmpkg/custom_mmcv/utils/parrots_jit.py b/custom_mmpkg/custom_mmcv/utils/parrots_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..61873f6dbb9b10ed972c90aa8faa321e3cb3249e --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/parrots_jit.py @@ -0,0 +1,41 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os + +from .parrots_wrapper import TORCH_VERSION + +parrots_jit_option = os.getenv('PARROTS_JIT_OPTION') + +if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON': + from parrots.jit import pat as jit +else: + + def jit(func=None, + check_input=None, + full_shape=True, + derivate=False, + coderize=False, + optimize=False): + + def wrapper(func): + + def wrapper_inner(*args, **kargs): + return func(*args, **kargs) + + return wrapper_inner + + if func is None: + return wrapper + else: + return func + + +if TORCH_VERSION == 'parrots': + from parrots.utils.tester import skip_no_elena +else: + + def skip_no_elena(func): + + def wrapper(*args, **kargs): + return func(*args, **kargs) + + return wrapper diff --git a/custom_mmpkg/custom_mmcv/utils/parrots_wrapper.py b/custom_mmpkg/custom_mmcv/utils/parrots_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..93c97640d4b9ed088ca82cfe03e6efebfcfa9dbf --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/parrots_wrapper.py @@ -0,0 +1,107 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from functools import partial + +import torch + +TORCH_VERSION = torch.__version__ + + +def is_rocm_pytorch() -> bool: + is_rocm = False + if TORCH_VERSION != 'parrots': + try: + from torch.utils.cpp_extension import ROCM_HOME + is_rocm = True if ((torch.version.hip is not None) and + (ROCM_HOME is not None)) else False + except ImportError: + pass + return is_rocm + + +def _get_cuda_home(): + if TORCH_VERSION == 'parrots': + from parrots.utils.build_extension import CUDA_HOME + else: + if is_rocm_pytorch(): + from torch.utils.cpp_extension import ROCM_HOME + CUDA_HOME = ROCM_HOME + else: + from torch.utils.cpp_extension import CUDA_HOME + return CUDA_HOME + + +def get_build_config(): + if TORCH_VERSION == 'parrots': + from parrots.config import get_build_info + return get_build_info() + else: + return torch.__config__.show() + + +def _get_conv(): + if TORCH_VERSION == 'parrots': + from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin + else: + from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin + return _ConvNd, _ConvTransposeMixin + + +def _get_dataloader(): + if TORCH_VERSION == 'parrots': + from torch.utils.data import DataLoader, PoolDataLoader + else: + from torch.utils.data import DataLoader + PoolDataLoader = DataLoader + return DataLoader, PoolDataLoader + + +def _get_extension(): + if TORCH_VERSION == 'parrots': + from parrots.utils.build_extension import BuildExtension, Extension + CppExtension = partial(Extension, cuda=False) + CUDAExtension = partial(Extension, cuda=True) + else: + from torch.utils.cpp_extension import (BuildExtension, CppExtension, + CUDAExtension) + return BuildExtension, CppExtension, CUDAExtension + + +def _get_pool(): + if TORCH_VERSION == 'parrots': + from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd, + _AdaptiveMaxPoolNd, _AvgPoolNd, + _MaxPoolNd) + else: + from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd, + _AdaptiveMaxPoolNd, _AvgPoolNd, + _MaxPoolNd) + return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd + + +def _get_norm(): + if TORCH_VERSION == 'parrots': + from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm + SyncBatchNorm_ = torch.nn.SyncBatchNorm2d + else: + from torch.nn.modules.instancenorm import _InstanceNorm + from torch.nn.modules.batchnorm import _BatchNorm + SyncBatchNorm_ = torch.nn.SyncBatchNorm + return _BatchNorm, _InstanceNorm, SyncBatchNorm_ + + +_ConvNd, _ConvTransposeMixin = _get_conv() +DataLoader, PoolDataLoader = _get_dataloader() +BuildExtension, CppExtension, CUDAExtension = _get_extension() +_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm() +_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool() + + +class SyncBatchNorm(SyncBatchNorm_): + + def _check_input_dim(self, input): + if TORCH_VERSION == 'parrots': + if input.dim() < 2: + raise ValueError( + f'expected at least 2D input (got {input.dim()}D input)') + else: + super()._check_input_dim(input) diff --git a/custom_mmpkg/custom_mmcv/utils/path.py b/custom_mmpkg/custom_mmcv/utils/path.py new file mode 100644 index 0000000000000000000000000000000000000000..7dab4b3041413b1432b0f434b8b14783097d33c6 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/path.py @@ -0,0 +1,101 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +from pathlib import Path + +from .misc import is_str + + +def is_filepath(x): + return is_str(x) or isinstance(x, Path) + + +def fopen(filepath, *args, **kwargs): + if is_str(filepath): + return open(filepath, *args, **kwargs) + elif isinstance(filepath, Path): + return filepath.open(*args, **kwargs) + raise ValueError('`filepath` should be a string or a Path') + + +def check_file_exist(filename, msg_tmpl='file "{}" does not exist'): + if not osp.isfile(filename): + raise FileNotFoundError(msg_tmpl.format(filename)) + + +def mkdir_or_exist(dir_name, mode=0o777): + if dir_name == '': + return + dir_name = osp.expanduser(dir_name) + os.makedirs(dir_name, mode=mode, exist_ok=True) + + +def symlink(src, dst, overwrite=True, **kwargs): + if os.path.lexists(dst) and overwrite: + os.remove(dst) + os.symlink(src, dst, **kwargs) + + +def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True): + """Scan a directory to find the interested files. + + Args: + dir_path (str | obj:`Path`): Path of the directory. + suffix (str | tuple(str), optional): File suffix that we are + interested in. Default: None. + recursive (bool, optional): If set to True, recursively scan the + directory. Default: False. + case_sensitive (bool, optional) : If set to False, ignore the case of + suffix. Default: True. + + Returns: + A generator for all the interested files with relative paths. + """ + if isinstance(dir_path, (str, Path)): + dir_path = str(dir_path) + else: + raise TypeError('"dir_path" must be a string or Path object') + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('"suffix" must be a string or tuple of strings') + + if suffix is not None and not case_sensitive: + suffix = suffix.lower() if isinstance(suffix, str) else tuple( + item.lower() for item in suffix) + + root = dir_path + + def _scandir(dir_path, suffix, recursive, case_sensitive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + rel_path = osp.relpath(entry.path, root) + _rel_path = rel_path if case_sensitive else rel_path.lower() + if suffix is None or _rel_path.endswith(suffix): + yield rel_path + elif recursive and os.path.isdir(entry.path): + # scan recursively if entry.path is a directory + yield from _scandir(entry.path, suffix, recursive, + case_sensitive) + + return _scandir(dir_path, suffix, recursive, case_sensitive) + + +def find_vcs_root(path, markers=('.git', )): + """Finds the root directory (including itself) of specified markers. + + Args: + path (str): Path of directory or file. + markers (list[str], optional): List of file or directory names. + + Returns: + The directory contained one of the markers or None if not found. + """ + if osp.isfile(path): + path = osp.dirname(path) + + prev, cur = None, osp.abspath(osp.expanduser(path)) + while cur != prev: + if any(osp.exists(osp.join(cur, marker)) for marker in markers): + return cur + prev, cur = cur, osp.split(cur)[0] + return None diff --git a/custom_mmpkg/custom_mmcv/utils/progressbar.py b/custom_mmpkg/custom_mmcv/utils/progressbar.py new file mode 100644 index 0000000000000000000000000000000000000000..0062f670dd94fa9da559ab26ef85517dcf5211c7 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/progressbar.py @@ -0,0 +1,208 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import sys +from collections.abc import Iterable +from multiprocessing import Pool +from shutil import get_terminal_size + +from .timer import Timer + + +class ProgressBar: + """A progress bar which can print the progress.""" + + def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout): + self.task_num = task_num + self.bar_width = bar_width + self.completed = 0 + self.file = file + if start: + self.start() + + @property + def terminal_width(self): + width, _ = get_terminal_size() + return width + + def start(self): + if self.task_num > 0: + self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, ' + 'elapsed: 0s, ETA:') + else: + self.file.write('completed: 0, elapsed: 0s') + self.file.flush() + self.timer = Timer() + + def update(self, num_tasks=1): + assert num_tasks > 0 + self.completed += num_tasks + elapsed = self.timer.since_start() + if elapsed > 0: + fps = self.completed / elapsed + else: + fps = float('inf') + if self.task_num > 0: + percentage = self.completed / float(self.task_num) + eta = int(elapsed * (1 - percentage) / percentage + 0.5) + msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \ + f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \ + f'ETA: {eta:5}s' + + bar_width = min(self.bar_width, + int(self.terminal_width - len(msg)) + 2, + int(self.terminal_width * 0.6)) + bar_width = max(2, bar_width) + mark_width = int(bar_width * percentage) + bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width) + self.file.write(msg.format(bar_chars)) + else: + self.file.write( + f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,' + f' {fps:.1f} tasks/s') + self.file.flush() + + +def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs): + """Track the progress of tasks execution with a progress bar. + + Tasks are done with a simple for-loop. + + Args: + func (callable): The function to be applied to each task. + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + bar_width (int): Width of progress bar. + + Returns: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, Iterable): + task_num = len(tasks) + else: + raise TypeError( + '"tasks" must be an iterable object or a (iterator, int) tuple') + prog_bar = ProgressBar(task_num, bar_width, file=file) + results = [] + for task in tasks: + results.append(func(task, **kwargs)) + prog_bar.update() + prog_bar.file.write('\n') + return results + + +def init_pool(process_num, initializer=None, initargs=None): + if initializer is None: + return Pool(process_num) + elif initargs is None: + return Pool(process_num, initializer) + else: + if not isinstance(initargs, tuple): + raise TypeError('"initargs" must be a tuple') + return Pool(process_num, initializer, initargs) + + +def track_parallel_progress(func, + tasks, + nproc, + initializer=None, + initargs=None, + bar_width=50, + chunksize=1, + skip_first=False, + keep_order=True, + file=sys.stdout): + """Track the progress of parallel task execution with a progress bar. + + The built-in :mod:`multiprocessing` module is used for process pools and + tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`. + + Args: + func (callable): The function to be applied to each task. + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + nproc (int): Process (worker) number. + initializer (None or callable): Refer to :class:`multiprocessing.Pool` + for details. + initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for + details. + chunksize (int): Refer to :class:`multiprocessing.Pool` for details. + bar_width (int): Width of progress bar. + skip_first (bool): Whether to skip the first sample for each worker + when estimating fps, since the initialization step may takes + longer. + keep_order (bool): If True, :func:`Pool.imap` is used, otherwise + :func:`Pool.imap_unordered` is used. + + Returns: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, Iterable): + task_num = len(tasks) + else: + raise TypeError( + '"tasks" must be an iterable object or a (iterator, int) tuple') + pool = init_pool(nproc, initializer, initargs) + start = not skip_first + task_num -= nproc * chunksize * int(skip_first) + prog_bar = ProgressBar(task_num, bar_width, start, file=file) + results = [] + if keep_order: + gen = pool.imap(func, tasks, chunksize) + else: + gen = pool.imap_unordered(func, tasks, chunksize) + for result in gen: + results.append(result) + if skip_first: + if len(results) < nproc * chunksize: + continue + elif len(results) == nproc * chunksize: + prog_bar.start() + continue + prog_bar.update() + prog_bar.file.write('\n') + pool.close() + pool.join() + return results + + +def track_iter_progress(tasks, bar_width=50, file=sys.stdout): + """Track the progress of tasks iteration or enumeration with a progress + bar. + + Tasks are yielded with a simple for-loop. + + Args: + tasks (list or tuple[Iterable, int]): A list of tasks or + (tasks, total num). + bar_width (int): Width of progress bar. + + Yields: + list: The task results. + """ + if isinstance(tasks, tuple): + assert len(tasks) == 2 + assert isinstance(tasks[0], Iterable) + assert isinstance(tasks[1], int) + task_num = tasks[1] + tasks = tasks[0] + elif isinstance(tasks, Iterable): + task_num = len(tasks) + else: + raise TypeError( + '"tasks" must be an iterable object or a (iterator, int) tuple') + prog_bar = ProgressBar(task_num, bar_width, file=file) + for task in tasks: + yield task + prog_bar.update() + prog_bar.file.write('\n') diff --git a/custom_mmpkg/custom_mmcv/utils/registry.py b/custom_mmpkg/custom_mmcv/utils/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..fa9df39bc9f3d8d568361e7250ab35468f2b74e0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/registry.py @@ -0,0 +1,315 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import inspect +import warnings +from functools import partial + +from .misc import is_seq_of + + +def build_from_cfg(cfg, registry, default_args=None): + """Build a module from config dict. + + Args: + cfg (dict): Config dict. It should at least contain the key "type". + registry (:obj:`Registry`): The registry to search the type from. + default_args (dict, optional): Default initialization arguments. + + Returns: + object: The constructed object. + """ + if not isinstance(cfg, dict): + raise TypeError(f'cfg must be a dict, but got {type(cfg)}') + if 'type' not in cfg: + if default_args is None or 'type' not in default_args: + raise KeyError( + '`cfg` or `default_args` must contain the key "type", ' + f'but got {cfg}\n{default_args}') + if not isinstance(registry, Registry): + raise TypeError('registry must be an mmcv.Registry object, ' + f'but got {type(registry)}') + if not (isinstance(default_args, dict) or default_args is None): + raise TypeError('default_args must be a dict or None, ' + f'but got {type(default_args)}') + + args = cfg.copy() + + if default_args is not None: + for name, value in default_args.items(): + args.setdefault(name, value) + + obj_type = args.pop('type') + if isinstance(obj_type, str): + obj_cls = registry.get(obj_type) + if obj_cls is None: + raise KeyError( + f'{obj_type} is not in the {registry.name} registry') + elif inspect.isclass(obj_type): + obj_cls = obj_type + else: + raise TypeError( + f'type must be a str or valid type, but got {type(obj_type)}') + try: + return obj_cls(**args) + except Exception as e: + # Normal TypeError does not print class name. + raise type(e)(f'{obj_cls.__name__}: {e}') + + +class Registry: + """A registry to map strings to classes. + + Registered object could be built from registry. + Example: + >>> MODELS = Registry('models') + >>> @MODELS.register_module() + >>> class ResNet: + >>> pass + >>> resnet = MODELS.build(dict(type='ResNet')) + + Please refer to + https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for + advanced usage. + + Args: + name (str): Registry name. + build_func(func, optional): Build function to construct instance from + Registry, func:`build_from_cfg` is used if neither ``parent`` or + ``build_func`` is specified. If ``parent`` is specified and + ``build_func`` is not given, ``build_func`` will be inherited + from ``parent``. Default: None. + parent (Registry, optional): Parent registry. The class registered in + children registry could be built from parent. Default: None. + scope (str, optional): The scope of registry. It is the key to search + for children registry. If not specified, scope will be the name of + the package where class is defined, e.g. mmdet, mmcls, mmseg. + Default: None. + """ + + def __init__(self, name, build_func=None, parent=None, scope=None): + self._name = name + self._module_dict = dict() + self._children = dict() + self._scope = self.infer_scope() if scope is None else scope + + # self.build_func will be set with the following priority: + # 1. build_func + # 2. parent.build_func + # 3. build_from_cfg + if build_func is None: + if parent is not None: + self.build_func = parent.build_func + else: + self.build_func = build_from_cfg + else: + self.build_func = build_func + if parent is not None: + assert isinstance(parent, Registry) + parent._add_children(self) + self.parent = parent + else: + self.parent = None + + def __len__(self): + return len(self._module_dict) + + def __contains__(self, key): + return self.get(key) is not None + + def __repr__(self): + format_str = self.__class__.__name__ + \ + f'(name={self._name}, ' \ + f'items={self._module_dict})' + return format_str + + @staticmethod + def infer_scope(): + """Infer the scope of registry. + + The name of the package where registry is defined will be returned. + + Example: + # in mmdet/models/backbone/resnet.py + >>> MODELS = Registry('models') + >>> @MODELS.register_module() + >>> class ResNet: + >>> pass + The scope of ``ResNet`` will be ``mmdet``. + + + Returns: + scope (str): The inferred scope name. + """ + # inspect.stack() trace where this function is called, the index-2 + # indicates the frame where `infer_scope()` is called + filename = inspect.getmodule(inspect.stack()[2][0]).__name__ + split_filename = filename.split('.') + return split_filename[0] + + @staticmethod + def split_scope_key(key): + """Split scope and key. + + The first scope will be split from key. + + Examples: + >>> Registry.split_scope_key('mmdet.ResNet') + 'mmdet', 'ResNet' + >>> Registry.split_scope_key('ResNet') + None, 'ResNet' + + Return: + scope (str, None): The first scope. + key (str): The remaining key. + """ + split_index = key.find('.') + if split_index != -1: + return key[:split_index], key[split_index + 1:] + else: + return None, key + + @property + def name(self): + return self._name + + @property + def scope(self): + return self._scope + + @property + def module_dict(self): + return self._module_dict + + @property + def children(self): + return self._children + + def get(self, key): + """Get the registry record. + + Args: + key (str): The class name in string format. + + Returns: + class: The corresponding class. + """ + scope, real_key = self.split_scope_key(key) + if scope is None or scope == self._scope: + # get from self + if real_key in self._module_dict: + return self._module_dict[real_key] + else: + # get from self._children + if scope in self._children: + return self._children[scope].get(real_key) + else: + # goto root + parent = self.parent + while parent.parent is not None: + parent = parent.parent + return parent.get(key) + + def build(self, *args, **kwargs): + return self.build_func(*args, **kwargs, registry=self) + + def _add_children(self, registry): + """Add children for a registry. + + The ``registry`` will be added as children based on its scope. + The parent registry could build objects from children registry. + + Example: + >>> models = Registry('models') + >>> mmdet_models = Registry('models', parent=models) + >>> @mmdet_models.register_module() + >>> class ResNet: + >>> pass + >>> resnet = models.build(dict(type='mmdet.ResNet')) + """ + + assert isinstance(registry, Registry) + assert registry.scope is not None + assert registry.scope not in self.children, \ + f'scope {registry.scope} exists in {self.name} registry' + self.children[registry.scope] = registry + + def _register_module(self, module_class, module_name=None, force=False): + if not inspect.isclass(module_class): + raise TypeError('module must be a class, ' + f'but got {type(module_class)}') + + if module_name is None: + module_name = module_class.__name__ + if isinstance(module_name, str): + module_name = [module_name] + for name in module_name: + if not force and name in self._module_dict: + raise KeyError(f'{name} is already registered ' + f'in {self.name}') + self._module_dict[name] = module_class + + def deprecated_register_module(self, cls=None, force=False): + warnings.warn( + 'The old API of register_module(module, force=False) ' + 'is deprecated and will be removed, please use the new API ' + 'register_module(name=None, force=False, module=None) instead.') + if cls is None: + return partial(self.deprecated_register_module, force=force) + self._register_module(cls, force=force) + return cls + + def register_module(self, name=None, force=False, module=None): + """Register a module. + + A record will be added to `self._module_dict`, whose key is the class + name or the specified name, and value is the class itself. + It can be used as a decorator or a normal function. + + Example: + >>> backbones = Registry('backbone') + >>> @backbones.register_module() + >>> class ResNet: + >>> pass + + >>> backbones = Registry('backbone') + >>> @backbones.register_module(name='mnet') + >>> class MobileNet: + >>> pass + + >>> backbones = Registry('backbone') + >>> class ResNet: + >>> pass + >>> backbones.register_module(ResNet) + + Args: + name (str | None): The module name to be registered. If not + specified, the class name will be used. + force (bool, optional): Whether to override an existing class with + the same name. Default: False. + module (type): Module class to be registered. + """ + if not isinstance(force, bool): + raise TypeError(f'force must be a boolean, but got {type(force)}') + # NOTE: This is a walkaround to be compatible with the old api, + # while it may introduce unexpected bugs. + if isinstance(name, type): + return self.deprecated_register_module(name, force=force) + + # raise the error ahead of time + if not (name is None or isinstance(name, str) or is_seq_of(name, str)): + raise TypeError( + 'name must be either of None, an instance of str or a sequence' + f' of str, but got {type(name)}') + + # use it as a normal method: x.register_module(module=SomeClass) + if module is not None: + self._register_module( + module_class=module, module_name=name, force=force) + return module + + # use it as a decorator: @x.register_module() + def _register(cls): + self._register_module( + module_class=cls, module_name=name, force=force) + return cls + + return _register diff --git a/custom_mmpkg/custom_mmcv/utils/testing.py b/custom_mmpkg/custom_mmcv/utils/testing.py new file mode 100644 index 0000000000000000000000000000000000000000..a27f936da8ec14bac18562ede0a79d476d82f797 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/testing.py @@ -0,0 +1,140 @@ +# Copyright (c) Open-MMLab. +import sys +from collections.abc import Iterable +from runpy import run_path +from shlex import split +from typing import Any, Dict, List +from unittest.mock import patch + + +def check_python_script(cmd): + """Run the python cmd script with `__main__`. The difference between + `os.system` is that, this function exectues code in the current process, so + that it can be tracked by coverage tools. Currently it supports two forms: + + - ./tests/data/scripts/hello.py zz + - python tests/data/scripts/hello.py zz + """ + args = split(cmd) + if args[0] == 'python': + args = args[1:] + with patch.object(sys, 'argv', args): + run_path(args[0], run_name='__main__') + + +def _any(judge_result): + """Since built-in ``any`` works only when the element of iterable is not + iterable, implement the function.""" + if not isinstance(judge_result, Iterable): + return judge_result + + try: + for element in judge_result: + if _any(element): + return True + except TypeError: + # Maybe encounter the case: torch.tensor(True) | torch.tensor(False) + if judge_result: + return True + return False + + +def assert_dict_contains_subset(dict_obj: Dict[Any, Any], + expected_subset: Dict[Any, Any]) -> bool: + """Check if the dict_obj contains the expected_subset. + + Args: + dict_obj (Dict[Any, Any]): Dict object to be checked. + expected_subset (Dict[Any, Any]): Subset expected to be contained in + dict_obj. + + Returns: + bool: Whether the dict_obj contains the expected_subset. + """ + + for key, value in expected_subset.items(): + if key not in dict_obj.keys() or _any(dict_obj[key] != value): + return False + return True + + +def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool: + """Check if attribute of class object is correct. + + Args: + obj (object): Class object to be checked. + expected_attrs (Dict[str, Any]): Dict of the expected attrs. + + Returns: + bool: Whether the attribute of class object is correct. + """ + for attr, value in expected_attrs.items(): + if not hasattr(obj, attr) or _any(getattr(obj, attr) != value): + return False + return True + + +def assert_dict_has_keys(obj: Dict[str, Any], + expected_keys: List[str]) -> bool: + """Check if the obj has all the expected_keys. + + Args: + obj (Dict[str, Any]): Object to be checked. + expected_keys (List[str]): Keys expected to contained in the keys of + the obj. + + Returns: + bool: Whether the obj has the expected keys. + """ + return set(expected_keys).issubset(set(obj.keys())) + + +def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool: + """Check if target_keys is equal to result_keys. + + Args: + result_keys (List[str]): Result keys to be checked. + target_keys (List[str]): Target keys to be checked. + + Returns: + bool: Whether target_keys is equal to result_keys. + """ + return set(result_keys) == set(target_keys) + + +def assert_is_norm_layer(module) -> bool: + """Check if the module is a norm layer. + + Args: + module (nn.Module): The module to be checked. + + Returns: + bool: Whether the module is a norm layer. + """ + from .parrots_wrapper import _BatchNorm, _InstanceNorm + from torch.nn import GroupNorm, LayerNorm + norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm) + return isinstance(module, norm_layer_candidates) + + +def assert_params_all_zeros(module) -> bool: + """Check if the parameters of the module is all zeros. + + Args: + module (nn.Module): The module to be checked. + + Returns: + bool: Whether the parameters of the module is all zeros. + """ + weight_data = module.weight.data + is_weight_zero = weight_data.allclose( + weight_data.new_zeros(weight_data.size())) + + if hasattr(module, 'bias') and module.bias is not None: + bias_data = module.bias.data + is_bias_zero = bias_data.allclose( + bias_data.new_zeros(bias_data.size())) + else: + is_bias_zero = True + + return is_weight_zero and is_bias_zero diff --git a/custom_mmpkg/custom_mmcv/utils/timer.py b/custom_mmpkg/custom_mmcv/utils/timer.py new file mode 100644 index 0000000000000000000000000000000000000000..5907e0edfdee7ab002e41d151e4c4386e1d9f294 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/timer.py @@ -0,0 +1,118 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from time import time + + +class TimerError(Exception): + + def __init__(self, message): + self.message = message + super(TimerError, self).__init__(message) + + +class Timer: + """A flexible Timer class. + + :Example: + + >>> import time + >>> import custom_mmpkg.custom_mmcv as mmcv + >>> with mmcv.Timer(): + >>> # simulate a code block that will run for 1s + >>> time.sleep(1) + 1.000 + >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'): + >>> # simulate a code block that will run for 1s + >>> time.sleep(1) + it takes 1.0 seconds + >>> timer = mmcv.Timer() + >>> time.sleep(0.5) + >>> print(timer.since_start()) + 0.500 + >>> time.sleep(0.5) + >>> print(timer.since_last_check()) + 0.500 + >>> print(timer.since_start()) + 1.000 + """ + + def __init__(self, start=True, print_tmpl=None): + self._is_running = False + self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}' + if start: + self.start() + + @property + def is_running(self): + """bool: indicate whether the timer is running""" + return self._is_running + + def __enter__(self): + self.start() + return self + + def __exit__(self, type, value, traceback): + print(self.print_tmpl.format(self.since_last_check())) + self._is_running = False + + def start(self): + """Start the timer.""" + if not self._is_running: + self._t_start = time() + self._is_running = True + self._t_last = time() + + def since_start(self): + """Total time since the timer is started. + + Returns (float): Time in seconds. + """ + if not self._is_running: + raise TimerError('timer is not running') + self._t_last = time() + return self._t_last - self._t_start + + def since_last_check(self): + """Time since the last checking. + + Either :func:`since_start` or :func:`since_last_check` is a checking + operation. + + Returns (float): Time in seconds. + """ + if not self._is_running: + raise TimerError('timer is not running') + dur = time() - self._t_last + self._t_last = time() + return dur + + +_g_timers = {} # global timers + + +def check_time(timer_id): + """Add check points in a single line. + + This method is suitable for running a task on a list of items. A timer will + be registered when the method is called for the first time. + + :Example: + + >>> import time + >>> import custom_mmpkg.custom_mmcv as mmcv + >>> for i in range(1, 6): + >>> # simulate a code block + >>> time.sleep(i) + >>> mmcv.check_time('task1') + 2.000 + 3.000 + 4.000 + 5.000 + + Args: + timer_id (str): Timer identifier. + """ + if timer_id not in _g_timers: + _g_timers[timer_id] = Timer() + return 0 + else: + return _g_timers[timer_id].since_last_check() diff --git a/custom_mmpkg/custom_mmcv/utils/trace.py b/custom_mmpkg/custom_mmcv/utils/trace.py new file mode 100644 index 0000000000000000000000000000000000000000..3907185bf82775e8ed4c2bf4cd4667c5c623d188 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/trace.py @@ -0,0 +1,23 @@ +import warnings + +import torch + +from custom_mmpkg.custom_mmcv.utils import digit_version + + +def is_jit_tracing() -> bool: + if (torch.__version__ != 'parrots' + and digit_version(torch.__version__) >= digit_version('1.6.0')): + on_trace = torch.jit.is_tracing() + # In PyTorch 1.6, torch.jit.is_tracing has a bug. + # Refers to https://github.com/pytorch/pytorch/issues/42448 + if isinstance(on_trace, bool): + return on_trace + else: + return torch._C._is_tracing() + else: + warnings.warn( + 'torch.jit.is_tracing is only supported after v1.6.0. ' + 'Therefore is_tracing returns False automatically. Please ' + 'set on_trace manually if you are using trace.', UserWarning) + return False diff --git a/custom_mmpkg/custom_mmcv/utils/version_utils.py b/custom_mmpkg/custom_mmcv/utils/version_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..963c45a2e8a86a88413ab6c18c22481fb9831985 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/utils/version_utils.py @@ -0,0 +1,90 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import subprocess +import warnings + +from packaging.version import parse + + +def digit_version(version_str: str, length: int = 4): + """Convert a version string into a tuple of integers. + + This method is usually used for comparing two versions. For pre-release + versions: alpha < beta < rc. + + Args: + version_str (str): The version string. + length (int): The maximum number of version levels. Default: 4. + + Returns: + tuple[int]: The version info in digits (integers). + """ + assert 'parrots' not in version_str + version = parse(version_str) + assert version.release, f'failed to parse version {version_str}' + release = list(version.release) + release = release[:length] + if len(release) < length: + release = release + [0] * (length - len(release)) + if version.is_prerelease: + mapping = {'a': -3, 'b': -2, 'rc': -1} + val = -4 + # version.pre can be None + if version.pre: + if version.pre[0] not in mapping: + warnings.warn(f'unknown prerelease version {version.pre[0]}, ' + 'version checking may go wrong') + else: + val = mapping[version.pre[0]] + release.extend([val, version.pre[-1]]) + else: + release.extend([val, 0]) + + elif version.is_postrelease: + release.extend([1, version.post]) + else: + release.extend([0, 0]) + return tuple(release) + + +def _minimal_ext_cmd(cmd): + # construct minimal environment + env = {} + for k in ['SYSTEMROOT', 'PATH', 'HOME']: + v = os.environ.get(k) + if v is not None: + env[k] = v + # LANGUAGE is used on win32 + env['LANGUAGE'] = 'C' + env['LANG'] = 'C' + env['LC_ALL'] = 'C' + out = subprocess.Popen( + cmd, stdout=subprocess.PIPE, env=env).communicate()[0] + return out + + +def get_git_hash(fallback='unknown', digits=None): + """Get the git hash of the current repo. + + Args: + fallback (str, optional): The fallback string when git hash is + unavailable. Defaults to 'unknown'. + digits (int, optional): kept digits of the hash. Defaults to None, + meaning all digits are kept. + + Returns: + str: Git commit hash. + """ + + if digits is not None and not isinstance(digits, int): + raise TypeError('digits must be None or an integer') + + try: + out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD']) + sha = out.strip().decode('ascii') + if digits is not None: + sha = sha[:digits] + except OSError: + sha = fallback + + return sha diff --git a/custom_mmpkg/custom_mmcv/version.py b/custom_mmpkg/custom_mmcv/version.py new file mode 100644 index 0000000000000000000000000000000000000000..1cce4e50bd692d4002e3cac3c545a3fb2efe95d0 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/version.py @@ -0,0 +1,35 @@ +# Copyright (c) OpenMMLab. All rights reserved. +__version__ = '1.3.17' + + +def parse_version_info(version_str: str, length: int = 4) -> tuple: + """Parse a version string into a tuple. + + Args: + version_str (str): The version string. + length (int): The maximum number of version levels. Default: 4. + + Returns: + tuple[int | str]: The version info, e.g., "1.3.0" is parsed into + (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into + (2, 0, 0, 0, 'rc', 1) (when length is set to 4). + """ + from packaging.version import parse + version = parse(version_str) + assert version.release, f'failed to parse version {version_str}' + release = list(version.release) + release = release[:length] + if len(release) < length: + release = release + [0] * (length - len(release)) + if version.is_prerelease: + release.extend(list(version.pre)) + elif version.is_postrelease: + release.extend(list(version.post)) + else: + release.extend([0, 0]) + return tuple(release) + + +version_info = tuple(int(x) for x in __version__.split('.')[:3]) + +__all__ = ['__version__', 'version_info', 'parse_version_info'] diff --git a/custom_mmpkg/custom_mmcv/video/__init__.py b/custom_mmpkg/custom_mmcv/video/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..73199b01dec52820dc6ca0139903536344d5a1eb --- /dev/null +++ b/custom_mmpkg/custom_mmcv/video/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .io import Cache, VideoReader, frames2video +from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread, + flowwrite, quantize_flow, sparse_flow_from_bytes) +from .processing import concat_video, convert_video, cut_video, resize_video + +__all__ = [ + 'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video', + 'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow', + 'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes' +] diff --git a/custom_mmpkg/custom_mmcv/video/io.py b/custom_mmpkg/custom_mmcv/video/io.py new file mode 100644 index 0000000000000000000000000000000000000000..f9c20cee37aec3e36413300b88fbdb0156bfa8a4 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/video/io.py @@ -0,0 +1,318 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os.path as osp +from collections import OrderedDict + +import cv2 +from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT, + CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH, + CAP_PROP_POS_FRAMES, VideoWriter_fourcc) + +from custom_mmpkg.custom_mmcv.utils import (check_file_exist, mkdir_or_exist, scandir, + track_progress) + + +class Cache: + + def __init__(self, capacity): + self._cache = OrderedDict() + self._capacity = int(capacity) + if capacity <= 0: + raise ValueError('capacity must be a positive integer') + + @property + def capacity(self): + return self._capacity + + @property + def size(self): + return len(self._cache) + + def put(self, key, val): + if key in self._cache: + return + if len(self._cache) >= self.capacity: + self._cache.popitem(last=False) + self._cache[key] = val + + def get(self, key, default=None): + val = self._cache[key] if key in self._cache else default + return val + + +class VideoReader: + """Video class with similar usage to a list object. + + This video warpper class provides convenient apis to access frames. + There exists an issue of OpenCV's VideoCapture class that jumping to a + certain frame may be inaccurate. It is fixed in this class by checking + the position after jumping each time. + Cache is used when decoding videos. So if the same frame is visited for + the second time, there is no need to decode again if it is stored in the + cache. + + :Example: + + >>> import custom_mmpkg.custom_mmcv as mmcv + >>> v = mmcv.VideoReader('sample.mp4') + >>> len(v) # get the total frame number with `len()` + 120 + >>> for img in v: # v is iterable + >>> mmcv.imshow(img) + >>> v[5] # get the 6th frame + """ + + def __init__(self, filename, cache_capacity=10): + # Check whether the video path is a url + if not filename.startswith(('https://', 'http://')): + check_file_exist(filename, 'Video file not found: ' + filename) + self._vcap = cv2.VideoCapture(filename) + assert cache_capacity > 0 + self._cache = Cache(cache_capacity) + self._position = 0 + # get basic info + self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH)) + self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT)) + self._fps = self._vcap.get(CAP_PROP_FPS) + self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT)) + self._fourcc = self._vcap.get(CAP_PROP_FOURCC) + + @property + def vcap(self): + """:obj:`cv2.VideoCapture`: The raw VideoCapture object.""" + return self._vcap + + @property + def opened(self): + """bool: Indicate whether the video is opened.""" + return self._vcap.isOpened() + + @property + def width(self): + """int: Width of video frames.""" + return self._width + + @property + def height(self): + """int: Height of video frames.""" + return self._height + + @property + def resolution(self): + """tuple: Video resolution (width, height).""" + return (self._width, self._height) + + @property + def fps(self): + """float: FPS of the video.""" + return self._fps + + @property + def frame_cnt(self): + """int: Total frames of the video.""" + return self._frame_cnt + + @property + def fourcc(self): + """str: "Four character code" of the video.""" + return self._fourcc + + @property + def position(self): + """int: Current cursor position, indicating frame decoded.""" + return self._position + + def _get_real_position(self): + return int(round(self._vcap.get(CAP_PROP_POS_FRAMES))) + + def _set_real_position(self, frame_id): + self._vcap.set(CAP_PROP_POS_FRAMES, frame_id) + pos = self._get_real_position() + for _ in range(frame_id - pos): + self._vcap.read() + self._position = frame_id + + def read(self): + """Read the next frame. + + If the next frame have been decoded before and in the cache, then + return it directly, otherwise decode, cache and return it. + + Returns: + ndarray or None: Return the frame if successful, otherwise None. + """ + # pos = self._position + if self._cache: + img = self._cache.get(self._position) + if img is not None: + ret = True + else: + if self._position != self._get_real_position(): + self._set_real_position(self._position) + ret, img = self._vcap.read() + if ret: + self._cache.put(self._position, img) + else: + ret, img = self._vcap.read() + if ret: + self._position += 1 + return img + + def get_frame(self, frame_id): + """Get frame by index. + + Args: + frame_id (int): Index of the expected frame, 0-based. + + Returns: + ndarray or None: Return the frame if successful, otherwise None. + """ + if frame_id < 0 or frame_id >= self._frame_cnt: + raise IndexError( + f'"frame_id" must be between 0 and {self._frame_cnt - 1}') + if frame_id == self._position: + return self.read() + if self._cache: + img = self._cache.get(frame_id) + if img is not None: + self._position = frame_id + 1 + return img + self._set_real_position(frame_id) + ret, img = self._vcap.read() + if ret: + if self._cache: + self._cache.put(self._position, img) + self._position += 1 + return img + + def current_frame(self): + """Get the current frame (frame that is just visited). + + Returns: + ndarray or None: If the video is fresh, return None, otherwise + return the frame. + """ + if self._position == 0: + return None + return self._cache.get(self._position - 1) + + def cvt2frames(self, + frame_dir, + file_start=0, + filename_tmpl='{:06d}.jpg', + start=0, + max_num=0, + show_progress=True): + """Convert a video to frame images. + + Args: + frame_dir (str): Output directory to store all the frame images. + file_start (int): Filenames will start from the specified number. + filename_tmpl (str): Filename template with the index as the + placeholder. + start (int): The starting frame index. + max_num (int): Maximum number of frames to be written. + show_progress (bool): Whether to show a progress bar. + """ + mkdir_or_exist(frame_dir) + if max_num == 0: + task_num = self.frame_cnt - start + else: + task_num = min(self.frame_cnt - start, max_num) + if task_num <= 0: + raise ValueError('start must be less than total frame number') + if start > 0: + self._set_real_position(start) + + def write_frame(file_idx): + img = self.read() + if img is None: + return + filename = osp.join(frame_dir, filename_tmpl.format(file_idx)) + cv2.imwrite(filename, img) + + if show_progress: + track_progress(write_frame, range(file_start, + file_start + task_num)) + else: + for i in range(task_num): + write_frame(file_start + i) + + def __len__(self): + return self.frame_cnt + + def __getitem__(self, index): + if isinstance(index, slice): + return [ + self.get_frame(i) + for i in range(*index.indices(self.frame_cnt)) + ] + # support negative indexing + if index < 0: + index += self.frame_cnt + if index < 0: + raise IndexError('index out of range') + return self.get_frame(index) + + def __iter__(self): + self._set_real_position(0) + return self + + def __next__(self): + img = self.read() + if img is not None: + return img + else: + raise StopIteration + + next = __next__ + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_value, traceback): + self._vcap.release() + + +def frames2video(frame_dir, + video_file, + fps=30, + fourcc='XVID', + filename_tmpl='{:06d}.jpg', + start=0, + end=0, + show_progress=True): + """Read the frame images from a directory and join them as a video. + + Args: + frame_dir (str): The directory containing video frames. + video_file (str): Output filename. + fps (float): FPS of the output video. + fourcc (str): Fourcc of the output video, this should be compatible + with the output file type. + filename_tmpl (str): Filename template with the index as the variable. + start (int): Starting frame index. + end (int): Ending frame index. + show_progress (bool): Whether to show a progress bar. + """ + if end == 0: + ext = filename_tmpl.split('.')[-1] + end = len([name for name in scandir(frame_dir, ext)]) + first_file = osp.join(frame_dir, filename_tmpl.format(start)) + check_file_exist(first_file, 'The start frame not found: ' + first_file) + img = cv2.imread(first_file) + height, width = img.shape[:2] + resolution = (width, height) + vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps, + resolution) + + def write_frame(file_idx): + filename = osp.join(frame_dir, filename_tmpl.format(file_idx)) + img = cv2.imread(filename) + vwriter.write(img) + + if show_progress: + track_progress(write_frame, range(start, end)) + else: + for i in range(start, end): + write_frame(i) + vwriter.release() diff --git a/custom_mmpkg/custom_mmcv/video/optflow.py b/custom_mmpkg/custom_mmcv/video/optflow.py new file mode 100644 index 0000000000000000000000000000000000000000..71c7cc1c48a896191e36d159680df29ac1d70dc4 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/video/optflow.py @@ -0,0 +1,254 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import cv2 +import numpy as np + +from custom_mmpkg.custom_mmcv.arraymisc import dequantize, quantize +from custom_mmpkg.custom_mmcv.image import imread, imwrite +from custom_mmpkg.custom_mmcv.utils import is_str + + +def flowread(flow_or_path, quantize=False, concat_axis=0, *args, **kwargs): + """Read an optical flow map. + + Args: + flow_or_path (ndarray or str): A flow map or filepath. + quantize (bool): whether to read quantized pair, if set to True, + remaining args will be passed to :func:`dequantize_flow`. + concat_axis (int): The axis that dx and dy are concatenated, + can be either 0 or 1. Ignored if quantize is False. + + Returns: + ndarray: Optical flow represented as a (h, w, 2) numpy array + """ + if isinstance(flow_or_path, np.ndarray): + if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2): + raise ValueError(f'Invalid flow with shape {flow_or_path.shape}') + return flow_or_path + elif not is_str(flow_or_path): + raise TypeError(f'"flow_or_path" must be a filename or numpy array, ' + f'not {type(flow_or_path)}') + + if not quantize: + with open(flow_or_path, 'rb') as f: + try: + header = f.read(4).decode('utf-8') + except Exception: + raise IOError(f'Invalid flow file: {flow_or_path}') + else: + if header != 'PIEH': + raise IOError(f'Invalid flow file: {flow_or_path}, ' + 'header does not contain PIEH') + + w = np.fromfile(f, np.int32, 1).squeeze() + h = np.fromfile(f, np.int32, 1).squeeze() + flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2)) + else: + assert concat_axis in [0, 1] + cat_flow = imread(flow_or_path, flag='unchanged') + if cat_flow.ndim != 2: + raise IOError( + f'{flow_or_path} is not a valid quantized flow file, ' + f'its dimension is {cat_flow.ndim}.') + assert cat_flow.shape[concat_axis] % 2 == 0 + dx, dy = np.split(cat_flow, 2, axis=concat_axis) + flow = dequantize_flow(dx, dy, *args, **kwargs) + + return flow.astype(np.float32) + + +def flowwrite(flow, filename, quantize=False, concat_axis=0, *args, **kwargs): + """Write optical flow to file. + + If the flow is not quantized, it will be saved as a .flo file losslessly, + otherwise a jpeg image which is lossy but of much smaller size. (dx and dy + will be concatenated horizontally into a single image if quantize is True.) + + Args: + flow (ndarray): (h, w, 2) array of optical flow. + filename (str): Output filepath. + quantize (bool): Whether to quantize the flow and save it to 2 jpeg + images. If set to True, remaining args will be passed to + :func:`quantize_flow`. + concat_axis (int): The axis that dx and dy are concatenated, + can be either 0 or 1. Ignored if quantize is False. + """ + if not quantize: + with open(filename, 'wb') as f: + f.write('PIEH'.encode('utf-8')) + np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f) + flow = flow.astype(np.float32) + flow.tofile(f) + f.flush() + else: + assert concat_axis in [0, 1] + dx, dy = quantize_flow(flow, *args, **kwargs) + dxdy = np.concatenate((dx, dy), axis=concat_axis) + imwrite(dxdy, filename) + + +def quantize_flow(flow, max_val=0.02, norm=True): + """Quantize flow to [0, 255]. + + After this step, the size of flow will be much smaller, and can be + dumped as jpeg images. + + Args: + flow (ndarray): (h, w, 2) array of optical flow. + max_val (float): Maximum value of flow, values beyond + [-max_val, max_val] will be truncated. + norm (bool): Whether to divide flow values by image width/height. + + Returns: + tuple[ndarray]: Quantized dx and dy. + """ + h, w, _ = flow.shape + dx = flow[..., 0] + dy = flow[..., 1] + if norm: + dx = dx / w # avoid inplace operations + dy = dy / h + # use 255 levels instead of 256 to make sure 0 is 0 after dequantization. + flow_comps = [ + quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy] + ] + return tuple(flow_comps) + + +def dequantize_flow(dx, dy, max_val=0.02, denorm=True): + """Recover from quantized flow. + + Args: + dx (ndarray): Quantized dx. + dy (ndarray): Quantized dy. + max_val (float): Maximum value used when quantizing. + denorm (bool): Whether to multiply flow values with width/height. + + Returns: + ndarray: Dequantized flow. + """ + assert dx.shape == dy.shape + assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1) + + dx, dy = [dequantize(d, -max_val, max_val, 255) for d in [dx, dy]] + + if denorm: + dx *= dx.shape[1] + dy *= dx.shape[0] + flow = np.dstack((dx, dy)) + return flow + + +def flow_warp(img, flow, filling_value=0, interpolate_mode='nearest'): + """Use flow to warp img. + + Args: + img (ndarray, float or uint8): Image to be warped. + flow (ndarray, float): Optical Flow. + filling_value (int): The missing pixels will be set with filling_value. + interpolate_mode (str): bilinear -> Bilinear Interpolation; + nearest -> Nearest Neighbor. + + Returns: + ndarray: Warped image with the same shape of img + """ + warnings.warn('This function is just for prototyping and cannot ' + 'guarantee the computational efficiency.') + assert flow.ndim == 3, 'Flow must be in 3D arrays.' + height = flow.shape[0] + width = flow.shape[1] + channels = img.shape[2] + + output = np.ones( + (height, width, channels), dtype=img.dtype) * filling_value + + grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2) + dx = grid[:, :, 0] + flow[:, :, 1] + dy = grid[:, :, 1] + flow[:, :, 0] + sx = np.floor(dx).astype(int) + sy = np.floor(dy).astype(int) + valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1) + + if interpolate_mode == 'nearest': + output[valid, :] = img[dx[valid].round().astype(int), + dy[valid].round().astype(int), :] + elif interpolate_mode == 'bilinear': + # dirty walkround for integer positions + eps_ = 1e-6 + dx, dy = dx + eps_, dy + eps_ + left_top_ = img[np.floor(dx[valid]).astype(int), + np.floor(dy[valid]).astype(int), :] * ( + np.ceil(dx[valid]) - dx[valid])[:, None] * ( + np.ceil(dy[valid]) - dy[valid])[:, None] + left_down_ = img[np.ceil(dx[valid]).astype(int), + np.floor(dy[valid]).astype(int), :] * ( + dx[valid] - np.floor(dx[valid]))[:, None] * ( + np.ceil(dy[valid]) - dy[valid])[:, None] + right_top_ = img[np.floor(dx[valid]).astype(int), + np.ceil(dy[valid]).astype(int), :] * ( + np.ceil(dx[valid]) - dx[valid])[:, None] * ( + dy[valid] - np.floor(dy[valid]))[:, None] + right_down_ = img[np.ceil(dx[valid]).astype(int), + np.ceil(dy[valid]).astype(int), :] * ( + dx[valid] - np.floor(dx[valid]))[:, None] * ( + dy[valid] - np.floor(dy[valid]))[:, None] + output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_ + else: + raise NotImplementedError( + 'We only support interpolation modes of nearest and bilinear, ' + f'but got {interpolate_mode}.') + return output.astype(img.dtype) + + +def flow_from_bytes(content): + """Read dense optical flow from bytes. + + .. note:: + This load optical flow function works for FlyingChairs, FlyingThings3D, + Sintel, FlyingChairsOcc datasets, but cannot load the data from + ChairsSDHom. + + Args: + content (bytes): Optical flow bytes got from files or other streams. + + Returns: + ndarray: Loaded optical flow with the shape (H, W, 2). + """ + + # header in first 4 bytes + header = content[:4] + if header.decode('utf-8') != 'PIEH': + raise Exception('Flow file header does not contain PIEH') + # width in second 4 bytes + width = np.frombuffer(content[4:], np.int32, 1).squeeze() + # height in third 4 bytes + height = np.frombuffer(content[8:], np.int32, 1).squeeze() + # after first 12 bytes, all bytes are flow + flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape( + (height, width, 2)) + + return flow + + +def sparse_flow_from_bytes(content): + """Read the optical flow in KITTI datasets from bytes. + + This function is modified from RAFT load the `KITTI datasets + `_. + + Args: + content (bytes): Optical flow bytes got from files or other streams. + + Returns: + Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2) + and flow valid mask with the shape (H, W). + """ # nopa + + content = np.frombuffer(content, np.uint8) + flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR) + flow = flow[:, :, ::-1].astype(np.float32) + # flow shape (H, W, 2) valid shape (H, W) + flow, valid = flow[:, :, :2], flow[:, :, 2] + flow = (flow - 2**15) / 64.0 + return flow, valid diff --git a/custom_mmpkg/custom_mmcv/video/processing.py b/custom_mmpkg/custom_mmcv/video/processing.py new file mode 100644 index 0000000000000000000000000000000000000000..72865d9041f5d8a9717b41b02beca67fa622fd9a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/video/processing.py @@ -0,0 +1,160 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import subprocess +import tempfile + +from custom_mmpkg.custom_mmcv.utils import requires_executable + + +@requires_executable('ffmpeg') +def convert_video(in_file, + out_file, + print_cmd=False, + pre_options='', + **kwargs): + """Convert a video with ffmpeg. + + This provides a general api to ffmpeg, the executed command is:: + + `ffmpeg -y -i ` + + Options(kwargs) are mapped to ffmpeg commands with the following rules: + + - key=val: "-key val" + - key=True: "-key" + - key=False: "" + + Args: + in_file (str): Input video filename. + out_file (str): Output video filename. + pre_options (str): Options appears before "-i ". + print_cmd (bool): Whether to print the final ffmpeg command. + """ + options = [] + for k, v in kwargs.items(): + if isinstance(v, bool): + if v: + options.append(f'-{k}') + elif k == 'log_level': + assert v in [ + 'quiet', 'panic', 'fatal', 'error', 'warning', 'info', + 'verbose', 'debug', 'trace' + ] + options.append(f'-loglevel {v}') + else: + options.append(f'-{k} {v}') + cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \ + f'{out_file}' + if print_cmd: + print(cmd) + subprocess.call(cmd, shell=True) + + +@requires_executable('ffmpeg') +def resize_video(in_file, + out_file, + size=None, + ratio=None, + keep_ar=False, + log_level='info', + print_cmd=False): + """Resize a video. + + Args: + in_file (str): Input video filename. + out_file (str): Output video filename. + size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1). + ratio (tuple or float): Expected resize ratio, (2, 0.5) means + (w*2, h*0.5). + keep_ar (bool): Whether to keep original aspect ratio. + log_level (str): Logging level of ffmpeg. + print_cmd (bool): Whether to print the final ffmpeg command. + """ + if size is None and ratio is None: + raise ValueError('expected size or ratio must be specified') + if size is not None and ratio is not None: + raise ValueError('size and ratio cannot be specified at the same time') + options = {'log_level': log_level} + if size: + if not keep_ar: + options['vf'] = f'scale={size[0]}:{size[1]}' + else: + options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \ + 'force_original_aspect_ratio=decrease' + else: + if not isinstance(ratio, tuple): + ratio = (ratio, ratio) + options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"' + convert_video(in_file, out_file, print_cmd, **options) + + +@requires_executable('ffmpeg') +def cut_video(in_file, + out_file, + start=None, + end=None, + vcodec=None, + acodec=None, + log_level='info', + print_cmd=False): + """Cut a clip from a video. + + Args: + in_file (str): Input video filename. + out_file (str): Output video filename. + start (None or float): Start time (in seconds). + end (None or float): End time (in seconds). + vcodec (None or str): Output video codec, None for unchanged. + acodec (None or str): Output audio codec, None for unchanged. + log_level (str): Logging level of ffmpeg. + print_cmd (bool): Whether to print the final ffmpeg command. + """ + options = {'log_level': log_level} + if vcodec is None: + options['vcodec'] = 'copy' + if acodec is None: + options['acodec'] = 'copy' + if start: + options['ss'] = start + else: + start = 0 + if end: + options['t'] = end - start + convert_video(in_file, out_file, print_cmd, **options) + + +@requires_executable('ffmpeg') +def concat_video(video_list, + out_file, + vcodec=None, + acodec=None, + log_level='info', + print_cmd=False): + """Concatenate multiple videos into a single one. + + Args: + video_list (list): A list of video filenames + out_file (str): Output video filename + vcodec (None or str): Output video codec, None for unchanged + acodec (None or str): Output audio codec, None for unchanged + log_level (str): Logging level of ffmpeg. + print_cmd (bool): Whether to print the final ffmpeg command. + """ + tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True) + with open(tmp_filename, 'w') as f: + for filename in video_list: + f.write(f'file {osp.abspath(filename)}\n') + options = {'log_level': log_level} + if vcodec is None: + options['vcodec'] = 'copy' + if acodec is None: + options['acodec'] = 'copy' + convert_video( + tmp_filename, + out_file, + print_cmd, + pre_options='-f concat -safe 0', + **options) + os.close(tmp_filehandler) + os.remove(tmp_filename) diff --git a/custom_mmpkg/custom_mmcv/visualization/__init__.py b/custom_mmpkg/custom_mmcv/visualization/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..835df136bdcf69348281d22914d41aa84cdf92b1 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/visualization/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from .color import Color, color_val +from .image import imshow, imshow_bboxes, imshow_det_bboxes +from .optflow import flow2rgb, flowshow, make_color_wheel + +__all__ = [ + 'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes', + 'flowshow', 'flow2rgb', 'make_color_wheel' +] diff --git a/custom_mmpkg/custom_mmcv/visualization/color.py b/custom_mmpkg/custom_mmcv/visualization/color.py new file mode 100644 index 0000000000000000000000000000000000000000..2bff8a9dc94fc5ff8dbd5425faeea165332ac10a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/visualization/color.py @@ -0,0 +1,51 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from enum import Enum + +import numpy as np + +from custom_mmpkg.custom_mmcv.utils import is_str + + +class Color(Enum): + """An enum that defines common colors. + + Contains red, green, blue, cyan, yellow, magenta, white and black. + """ + red = (0, 0, 255) + green = (0, 255, 0) + blue = (255, 0, 0) + cyan = (255, 255, 0) + yellow = (0, 255, 255) + magenta = (255, 0, 255) + white = (255, 255, 255) + black = (0, 0, 0) + + +def color_val(color): + """Convert various input to color tuples. + + Args: + color (:obj:`Color`/str/tuple/int/ndarray): Color inputs + + Returns: + tuple[int]: A tuple of 3 integers indicating BGR channels. + """ + if is_str(color): + return Color[color].value + elif isinstance(color, Color): + return color.value + elif isinstance(color, tuple): + assert len(color) == 3 + for channel in color: + assert 0 <= channel <= 255 + return color + elif isinstance(color, int): + assert 0 <= color <= 255 + return color, color, color + elif isinstance(color, np.ndarray): + assert color.ndim == 1 and color.size == 3 + assert np.all((color >= 0) & (color <= 255)) + color = color.astype(np.uint8) + return tuple(color) + else: + raise TypeError(f'Invalid type for color: {type(color)}') diff --git a/custom_mmpkg/custom_mmcv/visualization/image.py b/custom_mmpkg/custom_mmcv/visualization/image.py new file mode 100644 index 0000000000000000000000000000000000000000..3f77c6d1033dd2a5968cedf3a5fe77d91cd948b8 --- /dev/null +++ b/custom_mmpkg/custom_mmcv/visualization/image.py @@ -0,0 +1,152 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import cv2 +import numpy as np + +from custom_mmpkg.custom_mmcv.image import imread, imwrite +from .color import color_val + + +def imshow(img, win_name='', wait_time=0): + """Show an image. + + Args: + img (str or ndarray): The image to be displayed. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + """ + cv2.imshow(win_name, imread(img)) + if wait_time == 0: # prevent from hanging if windows was closed + while True: + ret = cv2.waitKey(1) + + closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1 + # if user closed window or if some key pressed + if closed or ret != -1: + break + else: + ret = cv2.waitKey(wait_time) + + +def imshow_bboxes(img, + bboxes, + colors='green', + top_k=-1, + thickness=1, + show=True, + win_name='', + wait_time=0, + out_file=None): + """Draw bboxes on an image. + + Args: + img (str or ndarray): The image to be displayed. + bboxes (list or ndarray): A list of ndarray of shape (k, 4). + colors (list[str or tuple or Color]): A list of colors. + top_k (int): Plot the first k bboxes only if set positive. + thickness (int): Thickness of lines. + show (bool): Whether to show the image. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + out_file (str, optional): The filename to write the image. + + Returns: + ndarray: The image with bboxes drawn on it. + """ + img = imread(img) + img = np.ascontiguousarray(img) + + if isinstance(bboxes, np.ndarray): + bboxes = [bboxes] + if not isinstance(colors, list): + colors = [colors for _ in range(len(bboxes))] + colors = [color_val(c) for c in colors] + assert len(bboxes) == len(colors) + + for i, _bboxes in enumerate(bboxes): + _bboxes = _bboxes.astype(np.int32) + if top_k <= 0: + _top_k = _bboxes.shape[0] + else: + _top_k = min(top_k, _bboxes.shape[0]) + for j in range(_top_k): + left_top = (_bboxes[j, 0], _bboxes[j, 1]) + right_bottom = (_bboxes[j, 2], _bboxes[j, 3]) + cv2.rectangle( + img, left_top, right_bottom, colors[i], thickness=thickness) + + if show: + imshow(img, win_name, wait_time) + if out_file is not None: + imwrite(img, out_file) + return img + + +def imshow_det_bboxes(img, + bboxes, + labels, + class_names=None, + score_thr=0, + bbox_color='green', + text_color='green', + thickness=1, + font_scale=0.5, + show=True, + win_name='', + wait_time=0, + out_file=None): + """Draw bboxes and class labels (with scores) on an image. + + Args: + img (str or ndarray): The image to be displayed. + bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or + (n, 5). + labels (ndarray): Labels of bboxes. + class_names (list[str]): Names of each classes. + score_thr (float): Minimum score of bboxes to be shown. + bbox_color (str or tuple or :obj:`Color`): Color of bbox lines. + text_color (str or tuple or :obj:`Color`): Color of texts. + thickness (int): Thickness of lines. + font_scale (float): Font scales of texts. + show (bool): Whether to show the image. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + out_file (str or None): The filename to write the image. + + Returns: + ndarray: The image with bboxes drawn on it. + """ + assert bboxes.ndim == 2 + assert labels.ndim == 1 + assert bboxes.shape[0] == labels.shape[0] + assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5 + img = imread(img) + img = np.ascontiguousarray(img) + + if score_thr > 0: + assert bboxes.shape[1] == 5 + scores = bboxes[:, -1] + inds = scores > score_thr + bboxes = bboxes[inds, :] + labels = labels[inds] + + bbox_color = color_val(bbox_color) + text_color = color_val(text_color) + + for bbox, label in zip(bboxes, labels): + bbox_int = bbox.astype(np.int32) + left_top = (bbox_int[0], bbox_int[1]) + right_bottom = (bbox_int[2], bbox_int[3]) + cv2.rectangle( + img, left_top, right_bottom, bbox_color, thickness=thickness) + label_text = class_names[ + label] if class_names is not None else f'cls {label}' + if len(bbox) > 4: + label_text += f'|{bbox[-1]:.02f}' + cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2), + cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color) + + if show: + imshow(img, win_name, wait_time) + if out_file is not None: + imwrite(img, out_file) + return img diff --git a/custom_mmpkg/custom_mmcv/visualization/optflow.py b/custom_mmpkg/custom_mmcv/visualization/optflow.py new file mode 100644 index 0000000000000000000000000000000000000000..8b13b411f7161205eba2653c357a84f8916a353a --- /dev/null +++ b/custom_mmpkg/custom_mmcv/visualization/optflow.py @@ -0,0 +1,112 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from __future__ import division + +import numpy as np + +from custom_mmpkg.custom_mmcv.image import rgb2bgr +from custom_mmpkg.custom_mmcv.video import flowread +from .image import imshow + + +def flowshow(flow, win_name='', wait_time=0): + """Show optical flow. + + Args: + flow (ndarray or str): The optical flow to be displayed. + win_name (str): The window name. + wait_time (int): Value of waitKey param. + """ + flow = flowread(flow) + flow_img = flow2rgb(flow) + imshow(rgb2bgr(flow_img), win_name, wait_time) + + +def flow2rgb(flow, color_wheel=None, unknown_thr=1e6): + """Convert flow map to RGB image. + + Args: + flow (ndarray): Array of optical flow. + color_wheel (ndarray or None): Color wheel used to map flow field to + RGB colorspace. Default color wheel will be used if not specified. + unknown_thr (str): Values above this threshold will be marked as + unknown and thus ignored. + + Returns: + ndarray: RGB image that can be visualized. + """ + assert flow.ndim == 3 and flow.shape[-1] == 2 + if color_wheel is None: + color_wheel = make_color_wheel() + assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3 + num_bins = color_wheel.shape[0] + + dx = flow[:, :, 0].copy() + dy = flow[:, :, 1].copy() + + ignore_inds = ( + np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) | + (np.abs(dy) > unknown_thr)) + dx[ignore_inds] = 0 + dy[ignore_inds] = 0 + + rad = np.sqrt(dx**2 + dy**2) + if np.any(rad > np.finfo(float).eps): + max_rad = np.max(rad) + dx /= max_rad + dy /= max_rad + + rad = np.sqrt(dx**2 + dy**2) + angle = np.arctan2(-dy, -dx) / np.pi + + bin_real = (angle + 1) / 2 * (num_bins - 1) + bin_left = np.floor(bin_real).astype(int) + bin_right = (bin_left + 1) % num_bins + w = (bin_real - bin_left.astype(np.float32))[..., None] + flow_img = (1 - + w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :] + small_ind = rad <= 1 + flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind]) + flow_img[np.logical_not(small_ind)] *= 0.75 + + flow_img[ignore_inds, :] = 0 + + return flow_img + + +def make_color_wheel(bins=None): + """Build a color wheel. + + Args: + bins(list or tuple, optional): Specify the number of bins for each + color range, corresponding to six ranges: red -> yellow, + yellow -> green, green -> cyan, cyan -> blue, blue -> magenta, + magenta -> red. [15, 6, 4, 11, 13, 6] is used for default + (see Middlebury). + + Returns: + ndarray: Color wheel of shape (total_bins, 3). + """ + if bins is None: + bins = [15, 6, 4, 11, 13, 6] + assert len(bins) == 6 + + RY, YG, GC, CB, BM, MR = tuple(bins) + + ry = [1, np.arange(RY) / RY, 0] + yg = [1 - np.arange(YG) / YG, 1, 0] + gc = [0, 1, np.arange(GC) / GC] + cb = [0, 1 - np.arange(CB) / CB, 1] + bm = [np.arange(BM) / BM, 0, 1] + mr = [1, 0, 1 - np.arange(MR) / MR] + + num_bins = RY + YG + GC + CB + BM + MR + + color_wheel = np.zeros((3, num_bins), dtype=np.float32) + + col = 0 + for i, color in enumerate([ry, yg, gc, cb, bm, mr]): + for j in range(3): + color_wheel[j, col:col + bins[i]] = color[j] + col += bins[i] + + return color_wheel.T diff --git a/custom_mmpkg/custom_mmseg/apis/__init__.py b/custom_mmpkg/custom_mmseg/apis/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..170724be38de42daf2bc1a1910e181d68818f165 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/apis/__init__.py @@ -0,0 +1,9 @@ +from .inference import inference_segmentor, init_segmentor, show_result_pyplot +from .test import multi_gpu_test, single_gpu_test +from .train import get_root_logger, set_random_seed, train_segmentor + +__all__ = [ + 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor', + 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test', + 'show_result_pyplot' +] diff --git a/custom_mmpkg/custom_mmseg/apis/inference.py b/custom_mmpkg/custom_mmseg/apis/inference.py new file mode 100644 index 0000000000000000000000000000000000000000..0ee57d61e59f67926c7be6a139d057805026b816 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/apis/inference.py @@ -0,0 +1,137 @@ +import matplotlib.pyplot as plt +import custom_mmpkg.custom_mmcv as mmcv +import torch +from custom_mmpkg.custom_mmcv.parallel import collate, scatter +from custom_mmpkg.custom_mmcv.runner import load_checkpoint + +from custom_mmpkg.custom_mmseg.datasets.pipelines import Compose +from custom_mmpkg.custom_mmseg.models import build_segmentor + + +def init_segmentor(config, checkpoint=None, device="cpu"): + """Initialize a segmentor from config file. + + Args: + config (str or :obj:`mmcv.Config`): Config file path or the config + object. + checkpoint (str, optional): Checkpoint path. If left as None, the model + will not load any weights. + device (str, optional) CPU/CUDA device option. Default 'cuda:0'. + Use 'cpu' for loading model on CPU. + Returns: + nn.Module: The constructed segmentor. + """ + if isinstance(config, str): + config = mmcv.Config.fromfile(config) + elif not isinstance(config, mmcv.Config): + raise TypeError('config must be a filename or Config object, ' + 'but got {}'.format(type(config))) + config.model.pretrained = None + config.model.train_cfg = None + model = build_segmentor(config.model, test_cfg=config.get('test_cfg')) + if checkpoint is not None: + checkpoint = load_checkpoint(model, checkpoint, map_location='cpu') + model.CLASSES = checkpoint['meta']['CLASSES'] + model.PALETTE = checkpoint['meta']['PALETTE'] + model.cfg = config # save the config in the model for convenience + model.to(device) + model.eval() + return model + + +class LoadImage: + """A simple pipeline to load image.""" + + def __call__(self, results): + """Call function to load images into results. + + Args: + results (dict): A result dict contains the file name + of the image to be read. + + Returns: + dict: ``results`` will be returned containing loaded image. + """ + + if isinstance(results['img'], str): + results['filename'] = results['img'] + results['ori_filename'] = results['img'] + else: + results['filename'] = None + results['ori_filename'] = None + img = mmcv.imread(results['img']) + results['img'] = img + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + return results + + +def inference_segmentor(model, img): + """Inference image(s) with the segmentor. + + Args: + model (nn.Module): The loaded segmentor. + imgs (str/ndarray or list[str/ndarray]): Either image files or loaded + images. + + Returns: + (list[Tensor]): The segmentation result. + """ + cfg = model.cfg + device = next(model.parameters()).device # model device + # build the data pipeline + test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:] + test_pipeline = Compose(test_pipeline) + # prepare data + data = dict(img=img) + data = test_pipeline(data) + data = collate([data], samples_per_gpu=1) + if next(model.parameters()).is_cuda: + # scatter to specified GPU + data = scatter(data, [device])[0] + else: + data['img'][0] = data['img'][0].to(device) + data['img_metas'] = [i.data[0] for i in data['img_metas']] + + # forward the model + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + return result + + +def show_result_pyplot(model, + img, + result, + palette=None, + fig_size=(15, 10), + opacity=0.5, + title='', + block=True): + """Visualize the segmentation results on the image. + + Args: + model (nn.Module): The loaded segmentor. + img (str or np.ndarray): Image filename or loaded image. + result (list): The segmentation result. + palette (list[list[int]]] | None): The palette of segmentation + map. If None is given, random palette will be generated. + Default: None + fig_size (tuple): Figure size of the pyplot figure. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + title (str): The title of pyplot figure. + Default is ''. + block (bool): Whether to block the pyplot figure. + Default is True. + """ + if hasattr(model, 'module'): + model = model.module + img = model.show_result( + img, result, palette=palette, show=False, opacity=opacity) + # plt.figure(figsize=fig_size) + # plt.imshow(mmcv.bgr2rgb(img)) + # plt.title(title) + # plt.tight_layout() + # plt.show(block=block) + return mmcv.bgr2rgb(img) diff --git a/custom_mmpkg/custom_mmseg/apis/test.py b/custom_mmpkg/custom_mmseg/apis/test.py new file mode 100644 index 0000000000000000000000000000000000000000..7d0078b5b52eca53ddb0c4bb28adb7b1afe59728 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/apis/test.py @@ -0,0 +1,238 @@ +import os.path as osp +import pickle +import shutil +import tempfile + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +import torch +import torch.distributed as dist +from custom_mmpkg.custom_mmcv.image import tensor2imgs +from custom_mmpkg.custom_mmcv.runner import get_dist_info + + +def np2tmp(array, temp_file_name=None): + """Save ndarray to local numpy file. + + Args: + array (ndarray): Ndarray to save. + temp_file_name (str): Numpy file name. If 'temp_file_name=None', this + function will generate a file name with tempfile.NamedTemporaryFile + to save ndarray. Default: None. + + Returns: + str: The numpy file name. + """ + + if temp_file_name is None: + temp_file_name = tempfile.NamedTemporaryFile( + suffix='.npy', delete=False).name + np.save(temp_file_name, array) + return temp_file_name + + +def single_gpu_test(model, + data_loader, + show=False, + out_dir=None, + efficient_test=False, + opacity=0.5): + """Test with single GPU. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + show (bool): Whether show results during inference. Default: False. + out_dir (str, optional): If specified, the results will be dumped into + the directory to save output results. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + Returns: + list: The prediction results. + """ + + model.eval() + results = [] + dataset = data_loader.dataset + prog_bar = mmcv.ProgressBar(len(dataset)) + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, **data) + + if show or out_dir: + img_tensor = data['img'][0] + img_metas = data['img_metas'][0].data[0] + imgs = tensor2imgs(img_tensor, **img_metas[0]['img_norm_cfg']) + assert len(imgs) == len(img_metas) + + for img, img_meta in zip(imgs, img_metas): + h, w, _ = img_meta['img_shape'] + img_show = img[:h, :w, :] + + ori_h, ori_w = img_meta['ori_shape'][:-1] + img_show = mmcv.imresize(img_show, (ori_w, ori_h)) + + if out_dir: + out_file = osp.join(out_dir, img_meta['ori_filename']) + else: + out_file = None + + model.module.show_result( + img_show, + result, + palette=dataset.PALETTE, + show=show, + out_file=out_file, + opacity=opacity) + + if isinstance(result, list): + if efficient_test: + result = [np2tmp(_) for _ in result] + results.extend(result) + else: + if efficient_test: + result = np2tmp(result) + results.append(result) + + batch_size = len(result) + for _ in range(batch_size): + prog_bar.update() + return results + + +def multi_gpu_test(model, + data_loader, + tmpdir=None, + gpu_collect=False, + efficient_test=False): + """Test model with multiple gpus. + + This method tests model with multiple gpus and collects the results + under two different modes: gpu and cpu modes. By setting 'gpu_collect=True' + it encodes results to gpu tensors and use gpu communication for results + collection. On cpu mode it saves the results on different gpus to 'tmpdir' + and collects them by the rank 0 worker. + + Args: + model (nn.Module): Model to be tested. + data_loader (utils.data.Dataloader): Pytorch data loader. + tmpdir (str): Path of directory to save the temporary results from + different gpus under cpu mode. + gpu_collect (bool): Option to use either gpu or cpu to collect results. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + + Returns: + list: The prediction results. + """ + + model.eval() + results = [] + dataset = data_loader.dataset + rank, world_size = get_dist_info() + if rank == 0: + prog_bar = mmcv.ProgressBar(len(dataset)) + for i, data in enumerate(data_loader): + with torch.no_grad(): + result = model(return_loss=False, rescale=True, **data) + + if isinstance(result, list): + if efficient_test: + result = [np2tmp(_) for _ in result] + results.extend(result) + else: + if efficient_test: + result = np2tmp(result) + results.append(result) + + if rank == 0: + batch_size = data['img'][0].size(0) + for _ in range(batch_size * world_size): + prog_bar.update() + + # collect results from all ranks + if gpu_collect: + results = collect_results_gpu(results, len(dataset)) + else: + results = collect_results_cpu(results, len(dataset), tmpdir) + return results + + +def collect_results_cpu(result_part, size, tmpdir=None): + """Collect results with CPU.""" + rank, world_size = get_dist_info() + # create a tmp dir if it is not specified + if tmpdir is None: + MAX_LEN = 512 + # 32 is whitespace + dir_tensor = torch.full((MAX_LEN, ), + 32, + dtype=torch.uint8, + device='cuda') + if rank == 0: + tmpdir = tempfile.mkdtemp() + tmpdir = torch.tensor( + bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda') + dir_tensor[:len(tmpdir)] = tmpdir + dist.broadcast(dir_tensor, 0) + tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip() + else: + mmcv.mkdir_or_exist(tmpdir) + # dump the part result to the dir + mmcv.dump(result_part, osp.join(tmpdir, 'part_{}.pkl'.format(rank))) + dist.barrier() + # collect all parts + if rank != 0: + return None + else: + # load results of all parts from tmp dir + part_list = [] + for i in range(world_size): + part_file = osp.join(tmpdir, 'part_{}.pkl'.format(i)) + part_list.append(mmcv.load(part_file)) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + # remove tmp dir + shutil.rmtree(tmpdir) + return ordered_results + + +def collect_results_gpu(result_part, size): + """Collect results with GPU.""" + rank, world_size = get_dist_info() + # dump result part to tensor with pickle + part_tensor = torch.tensor( + bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda') + # gather all result part tensor shape + shape_tensor = torch.tensor(part_tensor.shape, device='cuda') + shape_list = [shape_tensor.clone() for _ in range(world_size)] + dist.all_gather(shape_list, shape_tensor) + # padding result part tensor to max length + shape_max = torch.tensor(shape_list).max() + part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda') + part_send[:shape_tensor[0]] = part_tensor + part_recv_list = [ + part_tensor.new_zeros(shape_max) for _ in range(world_size) + ] + # gather all result part + dist.all_gather(part_recv_list, part_send) + + if rank == 0: + part_list = [] + for recv, shape in zip(part_recv_list, shape_list): + part_list.append( + pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())) + # sort the results + ordered_results = [] + for res in zip(*part_list): + ordered_results.extend(list(res)) + # the dataloader may pad some samples + ordered_results = ordered_results[:size] + return ordered_results diff --git a/custom_mmpkg/custom_mmseg/apis/train.py b/custom_mmpkg/custom_mmseg/apis/train.py new file mode 100644 index 0000000000000000000000000000000000000000..61eb4768b375cf8e3cd5323d5533221e8238c4c8 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/apis/train.py @@ -0,0 +1,116 @@ +import random +import warnings + +import numpy as np +import torch +from custom_mmpkg.custom_mmcv.parallel import MMDataParallel, MMDistributedDataParallel +from custom_mmpkg.custom_mmcv.runner import build_optimizer, build_runner + +from custom_mmpkg.custom_mmseg.core import DistEvalHook, EvalHook +from custom_mmpkg.custom_mmseg.datasets import build_dataloader, build_dataset +from custom_mmpkg.custom_mmseg.utils import get_root_logger + + +def set_random_seed(seed, deterministic=False): + """Set random seed. + + Args: + seed (int): Seed to be used. + deterministic (bool): Whether to set the deterministic option for + CUDNN backend, i.e., set `torch.backends.cudnn.deterministic` + to True and `torch.backends.cudnn.benchmark` to False. + Default: False. + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + if deterministic: + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def train_segmentor(model, + dataset, + cfg, + distributed=False, + validate=False, + timestamp=None, + meta=None): + """Launch segmentor training.""" + logger = get_root_logger(cfg.log_level) + + # prepare data loaders + dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset] + data_loaders = [ + build_dataloader( + ds, + cfg.data.samples_per_gpu, + cfg.data.workers_per_gpu, + # cfg.gpus will be ignored if distributed + len(cfg.gpu_ids), + dist=distributed, + seed=cfg.seed, + drop_last=True) for ds in dataset + ] + + # put model on gpus + if distributed: + find_unused_parameters = cfg.get('find_unused_parameters', False) + # Sets the `find_unused_parameters` parameter in + # torch.nn.parallel.DistributedDataParallel + model = MMDistributedDataParallel( + model.cuda(), + device_ids=[torch.cuda.current_device()], + broadcast_buffers=False, + find_unused_parameters=find_unused_parameters) + else: + model = MMDataParallel( + model.cuda(cfg.gpu_ids[0]), device_ids=cfg.gpu_ids) + + # build runner + optimizer = build_optimizer(model, cfg.optimizer) + + if cfg.get('runner') is None: + cfg.runner = {'type': 'IterBasedRunner', 'max_iters': cfg.total_iters} + warnings.warn( + 'config is now expected to have a `runner` section, ' + 'please set `runner` in your config.', UserWarning) + + runner = build_runner( + cfg.runner, + default_args=dict( + model=model, + batch_processor=None, + optimizer=optimizer, + work_dir=cfg.work_dir, + logger=logger, + meta=meta)) + + # register hooks + runner.register_training_hooks(cfg.lr_config, cfg.optimizer_config, + cfg.checkpoint_config, cfg.log_config, + cfg.get('momentum_config', None)) + + # an ugly walkaround to make the .log and .log.json filenames the same + runner.timestamp = timestamp + + # register eval hooks + if validate: + val_dataset = build_dataset(cfg.data.val, dict(test_mode=True)) + val_dataloader = build_dataloader( + val_dataset, + samples_per_gpu=1, + workers_per_gpu=cfg.data.workers_per_gpu, + dist=distributed, + shuffle=False) + eval_cfg = cfg.get('evaluation', {}) + eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner' + eval_hook = DistEvalHook if distributed else EvalHook + runner.register_hook(eval_hook(val_dataloader, **eval_cfg), priority='LOW') + + if cfg.resume_from: + runner.resume(cfg.resume_from) + elif cfg.load_from: + runner.load_checkpoint(cfg.load_from) + runner.run(data_loaders, cfg.workflow) diff --git a/custom_mmpkg/custom_mmseg/core/__init__.py b/custom_mmpkg/custom_mmseg/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..965605587211b7bf0bd6bc3acdbb33dd49cab023 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/__init__.py @@ -0,0 +1,3 @@ +from .evaluation import * # noqa: F401, F403 +from .seg import * # noqa: F401, F403 +from .utils import * # noqa: F401, F403 diff --git a/custom_mmpkg/custom_mmseg/core/evaluation/__init__.py b/custom_mmpkg/custom_mmseg/core/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f7cc4b23413a0639e9de00eeb0bf600632d2c6cd --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/evaluation/__init__.py @@ -0,0 +1,8 @@ +from .class_names import get_classes, get_palette +from .eval_hooks import DistEvalHook, EvalHook +from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou + +__all__ = [ + 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', + 'eval_metrics', 'get_classes', 'get_palette' +] diff --git a/custom_mmpkg/custom_mmseg/core/evaluation/class_names.py b/custom_mmpkg/custom_mmseg/core/evaluation/class_names.py new file mode 100644 index 0000000000000000000000000000000000000000..3e79082966879d06da504a8105646257f103a07c --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/evaluation/class_names.py @@ -0,0 +1,152 @@ +import custom_mmpkg.custom_mmcv as mmcv + + +def cityscapes_classes(): + """Cityscapes class names for external use.""" + return [ + 'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', + 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', + 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', + 'bicycle' + ] + + +def ade_classes(): + """ADE20K class names for external use.""" + return [ + 'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ', + 'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth', + 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car', + 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug', + 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe', + 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', + 'signboard', 'chest of drawers', 'counter', 'sand', 'sink', + 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path', + 'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door', + 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table', + 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove', + 'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar', + 'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower', + 'chandelier', 'awning', 'streetlight', 'booth', 'television receiver', + 'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister', + 'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van', + 'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything', + 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent', + 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank', + 'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake', + 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce', + 'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen', + 'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass', + 'clock', 'flag' + ] + + +def voc_classes(): + """Pascal VOC class names for external use.""" + return [ + 'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', + 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', + 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train', + 'tvmonitor' + ] + + +def cityscapes_palette(): + """Cityscapes palette for external use.""" + return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], + [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0], + [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60], + [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100], + [0, 0, 230], [119, 11, 32]] + + +def ade_palette(): + """ADE20K palette for external use.""" + return [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], + [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], + [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], + [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], + [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], + [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], + [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], + [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], + [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], + [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], + [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], + [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], + [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], + [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], + [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255], + [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255], + [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0], + [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0], + [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255], + [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255], + [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20], + [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255], + [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255], + [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255], + [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0], + [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0], + [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255], + [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112], + [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160], + [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163], + [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0], + [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0], + [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255], + [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204], + [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255], + [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255], + [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194], + [102, 255, 0], [92, 0, 255]] + + +def voc_palette(): + """Pascal VOC palette for external use.""" + return [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], + [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], + [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], + [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], + [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] + + +dataset_aliases = { + 'cityscapes': ['cityscapes'], + 'ade': ['ade', 'ade20k'], + 'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug'] +} + + +def get_classes(dataset): + """Get class names of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if mmcv.is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_classes()') + else: + raise ValueError(f'Unrecognized dataset: {dataset}') + else: + raise TypeError(f'dataset must a str, but got {type(dataset)}') + return labels + + +def get_palette(dataset): + """Get class palette (RGB) of a dataset.""" + alias2name = {} + for name, aliases in dataset_aliases.items(): + for alias in aliases: + alias2name[alias] = name + + if mmcv.is_str(dataset): + if dataset in alias2name: + labels = eval(alias2name[dataset] + '_palette()') + else: + raise ValueError(f'Unrecognized dataset: {dataset}') + else: + raise TypeError(f'dataset must a str, but got {type(dataset)}') + return labels diff --git a/custom_mmpkg/custom_mmseg/core/evaluation/eval_hooks.py b/custom_mmpkg/custom_mmseg/core/evaluation/eval_hooks.py new file mode 100644 index 0000000000000000000000000000000000000000..684fd6c291bae6255cd835ba3d32c1cacca536c8 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/evaluation/eval_hooks.py @@ -0,0 +1,109 @@ +import os.path as osp + +from custom_mmpkg.custom_mmcv.runner import DistEvalHook as _DistEvalHook +from custom_mmpkg.custom_mmcv.runner import EvalHook as _EvalHook + + +class EvalHook(_EvalHook): + """Single GPU EvalHook, with efficient test support. + + Args: + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + Returns: + list: The prediction results. + """ + + greater_keys = ['mIoU', 'mAcc', 'aAcc'] + + def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): + super().__init__(*args, by_epoch=by_epoch, **kwargs) + self.efficient_test = efficient_test + + def after_train_iter(self, runner): + """After train epoch hook. + + Override default ``single_gpu_test``. + """ + if self.by_epoch or not self.every_n_iters(runner, self.interval): + return + from custom_mmpkg.custom_mmseg.apis import single_gpu_test + runner.log_buffer.clear() + results = single_gpu_test( + runner.model, + self.dataloader, + show=False, + efficient_test=self.efficient_test) + self.evaluate(runner, results) + + def after_train_epoch(self, runner): + """After train epoch hook. + + Override default ``single_gpu_test``. + """ + if not self.by_epoch or not self.every_n_epochs(runner, self.interval): + return + from custom_mmpkg.custom_mmseg.apis import single_gpu_test + runner.log_buffer.clear() + results = single_gpu_test(runner.model, self.dataloader, show=False) + self.evaluate(runner, results) + + +class DistEvalHook(_DistEvalHook): + """Distributed EvalHook, with efficient test support. + + Args: + by_epoch (bool): Determine perform evaluation by epoch or by iteration. + If set to True, it will perform by epoch. Otherwise, by iteration. + Default: False. + efficient_test (bool): Whether save the results as local numpy files to + save CPU memory during evaluation. Default: False. + Returns: + list: The prediction results. + """ + + greater_keys = ['mIoU', 'mAcc', 'aAcc'] + + def __init__(self, *args, by_epoch=False, efficient_test=False, **kwargs): + super().__init__(*args, by_epoch=by_epoch, **kwargs) + self.efficient_test = efficient_test + + def after_train_iter(self, runner): + """After train epoch hook. + + Override default ``multi_gpu_test``. + """ + if self.by_epoch or not self.every_n_iters(runner, self.interval): + return + from custom_mmpkg.custom_mmseg.apis import multi_gpu_test + runner.log_buffer.clear() + results = multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=osp.join(runner.work_dir, '.eval_hook'), + gpu_collect=self.gpu_collect, + efficient_test=self.efficient_test) + if runner.rank == 0: + print('\n') + self.evaluate(runner, results) + + def after_train_epoch(self, runner): + """After train epoch hook. + + Override default ``multi_gpu_test``. + """ + if not self.by_epoch or not self.every_n_epochs(runner, self.interval): + return + from custom_mmpkg.custom_mmseg.apis import multi_gpu_test + runner.log_buffer.clear() + results = multi_gpu_test( + runner.model, + self.dataloader, + tmpdir=osp.join(runner.work_dir, '.eval_hook'), + gpu_collect=self.gpu_collect) + if runner.rank == 0: + print('\n') + self.evaluate(runner, results) diff --git a/custom_mmpkg/custom_mmseg/core/evaluation/metrics.py b/custom_mmpkg/custom_mmseg/core/evaluation/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..db4b29f6c277ce43e4a0f39c3898a2938e11dba8 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/evaluation/metrics.py @@ -0,0 +1,326 @@ +from collections import OrderedDict + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +import torch + + +def f_score(precision, recall, beta=1): + """calcuate the f-score value. + + Args: + precision (float | torch.Tensor): The precision value. + recall (float | torch.Tensor): The recall value. + beta (int): Determines the weight of recall in the combined score. + Default: False. + + Returns: + [torch.tensor]: The f-score value. + """ + score = (1 + beta**2) * (precision * recall) / ( + (beta**2 * precision) + recall) + return score + + +def intersect_and_union(pred_label, + label, + num_classes, + ignore_index, + label_map=dict(), + reduce_zero_label=False): + """Calculate intersection and Union. + + Args: + pred_label (ndarray | str): Prediction segmentation map + or predict result filename. + label (ndarray | str): Ground truth segmentation map + or label filename. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + label_map (dict): Mapping old labels to new labels. The parameter will + work only when label is str. Default: dict(). + reduce_zero_label (bool): Whether ignore zero label. The parameter will + work only when label is str. Default: False. + + Returns: + torch.Tensor: The intersection of prediction and ground truth + histogram on all classes. + torch.Tensor: The union of prediction and ground truth histogram on + all classes. + torch.Tensor: The prediction histogram on all classes. + torch.Tensor: The ground truth histogram on all classes. + """ + + if isinstance(pred_label, str): + pred_label = torch.from_numpy(np.load(pred_label)) + else: + pred_label = torch.from_numpy((pred_label)) + + if isinstance(label, str): + label = torch.from_numpy( + mmcv.imread(label, flag='unchanged', backend='pillow')) + else: + label = torch.from_numpy(label) + + if label_map is not None: + for old_id, new_id in label_map.items(): + label[label == old_id] = new_id + if reduce_zero_label: + label[label == 0] = 255 + label = label - 1 + label[label == 254] = 255 + + mask = (label != ignore_index) + pred_label = pred_label[mask] + label = label[mask] + + intersect = pred_label[pred_label == label] + area_intersect = torch.histc( + intersect.float(), bins=(num_classes), min=0, max=num_classes - 1) + area_pred_label = torch.histc( + pred_label.float(), bins=(num_classes), min=0, max=num_classes - 1) + area_label = torch.histc( + label.float(), bins=(num_classes), min=0, max=num_classes - 1) + area_union = area_pred_label + area_label - area_intersect + return area_intersect, area_union, area_pred_label, area_label + + +def total_intersect_and_union(results, + gt_seg_maps, + num_classes, + ignore_index, + label_map=dict(), + reduce_zero_label=False): + """Calculate Total Intersection and Union. + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Whether ignore zero label. Default: False. + + Returns: + ndarray: The intersection of prediction and ground truth histogram + on all classes. + ndarray: The union of prediction and ground truth histogram on all + classes. + ndarray: The prediction histogram on all classes. + ndarray: The ground truth histogram on all classes. + """ + num_imgs = len(results) + assert len(gt_seg_maps) == num_imgs + total_area_intersect = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_union = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_pred_label = torch.zeros((num_classes, ), dtype=torch.float64) + total_area_label = torch.zeros((num_classes, ), dtype=torch.float64) + for i in range(num_imgs): + area_intersect, area_union, area_pred_label, area_label = \ + intersect_and_union( + results[i], gt_seg_maps[i], num_classes, ignore_index, + label_map, reduce_zero_label) + total_area_intersect += area_intersect + total_area_union += area_union + total_area_pred_label += area_pred_label + total_area_label += area_label + return total_area_intersect, total_area_union, total_area_pred_label, \ + total_area_label + + +def mean_iou(results, + gt_seg_maps, + num_classes, + ignore_index, + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False): + """Calculate Mean Intersection and Union (mIoU) + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Whether ignore zero label. Default: False. + + Returns: + dict[str, float | ndarray]: + float: Overall accuracy on all images. + ndarray: Per category accuracy, shape (num_classes, ). + ndarray: Per category IoU, shape (num_classes, ). + """ + iou_result = eval_metrics( + results=results, + gt_seg_maps=gt_seg_maps, + num_classes=num_classes, + ignore_index=ignore_index, + metrics=['mIoU'], + nan_to_num=nan_to_num, + label_map=label_map, + reduce_zero_label=reduce_zero_label) + return iou_result + + +def mean_dice(results, + gt_seg_maps, + num_classes, + ignore_index, + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False): + """Calculate Mean Dice (mDice) + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Whether ignore zero label. Default: False. + + Returns: + dict[str, float | ndarray]: Default metrics. + float: Overall accuracy on all images. + ndarray: Per category accuracy, shape (num_classes, ). + ndarray: Per category dice, shape (num_classes, ). + """ + + dice_result = eval_metrics( + results=results, + gt_seg_maps=gt_seg_maps, + num_classes=num_classes, + ignore_index=ignore_index, + metrics=['mDice'], + nan_to_num=nan_to_num, + label_map=label_map, + reduce_zero_label=reduce_zero_label) + return dice_result + + +def mean_fscore(results, + gt_seg_maps, + num_classes, + ignore_index, + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False, + beta=1): + """Calculate Mean Intersection and Union (mIoU) + + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Whether ignore zero label. Default: False. + beta (int): Determines the weight of recall in the combined score. + Default: False. + + + Returns: + dict[str, float | ndarray]: Default metrics. + float: Overall accuracy on all images. + ndarray: Per category recall, shape (num_classes, ). + ndarray: Per category precision, shape (num_classes, ). + ndarray: Per category f-score, shape (num_classes, ). + """ + fscore_result = eval_metrics( + results=results, + gt_seg_maps=gt_seg_maps, + num_classes=num_classes, + ignore_index=ignore_index, + metrics=['mFscore'], + nan_to_num=nan_to_num, + label_map=label_map, + reduce_zero_label=reduce_zero_label, + beta=beta) + return fscore_result + + +def eval_metrics(results, + gt_seg_maps, + num_classes, + ignore_index, + metrics=['mIoU'], + nan_to_num=None, + label_map=dict(), + reduce_zero_label=False, + beta=1): + """Calculate evaluation metrics + Args: + results (list[ndarray] | list[str]): List of prediction segmentation + maps or list of prediction result filenames. + gt_seg_maps (list[ndarray] | list[str]): list of ground truth + segmentation maps or list of label filenames. + num_classes (int): Number of categories. + ignore_index (int): Index that will be ignored in evaluation. + metrics (list[str] | str): Metrics to be evaluated, 'mIoU' and 'mDice'. + nan_to_num (int, optional): If specified, NaN values will be replaced + by the numbers defined by the user. Default: None. + label_map (dict): Mapping old labels to new labels. Default: dict(). + reduce_zero_label (bool): Whether ignore zero label. Default: False. + Returns: + float: Overall accuracy on all images. + ndarray: Per category accuracy, shape (num_classes, ). + ndarray: Per category evaluation metrics, shape (num_classes, ). + """ + if isinstance(metrics, str): + metrics = [metrics] + allowed_metrics = ['mIoU', 'mDice', 'mFscore'] + if not set(metrics).issubset(set(allowed_metrics)): + raise KeyError('metrics {} is not supported'.format(metrics)) + + total_area_intersect, total_area_union, total_area_pred_label, \ + total_area_label = total_intersect_and_union( + results, gt_seg_maps, num_classes, ignore_index, label_map, + reduce_zero_label) + all_acc = total_area_intersect.sum() / total_area_label.sum() + ret_metrics = OrderedDict({'aAcc': all_acc}) + for metric in metrics: + if metric == 'mIoU': + iou = total_area_intersect / total_area_union + acc = total_area_intersect / total_area_label + ret_metrics['IoU'] = iou + ret_metrics['Acc'] = acc + elif metric == 'mDice': + dice = 2 * total_area_intersect / ( + total_area_pred_label + total_area_label) + acc = total_area_intersect / total_area_label + ret_metrics['Dice'] = dice + ret_metrics['Acc'] = acc + elif metric == 'mFscore': + precision = total_area_intersect / total_area_pred_label + recall = total_area_intersect / total_area_label + f_value = torch.tensor( + [f_score(x[0], x[1], beta) for x in zip(precision, recall)]) + ret_metrics['Fscore'] = f_value + ret_metrics['Precision'] = precision + ret_metrics['Recall'] = recall + + ret_metrics = { + metric: value.numpy() + for metric, value in ret_metrics.items() + } + if nan_to_num is not None: + ret_metrics = OrderedDict({ + metric: np.nan_to_num(metric_value, nan=nan_to_num) + for metric, metric_value in ret_metrics.items() + }) + return ret_metrics diff --git a/custom_mmpkg/custom_mmseg/core/seg/__init__.py b/custom_mmpkg/custom_mmseg/core/seg/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..93bc129b685e4a3efca2cc891729981b2865900d --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/seg/__init__.py @@ -0,0 +1,4 @@ +from .builder import build_pixel_sampler +from .sampler import BasePixelSampler, OHEMPixelSampler + +__all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler'] diff --git a/custom_mmpkg/custom_mmseg/core/seg/builder.py b/custom_mmpkg/custom_mmseg/core/seg/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..8c6971fce1e60b12c521413bf62127da76f441d4 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/seg/builder.py @@ -0,0 +1,8 @@ +from custom_mmpkg.custom_mmcv.utils import Registry, build_from_cfg + +PIXEL_SAMPLERS = Registry('pixel sampler') + + +def build_pixel_sampler(cfg, **default_args): + """Build pixel sampler for segmentation map.""" + return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args) diff --git a/custom_mmpkg/custom_mmseg/core/seg/sampler/__init__.py b/custom_mmpkg/custom_mmseg/core/seg/sampler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..332b242c03d1c5e80d4577df442a9a037b1816e1 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/seg/sampler/__init__.py @@ -0,0 +1,4 @@ +from .base_pixel_sampler import BasePixelSampler +from .ohem_pixel_sampler import OHEMPixelSampler + +__all__ = ['BasePixelSampler', 'OHEMPixelSampler'] diff --git a/custom_mmpkg/custom_mmseg/core/seg/sampler/base_pixel_sampler.py b/custom_mmpkg/custom_mmseg/core/seg/sampler/base_pixel_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..b75b1566c9f18169cee51d4b55d75e0357b69c57 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/seg/sampler/base_pixel_sampler.py @@ -0,0 +1,12 @@ +from abc import ABCMeta, abstractmethod + + +class BasePixelSampler(metaclass=ABCMeta): + """Base class of pixel sampler.""" + + def __init__(self, **kwargs): + pass + + @abstractmethod + def sample(self, seg_logit, seg_label): + """Placeholder for sample function.""" diff --git a/custom_mmpkg/custom_mmseg/core/seg/sampler/ohem_pixel_sampler.py b/custom_mmpkg/custom_mmseg/core/seg/sampler/ohem_pixel_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..88bb10d44026ba9f21756eaea9e550841cd59b9f --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/seg/sampler/ohem_pixel_sampler.py @@ -0,0 +1,76 @@ +import torch +import torch.nn.functional as F + +from ..builder import PIXEL_SAMPLERS +from .base_pixel_sampler import BasePixelSampler + + +@PIXEL_SAMPLERS.register_module() +class OHEMPixelSampler(BasePixelSampler): + """Online Hard Example Mining Sampler for segmentation. + + Args: + context (nn.Module): The context of sampler, subclass of + :obj:`BaseDecodeHead`. + thresh (float, optional): The threshold for hard example selection. + Below which, are prediction with low confidence. If not + specified, the hard examples will be pixels of top ``min_kept`` + loss. Default: None. + min_kept (int, optional): The minimum number of predictions to keep. + Default: 100000. + """ + + def __init__(self, context, thresh=None, min_kept=100000): + super(OHEMPixelSampler, self).__init__() + self.context = context + assert min_kept > 1 + self.thresh = thresh + self.min_kept = min_kept + + def sample(self, seg_logit, seg_label): + """Sample pixels that have high loss or with low prediction confidence. + + Args: + seg_logit (torch.Tensor): segmentation logits, shape (N, C, H, W) + seg_label (torch.Tensor): segmentation label, shape (N, 1, H, W) + + Returns: + torch.Tensor: segmentation weight, shape (N, H, W) + """ + with torch.no_grad(): + assert seg_logit.shape[2:] == seg_label.shape[2:] + assert seg_label.shape[1] == 1 + seg_label = seg_label.squeeze(1).long() + batch_kept = self.min_kept * seg_label.size(0) + valid_mask = seg_label != self.context.ignore_index + seg_weight = seg_logit.new_zeros(size=seg_label.size()) + valid_seg_weight = seg_weight[valid_mask] + if self.thresh is not None: + seg_prob = F.softmax(seg_logit, dim=1) + + tmp_seg_label = seg_label.clone().unsqueeze(1) + tmp_seg_label[tmp_seg_label == self.context.ignore_index] = 0 + seg_prob = seg_prob.gather(1, tmp_seg_label).squeeze(1) + sort_prob, sort_indices = seg_prob[valid_mask].sort() + + if sort_prob.numel() > 0: + min_threshold = sort_prob[min(batch_kept, + sort_prob.numel() - 1)] + else: + min_threshold = 0.0 + threshold = max(min_threshold, self.thresh) + valid_seg_weight[seg_prob[valid_mask] < threshold] = 1. + else: + losses = self.context.loss_decode( + seg_logit, + seg_label, + weight=None, + ignore_index=self.context.ignore_index, + reduction_override='none') + # faster than topk according to https://github.com/pytorch/pytorch/issues/22812 # noqa + _, sort_indices = losses[valid_mask].sort(descending=True) + valid_seg_weight[sort_indices[:batch_kept]] = 1. + + seg_weight[valid_mask] = valid_seg_weight + + return seg_weight diff --git a/custom_mmpkg/custom_mmseg/core/utils/__init__.py b/custom_mmpkg/custom_mmseg/core/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f2678b321c295bcceaef945111ac3524be19d6e4 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/utils/__init__.py @@ -0,0 +1,3 @@ +from .misc import add_prefix + +__all__ = ['add_prefix'] diff --git a/custom_mmpkg/custom_mmseg/core/utils/misc.py b/custom_mmpkg/custom_mmseg/core/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..eb862a82bd47c8624db3dd5c6fb6ad8a03b62466 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/core/utils/misc.py @@ -0,0 +1,17 @@ +def add_prefix(inputs, prefix): + """Add prefix for dict. + + Args: + inputs (dict): The input dict with str keys. + prefix (str): The prefix to add. + + Returns: + + dict: The dict with keys updated with ``prefix``. + """ + + outputs = dict() + for name, value in inputs.items(): + outputs[f'{prefix}.{name}'] = value + + return outputs diff --git a/custom_mmpkg/custom_mmseg/datasets/__init__.py b/custom_mmpkg/custom_mmseg/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebeaef4a28ef655e43578552a8aef6b77f13a636 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/__init__.py @@ -0,0 +1,19 @@ +from .ade import ADE20KDataset +from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset +from .chase_db1 import ChaseDB1Dataset +from .cityscapes import CityscapesDataset +from .custom import CustomDataset +from .dataset_wrappers import ConcatDataset, RepeatDataset +from .drive import DRIVEDataset +from .hrf import HRFDataset +from .pascal_context import PascalContextDataset, PascalContextDataset59 +from .stare import STAREDataset +from .voc import PascalVOCDataset + +__all__ = [ + 'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset', + 'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset', + 'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset', + 'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset', + 'STAREDataset' +] diff --git a/custom_mmpkg/custom_mmseg/datasets/ade.py b/custom_mmpkg/custom_mmseg/datasets/ade.py new file mode 100644 index 0000000000000000000000000000000000000000..5913e43775ed4920b6934c855eb5a37c54218ebf --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/ade.py @@ -0,0 +1,84 @@ +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class ADE20KDataset(CustomDataset): + """ADE20K dataset. + + In segmentation map annotation for ADE20K, 0 stands for background, which + is not included in 150 categories. ``reduce_zero_label`` is fixed to True. + The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to + '.png'. + """ + CLASSES = ( + 'wall', 'building', 'sky', 'floor', 'tree', 'ceiling', 'road', 'bed ', + 'windowpane', 'grass', 'cabinet', 'sidewalk', 'person', 'earth', + 'door', 'table', 'mountain', 'plant', 'curtain', 'chair', 'car', + 'water', 'painting', 'sofa', 'shelf', 'house', 'sea', 'mirror', 'rug', + 'field', 'armchair', 'seat', 'fence', 'desk', 'rock', 'wardrobe', + 'lamp', 'bathtub', 'railing', 'cushion', 'base', 'box', 'column', + 'signboard', 'chest of drawers', 'counter', 'sand', 'sink', + 'skyscraper', 'fireplace', 'refrigerator', 'grandstand', 'path', + 'stairs', 'runway', 'case', 'pool table', 'pillow', 'screen door', + 'stairway', 'river', 'bridge', 'bookcase', 'blind', 'coffee table', + 'toilet', 'flower', 'book', 'hill', 'bench', 'countertop', 'stove', + 'palm', 'kitchen island', 'computer', 'swivel chair', 'boat', 'bar', + 'arcade machine', 'hovel', 'bus', 'towel', 'light', 'truck', 'tower', + 'chandelier', 'awning', 'streetlight', 'booth', 'television receiver', + 'airplane', 'dirt track', 'apparel', 'pole', 'land', 'bannister', + 'escalator', 'ottoman', 'bottle', 'buffet', 'poster', 'stage', 'van', + 'ship', 'fountain', 'conveyer belt', 'canopy', 'washer', 'plaything', + 'swimming pool', 'stool', 'barrel', 'basket', 'waterfall', 'tent', + 'bag', 'minibike', 'cradle', 'oven', 'ball', 'food', 'step', 'tank', + 'trade name', 'microwave', 'pot', 'animal', 'bicycle', 'lake', + 'dishwasher', 'screen', 'blanket', 'sculpture', 'hood', 'sconce', + 'vase', 'traffic light', 'tray', 'ashcan', 'fan', 'pier', 'crt screen', + 'plate', 'monitor', 'bulletin board', 'shower', 'radiator', 'glass', + 'clock', 'flag') + + PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], + [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], + [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], + [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], + [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], + [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], + [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], + [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], + [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], + [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], + [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], + [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], + [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], + [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], + [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255], + [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255], + [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0], + [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0], + [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255], + [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255], + [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20], + [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255], + [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255], + [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255], + [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0], + [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0], + [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255], + [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112], + [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160], + [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163], + [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0], + [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0], + [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255], + [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204], + [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255], + [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255], + [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194], + [102, 255, 0], [92, 0, 255]] + + def __init__(self, **kwargs): + super(ADE20KDataset, self).__init__( + img_suffix='.jpg', + seg_map_suffix='.png', + reduce_zero_label=True, + **kwargs) diff --git a/custom_mmpkg/custom_mmseg/datasets/builder.py b/custom_mmpkg/custom_mmseg/datasets/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..6d09e961eafb5301c98fd3defeb558f9b7e938e7 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/builder.py @@ -0,0 +1,161 @@ +import copy +import platform +import random +from functools import partial + +import numpy as np +from custom_mmpkg.custom_mmcv.parallel import collate +from custom_mmpkg.custom_mmcv.runner import get_dist_info +from custom_mmpkg.custom_mmcv.utils import Registry, build_from_cfg +from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import DataLoader, PoolDataLoader +from torch.utils.data import DistributedSampler + +DATASETS = Registry('dataset') +PIPELINES = Registry('pipeline') + + +def _concat_dataset(cfg, default_args=None): + """Build :obj:`ConcatDataset by.""" + from .dataset_wrappers import ConcatDataset + img_dir = cfg['img_dir'] + ann_dir = cfg.get('ann_dir', None) + split = cfg.get('split', None) + num_img_dir = len(img_dir) if isinstance(img_dir, (list, tuple)) else 1 + if ann_dir is not None: + num_ann_dir = len(ann_dir) if isinstance(ann_dir, (list, tuple)) else 1 + else: + num_ann_dir = 0 + if split is not None: + num_split = len(split) if isinstance(split, (list, tuple)) else 1 + else: + num_split = 0 + if num_img_dir > 1: + assert num_img_dir == num_ann_dir or num_ann_dir == 0 + assert num_img_dir == num_split or num_split == 0 + else: + assert num_split == num_ann_dir or num_ann_dir <= 1 + num_dset = max(num_split, num_img_dir) + + datasets = [] + for i in range(num_dset): + data_cfg = copy.deepcopy(cfg) + if isinstance(img_dir, (list, tuple)): + data_cfg['img_dir'] = img_dir[i] + if isinstance(ann_dir, (list, tuple)): + data_cfg['ann_dir'] = ann_dir[i] + if isinstance(split, (list, tuple)): + data_cfg['split'] = split[i] + datasets.append(build_dataset(data_cfg, default_args)) + + return ConcatDataset(datasets) + + +def build_dataset(cfg, default_args=None): + """Build datasets.""" + from .dataset_wrappers import ConcatDataset, RepeatDataset + if isinstance(cfg, (list, tuple)): + dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg]) + elif cfg['type'] == 'RepeatDataset': + dataset = RepeatDataset( + build_dataset(cfg['dataset'], default_args), cfg['times']) + elif isinstance(cfg.get('img_dir'), (list, tuple)) or isinstance( + cfg.get('split', None), (list, tuple)): + dataset = _concat_dataset(cfg, default_args) + else: + dataset = build_from_cfg(cfg, DATASETS, default_args) + + return dataset + + +def build_dataloader(dataset, + samples_per_gpu, + workers_per_gpu, + num_gpus=1, + dist=True, + shuffle=True, + seed=None, + drop_last=False, + pin_memory=True, + dataloader_type='PoolDataLoader', + **kwargs): + """Build PyTorch DataLoader. + + In distributed training, each GPU/process has a dataloader. + In non-distributed training, there is only one dataloader for all GPUs. + + Args: + dataset (Dataset): A PyTorch dataset. + samples_per_gpu (int): Number of training samples on each GPU, i.e., + batch size of each GPU. + workers_per_gpu (int): How many subprocesses to use for data loading + for each GPU. + num_gpus (int): Number of GPUs. Only used in non-distributed training. + dist (bool): Distributed training/test or not. Default: True. + shuffle (bool): Whether to shuffle the data at every epoch. + Default: True. + seed (int | None): Seed to be used. Default: None. + drop_last (bool): Whether to drop the last incomplete batch in epoch. + Default: False + pin_memory (bool): Whether to use pin_memory in DataLoader. + Default: True + dataloader_type (str): Type of dataloader. Default: 'PoolDataLoader' + kwargs: any keyword argument to be used to initialize DataLoader + + Returns: + DataLoader: A PyTorch dataloader. + """ + rank, world_size = get_dist_info() + if dist: + sampler = DistributedSampler( + dataset, world_size, rank, shuffle=shuffle) + shuffle = False + batch_size = samples_per_gpu + num_workers = workers_per_gpu + else: + sampler = None + batch_size = num_gpus * samples_per_gpu + num_workers = num_gpus * workers_per_gpu + + init_fn = partial( + worker_init_fn, num_workers=num_workers, rank=rank, + seed=seed) if seed is not None else None + + assert dataloader_type in ( + 'DataLoader', + 'PoolDataLoader'), f'unsupported dataloader {dataloader_type}' + + if dataloader_type == 'PoolDataLoader': + dataloader = PoolDataLoader + elif dataloader_type == 'DataLoader': + dataloader = DataLoader + + data_loader = dataloader( + dataset, + batch_size=batch_size, + sampler=sampler, + num_workers=num_workers, + collate_fn=partial(collate, samples_per_gpu=samples_per_gpu), + pin_memory=pin_memory, + shuffle=shuffle, + worker_init_fn=init_fn, + drop_last=drop_last, + **kwargs) + + return data_loader + + +def worker_init_fn(worker_id, num_workers, rank, seed): + """Worker init func for dataloader. + + The seed of each worker equals to num_worker * rank + worker_id + user_seed + + Args: + worker_id (int): Worker id. + num_workers (int): Number of workers. + rank (int): The rank of current process. + seed (int): The random seed to use. + """ + + worker_seed = num_workers * rank + worker_id + seed + np.random.seed(worker_seed) + random.seed(worker_seed) diff --git a/custom_mmpkg/custom_mmseg/datasets/chase_db1.py b/custom_mmpkg/custom_mmseg/datasets/chase_db1.py new file mode 100644 index 0000000000000000000000000000000000000000..8bc29bea14704a4407f83474610cbc3bef32c708 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/chase_db1.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class ChaseDB1Dataset(CustomDataset): + """Chase_db1 dataset. + + In segmentation map annotation for Chase_db1, 0 stands for background, + which is included in 2 categories. ``reduce_zero_label`` is fixed to False. + The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '_1stHO.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(ChaseDB1Dataset, self).__init__( + img_suffix='.png', + seg_map_suffix='_1stHO.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/custom_mmpkg/custom_mmseg/datasets/cityscapes.py b/custom_mmpkg/custom_mmseg/datasets/cityscapes.py new file mode 100644 index 0000000000000000000000000000000000000000..4c4d09372290d8d1d35fc75846a2802417d6b0db --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/cityscapes.py @@ -0,0 +1,217 @@ +import os.path as osp +import tempfile + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +from custom_mmpkg.custom_mmcv.utils import print_log +from PIL import Image + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class CityscapesDataset(CustomDataset): + """Cityscapes dataset. + + The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is + fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset. + """ + + CLASSES = ('road', 'sidewalk', 'building', 'wall', 'fence', 'pole', + 'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky', + 'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle', + 'bicycle') + + PALETTE = [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156], + [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0], + [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60], + [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], + [0, 80, 100], [0, 0, 230], [119, 11, 32]] + + def __init__(self, **kwargs): + super(CityscapesDataset, self).__init__( + img_suffix='_leftImg8bit.png', + seg_map_suffix='_gtFine_labelTrainIds.png', + **kwargs) + + @staticmethod + def _convert_to_label_id(result): + """Convert trainId to id for cityscapes.""" + if isinstance(result, str): + result = np.load(result) + import cityscapesscripts.helpers.labels as CSLabels + result_copy = result.copy() + for trainId, label in CSLabels.trainId2label.items(): + result_copy[result == trainId] = label.id + + return result_copy + + def results2img(self, results, imgfile_prefix, to_label_id): + """Write the segmentation results to images. + + Args: + results (list[list | tuple | ndarray]): Testing results of the + dataset. + imgfile_prefix (str): The filename prefix of the png files. + If the prefix is "somepath/xxx", + the png files will be named "somepath/xxx.png". + to_label_id (bool): whether convert output to label_id for + submission + + Returns: + list[str: str]: result txt files which contains corresponding + semantic segmentation images. + """ + mmcv.mkdir_or_exist(imgfile_prefix) + result_files = [] + prog_bar = mmcv.ProgressBar(len(self)) + for idx in range(len(self)): + result = results[idx] + if to_label_id: + result = self._convert_to_label_id(result) + filename = self.img_infos[idx]['filename'] + basename = osp.splitext(osp.basename(filename))[0] + + png_filename = osp.join(imgfile_prefix, f'{basename}.png') + + output = Image.fromarray(result.astype(np.uint8)).convert('P') + import cityscapesscripts.helpers.labels as CSLabels + palette = np.zeros((len(CSLabels.id2label), 3), dtype=np.uint8) + for label_id, label in CSLabels.id2label.items(): + palette[label_id] = label.color + + output.putpalette(palette) + output.save(png_filename) + result_files.append(png_filename) + prog_bar.update() + + return result_files + + def format_results(self, results, imgfile_prefix=None, to_label_id=True): + """Format the results into dir (standard format for Cityscapes + evaluation). + + Args: + results (list): Testing results of the dataset. + imgfile_prefix (str | None): The prefix of images files. It + includes the file path and the prefix of filename, e.g., + "a/b/prefix". If not specified, a temp file will be created. + Default: None. + to_label_id (bool): whether convert output to label_id for + submission. Default: False + + Returns: + tuple: (result_files, tmp_dir), result_files is a list containing + the image paths, tmp_dir is the temporal directory created + for saving json/png files when img_prefix is not specified. + """ + + assert isinstance(results, list), 'results must be a list' + assert len(results) == len(self), ( + 'The length of results is not equal to the dataset len: ' + f'{len(results)} != {len(self)}') + + if imgfile_prefix is None: + tmp_dir = tempfile.TemporaryDirectory() + imgfile_prefix = tmp_dir.name + else: + tmp_dir = None + result_files = self.results2img(results, imgfile_prefix, to_label_id) + + return result_files, tmp_dir + + def evaluate(self, + results, + metric='mIoU', + logger=None, + imgfile_prefix=None, + efficient_test=False): + """Evaluation in Cityscapes/default protocol. + + Args: + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + imgfile_prefix (str | None): The prefix of output image file, + for cityscapes evaluation only. It includes the file path and + the prefix of filename, e.g., "a/b/prefix". + If results are evaluated with cityscapes protocol, it would be + the prefix of output png files. The output files would be + png images under folder "a/b/prefix/xxx.png", where "xxx" is + the image name of cityscapes. If not specified, a temp file + will be created for evaluation. + Default: None. + + Returns: + dict[str, float]: Cityscapes/default metrics. + """ + + eval_results = dict() + metrics = metric.copy() if isinstance(metric, list) else [metric] + if 'cityscapes' in metrics: + eval_results.update( + self._evaluate_cityscapes(results, logger, imgfile_prefix)) + metrics.remove('cityscapes') + if len(metrics) > 0: + eval_results.update( + super(CityscapesDataset, + self).evaluate(results, metrics, logger, efficient_test)) + + return eval_results + + def _evaluate_cityscapes(self, results, logger, imgfile_prefix): + """Evaluation in Cityscapes protocol. + + Args: + results (list): Testing results of the dataset. + logger (logging.Logger | str | None): Logger used for printing + related information during evaluation. Default: None. + imgfile_prefix (str | None): The prefix of output image file + + Returns: + dict[str: float]: Cityscapes evaluation results. + """ + try: + import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval # noqa + except ImportError: + raise ImportError('Please run "pip install cityscapesscripts" to ' + 'install cityscapesscripts first.') + msg = 'Evaluating in Cityscapes style' + if logger is None: + msg = '\n' + msg + print_log(msg, logger=logger) + + result_files, tmp_dir = self.format_results(results, imgfile_prefix) + + if tmp_dir is None: + result_dir = imgfile_prefix + else: + result_dir = tmp_dir.name + + eval_results = dict() + print_log(f'Evaluating results under {result_dir} ...', logger=logger) + + CSEval.args.evalInstLevelScore = True + CSEval.args.predictionPath = osp.abspath(result_dir) + CSEval.args.evalPixelAccuracy = True + CSEval.args.JSONOutput = False + + seg_map_list = [] + pred_list = [] + + # when evaluating with official cityscapesscripts, + # **_gtFine_labelIds.png is used + for seg_map in mmcv.scandir( + self.ann_dir, 'gtFine_labelIds.png', recursive=True): + seg_map_list.append(osp.join(self.ann_dir, seg_map)) + pred_list.append(CSEval.getPrediction(CSEval.args, seg_map)) + + eval_results.update( + CSEval.evaluateImgLists(pred_list, seg_map_list, CSEval.args)) + + if tmp_dir is not None: + tmp_dir.cleanup() + + return eval_results diff --git a/custom_mmpkg/custom_mmseg/datasets/custom.py b/custom_mmpkg/custom_mmseg/datasets/custom.py new file mode 100644 index 0000000000000000000000000000000000000000..5096a1d718784fcfcc6ae0b30aa256dfb57bc768 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/custom.py @@ -0,0 +1,403 @@ +import os +import os.path as osp +from collections import OrderedDict +from functools import reduce + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +from custom_mmpkg.custom_mmcv.utils import print_log +from torch.utils.data import Dataset + +from custom_mmpkg.custom_mmseg.core import eval_metrics +from custom_mmpkg.custom_mmseg.utils import get_root_logger +from .builder import DATASETS +from .pipelines import Compose + + +@DATASETS.register_module() +class CustomDataset(Dataset): + """Custom dataset for semantic segmentation. An example of file structure + is as followed. + + .. code-block:: none + + ├── data + │ ├── my_dataset + │ │ ├── img_dir + │ │ │ ├── train + │ │ │ │ ├── xxx{img_suffix} + │ │ │ │ ├── yyy{img_suffix} + │ │ │ │ ├── zzz{img_suffix} + │ │ │ ├── val + │ │ ├── ann_dir + │ │ │ ├── train + │ │ │ │ ├── xxx{seg_map_suffix} + │ │ │ │ ├── yyy{seg_map_suffix} + │ │ │ │ ├── zzz{seg_map_suffix} + │ │ │ ├── val + + The img/gt_semantic_seg pair of CustomDataset should be of the same + except suffix. A valid img/gt_semantic_seg filename pair should be like + ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included + in the suffix). If split is given, then ``xxx`` is specified in txt file. + Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded. + Please refer to ``docs/tutorials/new_dataset.md`` for more details. + + + Args: + pipeline (list[dict]): Processing pipeline + img_dir (str): Path to image directory + img_suffix (str): Suffix of images. Default: '.jpg' + ann_dir (str, optional): Path to annotation directory. Default: None + seg_map_suffix (str): Suffix of segmentation maps. Default: '.png' + split (str, optional): Split txt file. If split is specified, only + file with suffix in the splits will be loaded. Otherwise, all + images in img_dir/ann_dir will be loaded. Default: None + data_root (str, optional): Data root for img_dir/ann_dir. Default: + None. + test_mode (bool): If test_mode=True, gt wouldn't be loaded. + ignore_index (int): The label index to be ignored. Default: 255 + reduce_zero_label (bool): Whether to mark label zero as ignored. + Default: False + classes (str | Sequence[str], optional): Specify classes to load. + If is None, ``cls.CLASSES`` will be used. Default: None. + palette (Sequence[Sequence[int]]] | np.ndarray | None): + The palette of segmentation map. If None is given, and + self.PALETTE is None, random palette will be generated. + Default: None + """ + + CLASSES = None + + PALETTE = None + + def __init__(self, + pipeline, + img_dir, + img_suffix='.jpg', + ann_dir=None, + seg_map_suffix='.png', + split=None, + data_root=None, + test_mode=False, + ignore_index=255, + reduce_zero_label=False, + classes=None, + palette=None): + self.pipeline = Compose(pipeline) + self.img_dir = img_dir + self.img_suffix = img_suffix + self.ann_dir = ann_dir + self.seg_map_suffix = seg_map_suffix + self.split = split + self.data_root = data_root + self.test_mode = test_mode + self.ignore_index = ignore_index + self.reduce_zero_label = reduce_zero_label + self.label_map = None + self.CLASSES, self.PALETTE = self.get_classes_and_palette( + classes, palette) + + # join paths if data_root is specified + if self.data_root is not None: + if not osp.isabs(self.img_dir): + self.img_dir = osp.join(self.data_root, self.img_dir) + if not (self.ann_dir is None or osp.isabs(self.ann_dir)): + self.ann_dir = osp.join(self.data_root, self.ann_dir) + if not (self.split is None or osp.isabs(self.split)): + self.split = osp.join(self.data_root, self.split) + + # load annotations + self.img_infos = self.load_annotations(self.img_dir, self.img_suffix, + self.ann_dir, + self.seg_map_suffix, self.split) + + def __len__(self): + """Total number of samples of data.""" + return len(self.img_infos) + + def load_annotations(self, img_dir, img_suffix, ann_dir, seg_map_suffix, + split): + """Load annotation from directory. + + Args: + img_dir (str): Path to image directory + img_suffix (str): Suffix of images. + ann_dir (str|None): Path to annotation directory. + seg_map_suffix (str|None): Suffix of segmentation maps. + split (str|None): Split txt file. If split is specified, only file + with suffix in the splits will be loaded. Otherwise, all images + in img_dir/ann_dir will be loaded. Default: None + + Returns: + list[dict]: All image info of dataset. + """ + + img_infos = [] + if split is not None: + with open(split) as f: + for line in f: + img_name = line.strip() + img_info = dict(filename=img_name + img_suffix) + if ann_dir is not None: + seg_map = img_name + seg_map_suffix + img_info['ann'] = dict(seg_map=seg_map) + img_infos.append(img_info) + else: + for img in mmcv.scandir(img_dir, img_suffix, recursive=True): + img_info = dict(filename=img) + if ann_dir is not None: + seg_map = img.replace(img_suffix, seg_map_suffix) + img_info['ann'] = dict(seg_map=seg_map) + img_infos.append(img_info) + + print_log(f'Loaded {len(img_infos)} images', logger=get_root_logger()) + return img_infos + + def get_ann_info(self, idx): + """Get annotation by index. + + Args: + idx (int): Index of data. + + Returns: + dict: Annotation info of specified index. + """ + + return self.img_infos[idx]['ann'] + + def pre_pipeline(self, results): + """Prepare results dict for pipeline.""" + results['seg_fields'] = [] + results['img_prefix'] = self.img_dir + results['seg_prefix'] = self.ann_dir + if self.custom_classes: + results['label_map'] = self.label_map + + def __getitem__(self, idx): + """Get training/test data after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Training/test data (with annotation if `test_mode` is set + False). + """ + + if self.test_mode: + return self.prepare_test_img(idx) + else: + return self.prepare_train_img(idx) + + def prepare_train_img(self, idx): + """Get training data and annotations after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Training data and annotation after pipeline with new keys + introduced by pipeline. + """ + + img_info = self.img_infos[idx] + ann_info = self.get_ann_info(idx) + results = dict(img_info=img_info, ann_info=ann_info) + self.pre_pipeline(results) + return self.pipeline(results) + + def prepare_test_img(self, idx): + """Get testing data after pipeline. + + Args: + idx (int): Index of data. + + Returns: + dict: Testing data after pipeline with new keys introduced by + pipeline. + """ + + img_info = self.img_infos[idx] + results = dict(img_info=img_info) + self.pre_pipeline(results) + return self.pipeline(results) + + def format_results(self, results, **kwargs): + """Place holder to format result to dataset specific output.""" + + def get_gt_seg_maps(self, efficient_test=False): + """Get ground truth segmentation maps for evaluation.""" + gt_seg_maps = [] + for img_info in self.img_infos: + seg_map = osp.join(self.ann_dir, img_info['ann']['seg_map']) + if efficient_test: + gt_seg_map = seg_map + else: + gt_seg_map = mmcv.imread( + seg_map, flag='unchanged', backend='pillow') + gt_seg_maps.append(gt_seg_map) + return gt_seg_maps + + def get_classes_and_palette(self, classes=None, palette=None): + """Get class names of current dataset. + + Args: + classes (Sequence[str] | str | None): If classes is None, use + default CLASSES defined by builtin dataset. If classes is a + string, take it as a file name. The file contains the name of + classes where each line contains one class name. If classes is + a tuple or list, override the CLASSES defined by the dataset. + palette (Sequence[Sequence[int]]] | np.ndarray | None): + The palette of segmentation map. If None is given, random + palette will be generated. Default: None + """ + if classes is None: + self.custom_classes = False + return self.CLASSES, self.PALETTE + + self.custom_classes = True + if isinstance(classes, str): + # take it as a file path + class_names = mmcv.list_from_file(classes) + elif isinstance(classes, (tuple, list)): + class_names = classes + else: + raise ValueError(f'Unsupported type {type(classes)} of classes.') + + if self.CLASSES: + if not set(classes).issubset(self.CLASSES): + raise ValueError('classes is not a subset of CLASSES.') + + # dictionary, its keys are the old label ids and its values + # are the new label ids. + # used for changing pixel labels in load_annotations. + self.label_map = {} + for i, c in enumerate(self.CLASSES): + if c not in class_names: + self.label_map[i] = -1 + else: + self.label_map[i] = classes.index(c) + + palette = self.get_palette_for_custom_classes(class_names, palette) + + return class_names, palette + + def get_palette_for_custom_classes(self, class_names, palette=None): + + if self.label_map is not None: + # return subset of palette + palette = [] + for old_id, new_id in sorted( + self.label_map.items(), key=lambda x: x[1]): + if new_id != -1: + palette.append(self.PALETTE[old_id]) + palette = type(self.PALETTE)(palette) + + elif palette is None: + if self.PALETTE is None: + palette = np.random.randint(0, 255, size=(len(class_names), 3)) + else: + palette = self.PALETTE + + return palette + + def evaluate(self, + results, + metric='mIoU', + logger=None, + efficient_test=False, + **kwargs): + """Evaluate the dataset. + + Args: + results (list): Testing results of the dataset. + metric (str | list[str]): Metrics to be evaluated. 'mIoU', + 'mDice' and 'mFscore' are supported. + logger (logging.Logger | None | str): Logger used for printing + related information during evaluation. Default: None. + + Returns: + dict[str, float]: Default metrics. + """ + + if isinstance(metric, str): + metric = [metric] + allowed_metrics = ['mIoU', 'mDice', 'mFscore'] + if not set(metric).issubset(set(allowed_metrics)): + raise KeyError('metric {} is not supported'.format(metric)) + eval_results = {} + gt_seg_maps = self.get_gt_seg_maps(efficient_test) + if self.CLASSES is None: + num_classes = len( + reduce(np.union1d, [np.unique(_) for _ in gt_seg_maps])) + else: + num_classes = len(self.CLASSES) + ret_metrics = eval_metrics( + results, + gt_seg_maps, + num_classes, + self.ignore_index, + metric, + label_map=self.label_map, + reduce_zero_label=self.reduce_zero_label) + + if self.CLASSES is None: + class_names = tuple(range(num_classes)) + else: + class_names = self.CLASSES + + # summary table + ret_metrics_summary = OrderedDict({ + ret_metric: np.round(np.nanmean(ret_metric_value) * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + }) + + # each class table + ret_metrics.pop('aAcc', None) + ret_metrics_class = OrderedDict({ + ret_metric: np.round(ret_metric_value * 100, 2) + for ret_metric, ret_metric_value in ret_metrics.items() + }) + ret_metrics_class.update({'Class': class_names}) + ret_metrics_class.move_to_end('Class', last=False) + + try: + from prettytable import PrettyTable + # for logger + class_table_data = PrettyTable() + for key, val in ret_metrics_class.items(): + class_table_data.add_column(key, val) + + summary_table_data = PrettyTable() + for key, val in ret_metrics_summary.items(): + if key == 'aAcc': + summary_table_data.add_column(key, [val]) + else: + summary_table_data.add_column('m' + key, [val]) + + print_log('per class results:', logger) + print_log('\n' + class_table_data.get_string(), logger=logger) + print_log('Summary:', logger) + print_log('\n' + summary_table_data.get_string(), logger=logger) + except ImportError: # prettytable is not installed + pass + + # each metric dict + for key, value in ret_metrics_summary.items(): + if key == 'aAcc': + eval_results[key] = value / 100.0 + else: + eval_results['m' + key] = value / 100.0 + + ret_metrics_class.pop('Class', None) + for key, value in ret_metrics_class.items(): + eval_results.update({ + key + '.' + str(name): value[idx] / 100.0 + for idx, name in enumerate(class_names) + }) + + if mmcv.is_list_of(results, str): + for file_name in results: + os.remove(file_name) + return eval_results diff --git a/custom_mmpkg/custom_mmseg/datasets/dataset_wrappers.py b/custom_mmpkg/custom_mmseg/datasets/dataset_wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..d6a5e957ec3b44465432617cf6e8f0b86a8a5efa --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/dataset_wrappers.py @@ -0,0 +1,50 @@ +from torch.utils.data.dataset import ConcatDataset as _ConcatDataset + +from .builder import DATASETS + + +@DATASETS.register_module() +class ConcatDataset(_ConcatDataset): + """A wrapper of concatenated dataset. + + Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but + concat the group flag for image aspect ratio. + + Args: + datasets (list[:obj:`Dataset`]): A list of datasets. + """ + + def __init__(self, datasets): + super(ConcatDataset, self).__init__(datasets) + self.CLASSES = datasets[0].CLASSES + self.PALETTE = datasets[0].PALETTE + + +@DATASETS.register_module() +class RepeatDataset(object): + """A wrapper of repeated dataset. + + The length of repeated dataset will be `times` larger than the original + dataset. This is useful when the data loading time is long but the dataset + is small. Using RepeatDataset can reduce the data loading time between + epochs. + + Args: + dataset (:obj:`Dataset`): The dataset to be repeated. + times (int): Repeat times. + """ + + def __init__(self, dataset, times): + self.dataset = dataset + self.times = times + self.CLASSES = dataset.CLASSES + self.PALETTE = dataset.PALETTE + self._ori_len = len(self.dataset) + + def __getitem__(self, idx): + """Get item from original dataset.""" + return self.dataset[idx % self._ori_len] + + def __len__(self): + """The length is multiplied by ``times``""" + return self.times * self._ori_len diff --git a/custom_mmpkg/custom_mmseg/datasets/drive.py b/custom_mmpkg/custom_mmseg/datasets/drive.py new file mode 100644 index 0000000000000000000000000000000000000000..3cbfda8ae74bdf26c5aef197ff2866a7c7ad0cfd --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/drive.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class DRIVEDataset(CustomDataset): + """DRIVE dataset. + + In segmentation map annotation for DRIVE, 0 stands for background, which is + included in 2 categories. ``reduce_zero_label`` is fixed to False. The + ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '_manual1.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(DRIVEDataset, self).__init__( + img_suffix='.png', + seg_map_suffix='_manual1.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/custom_mmpkg/custom_mmseg/datasets/hrf.py b/custom_mmpkg/custom_mmseg/datasets/hrf.py new file mode 100644 index 0000000000000000000000000000000000000000..923203b51377f9344277fc561803d7a78bd2c684 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/hrf.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class HRFDataset(CustomDataset): + """HRF dataset. + + In segmentation map annotation for HRF, 0 stands for background, which is + included in 2 categories. ``reduce_zero_label`` is fixed to False. The + ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(HRFDataset, self).__init__( + img_suffix='.png', + seg_map_suffix='.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/custom_mmpkg/custom_mmseg/datasets/pascal_context.py b/custom_mmpkg/custom_mmseg/datasets/pascal_context.py new file mode 100644 index 0000000000000000000000000000000000000000..541a63c66a13fb16fd52921e755715ad8d078fdd --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/pascal_context.py @@ -0,0 +1,103 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class PascalContextDataset(CustomDataset): + """PascalContext dataset. + + In segmentation map annotation for PascalContext, 0 stands for background, + which is included in 60 categories. ``reduce_zero_label`` is fixed to + False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is + fixed to '.png'. + + Args: + split (str): Split txt file for PascalContext. + """ + + CLASSES = ('background', 'aeroplane', 'bag', 'bed', 'bedclothes', 'bench', + 'bicycle', 'bird', 'boat', 'book', 'bottle', 'building', 'bus', + 'cabinet', 'car', 'cat', 'ceiling', 'chair', 'cloth', + 'computer', 'cow', 'cup', 'curtain', 'dog', 'door', 'fence', + 'floor', 'flower', 'food', 'grass', 'ground', 'horse', + 'keyboard', 'light', 'motorbike', 'mountain', 'mouse', 'person', + 'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep', + 'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', + 'track', 'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water', + 'window', 'wood') + + PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50], + [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255], + [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7], + [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82], + [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3], + [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255], + [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220], + [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224], + [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255], + [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7], + [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153], + [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255], + [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0], + [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255], + [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]] + + def __init__(self, split, **kwargs): + super(PascalContextDataset, self).__init__( + img_suffix='.jpg', + seg_map_suffix='.png', + split=split, + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) and self.split is not None + + +@DATASETS.register_module() +class PascalContextDataset59(CustomDataset): + """PascalContext dataset. + + In segmentation map annotation for PascalContext, 0 stands for background, + which is included in 60 categories. ``reduce_zero_label`` is fixed to + False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is + fixed to '.png'. + + Args: + split (str): Split txt file for PascalContext. + """ + + CLASSES = ('aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', + 'bird', 'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', + 'car', 'cat', 'ceiling', 'chair', 'cloth', 'computer', 'cow', + 'cup', 'curtain', 'dog', 'door', 'fence', 'floor', 'flower', + 'food', 'grass', 'ground', 'horse', 'keyboard', 'light', + 'motorbike', 'mountain', 'mouse', 'person', 'plate', 'platform', + 'pottedplant', 'road', 'rock', 'sheep', 'shelves', 'sidewalk', + 'sign', 'sky', 'snow', 'sofa', 'table', 'track', 'train', + 'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window', 'wood') + + PALETTE = [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3], + [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230], + [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61], + [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140], + [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200], + [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71], + [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92], + [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6], + [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8], + [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8], + [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255], + [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140], + [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0], + [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0], + [0, 235, 255], [0, 173, 255], [31, 0, 255]] + + def __init__(self, split, **kwargs): + super(PascalContextDataset59, self).__init__( + img_suffix='.jpg', + seg_map_suffix='.png', + split=split, + reduce_zero_label=True, + **kwargs) + assert osp.exists(self.img_dir) and self.split is not None diff --git a/custom_mmpkg/custom_mmseg/datasets/pipelines/__init__.py b/custom_mmpkg/custom_mmseg/datasets/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b9046b07bb4ddea7a707a392b42e72db7c9df67 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/pipelines/__init__.py @@ -0,0 +1,16 @@ +from .compose import Compose +from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor, + Transpose, to_tensor) +from .loading import LoadAnnotations, LoadImageFromFile +from .test_time_aug import MultiScaleFlipAug +from .transforms import (CLAHE, AdjustGamma, Normalize, Pad, + PhotoMetricDistortion, RandomCrop, RandomFlip, + RandomRotate, Rerange, Resize, RGB2Gray, SegRescale) + +__all__ = [ + 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', + 'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile', + 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', + 'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate', + 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray' +] diff --git a/custom_mmpkg/custom_mmseg/datasets/pipelines/compose.py b/custom_mmpkg/custom_mmseg/datasets/pipelines/compose.py new file mode 100644 index 0000000000000000000000000000000000000000..a9c8027c235140c6d1cca510bb4d2c81baf439c2 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/pipelines/compose.py @@ -0,0 +1,51 @@ +import collections + +from custom_mmpkg.custom_mmcv.utils import build_from_cfg + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class Compose(object): + """Compose multiple transforms sequentially. + + Args: + transforms (Sequence[dict | callable]): Sequence of transform object or + config dict to be composed. + """ + + def __init__(self, transforms): + assert isinstance(transforms, collections.abc.Sequence) + self.transforms = [] + for transform in transforms: + if isinstance(transform, dict): + transform = build_from_cfg(transform, PIPELINES) + self.transforms.append(transform) + elif callable(transform): + self.transforms.append(transform) + else: + raise TypeError('transform must be callable or a dict') + + def __call__(self, data): + """Call function to apply transforms sequentially. + + Args: + data (dict): A result dict contains the data to transform. + + Returns: + dict: Transformed data. + """ + + for t in self.transforms: + data = t(data) + if data is None: + return None + return data + + def __repr__(self): + format_string = self.__class__.__name__ + '(' + for t in self.transforms: + format_string += '\n' + format_string += f' {t}' + format_string += '\n)' + return format_string diff --git a/custom_mmpkg/custom_mmseg/datasets/pipelines/formating.py b/custom_mmpkg/custom_mmseg/datasets/pipelines/formating.py new file mode 100644 index 0000000000000000000000000000000000000000..0c259f185c9a55faf083dc3bec6d571902125e2d --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/pipelines/formating.py @@ -0,0 +1,288 @@ +from collections.abc import Sequence + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +import torch +from custom_mmpkg.custom_mmcv.parallel import DataContainer as DC + +from ..builder import PIPELINES + + +def to_tensor(data): + """Convert objects of various python types to :obj:`torch.Tensor`. + + Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`, + :class:`Sequence`, :class:`int` and :class:`float`. + + Args: + data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to + be converted. + """ + + if isinstance(data, torch.Tensor): + return data + elif isinstance(data, np.ndarray): + return torch.from_numpy(data) + elif isinstance(data, Sequence) and not mmcv.is_str(data): + return torch.tensor(data) + elif isinstance(data, int): + return torch.LongTensor([data]) + elif isinstance(data, float): + return torch.FloatTensor([data]) + else: + raise TypeError(f'type {type(data)} cannot be converted to tensor.') + + +@PIPELINES.register_module() +class ToTensor(object): + """Convert some results to :obj:`torch.Tensor` by given keys. + + Args: + keys (Sequence[str]): Keys that need to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert data in results to :obj:`torch.Tensor`. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data converted + to :obj:`torch.Tensor`. + """ + + for key in self.keys: + results[key] = to_tensor(results[key]) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PIPELINES.register_module() +class ImageToTensor(object): + """Convert image to :obj:`torch.Tensor` by given keys. + + The dimension order of input image is (H, W, C). The pipeline will convert + it to (C, H, W). If only 2 dimension (H, W) is given, the output would be + (1, H, W). + + Args: + keys (Sequence[str]): Key of images to be converted to Tensor. + """ + + def __init__(self, keys): + self.keys = keys + + def __call__(self, results): + """Call function to convert image in results to :obj:`torch.Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`torch.Tensor` and transposed to (C, H, W) order. + """ + + for key in self.keys: + img = results[key] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + results[key] = to_tensor(img.transpose(2, 0, 1)) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(keys={self.keys})' + + +@PIPELINES.register_module() +class Transpose(object): + """Transpose some results by given keys. + + Args: + keys (Sequence[str]): Keys of results to be transposed. + order (Sequence[int]): Order of transpose. + """ + + def __init__(self, keys, order): + self.keys = keys + self.order = order + + def __call__(self, results): + """Call function to convert image in results to :obj:`torch.Tensor` and + transpose the channel order. + + Args: + results (dict): Result dict contains the image data to convert. + + Returns: + dict: The result dict contains the image converted + to :obj:`torch.Tensor` and transposed to (C, H, W) order. + """ + + for key in self.keys: + results[key] = results[key].transpose(self.order) + return results + + def __repr__(self): + return self.__class__.__name__ + \ + f'(keys={self.keys}, order={self.order})' + + +@PIPELINES.register_module() +class ToDataContainer(object): + """Convert results to :obj:`mmcv.DataContainer` by given fields. + + Args: + fields (Sequence[dict]): Each field is a dict like + ``dict(key='xxx', **kwargs)``. The ``key`` in result will + be converted to :obj:`mmcv.DataContainer` with ``**kwargs``. + Default: ``(dict(key='img', stack=True), + dict(key='gt_semantic_seg'))``. + """ + + def __init__(self, + fields=(dict(key='img', + stack=True), dict(key='gt_semantic_seg'))): + self.fields = fields + + def __call__(self, results): + """Call function to convert data in results to + :obj:`mmcv.DataContainer`. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data converted to + :obj:`mmcv.DataContainer`. + """ + + for field in self.fields: + field = field.copy() + key = field.pop('key') + results[key] = DC(results[key], **field) + return results + + def __repr__(self): + return self.__class__.__name__ + f'(fields={self.fields})' + + +@PIPELINES.register_module() +class DefaultFormatBundle(object): + """Default formatting bundle. + + It simplifies the pipeline of formatting common fields, including "img" + and "gt_semantic_seg". These fields are formatted as follows. + + - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True) + - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, + (3)to DataContainer (stack=True) + """ + + def __call__(self, results): + """Call function to transform and format common fields in results. + + Args: + results (dict): Result dict contains the data to convert. + + Returns: + dict: The result dict contains the data that is formatted with + default bundle. + """ + + if 'img' in results: + img = results['img'] + if len(img.shape) < 3: + img = np.expand_dims(img, -1) + img = np.ascontiguousarray(img.transpose(2, 0, 1)) + results['img'] = DC(to_tensor(img), stack=True) + if 'gt_semantic_seg' in results: + # convert to long + results['gt_semantic_seg'] = DC( + to_tensor(results['gt_semantic_seg'][None, + ...].astype(np.int64)), + stack=True) + return results + + def __repr__(self): + return self.__class__.__name__ + + +@PIPELINES.register_module() +class Collect(object): + """Collect data from the loader relevant to the specific task. + + This is usually the last stage of the data loader pipeline. Typically keys + is set to some subset of "img", "gt_semantic_seg". + + The "img_meta" item is always populated. The contents of the "img_meta" + dictionary depends on "meta_keys". By default this includes: + + - "img_shape": shape of the image input to the network as a tuple + (h, w, c). Note that images may be zero padded on the bottom/right + if the batch tensor is larger than this shape. + + - "scale_factor": a float indicating the preprocessing scale + + - "flip": a boolean indicating if image flip transform was used + + - "filename": path to the image file + + - "ori_shape": original shape of the image as a tuple (h, w, c) + + - "pad_shape": image shape after padding + + - "img_norm_cfg": a dict of normalization information: + - mean - per channel mean subtraction + - std - per channel std divisor + - to_rgb - bool indicating if bgr was converted to rgb + + Args: + keys (Sequence[str]): Keys of results to be collected in ``data``. + meta_keys (Sequence[str], optional): Meta keys to be converted to + ``mmcv.DataContainer`` and collected in ``data[img_metas]``. + Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape', + 'pad_shape', 'scale_factor', 'flip', 'flip_direction', + 'img_norm_cfg')`` + """ + + def __init__(self, + keys, + meta_keys=('filename', 'ori_filename', 'ori_shape', + 'img_shape', 'pad_shape', 'scale_factor', 'flip', + 'flip_direction', 'img_norm_cfg')): + self.keys = keys + self.meta_keys = meta_keys + + def __call__(self, results): + """Call function to collect keys in results. The keys in ``meta_keys`` + will be converted to :obj:mmcv.DataContainer. + + Args: + results (dict): Result dict contains the data to collect. + + Returns: + dict: The result dict contains the following keys + - keys in``self.keys`` + - ``img_metas`` + """ + + data = {} + img_meta = {} + for key in self.meta_keys: + img_meta[key] = results[key] + data['img_metas'] = DC(img_meta, cpu_only=True) + for key in self.keys: + data[key] = results[key] + return data + + def __repr__(self): + return self.__class__.__name__ + \ + f'(keys={self.keys}, meta_keys={self.meta_keys})' diff --git a/custom_mmpkg/custom_mmseg/datasets/pipelines/loading.py b/custom_mmpkg/custom_mmseg/datasets/pipelines/loading.py new file mode 100644 index 0000000000000000000000000000000000000000..0ef470c7a4b09deaaee6ca145f5f686610e38497 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/pipelines/loading.py @@ -0,0 +1,153 @@ +import os.path as osp + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class LoadImageFromFile(object): + """Load an image from file. + + Required keys are "img_prefix" and "img_info" (a dict that must contain the + key "filename"). Added or updated keys are "filename", "img", "img_shape", + "ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`), + "scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1). + + Args: + to_float32 (bool): Whether to convert the loaded image to a float32 + numpy array. If set to False, the loaded image is an uint8 array. + Defaults to False. + color_type (str): The flag argument for :func:`mmcv.imfrombytes`. + Defaults to 'color'. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default: + 'cv2' + """ + + def __init__(self, + to_float32=False, + color_type='color', + file_client_args=dict(backend='disk'), + imdecode_backend='cv2'): + self.to_float32 = to_float32 + self.color_type = color_type + self.file_client_args = file_client_args.copy() + self.file_client = None + self.imdecode_backend = imdecode_backend + + def __call__(self, results): + """Call functions to load image and get image meta information. + + Args: + results (dict): Result dict from :obj:`mmseg.CustomDataset`. + + Returns: + dict: The dict contains loaded image and meta information. + """ + + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + if results.get('img_prefix') is not None: + filename = osp.join(results['img_prefix'], + results['img_info']['filename']) + else: + filename = results['img_info']['filename'] + img_bytes = self.file_client.get(filename) + img = mmcv.imfrombytes( + img_bytes, flag=self.color_type, backend=self.imdecode_backend) + if self.to_float32: + img = img.astype(np.float32) + + results['filename'] = filename + results['ori_filename'] = results['img_info']['filename'] + results['img'] = img + results['img_shape'] = img.shape + results['ori_shape'] = img.shape + # Set initial values for default meta_keys + results['pad_shape'] = img.shape + results['scale_factor'] = 1.0 + num_channels = 1 if len(img.shape) < 3 else img.shape[2] + results['img_norm_cfg'] = dict( + mean=np.zeros(num_channels, dtype=np.float32), + std=np.ones(num_channels, dtype=np.float32), + to_rgb=False) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(to_float32={self.to_float32},' + repr_str += f"color_type='{self.color_type}'," + repr_str += f"imdecode_backend='{self.imdecode_backend}')" + return repr_str + + +@PIPELINES.register_module() +class LoadAnnotations(object): + """Load annotations for semantic segmentation. + + Args: + reduce_zero_label (bool): Whether reduce all label value by 1. + Usually used for datasets where 0 is background label. + Default: False. + file_client_args (dict): Arguments to instantiate a FileClient. + See :class:`mmcv.fileio.FileClient` for details. + Defaults to ``dict(backend='disk')``. + imdecode_backend (str): Backend for :func:`mmcv.imdecode`. Default: + 'pillow' + """ + + def __init__(self, + reduce_zero_label=False, + file_client_args=dict(backend='disk'), + imdecode_backend='pillow'): + self.reduce_zero_label = reduce_zero_label + self.file_client_args = file_client_args.copy() + self.file_client = None + self.imdecode_backend = imdecode_backend + + def __call__(self, results): + """Call function to load multiple types annotations. + + Args: + results (dict): Result dict from :obj:`mmseg.CustomDataset`. + + Returns: + dict: The dict contains loaded semantic segmentation annotations. + """ + + if self.file_client is None: + self.file_client = mmcv.FileClient(**self.file_client_args) + + if results.get('seg_prefix', None) is not None: + filename = osp.join(results['seg_prefix'], + results['ann_info']['seg_map']) + else: + filename = results['ann_info']['seg_map'] + img_bytes = self.file_client.get(filename) + gt_semantic_seg = mmcv.imfrombytes( + img_bytes, flag='unchanged', + backend=self.imdecode_backend).squeeze().astype(np.uint8) + # modify if custom classes + if results.get('label_map', None) is not None: + for old_id, new_id in results['label_map'].items(): + gt_semantic_seg[gt_semantic_seg == old_id] = new_id + # reduce zero_label + if self.reduce_zero_label: + # avoid using underflow conversion + gt_semantic_seg[gt_semantic_seg == 0] = 255 + gt_semantic_seg = gt_semantic_seg - 1 + gt_semantic_seg[gt_semantic_seg == 254] = 255 + results['gt_semantic_seg'] = gt_semantic_seg + results['seg_fields'].append('gt_semantic_seg') + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(reduce_zero_label={self.reduce_zero_label},' + repr_str += f"imdecode_backend='{self.imdecode_backend}')" + return repr_str diff --git a/custom_mmpkg/custom_mmseg/datasets/pipelines/test_time_aug.py b/custom_mmpkg/custom_mmseg/datasets/pipelines/test_time_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..93fe21433378b9c87d9e45243c550755bcafefe5 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/pipelines/test_time_aug.py @@ -0,0 +1,133 @@ +import warnings + +import custom_mmpkg.custom_mmcv as mmcv + +from ..builder import PIPELINES +from .compose import Compose + + +@PIPELINES.register_module() +class MultiScaleFlipAug(object): + """Test-time augmentation with multiple scales and flipping. + + An example configuration is as followed: + + .. code-block:: + + img_scale=(2048, 1024), + img_ratios=[0.5, 1.0], + flip=True, + transforms=[ + dict(type='Resize', keep_ratio=True), + dict(type='RandomFlip'), + dict(type='Normalize', **img_norm_cfg), + dict(type='Pad', size_divisor=32), + dict(type='ImageToTensor', keys=['img']), + dict(type='Collect', keys=['img']), + ] + + After MultiScaleFLipAug with above configuration, the results are wrapped + into lists of the same length as followed: + + .. code-block:: + + dict( + img=[...], + img_shape=[...], + scale=[(1024, 512), (1024, 512), (2048, 1024), (2048, 1024)] + flip=[False, True, False, True] + ... + ) + + Args: + transforms (list[dict]): Transforms to apply in each augmentation. + img_scale (None | tuple | list[tuple]): Images scales for resizing. + img_ratios (float | list[float]): Image ratios for resizing + flip (bool): Whether apply flip augmentation. Default: False. + flip_direction (str | list[str]): Flip augmentation directions, + options are "horizontal" and "vertical". If flip_direction is list, + multiple flip augmentations will be applied. + It has no effect when flip == False. Default: "horizontal". + """ + + def __init__(self, + transforms, + img_scale, + img_ratios=None, + flip=False, + flip_direction='horizontal'): + self.transforms = Compose(transforms) + if img_ratios is not None: + img_ratios = img_ratios if isinstance(img_ratios, + list) else [img_ratios] + assert mmcv.is_list_of(img_ratios, float) + if img_scale is None: + # mode 1: given img_scale=None and a range of image ratio + self.img_scale = None + assert mmcv.is_list_of(img_ratios, float) + elif isinstance(img_scale, tuple) and mmcv.is_list_of( + img_ratios, float): + assert len(img_scale) == 2 + # mode 2: given a scale and a range of image ratio + self.img_scale = [(int(img_scale[0] * ratio), + int(img_scale[1] * ratio)) + for ratio in img_ratios] + else: + # mode 3: given multiple scales + self.img_scale = img_scale if isinstance(img_scale, + list) else [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) or self.img_scale is None + self.flip = flip + self.img_ratios = img_ratios + self.flip_direction = flip_direction if isinstance( + flip_direction, list) else [flip_direction] + assert mmcv.is_list_of(self.flip_direction, str) + if not self.flip and self.flip_direction != ['horizontal']: + warnings.warn( + 'flip_direction has no effect when flip is set to False') + if (self.flip + and not any([t['type'] == 'RandomFlip' for t in transforms])): + warnings.warn( + 'flip has no effect when RandomFlip is not in transforms') + + def __call__(self, results): + """Call function to apply test time augment transforms on results. + + Args: + results (dict): Result dict contains the data to transform. + + Returns: + dict[str: list]: The augmented data, where each value is wrapped + into a list. + """ + + aug_data = [] + if self.img_scale is None and mmcv.is_list_of(self.img_ratios, float): + h, w = results['img'].shape[:2] + img_scale = [(int(w * ratio), int(h * ratio)) + for ratio in self.img_ratios] + else: + img_scale = self.img_scale + flip_aug = [False, True] if self.flip else [False] + for scale in img_scale: + for flip in flip_aug: + for direction in self.flip_direction: + _results = results.copy() + _results['scale'] = scale + _results['flip'] = flip + _results['flip_direction'] = direction + data = self.transforms(_results) + aug_data.append(data) + # list of dict to dict of list + aug_data_dict = {key: [] for key in aug_data[0]} + for data in aug_data: + for key, val in data.items(): + aug_data_dict[key].append(val) + return aug_data_dict + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(transforms={self.transforms}, ' + repr_str += f'img_scale={self.img_scale}, flip={self.flip})' + repr_str += f'flip_direction={self.flip_direction}' + return repr_str diff --git a/custom_mmpkg/custom_mmseg/datasets/pipelines/transforms.py b/custom_mmpkg/custom_mmseg/datasets/pipelines/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..677191de984592456c145fe83579a049879443d4 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/pipelines/transforms.py @@ -0,0 +1,889 @@ +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +from custom_mmpkg.custom_mmcv.utils import deprecated_api_warning, is_tuple_of +from numpy import random + +from ..builder import PIPELINES + + +@PIPELINES.register_module() +class Resize(object): + """Resize images & seg. + + This transform resizes the input image to some scale. If the input dict + contains the key "scale", then the scale in the input dict is used, + otherwise the specified scale in the init method is used. + + ``img_scale`` can be None, a tuple (single-scale) or a list of tuple + (multi-scale). There are 4 multiscale modes: + + - ``ratio_range is not None``: + 1. When img_scale is None, img_scale is the shape of image in results + (img_scale = results['img'].shape[:2]) and the image is resized based + on the original size. (mode 1) + 2. When img_scale is a tuple (single-scale), randomly sample a ratio from + the ratio range and multiply it with the image scale. (mode 2) + + - ``ratio_range is None and multiscale_mode == "range"``: randomly sample a + scale from the a range. (mode 3) + + - ``ratio_range is None and multiscale_mode == "value"``: randomly sample a + scale from multiple scales. (mode 4) + + Args: + img_scale (tuple or list[tuple]): Images scales for resizing. + multiscale_mode (str): Either "range" or "value". + ratio_range (tuple[float]): (min_ratio, max_ratio) + keep_ratio (bool): Whether to keep the aspect ratio when resizing the + image. + """ + + def __init__(self, + img_scale=None, + multiscale_mode='range', + ratio_range=None, + keep_ratio=True): + if img_scale is None: + self.img_scale = None + else: + if isinstance(img_scale, list): + self.img_scale = img_scale + else: + self.img_scale = [img_scale] + assert mmcv.is_list_of(self.img_scale, tuple) + + if ratio_range is not None: + # mode 1: given img_scale=None and a range of image ratio + # mode 2: given a scale and a range of image ratio + assert self.img_scale is None or len(self.img_scale) == 1 + else: + # mode 3 and 4: given multiple scales or a range of scales + assert multiscale_mode in ['value', 'range'] + + self.multiscale_mode = multiscale_mode + self.ratio_range = ratio_range + self.keep_ratio = keep_ratio + + @staticmethod + def random_select(img_scales): + """Randomly select an img_scale from given candidates. + + Args: + img_scales (list[tuple]): Images scales for selection. + + Returns: + (tuple, int): Returns a tuple ``(img_scale, scale_dix)``, + where ``img_scale`` is the selected image scale and + ``scale_idx`` is the selected index in the given candidates. + """ + + assert mmcv.is_list_of(img_scales, tuple) + scale_idx = np.random.randint(len(img_scales)) + img_scale = img_scales[scale_idx] + return img_scale, scale_idx + + @staticmethod + def random_sample(img_scales): + """Randomly sample an img_scale when ``multiscale_mode=='range'``. + + Args: + img_scales (list[tuple]): Images scale range for sampling. + There must be two tuples in img_scales, which specify the lower + and upper bound of image scales. + + Returns: + (tuple, None): Returns a tuple ``(img_scale, None)``, where + ``img_scale`` is sampled scale and None is just a placeholder + to be consistent with :func:`random_select`. + """ + + assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2 + img_scale_long = [max(s) for s in img_scales] + img_scale_short = [min(s) for s in img_scales] + long_edge = np.random.randint( + min(img_scale_long), + max(img_scale_long) + 1) + short_edge = np.random.randint( + min(img_scale_short), + max(img_scale_short) + 1) + img_scale = (long_edge, short_edge) + return img_scale, None + + @staticmethod + def random_sample_ratio(img_scale, ratio_range): + """Randomly sample an img_scale when ``ratio_range`` is specified. + + A ratio will be randomly sampled from the range specified by + ``ratio_range``. Then it would be multiplied with ``img_scale`` to + generate sampled scale. + + Args: + img_scale (tuple): Images scale base to multiply with ratio. + ratio_range (tuple[float]): The minimum and maximum ratio to scale + the ``img_scale``. + + Returns: + (tuple, None): Returns a tuple ``(scale, None)``, where + ``scale`` is sampled ratio multiplied with ``img_scale`` and + None is just a placeholder to be consistent with + :func:`random_select`. + """ + + assert isinstance(img_scale, tuple) and len(img_scale) == 2 + min_ratio, max_ratio = ratio_range + assert min_ratio <= max_ratio + ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio + scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio) + return scale, None + + def _random_scale(self, results): + """Randomly sample an img_scale according to ``ratio_range`` and + ``multiscale_mode``. + + If ``ratio_range`` is specified, a ratio will be sampled and be + multiplied with ``img_scale``. + If multiple scales are specified by ``img_scale``, a scale will be + sampled according to ``multiscale_mode``. + Otherwise, single scale will be used. + + Args: + results (dict): Result dict from :obj:`dataset`. + + Returns: + dict: Two new keys 'scale` and 'scale_idx` are added into + ``results``, which would be used by subsequent pipelines. + """ + + if self.ratio_range is not None: + if self.img_scale is None: + h, w = results['img'].shape[:2] + scale, scale_idx = self.random_sample_ratio((w, h), + self.ratio_range) + else: + scale, scale_idx = self.random_sample_ratio( + self.img_scale[0], self.ratio_range) + elif len(self.img_scale) == 1: + scale, scale_idx = self.img_scale[0], 0 + elif self.multiscale_mode == 'range': + scale, scale_idx = self.random_sample(self.img_scale) + elif self.multiscale_mode == 'value': + scale, scale_idx = self.random_select(self.img_scale) + else: + raise NotImplementedError + + results['scale'] = scale + results['scale_idx'] = scale_idx + + def _resize_img(self, results): + """Resize images with ``results['scale']``.""" + if self.keep_ratio: + img, scale_factor = mmcv.imrescale( + results['img'], results['scale'], return_scale=True) + # the w_scale and h_scale has minor difference + # a real fix should be done in the mmcv.imrescale in the future + new_h, new_w = img.shape[:2] + h, w = results['img'].shape[:2] + w_scale = new_w / w + h_scale = new_h / h + else: + img, w_scale, h_scale = mmcv.imresize( + results['img'], results['scale'], return_scale=True) + scale_factor = np.array([w_scale, h_scale, w_scale, h_scale], + dtype=np.float32) + results['img'] = img + results['img_shape'] = img.shape + results['pad_shape'] = img.shape # in case that there is no padding + results['scale_factor'] = scale_factor + results['keep_ratio'] = self.keep_ratio + + def _resize_seg(self, results): + """Resize semantic segmentation map with ``results['scale']``.""" + for key in results.get('seg_fields', []): + if self.keep_ratio: + gt_seg = mmcv.imrescale( + results[key], results['scale'], interpolation='nearest') + else: + gt_seg = mmcv.imresize( + results[key], results['scale'], interpolation='nearest') + results[key] = gt_seg + + def __call__(self, results): + """Call function to resize images, bounding boxes, masks, semantic + segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', + 'keep_ratio' keys are added into result dict. + """ + + if 'scale' not in results: + self._random_scale(results) + self._resize_img(results) + self._resize_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += (f'(img_scale={self.img_scale}, ' + f'multiscale_mode={self.multiscale_mode}, ' + f'ratio_range={self.ratio_range}, ' + f'keep_ratio={self.keep_ratio})') + return repr_str + + +@PIPELINES.register_module() +class RandomFlip(object): + """Flip the image & seg. + + If the input dict contains the key "flip", then the flag will be used, + otherwise it will be randomly decided by a ratio specified in the init + method. + + Args: + prob (float, optional): The flipping probability. Default: None. + direction(str, optional): The flipping direction. Options are + 'horizontal' and 'vertical'. Default: 'horizontal'. + """ + + @deprecated_api_warning({'flip_ratio': 'prob'}, cls_name='RandomFlip') + def __init__(self, prob=None, direction='horizontal'): + self.prob = prob + self.direction = direction + if prob is not None: + assert prob >= 0 and prob <= 1 + assert direction in ['horizontal', 'vertical'] + + def __call__(self, results): + """Call function to flip bounding boxes, masks, semantic segmentation + maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Flipped results, 'flip', 'flip_direction' keys are added into + result dict. + """ + + if 'flip' not in results: + flip = True if np.random.rand() < self.prob else False + results['flip'] = flip + if 'flip_direction' not in results: + results['flip_direction'] = self.direction + if results['flip']: + # flip image + results['img'] = mmcv.imflip( + results['img'], direction=results['flip_direction']) + + # flip segs + for key in results.get('seg_fields', []): + # use copy() to make numpy stride positive + results[key] = mmcv.imflip( + results[key], direction=results['flip_direction']).copy() + return results + + def __repr__(self): + return self.__class__.__name__ + f'(prob={self.prob})' + + +@PIPELINES.register_module() +class Pad(object): + """Pad the image & mask. + + There are two padding modes: (1) pad to a fixed size and (2) pad to the + minimum size that is divisible by some number. + Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor", + + Args: + size (tuple, optional): Fixed padding size. + size_divisor (int, optional): The divisor of padded size. + pad_val (float, optional): Padding value. Default: 0. + seg_pad_val (float, optional): Padding value of segmentation map. + Default: 255. + """ + + def __init__(self, + size=None, + size_divisor=None, + pad_val=0, + seg_pad_val=255): + self.size = size + self.size_divisor = size_divisor + self.pad_val = pad_val + self.seg_pad_val = seg_pad_val + # only one of size and size_divisor should be valid + assert size is not None or size_divisor is not None + assert size is None or size_divisor is None + + def _pad_img(self, results): + """Pad images according to ``self.size``.""" + if self.size is not None: + padded_img = mmcv.impad( + results['img'], shape=self.size, pad_val=self.pad_val) + elif self.size_divisor is not None: + padded_img = mmcv.impad_to_multiple( + results['img'], self.size_divisor, pad_val=self.pad_val) + results['img'] = padded_img + results['pad_shape'] = padded_img.shape + results['pad_fixed_size'] = self.size + results['pad_size_divisor'] = self.size_divisor + + def _pad_seg(self, results): + """Pad masks according to ``results['pad_shape']``.""" + for key in results.get('seg_fields', []): + results[key] = mmcv.impad( + results[key], + shape=results['pad_shape'][:2], + pad_val=self.seg_pad_val) + + def __call__(self, results): + """Call function to pad images, masks, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Updated result dict. + """ + + self._pad_img(results) + self._pad_seg(results) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(size={self.size}, size_divisor={self.size_divisor}, ' \ + f'pad_val={self.pad_val})' + return repr_str + + +@PIPELINES.register_module() +class Normalize(object): + """Normalize the image. + + Added key is "img_norm_cfg". + + Args: + mean (sequence): Mean values of 3 channels. + std (sequence): Std values of 3 channels. + to_rgb (bool): Whether to convert the image from BGR to RGB, + default is true. + """ + + def __init__(self, mean, std, to_rgb=True): + self.mean = np.array(mean, dtype=np.float32) + self.std = np.array(std, dtype=np.float32) + self.to_rgb = to_rgb + + def __call__(self, results): + """Call function to normalize images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Normalized results, 'img_norm_cfg' key is added into + result dict. + """ + + results['img'] = mmcv.imnormalize(results['img'], self.mean, self.std, + self.to_rgb) + results['img_norm_cfg'] = dict( + mean=self.mean, std=self.std, to_rgb=self.to_rgb) + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(mean={self.mean}, std={self.std}, to_rgb=' \ + f'{self.to_rgb})' + return repr_str + + +@PIPELINES.register_module() +class Rerange(object): + """Rerange the image pixel value. + + Args: + min_value (float or int): Minimum value of the reranged image. + Default: 0. + max_value (float or int): Maximum value of the reranged image. + Default: 255. + """ + + def __init__(self, min_value=0, max_value=255): + assert isinstance(min_value, float) or isinstance(min_value, int) + assert isinstance(max_value, float) or isinstance(max_value, int) + assert min_value < max_value + self.min_value = min_value + self.max_value = max_value + + def __call__(self, results): + """Call function to rerange images. + + Args: + results (dict): Result dict from loading pipeline. + Returns: + dict: Reranged results. + """ + + img = results['img'] + img_min_value = np.min(img) + img_max_value = np.max(img) + + assert img_min_value < img_max_value + # rerange to [0, 1] + img = (img - img_min_value) / (img_max_value - img_min_value) + # rerange to [min_value, max_value] + img = img * (self.max_value - self.min_value) + self.min_value + results['img'] = img + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(min_value={self.min_value}, max_value={self.max_value})' + return repr_str + + +@PIPELINES.register_module() +class CLAHE(object): + """Use CLAHE method to process the image. + + See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J]. + Graphics Gems, 1994:474-485.` for more information. + + Args: + clip_limit (float): Threshold for contrast limiting. Default: 40.0. + tile_grid_size (tuple[int]): Size of grid for histogram equalization. + Input image will be divided into equally sized rectangular tiles. + It defines the number of tiles in row and column. Default: (8, 8). + """ + + def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)): + assert isinstance(clip_limit, (float, int)) + self.clip_limit = clip_limit + assert is_tuple_of(tile_grid_size, int) + assert len(tile_grid_size) == 2 + self.tile_grid_size = tile_grid_size + + def __call__(self, results): + """Call function to Use CLAHE method process images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Processed results. + """ + + for i in range(results['img'].shape[2]): + results['img'][:, :, i] = mmcv.clahe( + np.array(results['img'][:, :, i], dtype=np.uint8), + self.clip_limit, self.tile_grid_size) + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(clip_limit={self.clip_limit}, '\ + f'tile_grid_size={self.tile_grid_size})' + return repr_str + + +@PIPELINES.register_module() +class RandomCrop(object): + """Random crop the image & seg. + + Args: + crop_size (tuple): Expected size after cropping, (h, w). + cat_max_ratio (float): The maximum ratio that single category could + occupy. + """ + + def __init__(self, crop_size, cat_max_ratio=1., ignore_index=255): + assert crop_size[0] > 0 and crop_size[1] > 0 + self.crop_size = crop_size + self.cat_max_ratio = cat_max_ratio + self.ignore_index = ignore_index + + def get_crop_bbox(self, img): + """Randomly get a crop bounding box.""" + margin_h = max(img.shape[0] - self.crop_size[0], 0) + margin_w = max(img.shape[1] - self.crop_size[1], 0) + offset_h = np.random.randint(0, margin_h + 1) + offset_w = np.random.randint(0, margin_w + 1) + crop_y1, crop_y2 = offset_h, offset_h + self.crop_size[0] + crop_x1, crop_x2 = offset_w, offset_w + self.crop_size[1] + + return crop_y1, crop_y2, crop_x1, crop_x2 + + def crop(self, img, crop_bbox): + """Crop from ``img``""" + crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox + img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...] + return img + + def __call__(self, results): + """Call function to randomly crop images, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Randomly cropped results, 'img_shape' key in result dict is + updated according to crop size. + """ + + img = results['img'] + crop_bbox = self.get_crop_bbox(img) + if self.cat_max_ratio < 1.: + # Repeat 10 times + for _ in range(10): + seg_temp = self.crop(results['gt_semantic_seg'], crop_bbox) + labels, cnt = np.unique(seg_temp, return_counts=True) + cnt = cnt[labels != self.ignore_index] + if len(cnt) > 1 and np.max(cnt) / np.sum( + cnt) < self.cat_max_ratio: + break + crop_bbox = self.get_crop_bbox(img) + + # crop the image + img = self.crop(img, crop_bbox) + img_shape = img.shape + results['img'] = img + results['img_shape'] = img_shape + + # crop semantic seg + for key in results.get('seg_fields', []): + results[key] = self.crop(results[key], crop_bbox) + + return results + + def __repr__(self): + return self.__class__.__name__ + f'(crop_size={self.crop_size})' + + +@PIPELINES.register_module() +class RandomRotate(object): + """Rotate the image & seg. + + Args: + prob (float): The rotation probability. + degree (float, tuple[float]): Range of degrees to select from. If + degree is a number instead of tuple like (min, max), + the range of degree will be (``-degree``, ``+degree``) + pad_val (float, optional): Padding value of image. Default: 0. + seg_pad_val (float, optional): Padding value of segmentation map. + Default: 255. + center (tuple[float], optional): Center point (w, h) of the rotation in + the source image. If not specified, the center of the image will be + used. Default: None. + auto_bound (bool): Whether to adjust the image size to cover the whole + rotated image. Default: False + """ + + def __init__(self, + prob, + degree, + pad_val=0, + seg_pad_val=255, + center=None, + auto_bound=False): + self.prob = prob + assert prob >= 0 and prob <= 1 + if isinstance(degree, (float, int)): + assert degree > 0, f'degree {degree} should be positive' + self.degree = (-degree, degree) + else: + self.degree = degree + assert len(self.degree) == 2, f'degree {self.degree} should be a ' \ + f'tuple of (min, max)' + self.pal_val = pad_val + self.seg_pad_val = seg_pad_val + self.center = center + self.auto_bound = auto_bound + + def __call__(self, results): + """Call function to rotate image, semantic segmentation maps. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Rotated results. + """ + + rotate = True if np.random.rand() < self.prob else False + degree = np.random.uniform(min(*self.degree), max(*self.degree)) + if rotate: + # rotate image + results['img'] = mmcv.imrotate( + results['img'], + angle=degree, + border_value=self.pal_val, + center=self.center, + auto_bound=self.auto_bound) + + # rotate segs + for key in results.get('seg_fields', []): + results[key] = mmcv.imrotate( + results[key], + angle=degree, + border_value=self.seg_pad_val, + center=self.center, + auto_bound=self.auto_bound, + interpolation='nearest') + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(prob={self.prob}, ' \ + f'degree={self.degree}, ' \ + f'pad_val={self.pal_val}, ' \ + f'seg_pad_val={self.seg_pad_val}, ' \ + f'center={self.center}, ' \ + f'auto_bound={self.auto_bound})' + return repr_str + + +@PIPELINES.register_module() +class RGB2Gray(object): + """Convert RGB image to grayscale image. + + This transform calculate the weighted mean of input image channels with + ``weights`` and then expand the channels to ``out_channels``. When + ``out_channels`` is None, the number of output channels is the same as + input channels. + + Args: + out_channels (int): Expected number of output channels after + transforming. Default: None. + weights (tuple[float]): The weights to calculate the weighted mean. + Default: (0.299, 0.587, 0.114). + """ + + def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)): + assert out_channels is None or out_channels > 0 + self.out_channels = out_channels + assert isinstance(weights, tuple) + for item in weights: + assert isinstance(item, (float, int)) + self.weights = weights + + def __call__(self, results): + """Call function to convert RGB image to grayscale image. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with grayscale image. + """ + img = results['img'] + assert len(img.shape) == 3 + assert img.shape[2] == len(self.weights) + weights = np.array(self.weights).reshape((1, 1, -1)) + img = (img * weights).sum(2, keepdims=True) + if self.out_channels is None: + img = img.repeat(weights.shape[2], axis=2) + else: + img = img.repeat(self.out_channels, axis=2) + + results['img'] = img + results['img_shape'] = img.shape + + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(out_channels={self.out_channels}, ' \ + f'weights={self.weights})' + return repr_str + + +@PIPELINES.register_module() +class AdjustGamma(object): + """Using gamma correction to process the image. + + Args: + gamma (float or int): Gamma value used in gamma correction. + Default: 1.0. + """ + + def __init__(self, gamma=1.0): + assert isinstance(gamma, float) or isinstance(gamma, int) + assert gamma > 0 + self.gamma = gamma + inv_gamma = 1.0 / gamma + self.table = np.array([(i / 255.0)**inv_gamma * 255 + for i in np.arange(256)]).astype('uint8') + + def __call__(self, results): + """Call function to process the image with gamma correction. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Processed results. + """ + + results['img'] = mmcv.lut_transform( + np.array(results['img'], dtype=np.uint8), self.table) + + return results + + def __repr__(self): + return self.__class__.__name__ + f'(gamma={self.gamma})' + + +@PIPELINES.register_module() +class SegRescale(object): + """Rescale semantic segmentation maps. + + Args: + scale_factor (float): The scale factor of the final output. + """ + + def __init__(self, scale_factor=1): + self.scale_factor = scale_factor + + def __call__(self, results): + """Call function to scale the semantic segmentation map. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with semantic segmentation map scaled. + """ + for key in results.get('seg_fields', []): + if self.scale_factor != 1: + results[key] = mmcv.imrescale( + results[key], self.scale_factor, interpolation='nearest') + return results + + def __repr__(self): + return self.__class__.__name__ + f'(scale_factor={self.scale_factor})' + + +@PIPELINES.register_module() +class PhotoMetricDistortion(object): + """Apply photometric distortion to image sequentially, every transformation + is applied with a probability of 0.5. The position of random contrast is in + second or second to last. + + 1. random brightness + 2. random contrast (mode 0) + 3. convert color from BGR to HSV + 4. random saturation + 5. random hue + 6. convert color from HSV to BGR + 7. random contrast (mode 1) + + Args: + brightness_delta (int): delta of brightness. + contrast_range (tuple): range of contrast. + saturation_range (tuple): range of saturation. + hue_delta (int): delta of hue. + """ + + def __init__(self, + brightness_delta=32, + contrast_range=(0.5, 1.5), + saturation_range=(0.5, 1.5), + hue_delta=18): + self.brightness_delta = brightness_delta + self.contrast_lower, self.contrast_upper = contrast_range + self.saturation_lower, self.saturation_upper = saturation_range + self.hue_delta = hue_delta + + def convert(self, img, alpha=1, beta=0): + """Multiple with alpha and add beat with clip.""" + img = img.astype(np.float32) * alpha + beta + img = np.clip(img, 0, 255) + return img.astype(np.uint8) + + def brightness(self, img): + """Brightness distortion.""" + if random.randint(2): + return self.convert( + img, + beta=random.uniform(-self.brightness_delta, + self.brightness_delta)) + return img + + def contrast(self, img): + """Contrast distortion.""" + if random.randint(2): + return self.convert( + img, + alpha=random.uniform(self.contrast_lower, self.contrast_upper)) + return img + + def saturation(self, img): + """Saturation distortion.""" + if random.randint(2): + img = mmcv.bgr2hsv(img) + img[:, :, 1] = self.convert( + img[:, :, 1], + alpha=random.uniform(self.saturation_lower, + self.saturation_upper)) + img = mmcv.hsv2bgr(img) + return img + + def hue(self, img): + """Hue distortion.""" + if random.randint(2): + img = mmcv.bgr2hsv(img) + img[:, :, + 0] = (img[:, :, 0].astype(int) + + random.randint(-self.hue_delta, self.hue_delta)) % 180 + img = mmcv.hsv2bgr(img) + return img + + def __call__(self, results): + """Call function to perform photometric distortion on images. + + Args: + results (dict): Result dict from loading pipeline. + + Returns: + dict: Result dict with images distorted. + """ + + img = results['img'] + # random brightness + img = self.brightness(img) + + # mode == 0 --> do random contrast first + # mode == 1 --> do random contrast last + mode = random.randint(2) + if mode == 1: + img = self.contrast(img) + + # random saturation + img = self.saturation(img) + + # random hue + img = self.hue(img) + + # random contrast + if mode == 0: + img = self.contrast(img) + + results['img'] = img + return results + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += (f'(brightness_delta={self.brightness_delta}, ' + f'contrast_range=({self.contrast_lower}, ' + f'{self.contrast_upper}), ' + f'saturation_range=({self.saturation_lower}, ' + f'{self.saturation_upper}), ' + f'hue_delta={self.hue_delta})') + return repr_str diff --git a/custom_mmpkg/custom_mmseg/datasets/stare.py b/custom_mmpkg/custom_mmseg/datasets/stare.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd14e0920e7f6a73baff1432e5a32ccfdb0dfae --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/stare.py @@ -0,0 +1,27 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class STAREDataset(CustomDataset): + """STARE dataset. + + In segmentation map annotation for STARE, 0 stands for background, which is + included in 2 categories. ``reduce_zero_label`` is fixed to False. The + ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to + '.ah.png'. + """ + + CLASSES = ('background', 'vessel') + + PALETTE = [[120, 120, 120], [6, 230, 230]] + + def __init__(self, **kwargs): + super(STAREDataset, self).__init__( + img_suffix='.png', + seg_map_suffix='.ah.png', + reduce_zero_label=False, + **kwargs) + assert osp.exists(self.img_dir) diff --git a/custom_mmpkg/custom_mmseg/datasets/voc.py b/custom_mmpkg/custom_mmseg/datasets/voc.py new file mode 100644 index 0000000000000000000000000000000000000000..a8855203b14ee0dc4da9099a2945d4aedcffbcd6 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/datasets/voc.py @@ -0,0 +1,29 @@ +import os.path as osp + +from .builder import DATASETS +from .custom import CustomDataset + + +@DATASETS.register_module() +class PascalVOCDataset(CustomDataset): + """Pascal VOC dataset. + + Args: + split (str): Split txt file for Pascal VOC. + """ + + CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', + 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', + 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', + 'train', 'tvmonitor') + + PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], + [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], + [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], + [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], + [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] + + def __init__(self, split, **kwargs): + super(PascalVOCDataset, self).__init__( + img_suffix='.jpg', seg_map_suffix='.png', split=split, **kwargs) + assert osp.exists(self.img_dir) and self.split is not None diff --git a/custom_mmpkg/custom_mmseg/models/__init__.py b/custom_mmpkg/custom_mmseg/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3cf93f8bec9cf0cef0a3bd76ca3ca92eb188f535 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/__init__.py @@ -0,0 +1,12 @@ +from .backbones import * # noqa: F401,F403 +from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone, + build_head, build_loss, build_segmentor) +from .decode_heads import * # noqa: F401,F403 +from .losses import * # noqa: F401,F403 +from .necks import * # noqa: F401,F403 +from .segmentors import * # noqa: F401,F403 + +__all__ = [ + 'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone', + 'build_head', 'build_loss', 'build_segmentor' +] diff --git a/custom_mmpkg/custom_mmseg/models/backbones/__init__.py b/custom_mmpkg/custom_mmseg/models/backbones/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a1116c00a17c8bd9ed7f18743baee22b3b7d3f8d --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/__init__.py @@ -0,0 +1,16 @@ +from .cgnet import CGNet +# from .fast_scnn import FastSCNN +from .hrnet import HRNet +from .mobilenet_v2 import MobileNetV2 +from .mobilenet_v3 import MobileNetV3 +from .resnest import ResNeSt +from .resnet import ResNet, ResNetV1c, ResNetV1d +from .resnext import ResNeXt +from .unet import UNet +from .vit import VisionTransformer + +__all__ = [ + 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', + 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', + 'VisionTransformer' +] diff --git a/custom_mmpkg/custom_mmseg/models/backbones/cgnet.py b/custom_mmpkg/custom_mmseg/models/backbones/cgnet.py new file mode 100644 index 0000000000000000000000000000000000000000..18b158be8dffa5e119c4f73e84d399815ec714ac --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/cgnet.py @@ -0,0 +1,367 @@ +import torch +import torch.nn as nn +import torch.utils.checkpoint as cp +from custom_mmpkg.custom_mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer, + constant_init, kaiming_init) +from custom_mmpkg.custom_mmcv.runner import load_checkpoint +from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import _BatchNorm + +from custom_mmpkg.custom_mmseg.utils import get_root_logger +from ..builder import BACKBONES + + +class GlobalContextExtractor(nn.Module): + """Global Context Extractor for CGNet. + + This class is employed to refine the joint feature of both local feature + and surrounding context. + + Args: + channel (int): Number of input feature channels. + reduction (int): Reductions for global context extractor. Default: 16. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, channel, reduction=16, with_cp=False): + super(GlobalContextExtractor, self).__init__() + self.channel = channel + self.reduction = reduction + assert reduction >= 1 and channel >= reduction + self.with_cp = with_cp + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction), nn.ReLU(inplace=True), + nn.Linear(channel // reduction, channel), nn.Sigmoid()) + + def forward(self, x): + + def _inner_forward(x): + num_batch, num_channel = x.size()[:2] + y = self.avg_pool(x).view(num_batch, num_channel) + y = self.fc(y).view(num_batch, num_channel, 1, 1) + return x * y + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class ContextGuidedBlock(nn.Module): + """Context Guided Block for CGNet. + + This class consists of four components: local feature extractor, + surrounding feature extractor, joint feature extractor and global + context extractor. + + Args: + in_channels (int): Number of input feature channels. + out_channels (int): Number of output feature channels. + dilation (int): Dilation rate for surrounding context extractor. + Default: 2. + reduction (int): Reduction for global context extractor. Default: 16. + skip_connect (bool): Add input to output or not. Default: True. + downsample (bool): Downsample the input to 1/2 or not. Default: False. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='PReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels, + out_channels, + dilation=2, + reduction=16, + skip_connect=True, + downsample=False, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'), + with_cp=False): + super(ContextGuidedBlock, self).__init__() + self.with_cp = with_cp + self.downsample = downsample + + channels = out_channels if downsample else out_channels // 2 + if 'type' in act_cfg and act_cfg['type'] == 'PReLU': + act_cfg['num_parameters'] = channels + kernel_size = 3 if downsample else 1 + stride = 2 if downsample else 1 + padding = (kernel_size - 1) // 2 + + self.conv1x1 = ConvModule( + in_channels, + channels, + kernel_size, + stride, + padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + self.f_loc = build_conv_layer( + conv_cfg, + channels, + channels, + kernel_size=3, + padding=1, + groups=channels, + bias=False) + self.f_sur = build_conv_layer( + conv_cfg, + channels, + channels, + kernel_size=3, + padding=dilation, + groups=channels, + dilation=dilation, + bias=False) + + self.bn = build_norm_layer(norm_cfg, 2 * channels)[1] + self.activate = nn.PReLU(2 * channels) + + if downsample: + self.bottleneck = build_conv_layer( + conv_cfg, + 2 * channels, + out_channels, + kernel_size=1, + bias=False) + + self.skip_connect = skip_connect and not downsample + self.f_glo = GlobalContextExtractor(out_channels, reduction, with_cp) + + def forward(self, x): + + def _inner_forward(x): + out = self.conv1x1(x) + loc = self.f_loc(out) + sur = self.f_sur(out) + + joi_feat = torch.cat([loc, sur], 1) # the joint feature + joi_feat = self.bn(joi_feat) + joi_feat = self.activate(joi_feat) + if self.downsample: + joi_feat = self.bottleneck(joi_feat) # channel = out_channels + # f_glo is employed to refine the joint feature + out = self.f_glo(joi_feat) + + if self.skip_connect: + return x + out + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class InputInjection(nn.Module): + """Downsampling module for CGNet.""" + + def __init__(self, num_downsampling): + super(InputInjection, self).__init__() + self.pool = nn.ModuleList() + for i in range(num_downsampling): + self.pool.append(nn.AvgPool2d(3, stride=2, padding=1)) + + def forward(self, x): + for pool in self.pool: + x = pool(x) + return x + + +@BACKBONES.register_module() +class CGNet(nn.Module): + """CGNet backbone. + + A Light-weight Context Guided Network for Semantic Segmentation + arXiv: https://arxiv.org/abs/1811.08201 + + Args: + in_channels (int): Number of input image channels. Normally 3. + num_channels (tuple[int]): Numbers of feature channels at each stages. + Default: (32, 64, 128). + num_blocks (tuple[int]): Numbers of CG blocks at stage 1 and stage 2. + Default: (3, 21). + dilations (tuple[int]): Dilation rate for surrounding context + extractors at stage 1 and stage 2. Default: (2, 4). + reductions (tuple[int]): Reductions for global context extractors at + stage 1 and stage 2. Default: (8, 16). + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN', requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='PReLU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + in_channels=3, + num_channels=(32, 64, 128), + num_blocks=(3, 21), + dilations=(2, 4), + reductions=(8, 16), + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + act_cfg=dict(type='PReLU'), + norm_eval=False, + with_cp=False): + + super(CGNet, self).__init__() + self.in_channels = in_channels + self.num_channels = num_channels + assert isinstance(self.num_channels, tuple) and len( + self.num_channels) == 3 + self.num_blocks = num_blocks + assert isinstance(self.num_blocks, tuple) and len(self.num_blocks) == 2 + self.dilations = dilations + assert isinstance(self.dilations, tuple) and len(self.dilations) == 2 + self.reductions = reductions + assert isinstance(self.reductions, tuple) and len(self.reductions) == 2 + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + if 'type' in self.act_cfg and self.act_cfg['type'] == 'PReLU': + self.act_cfg['num_parameters'] = num_channels[0] + self.norm_eval = norm_eval + self.with_cp = with_cp + + cur_channels = in_channels + self.stem = nn.ModuleList() + for i in range(3): + self.stem.append( + ConvModule( + cur_channels, + num_channels[0], + 3, + 2 if i == 0 else 1, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + cur_channels = num_channels[0] + + self.inject_2x = InputInjection(1) # down-sample for Input, factor=2 + self.inject_4x = InputInjection(2) # down-sample for Input, factor=4 + + cur_channels += in_channels + self.norm_prelu_0 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + # stage 1 + self.level1 = nn.ModuleList() + for i in range(num_blocks[0]): + self.level1.append( + ContextGuidedBlock( + cur_channels if i == 0 else num_channels[1], + num_channels[1], + dilations[0], + reductions[0], + downsample=(i == 0), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) # CG block + + cur_channels = 2 * num_channels[1] + in_channels + self.norm_prelu_1 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + # stage 2 + self.level2 = nn.ModuleList() + for i in range(num_blocks[1]): + self.level2.append( + ContextGuidedBlock( + cur_channels if i == 0 else num_channels[2], + num_channels[2], + dilations[1], + reductions[1], + downsample=(i == 0), + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + with_cp=with_cp)) # CG block + + cur_channels = 2 * num_channels[2] + self.norm_prelu_2 = nn.Sequential( + build_norm_layer(norm_cfg, cur_channels)[1], + nn.PReLU(cur_channels)) + + def forward(self, x): + output = [] + + # stage 0 + inp_2x = self.inject_2x(x) + inp_4x = self.inject_4x(x) + for layer in self.stem: + x = layer(x) + x = self.norm_prelu_0(torch.cat([x, inp_2x], 1)) + output.append(x) + + # stage 1 + for i, layer in enumerate(self.level1): + x = layer(x) + if i == 0: + down1 = x + x = self.norm_prelu_1(torch.cat([x, down1, inp_4x], 1)) + output.append(x) + + # stage 2 + for i, layer in enumerate(self.level2): + x = layer(x) + if i == 0: + down2 = x + x = self.norm_prelu_2(torch.cat([down2, x], 1)) + output.append(x) + + return output + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, (nn.Conv2d, nn.Linear)): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + elif isinstance(m, nn.PReLU): + constant_init(m, 0) + else: + raise TypeError('pretrained must be a str or None') + + def train(self, mode=True): + """Convert the model into training mode will keeping the normalization + layer freezed.""" + super(CGNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/custom_mmpkg/custom_mmseg/models/backbones/fast_scnn.py b/custom_mmpkg/custom_mmseg/models/backbones/fast_scnn.py new file mode 100644 index 0000000000000000000000000000000000000000..0d06faa7c4e3a0d6e85acaf3f2bd21ec28e1f435 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/fast_scnn.py @@ -0,0 +1,375 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule, constant_init, + kaiming_init) +from torch.nn.modules.batchnorm import _BatchNorm + +from custom_mmpkg.custom_mmseg.models.decode_heads.psp_head import PPM +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import BACKBONES +from ..utils.inverted_residual import InvertedResidual + + +class LearningToDownsample(nn.Module): + """Learning to downsample module. + + Args: + in_channels (int): Number of input channels. + dw_channels (tuple[int]): Number of output channels of the first and + the second depthwise conv (dwconv) layers. + out_channels (int): Number of output channels of the whole + 'learning to downsample' module. + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + """ + + def __init__(self, + in_channels, + dw_channels, + out_channels, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU')): + super(LearningToDownsample, self).__init__() + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + dw_channels1 = dw_channels[0] + dw_channels2 = dw_channels[1] + + self.conv = ConvModule( + in_channels, + dw_channels1, + 3, + stride=2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.dsconv1 = DepthwiseSeparableConvModule( + dw_channels1, + dw_channels2, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg) + self.dsconv2 = DepthwiseSeparableConvModule( + dw_channels2, + out_channels, + kernel_size=3, + stride=2, + padding=1, + norm_cfg=self.norm_cfg) + + def forward(self, x): + x = self.conv(x) + x = self.dsconv1(x) + x = self.dsconv2(x) + return x + + +class GlobalFeatureExtractor(nn.Module): + """Global feature extractor module. + + Args: + in_channels (int): Number of input channels of the GFE module. + Default: 64 + block_channels (tuple[int]): Tuple of ints. Each int specifies the + number of output channels of each Inverted Residual module. + Default: (64, 96, 128) + out_channels(int): Number of output channels of the GFE module. + Default: 128 + expand_ratio (int): Adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + Default: 6 + num_blocks (tuple[int]): Tuple of ints. Each int specifies the + number of times each Inverted Residual module is repeated. + The repeated Inverted Residual modules are called a 'group'. + Default: (3, 3, 3) + strides (tuple[int]): Tuple of ints. Each int specifies + the downsampling factor of each 'group'. + Default: (2, 2, 1) + pool_scales (tuple[int]): Tuple of ints. Each int specifies + the parameter required in 'global average pooling' within PPM. + Default: (1, 2, 3, 6) + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + align_corners (bool): align_corners argument of F.interpolate. + Default: False + """ + + def __init__(self, + in_channels=64, + block_channels=(64, 96, 128), + out_channels=128, + expand_ratio=6, + num_blocks=(3, 3, 3), + strides=(2, 2, 1), + pool_scales=(1, 2, 3, 6), + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + align_corners=False): + super(GlobalFeatureExtractor, self).__init__() + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + assert len(block_channels) == len(num_blocks) == 3 + self.bottleneck1 = self._make_layer(in_channels, block_channels[0], + num_blocks[0], strides[0], + expand_ratio) + self.bottleneck2 = self._make_layer(block_channels[0], + block_channels[1], num_blocks[1], + strides[1], expand_ratio) + self.bottleneck3 = self._make_layer(block_channels[1], + block_channels[2], num_blocks[2], + strides[2], expand_ratio) + self.ppm = PPM( + pool_scales, + block_channels[2], + block_channels[2] // 4, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=align_corners) + self.out = ConvModule( + block_channels[2] * 2, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def _make_layer(self, + in_channels, + out_channels, + blocks, + stride=1, + expand_ratio=6): + layers = [ + InvertedResidual( + in_channels, + out_channels, + stride, + expand_ratio, + norm_cfg=self.norm_cfg) + ] + for i in range(1, blocks): + layers.append( + InvertedResidual( + out_channels, + out_channels, + 1, + expand_ratio, + norm_cfg=self.norm_cfg)) + return nn.Sequential(*layers) + + def forward(self, x): + x = self.bottleneck1(x) + x = self.bottleneck2(x) + x = self.bottleneck3(x) + x = torch.cat([x, *self.ppm(x)], dim=1) + x = self.out(x) + return x + + +class FeatureFusionModule(nn.Module): + """Feature fusion module. + + Args: + higher_in_channels (int): Number of input channels of the + higher-resolution branch. + lower_in_channels (int): Number of input channels of the + lower-resolution branch. + out_channels (int): Number of output channels. + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + align_corners (bool): align_corners argument of F.interpolate. + Default: False + """ + + def __init__(self, + higher_in_channels, + lower_in_channels, + out_channels, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + align_corners=False): + super(FeatureFusionModule, self).__init__() + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.align_corners = align_corners + self.dwconv = ConvModule( + lower_in_channels, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.conv_lower_res = ConvModule( + out_channels, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + self.conv_higher_res = ConvModule( + higher_in_channels, + out_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + self.relu = nn.ReLU(True) + + def forward(self, higher_res_feature, lower_res_feature): + lower_res_feature = resize( + lower_res_feature, + size=higher_res_feature.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + lower_res_feature = self.dwconv(lower_res_feature) + lower_res_feature = self.conv_lower_res(lower_res_feature) + + higher_res_feature = self.conv_higher_res(higher_res_feature) + out = higher_res_feature + lower_res_feature + return self.relu(out) + + +@BACKBONES.register_module() +class FastSCNN(nn.Module): + """Fast-SCNN Backbone. + + Args: + in_channels (int): Number of input image channels. Default: 3. + downsample_dw_channels (tuple[int]): Number of output channels after + the first conv layer & the second conv layer in + Learning-To-Downsample (LTD) module. + Default: (32, 48). + global_in_channels (int): Number of input channels of + Global Feature Extractor(GFE). + Equal to number of output channels of LTD. + Default: 64. + global_block_channels (tuple[int]): Tuple of integers that describe + the output channels for each of the MobileNet-v2 bottleneck + residual blocks in GFE. + Default: (64, 96, 128). + global_block_strides (tuple[int]): Tuple of integers + that describe the strides (downsampling factors) for each of the + MobileNet-v2 bottleneck residual blocks in GFE. + Default: (2, 2, 1). + global_out_channels (int): Number of output channels of GFE. + Default: 128. + higher_in_channels (int): Number of input channels of the higher + resolution branch in FFM. + Equal to global_in_channels. + Default: 64. + lower_in_channels (int): Number of input channels of the lower + resolution branch in FFM. + Equal to global_out_channels. + Default: 128. + fusion_out_channels (int): Number of output channels of FFM. + Default: 128. + out_indices (tuple): Tuple of indices of list + [higher_res_features, lower_res_features, fusion_output]. + Often set to (0,1,2) to enable aux. heads. + Default: (0, 1, 2). + conv_cfg (dict | None): Config of conv layers. Default: None + norm_cfg (dict | None): Config of norm layers. Default: + dict(type='BN') + act_cfg (dict): Config of activation layers. Default: + dict(type='ReLU') + align_corners (bool): align_corners argument of F.interpolate. + Default: False + """ + + def __init__(self, + in_channels=3, + downsample_dw_channels=(32, 48), + global_in_channels=64, + global_block_channels=(64, 96, 128), + global_block_strides=(2, 2, 1), + global_out_channels=128, + higher_in_channels=64, + lower_in_channels=128, + fusion_out_channels=128, + out_indices=(0, 1, 2), + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + align_corners=False): + + super(FastSCNN, self).__init__() + if global_in_channels != higher_in_channels: + raise AssertionError('Global Input Channels must be the same \ + with Higher Input Channels!') + elif global_out_channels != lower_in_channels: + raise AssertionError('Global Output Channels must be the same \ + with Lower Input Channels!') + + self.in_channels = in_channels + self.downsample_dw_channels1 = downsample_dw_channels[0] + self.downsample_dw_channels2 = downsample_dw_channels[1] + self.global_in_channels = global_in_channels + self.global_block_channels = global_block_channels + self.global_block_strides = global_block_strides + self.global_out_channels = global_out_channels + self.higher_in_channels = higher_in_channels + self.lower_in_channels = lower_in_channels + self.fusion_out_channels = fusion_out_channels + self.out_indices = out_indices + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.align_corners = align_corners + self.learning_to_downsample = LearningToDownsample( + in_channels, + downsample_dw_channels, + global_in_channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.global_feature_extractor = GlobalFeatureExtractor( + global_in_channels, + global_block_channels, + global_out_channels, + strides=self.global_block_strides, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + self.feature_fusion = FeatureFusionModule( + higher_in_channels, + lower_in_channels, + fusion_out_channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + + def init_weights(self, pretrained=None): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + def forward(self, x): + higher_res_features = self.learning_to_downsample(x) + lower_res_features = self.global_feature_extractor(higher_res_features) + fusion_output = self.feature_fusion(higher_res_features, + lower_res_features) + + outs = [higher_res_features, lower_res_features, fusion_output] + outs = [outs[i] for i in self.out_indices] + return tuple(outs) diff --git a/custom_mmpkg/custom_mmseg/models/backbones/hrnet.py b/custom_mmpkg/custom_mmseg/models/backbones/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..7df19e7bef0ccacbef039633fa5c26344593bf3c --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/hrnet.py @@ -0,0 +1,555 @@ +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import (build_conv_layer, build_norm_layer, constant_init, + kaiming_init) +from custom_mmpkg.custom_mmcv.runner import load_checkpoint +from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import _BatchNorm + +from custom_mmpkg.custom_mmseg.ops import Upsample, resize +from custom_mmpkg.custom_mmseg.utils import get_root_logger +from ..builder import BACKBONES +from .resnet import BasicBlock, Bottleneck + + +class HRModule(nn.Module): + """High-Resolution Module for HRNet. + + In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange + is in this module. + """ + + def __init__(self, + num_branches, + blocks, + num_blocks, + in_channels, + num_channels, + multiscale_output=True, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True)): + super(HRModule, self).__init__() + self._check_branches(num_branches, num_blocks, in_channels, + num_channels) + + self.in_channels = in_channels + self.num_branches = num_branches + + self.multiscale_output = multiscale_output + self.norm_cfg = norm_cfg + self.conv_cfg = conv_cfg + self.with_cp = with_cp + self.branches = self._make_branches(num_branches, blocks, num_blocks, + num_channels) + self.fuse_layers = self._make_fuse_layers() + self.relu = nn.ReLU(inplace=False) + + def _check_branches(self, num_branches, num_blocks, in_channels, + num_channels): + """Check branches configuration.""" + if num_branches != len(num_blocks): + error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_BLOCKS(' \ + f'{len(num_blocks)})' + raise ValueError(error_msg) + + if num_branches != len(num_channels): + error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_CHANNELS(' \ + f'{len(num_channels)})' + raise ValueError(error_msg) + + if num_branches != len(in_channels): + error_msg = f'NUM_BRANCHES({num_branches}) <> NUM_INCHANNELS(' \ + f'{len(in_channels)})' + raise ValueError(error_msg) + + def _make_one_branch(self, + branch_index, + block, + num_blocks, + num_channels, + stride=1): + """Build one branch.""" + downsample = None + if stride != 1 or \ + self.in_channels[branch_index] != \ + num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + self.in_channels[branch_index], + num_channels[branch_index] * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, num_channels[branch_index] * + block.expansion)[1]) + + layers = [] + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + self.in_channels[branch_index] = \ + num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append( + block( + self.in_channels[branch_index], + num_channels[branch_index], + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + """Build multiple branch.""" + branches = [] + + for i in range(num_branches): + branches.append( + self._make_one_branch(i, block, num_blocks, num_channels)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + """Build fuse layer.""" + if self.num_branches == 1: + return None + + num_branches = self.num_branches + in_channels = self.in_channels + fuse_layers = [] + num_out_branches = num_branches if self.multiscale_output else 1 + for i in range(num_out_branches): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=1, + stride=1, + padding=0, + bias=False), + build_norm_layer(self.norm_cfg, in_channels[i])[1], + # we set align_corners=False for HRNet + Upsample( + scale_factor=2**(j - i), + mode='bilinear', + align_corners=False))) + elif j == i: + fuse_layer.append(None) + else: + conv_downsamples = [] + for k in range(i - j): + if k == i - j - 1: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[i], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[i])[1])) + else: + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels[j], + in_channels[j], + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + in_channels[j])[1], + nn.ReLU(inplace=False))) + fuse_layer.append(nn.Sequential(*conv_downsamples)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def forward(self, x): + """Forward function.""" + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i in range(self.num_branches): + x[i] = self.branches[i](x[i]) + + x_fuse = [] + for i in range(len(self.fuse_layers)): + y = 0 + for j in range(self.num_branches): + if i == j: + y += x[j] + elif j > i: + y = y + resize( + self.fuse_layers[i][j](x[j]), + size=x[i].shape[2:], + mode='bilinear', + align_corners=False) + else: + y += self.fuse_layers[i][j](x[j]) + x_fuse.append(self.relu(y)) + return x_fuse + + +@BACKBONES.register_module() +class HRNet(nn.Module): + """HRNet backbone. + + High-Resolution Representations for Labeling Pixels and Regions + arXiv: https://arxiv.org/abs/1904.04514 + + Args: + extra (dict): detailed configuration for each stage of HRNet. + in_channels (int): Number of input image channels. Normally 3. + conv_cfg (dict): dictionary to construct and config conv layer. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. + + Example: + >>> from custom_mmpkg.custom_mmseg.models import HRNet + >>> import torch + >>> extra = dict( + >>> stage1=dict( + >>> num_modules=1, + >>> num_branches=1, + >>> block='BOTTLENECK', + >>> num_blocks=(4, ), + >>> num_channels=(64, )), + >>> stage2=dict( + >>> num_modules=1, + >>> num_branches=2, + >>> block='BASIC', + >>> num_blocks=(4, 4), + >>> num_channels=(32, 64)), + >>> stage3=dict( + >>> num_modules=4, + >>> num_branches=3, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4), + >>> num_channels=(32, 64, 128)), + >>> stage4=dict( + >>> num_modules=3, + >>> num_branches=4, + >>> block='BASIC', + >>> num_blocks=(4, 4, 4, 4), + >>> num_channels=(32, 64, 128, 256))) + >>> self = HRNet(extra, in_channels=1) + >>> self.eval() + >>> inputs = torch.rand(1, 1, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 32, 8, 8) + (1, 64, 4, 4) + (1, 128, 2, 2) + (1, 256, 1, 1) + """ + + blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck} + + def __init__(self, + extra, + in_channels=3, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + with_cp=False, + zero_init_residual=False): + super(HRNet, self).__init__() + self.extra = extra + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + self.zero_init_residual = zero_init_residual + + # stem net + self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1) + self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2) + + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + self.conv_cfg, + 64, + 64, + kernel_size=3, + stride=2, + padding=1, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.relu = nn.ReLU(inplace=True) + + # stage 1 + self.stage1_cfg = self.extra['stage1'] + num_channels = self.stage1_cfg['num_channels'][0] + block_type = self.stage1_cfg['block'] + num_blocks = self.stage1_cfg['num_blocks'][0] + + block = self.blocks_dict[block_type] + stage1_out_channels = num_channels * block.expansion + self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) + + # stage 2 + self.stage2_cfg = self.extra['stage2'] + num_channels = self.stage2_cfg['num_channels'] + block_type = self.stage2_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition1 = self._make_transition_layer([stage1_out_channels], + num_channels) + self.stage2, pre_stage_channels = self._make_stage( + self.stage2_cfg, num_channels) + + # stage 3 + self.stage3_cfg = self.extra['stage3'] + num_channels = self.stage3_cfg['num_channels'] + block_type = self.stage3_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition2 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage3, pre_stage_channels = self._make_stage( + self.stage3_cfg, num_channels) + + # stage 4 + self.stage4_cfg = self.extra['stage4'] + num_channels = self.stage4_cfg['num_channels'] + block_type = self.stage4_cfg['block'] + + block = self.blocks_dict[block_type] + num_channels = [channel * block.expansion for channel in num_channels] + self.transition3 = self._make_transition_layer(pre_stage_channels, + num_channels) + self.stage4, pre_stage_channels = self._make_stage( + self.stage4_cfg, num_channels) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: the normalization layer named "norm2" """ + return getattr(self, self.norm2_name) + + def _make_transition_layer(self, num_channels_pre_layer, + num_channels_cur_layer): + """Make transition layer.""" + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + num_channels_pre_layer[i], + num_channels_cur_layer[i], + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, + num_channels_cur_layer[i])[1], + nn.ReLU(inplace=True))) + else: + transition_layers.append(None) + else: + conv_downsamples = [] + for j in range(i + 1 - num_branches_pre): + in_channels = num_channels_pre_layer[-1] + out_channels = num_channels_cur_layer[i] \ + if j == i - num_branches_pre else in_channels + conv_downsamples.append( + nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + out_channels, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, out_channels)[1], + nn.ReLU(inplace=True))) + transition_layers.append(nn.Sequential(*conv_downsamples)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, inplanes, planes, blocks, stride=1): + """Make each layer.""" + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + build_conv_layer( + self.conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=stride, + bias=False), + build_norm_layer(self.norm_cfg, planes * block.expansion)[1]) + + layers = [] + layers.append( + block( + inplanes, + planes, + stride, + downsample=downsample, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block( + inplanes, + planes, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, in_channels, multiscale_output=True): + """Make each stage.""" + num_modules = layer_config['num_modules'] + num_branches = layer_config['num_branches'] + num_blocks = layer_config['num_blocks'] + num_channels = layer_config['num_channels'] + block = self.blocks_dict[layer_config['block']] + + hr_modules = [] + for i in range(num_modules): + # multi_scale_output is only used for the last module + if not multiscale_output and i == num_modules - 1: + reset_multiscale_output = False + else: + reset_multiscale_output = True + + hr_modules.append( + HRModule( + num_branches, + block, + num_blocks, + in_channels, + num_channels, + reset_multiscale_output, + with_cp=self.with_cp, + norm_cfg=self.norm_cfg, + conv_cfg=self.conv_cfg)) + + return nn.Sequential(*hr_modules), in_channels + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + constant_init(m.norm3, 0) + elif isinstance(m, BasicBlock): + constant_init(m.norm2, 0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.relu(x) + x = self.layer1(x) + + x_list = [] + for i in range(self.stage2_cfg['num_branches']): + if self.transition1[i] is not None: + x_list.append(self.transition1[i](x)) + else: + x_list.append(x) + y_list = self.stage2(x_list) + + x_list = [] + for i in range(self.stage3_cfg['num_branches']): + if self.transition2[i] is not None: + x_list.append(self.transition2[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage3(x_list) + + x_list = [] + for i in range(self.stage4_cfg['num_branches']): + if self.transition3[i] is not None: + x_list.append(self.transition3[i](y_list[-1])) + else: + x_list.append(y_list[i]) + y_list = self.stage4(x_list) + + return y_list + + def train(self, mode=True): + """Convert the model into training mode will keeping the normalization + layer freezed.""" + super(HRNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() diff --git a/custom_mmpkg/custom_mmseg/models/backbones/mobilenet_v2.py b/custom_mmpkg/custom_mmseg/models/backbones/mobilenet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..bcec93a22124fbc58f84cedd96d11f1e8dd90393 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/mobilenet_v2.py @@ -0,0 +1,180 @@ +import logging + +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule, constant_init, kaiming_init +from custom_mmpkg.custom_mmcv.runner import load_checkpoint +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from ..utils import InvertedResidual, make_divisible + + +@BACKBONES.register_module() +class MobileNetV2(nn.Module): + """MobileNetV2 backbone. + + Args: + widen_factor (float): Width multiplier, multiply number of + channels in each layer by this amount. Default: 1.0. + strides (Sequence[int], optional): Strides of the first block of each + layer. If not specified, default config in ``arch_setting`` will + be used. + dilations (Sequence[int]): Dilation of each layer. + out_indices (None or Sequence[int]): Output from which stages. + Default: (7, ). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + # Parameters to build layers. 3 parameters are needed to construct a + # layer, from left to right: expand_ratio, channel, num_blocks. + arch_settings = [[1, 16, 1], [6, 24, 2], [6, 32, 3], [6, 64, 4], + [6, 96, 3], [6, 160, 3], [6, 320, 1]] + + def __init__(self, + widen_factor=1., + strides=(1, 2, 2, 2, 1, 2, 1), + dilations=(1, 1, 1, 1, 1, 1, 1), + out_indices=(1, 2, 4, 6), + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + norm_eval=False, + with_cp=False): + super(MobileNetV2, self).__init__() + self.widen_factor = widen_factor + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == len(self.arch_settings) + self.out_indices = out_indices + for index in out_indices: + if index not in range(0, 7): + raise ValueError('the item in out_indices must in ' + f'range(0, 8). But received {index}') + + if frozen_stages not in range(-1, 7): + raise ValueError('frozen_stages must be in range(-1, 7). ' + f'But received {frozen_stages}') + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.norm_eval = norm_eval + self.with_cp = with_cp + + self.in_channels = make_divisible(32 * widen_factor, 8) + + self.conv1 = ConvModule( + in_channels=3, + out_channels=self.in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.layers = [] + + for i, layer_cfg in enumerate(self.arch_settings): + expand_ratio, channel, num_blocks = layer_cfg + stride = self.strides[i] + dilation = self.dilations[i] + out_channels = make_divisible(channel * widen_factor, 8) + inverted_res_layer = self.make_layer( + out_channels=out_channels, + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + expand_ratio=expand_ratio) + layer_name = f'layer{i + 1}' + self.add_module(layer_name, inverted_res_layer) + self.layers.append(layer_name) + + def make_layer(self, out_channels, num_blocks, stride, dilation, + expand_ratio): + """Stack InvertedResidual blocks to build a layer for MobileNetV2. + + Args: + out_channels (int): out_channels of block. + num_blocks (int): Number of blocks. + stride (int): Stride of the first block. + dilation (int): Dilation of the first block. + expand_ratio (int): Expand the number of channels of the + hidden layer in InvertedResidual by this ratio. + """ + layers = [] + for i in range(num_blocks): + layers.append( + InvertedResidual( + self.in_channels, + out_channels, + stride if i == 0 else 1, + expand_ratio=expand_ratio, + dilation=dilation if i == 0 else 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + with_cp=self.with_cp)) + self.in_channels = out_channels + + return nn.Sequential(*layers) + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + x = self.conv1(x) + + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + + if len(outs) == 1: + return outs[0] + else: + return tuple(outs) + + def _freeze_stages(self): + if self.frozen_stages >= 0: + for param in self.conv1.parameters(): + param.requires_grad = False + for i in range(1, self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(MobileNetV2, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/custom_mmpkg/custom_mmseg/models/backbones/mobilenet_v3.py b/custom_mmpkg/custom_mmseg/models/backbones/mobilenet_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..172103273f385b8dcd4e89a7f8ee0714be87113e --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/mobilenet_v3.py @@ -0,0 +1,255 @@ +import logging + +import custom_mmpkg.custom_mmcv as mmcv +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule, constant_init, kaiming_init +from custom_mmpkg.custom_mmcv.cnn.bricks import Conv2dAdaptivePadding +from custom_mmpkg.custom_mmcv.runner import load_checkpoint +from torch.nn.modules.batchnorm import _BatchNorm + +from ..builder import BACKBONES +from ..utils import InvertedResidualV3 as InvertedResidual + + +@BACKBONES.register_module() +class MobileNetV3(nn.Module): + """MobileNetV3 backbone. + + This backbone is the improved implementation of `Searching for MobileNetV3 + `_. + + Args: + arch (str): Architecture of mobilnetv3, from {'small', 'large'}. + Default: 'small'. + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + out_indices (tuple[int]): Output from which layer. + Default: (0, 1, 12). + frozen_stages (int): Stages to be frozen (all param fixed). + Default: -1, which means not freezing any parameters. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + with_cp (bool): Use checkpoint or not. Using checkpoint will save + some memory while slowing down the training speed. + Default: False. + """ + # Parameters to build each block: + # [kernel size, mid channels, out channels, with_se, act type, stride] + arch_settings = { + 'small': [[3, 16, 16, True, 'ReLU', 2], # block0 layer1 os=4 + [3, 72, 24, False, 'ReLU', 2], # block1 layer2 os=8 + [3, 88, 24, False, 'ReLU', 1], + [5, 96, 40, True, 'HSwish', 2], # block2 layer4 os=16 + [5, 240, 40, True, 'HSwish', 1], + [5, 240, 40, True, 'HSwish', 1], + [5, 120, 48, True, 'HSwish', 1], # block3 layer7 os=16 + [5, 144, 48, True, 'HSwish', 1], + [5, 288, 96, True, 'HSwish', 2], # block4 layer9 os=32 + [5, 576, 96, True, 'HSwish', 1], + [5, 576, 96, True, 'HSwish', 1]], + 'large': [[3, 16, 16, False, 'ReLU', 1], # block0 layer1 os=2 + [3, 64, 24, False, 'ReLU', 2], # block1 layer2 os=4 + [3, 72, 24, False, 'ReLU', 1], + [5, 72, 40, True, 'ReLU', 2], # block2 layer4 os=8 + [5, 120, 40, True, 'ReLU', 1], + [5, 120, 40, True, 'ReLU', 1], + [3, 240, 80, False, 'HSwish', 2], # block3 layer7 os=16 + [3, 200, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 184, 80, False, 'HSwish', 1], + [3, 480, 112, True, 'HSwish', 1], # block4 layer11 os=16 + [3, 672, 112, True, 'HSwish', 1], + [5, 672, 160, True, 'HSwish', 2], # block5 layer13 os=32 + [5, 960, 160, True, 'HSwish', 1], + [5, 960, 160, True, 'HSwish', 1]] + } # yapf: disable + + def __init__(self, + arch='small', + conv_cfg=None, + norm_cfg=dict(type='BN'), + out_indices=(0, 1, 12), + frozen_stages=-1, + reduction_factor=1, + norm_eval=False, + with_cp=False): + super(MobileNetV3, self).__init__() + assert arch in self.arch_settings + assert isinstance(reduction_factor, int) and reduction_factor > 0 + assert mmcv.is_tuple_of(out_indices, int) + for index in out_indices: + if index not in range(0, len(self.arch_settings[arch]) + 2): + raise ValueError( + 'the item in out_indices must in ' + f'range(0, {len(self.arch_settings[arch])+2}). ' + f'But received {index}') + + if frozen_stages not in range(-1, len(self.arch_settings[arch]) + 2): + raise ValueError('frozen_stages must be in range(-1, ' + f'{len(self.arch_settings[arch])+2}). ' + f'But received {frozen_stages}') + self.arch = arch + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.out_indices = out_indices + self.frozen_stages = frozen_stages + self.reduction_factor = reduction_factor + self.norm_eval = norm_eval + self.with_cp = with_cp + self.layers = self._make_layer() + + def _make_layer(self): + layers = [] + + # build the first layer (layer0) + in_channels = 16 + layer = ConvModule( + in_channels=3, + out_channels=in_channels, + kernel_size=3, + stride=2, + padding=1, + conv_cfg=dict(type='Conv2dAdaptivePadding'), + norm_cfg=self.norm_cfg, + act_cfg=dict(type='HSwish')) + self.add_module('layer0', layer) + layers.append('layer0') + + layer_setting = self.arch_settings[self.arch] + for i, params in enumerate(layer_setting): + (kernel_size, mid_channels, out_channels, with_se, act, + stride) = params + + if self.arch == 'large' and i >= 12 or self.arch == 'small' and \ + i >= 8: + mid_channels = mid_channels // self.reduction_factor + out_channels = out_channels // self.reduction_factor + + if with_se: + se_cfg = dict( + channels=mid_channels, + ratio=4, + act_cfg=(dict(type='ReLU'), + dict(type='HSigmoid', bias=3.0, divisor=6.0))) + else: + se_cfg = None + + layer = InvertedResidual( + in_channels=in_channels, + out_channels=out_channels, + mid_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + se_cfg=se_cfg, + with_expand_conv=(in_channels != mid_channels), + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type=act), + with_cp=self.with_cp) + in_channels = out_channels + layer_name = 'layer{}'.format(i + 1) + self.add_module(layer_name, layer) + layers.append(layer_name) + + # build the last layer + # block5 layer12 os=32 for small model + # block6 layer16 os=32 for large model + layer = ConvModule( + in_channels=in_channels, + out_channels=576 if self.arch == 'small' else 960, + kernel_size=1, + stride=1, + dilation=4, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=dict(type='HSwish')) + layer_name = 'layer{}'.format(len(layer_setting) + 1) + self.add_module(layer_name, layer) + layers.append(layer_name) + + # next, convert backbone MobileNetV3 to a semantic segmentation version + if self.arch == 'small': + self.layer4.depthwise_conv.conv.stride = (1, 1) + self.layer9.depthwise_conv.conv.stride = (1, 1) + for i in range(4, len(layers)): + layer = getattr(self, layers[i]) + if isinstance(layer, InvertedResidual): + modified_module = layer.depthwise_conv.conv + else: + modified_module = layer.conv + + if i < 9: + modified_module.dilation = (2, 2) + pad = 2 + else: + modified_module.dilation = (4, 4) + pad = 4 + + if not isinstance(modified_module, Conv2dAdaptivePadding): + # Adjust padding + pad *= (modified_module.kernel_size[0] - 1) // 2 + modified_module.padding = (pad, pad) + else: + self.layer7.depthwise_conv.conv.stride = (1, 1) + self.layer13.depthwise_conv.conv.stride = (1, 1) + for i in range(7, len(layers)): + layer = getattr(self, layers[i]) + if isinstance(layer, InvertedResidual): + modified_module = layer.depthwise_conv.conv + else: + modified_module = layer.conv + + if i < 13: + modified_module.dilation = (2, 2) + pad = 2 + else: + modified_module.dilation = (4, 4) + pad = 4 + + if not isinstance(modified_module, Conv2dAdaptivePadding): + # Adjust padding + pad *= (modified_module.kernel_size[0] - 1) // 2 + modified_module.padding = (pad, pad) + + return layers + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = logging.getLogger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, nn.BatchNorm2d): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + outs = [] + for i, layer_name in enumerate(self.layers): + layer = getattr(self, layer_name) + x = layer(x) + if i in self.out_indices: + outs.append(x) + return outs + + def _freeze_stages(self): + for i in range(self.frozen_stages + 1): + layer = getattr(self, f'layer{i}') + layer.eval() + for param in layer.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(MobileNetV3, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, _BatchNorm): + m.eval() diff --git a/custom_mmpkg/custom_mmseg/models/backbones/resnest.py b/custom_mmpkg/custom_mmseg/models/backbones/resnest.py new file mode 100644 index 0000000000000000000000000000000000000000..3ea8fbe3aa2149de6367abf11273ec845a17e013 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/resnest.py @@ -0,0 +1,314 @@ +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from custom_mmpkg.custom_mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from ..utils import ResLayer +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNetV1d + + +class RSoftmax(nn.Module): + """Radix Softmax module in ``SplitAttentionConv2d``. + + Args: + radix (int): Radix of input. + groups (int): Groups of input. + """ + + def __init__(self, radix, groups): + super().__init__() + self.radix = radix + self.groups = groups + + def forward(self, x): + batch = x.size(0) + if self.radix > 1: + x = x.view(batch, self.groups, self.radix, -1).transpose(1, 2) + x = F.softmax(x, dim=1) + x = x.reshape(batch, -1) + else: + x = torch.sigmoid(x) + return x + + +class SplitAttentionConv2d(nn.Module): + """Split-Attention Conv2d in ResNeSt. + + Args: + in_channels (int): Same as nn.Conv2d. + out_channels (int): Same as nn.Conv2d. + kernel_size (int | tuple[int]): Same as nn.Conv2d. + stride (int | tuple[int]): Same as nn.Conv2d. + padding (int | tuple[int]): Same as nn.Conv2d. + dilation (int | tuple[int]): Same as nn.Conv2d. + groups (int): Same as nn.Conv2d. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels. Default: 4. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. Default: None. + dcn (dict): Config dict for DCN. Default: None. + """ + + def __init__(self, + in_channels, + channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + radix=2, + reduction_factor=4, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None): + super(SplitAttentionConv2d, self).__init__() + inter_channels = max(in_channels * radix // reduction_factor, 32) + self.radix = radix + self.groups = groups + self.channels = channels + self.with_dcn = dcn is not None + self.dcn = dcn + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if self.with_dcn and not fallback_on_stride: + assert conv_cfg is None, 'conv_cfg must be None for DCN' + conv_cfg = dcn + self.conv = build_conv_layer( + conv_cfg, + in_channels, + channels * radix, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups * radix, + bias=False) + self.norm0_name, norm0 = build_norm_layer( + norm_cfg, channels * radix, postfix=0) + self.add_module(self.norm0_name, norm0) + self.relu = nn.ReLU(inplace=True) + self.fc1 = build_conv_layer( + None, channels, inter_channels, 1, groups=self.groups) + self.norm1_name, norm1 = build_norm_layer( + norm_cfg, inter_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.fc2 = build_conv_layer( + None, inter_channels, channels * radix, 1, groups=self.groups) + self.rsoftmax = RSoftmax(radix, groups) + + @property + def norm0(self): + """nn.Module: the normalization layer named "norm0" """ + return getattr(self, self.norm0_name) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def forward(self, x): + x = self.conv(x) + x = self.norm0(x) + x = self.relu(x) + + batch, rchannel = x.shape[:2] + batch = x.size(0) + if self.radix > 1: + splits = x.view(batch, self.radix, -1, *x.shape[2:]) + gap = splits.sum(dim=1) + else: + gap = x + gap = F.adaptive_avg_pool2d(gap, 1) + gap = self.fc1(gap) + + gap = self.norm1(gap) + gap = self.relu(gap) + + atten = self.fc2(gap) + atten = self.rsoftmax(atten).view(batch, -1, 1, 1) + + if self.radix > 1: + attens = atten.view(batch, self.radix, -1, *atten.shape[2:]) + out = torch.sum(attens * splits, dim=1) + else: + out = atten * x + return out.contiguous() + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeSt. + + Args: + inplane (int): Input planes of this block. + planes (int): Middle planes of this block. + groups (int): Groups of conv2. + width_per_group (int): Width per group of conv2. 64x4d indicates + ``groups=64, width_per_group=4`` and 32x8d indicates + ``groups=32, width_per_group=8``. + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels in + SplitAttentionConv2d. Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + kwargs (dict): Key word arguments for base class. + """ + expansion = 4 + + def __init__(self, + inplanes, + planes, + groups=1, + base_width=4, + base_channels=64, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + """Bottleneck block for ResNeSt.""" + super(Bottleneck, self).__init__(inplanes, planes, **kwargs) + + if groups == 1: + width = self.planes + else: + width = math.floor(self.planes * + (base_width / base_channels)) * groups + + self.avg_down_stride = avg_down_stride and self.conv2_stride > 1 + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width, postfix=1) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + self.with_modulated_dcn = False + self.conv2 = SplitAttentionConv2d( + width, + width, + kernel_size=3, + stride=1 if self.avg_down_stride else self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + radix=radix, + reduction_factor=reduction_factor, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + dcn=self.dcn) + delattr(self, self.norm2_name) + + if self.avg_down_stride: + self.avd_layer = nn.AvgPool2d(3, self.conv2_stride, padding=1) + + self.conv3 = build_conv_layer( + self.conv_cfg, + width, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + def forward(self, x): + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + + if self.avg_down_stride: + out = self.avd_layer(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class ResNeSt(ResNetV1d): + """ResNeSt backbone. + + Args: + groups (int): Number of groups of Bottleneck. Default: 1 + base_width (int): Base width of Bottleneck. Default: 4 + radix (int): Radix of SpltAtConv2d. Default: 2 + reduction_factor (int): Reduction factor of inter_channels in + SplitAttentionConv2d. Default: 4. + avg_down_stride (bool): Whether to use average pool for stride in + Bottleneck. Default: True. + kwargs (dict): Keyword arguments for ResNet. + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)), + 200: (Bottleneck, (3, 24, 36, 3)) + } + + def __init__(self, + groups=1, + base_width=4, + radix=2, + reduction_factor=4, + avg_down_stride=True, + **kwargs): + self.groups = groups + self.base_width = base_width + self.radix = radix + self.reduction_factor = reduction_factor + self.avg_down_stride = avg_down_stride + super(ResNeSt, self).__init__(**kwargs) + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``.""" + return ResLayer( + groups=self.groups, + base_width=self.base_width, + base_channels=self.base_channels, + radix=self.radix, + reduction_factor=self.reduction_factor, + avg_down_stride=self.avg_down_stride, + **kwargs) diff --git a/custom_mmpkg/custom_mmseg/models/backbones/resnet.py b/custom_mmpkg/custom_mmseg/models/backbones/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..9585254cabbf84fd54cf1644b6bd7c8304f730b8 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/resnet.py @@ -0,0 +1,688 @@ +import torch.nn as nn +import torch.utils.checkpoint as cp +from custom_mmpkg.custom_mmcv.cnn import (build_conv_layer, build_norm_layer, build_plugin_layer, + constant_init, kaiming_init) +from custom_mmpkg.custom_mmcv.runner import load_checkpoint +from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import _BatchNorm + +from custom_mmpkg.custom_mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import ResLayer + + +class BasicBlock(nn.Module): + """Basic block for ResNet.""" + + expansion = 1 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None): + super(BasicBlock, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + 3, + stride=stride, + padding=dilation, + dilation=dilation, + bias=False) + self.add_module(self.norm1_name, norm1) + self.conv2 = build_conv_layer( + conv_cfg, planes, planes, 3, padding=1, bias=False) + self.add_module(self.norm2_name, norm2) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.with_cp = with_cp + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.norm2(out) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +class Bottleneck(nn.Module): + """Bottleneck block for ResNet. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is + "caffe", the stride-two layer is the first 1x1 conv layer. + """ + + expansion = 4 + + def __init__(self, + inplanes, + planes, + stride=1, + dilation=1, + downsample=None, + style='pytorch', + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + dcn=None, + plugins=None): + super(Bottleneck, self).__init__() + assert style in ['pytorch', 'caffe'] + assert dcn is None or isinstance(dcn, dict) + assert plugins is None or isinstance(plugins, list) + if plugins is not None: + allowed_position = ['after_conv1', 'after_conv2', 'after_conv3'] + assert all(p['position'] in allowed_position for p in plugins) + + self.inplanes = inplanes + self.planes = planes + self.stride = stride + self.dilation = dilation + self.style = style + self.with_cp = with_cp + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.dcn = dcn + self.with_dcn = dcn is not None + self.plugins = plugins + self.with_plugins = plugins is not None + + if self.with_plugins: + # collect plugins for conv1/conv2/conv3 + self.after_conv1_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv1' + ] + self.after_conv2_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv2' + ] + self.after_conv3_plugins = [ + plugin['cfg'] for plugin in plugins + if plugin['position'] == 'after_conv3' + ] + + if self.style == 'pytorch': + self.conv1_stride = 1 + self.conv2_stride = stride + else: + self.conv1_stride = stride + self.conv2_stride = 1 + + self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1) + self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + norm_cfg, planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + conv_cfg, + inplanes, + planes, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + if self.with_dcn: + fallback_on_stride = dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + conv_cfg, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + dcn, + planes, + planes, + kernel_size=3, + stride=self.conv2_stride, + padding=dilation, + dilation=dilation, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + conv_cfg, + planes, + planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + + if self.with_plugins: + self.after_conv1_plugin_names = self.make_block_plugins( + planes, self.after_conv1_plugins) + self.after_conv2_plugin_names = self.make_block_plugins( + planes, self.after_conv2_plugins) + self.after_conv3_plugin_names = self.make_block_plugins( + planes * self.expansion, self.after_conv3_plugins) + + def make_block_plugins(self, in_channels, plugins): + """make plugins for block. + + Args: + in_channels (int): Input channels of plugin. + plugins (list[dict]): List of plugins cfg to build. + + Returns: + list[str]: List of the names of plugin. + """ + assert isinstance(plugins, list) + plugin_names = [] + for plugin in plugins: + plugin = plugin.copy() + name, layer = build_plugin_layer( + plugin, + in_channels=in_channels, + postfix=plugin.pop('postfix', '')) + assert not hasattr(self, name), f'duplicate plugin {name}' + self.add_module(name, layer) + plugin_names.append(name) + return plugin_names + + def forward_plugin(self, x, plugin_names): + """Forward function for plugins.""" + out = x + for name in plugin_names: + out = getattr(self, name)(x) + return out + + @property + def norm1(self): + """nn.Module: normalization layer after the first convolution layer""" + return getattr(self, self.norm1_name) + + @property + def norm2(self): + """nn.Module: normalization layer after the second convolution layer""" + return getattr(self, self.norm2_name) + + @property + def norm3(self): + """nn.Module: normalization layer after the third convolution layer""" + return getattr(self, self.norm3_name) + + def forward(self, x): + """Forward function.""" + + def _inner_forward(x): + identity = x + + out = self.conv1(x) + out = self.norm1(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv1_plugin_names) + + out = self.conv2(out) + out = self.norm2(out) + out = self.relu(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv2_plugin_names) + + out = self.conv3(out) + out = self.norm3(out) + + if self.with_plugins: + out = self.forward_plugin(out, self.after_conv3_plugin_names) + + if self.downsample is not None: + identity = self.downsample(x) + + out += identity + + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + out = self.relu(out) + + return out + + +@BACKBONES.register_module() +class ResNet(nn.Module): + """ResNet backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + in_channels (int): Number of input image channels. Default" 3. + stem_channels (int): Number of stem channels. Default: 64. + base_channels (int): Number of base channels of res layer. Default: 64. + num_stages (int): Resnet stages, normally 4. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + norm_cfg (dict): Dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + plugins (list[dict]): List of plugins for stages, each dict contains: + + - cfg (dict, required): Cfg dict to build plugin. + + - position (str, required): Position inside block to insert plugin, + options: 'after_conv1', 'after_conv2', 'after_conv3'. + + - stages (tuple[bool], optional): Stages to apply plugin, length + should be same as 'num_stages' + multi_grid (Sequence[int]|None): Multi grid dilation rates of last + stage. Default: None + contract_dilation (bool): Whether contract first dilation of each layer + Default: False + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): Whether to use zero init for last norm layer + in resblocks to let them behave as identity. + + Example: + >>> from custom_mmpkg.custom_mmseg.models import ResNet + >>> import torch + >>> self = ResNet(depth=18) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 64, 8, 8) + (1, 128, 4, 4) + (1, 256, 2, 2) + (1, 512, 1, 1) + """ + + arch_settings = { + 18: (BasicBlock, (2, 2, 2, 2)), + 34: (BasicBlock, (3, 4, 6, 3)), + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, + depth, + in_channels=3, + stem_channels=64, + base_channels=64, + num_stages=4, + strides=(1, 2, 2, 2), + dilations=(1, 1, 1, 1), + out_indices=(0, 1, 2, 3), + style='pytorch', + deep_stem=False, + avg_down=False, + frozen_stages=-1, + conv_cfg=None, + norm_cfg=dict(type='BN', requires_grad=True), + norm_eval=False, + dcn=None, + stage_with_dcn=(False, False, False, False), + plugins=None, + multi_grid=None, + contract_dilation=False, + with_cp=False, + zero_init_residual=True): + super(ResNet, self).__init__() + if depth not in self.arch_settings: + raise KeyError(f'invalid depth {depth} for resnet') + self.depth = depth + self.stem_channels = stem_channels + self.base_channels = base_channels + self.num_stages = num_stages + assert num_stages >= 1 and num_stages <= 4 + self.strides = strides + self.dilations = dilations + assert len(strides) == len(dilations) == num_stages + self.out_indices = out_indices + assert max(out_indices) < num_stages + self.style = style + self.deep_stem = deep_stem + self.avg_down = avg_down + self.frozen_stages = frozen_stages + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.with_cp = with_cp + self.norm_eval = norm_eval + self.dcn = dcn + self.stage_with_dcn = stage_with_dcn + if dcn is not None: + assert len(stage_with_dcn) == num_stages + self.plugins = plugins + self.multi_grid = multi_grid + self.contract_dilation = contract_dilation + self.zero_init_residual = zero_init_residual + self.block, stage_blocks = self.arch_settings[depth] + self.stage_blocks = stage_blocks[:num_stages] + self.inplanes = stem_channels + + self._make_stem_layer(in_channels, stem_channels) + + self.res_layers = [] + for i, num_blocks in enumerate(self.stage_blocks): + stride = strides[i] + dilation = dilations[i] + dcn = self.dcn if self.stage_with_dcn[i] else None + if plugins is not None: + stage_plugins = self.make_stage_plugins(plugins, i) + else: + stage_plugins = None + # multi grid is applied to last layer only + stage_multi_grid = multi_grid if i == len( + self.stage_blocks) - 1 else None + planes = base_channels * 2**i + res_layer = self.make_res_layer( + block=self.block, + inplanes=self.inplanes, + planes=planes, + num_blocks=num_blocks, + stride=stride, + dilation=dilation, + style=self.style, + avg_down=self.avg_down, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + dcn=dcn, + plugins=stage_plugins, + multi_grid=stage_multi_grid, + contract_dilation=contract_dilation) + self.inplanes = planes * self.block.expansion + layer_name = f'layer{i+1}' + self.add_module(layer_name, res_layer) + self.res_layers.append(layer_name) + + self._freeze_stages() + + self.feat_dim = self.block.expansion * base_channels * 2**( + len(self.stage_blocks) - 1) + + def make_stage_plugins(self, plugins, stage_idx): + """make plugins for ResNet 'stage_idx'th stage . + + Currently we support to insert 'context_block', + 'empirical_attention_block', 'nonlocal_block' into the backbone like + ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of + Bottleneck. + + An example of plugins format could be : + >>> plugins=[ + ... dict(cfg=dict(type='xxx', arg1='xxx'), + ... stages=(False, True, True, True), + ... position='after_conv2'), + ... dict(cfg=dict(type='yyy'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='1'), + ... stages=(True, True, True, True), + ... position='after_conv3'), + ... dict(cfg=dict(type='zzz', postfix='2'), + ... stages=(True, True, True, True), + ... position='after_conv3') + ... ] + >>> self = ResNet(depth=18) + >>> stage_plugins = self.make_stage_plugins(plugins, 0) + >>> assert len(stage_plugins) == 3 + + Suppose 'stage_idx=0', the structure of blocks in the stage would be: + conv1-> conv2->conv3->yyy->zzz1->zzz2 + Suppose 'stage_idx=1', the structure of blocks in the stage would be: + conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2 + + If stages is missing, the plugin would be applied to all stages. + + Args: + plugins (list[dict]): List of plugins cfg to build. The postfix is + required if multiple same type plugins are inserted. + stage_idx (int): Index of stage to build + + Returns: + list[dict]: Plugins for current stage + """ + stage_plugins = [] + for plugin in plugins: + plugin = plugin.copy() + stages = plugin.pop('stages', None) + assert stages is None or len(stages) == self.num_stages + # whether to insert plugin into current stage + if stages is None or stages[stage_idx]: + stage_plugins.append(plugin) + + return stage_plugins + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``.""" + return ResLayer(**kwargs) + + @property + def norm1(self): + """nn.Module: the normalization layer named "norm1" """ + return getattr(self, self.norm1_name) + + def _make_stem_layer(self, in_channels, stem_channels): + """Make stem layer for ResNet.""" + if self.deep_stem: + self.stem = nn.Sequential( + build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels // 2, + kernel_size=3, + stride=2, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels // 2, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels // 2)[1], + nn.ReLU(inplace=True), + build_conv_layer( + self.conv_cfg, + stem_channels // 2, + stem_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False), + build_norm_layer(self.norm_cfg, stem_channels)[1], + nn.ReLU(inplace=True)) + else: + self.conv1 = build_conv_layer( + self.conv_cfg, + in_channels, + stem_channels, + kernel_size=7, + stride=2, + padding=3, + bias=False) + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, stem_channels, postfix=1) + self.add_module(self.norm1_name, norm1) + self.relu = nn.ReLU(inplace=True) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + def _freeze_stages(self): + """Freeze stages param and norm stats.""" + if self.frozen_stages >= 0: + if self.deep_stem: + self.stem.eval() + for param in self.stem.parameters(): + param.requires_grad = False + else: + self.norm1.eval() + for m in [self.conv1, self.norm1]: + for param in m.parameters(): + param.requires_grad = False + + for i in range(1, self.frozen_stages + 1): + m = getattr(self, f'layer{i}') + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + + if self.dcn is not None: + for m in self.modules(): + if isinstance(m, Bottleneck) and hasattr( + m, 'conv2_offset'): + constant_init(m.conv2_offset, 0) + + if self.zero_init_residual: + for m in self.modules(): + if isinstance(m, Bottleneck): + constant_init(m.norm3, 0) + elif isinstance(m, BasicBlock): + constant_init(m.norm2, 0) + else: + raise TypeError('pretrained must be a str or None') + + def forward(self, x): + """Forward function.""" + if self.deep_stem: + x = self.stem(x) + else: + x = self.conv1(x) + x = self.norm1(x) + x = self.relu(x) + x = self.maxpool(x) + outs = [] + for i, layer_name in enumerate(self.res_layers): + res_layer = getattr(self, layer_name) + x = res_layer(x) + if i in self.out_indices: + outs.append(x) + return tuple(outs) + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(ResNet, self).train(mode) + self._freeze_stages() + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + +@BACKBONES.register_module() +class ResNetV1c(ResNet): + """ResNetV1c variant described in [1]_. + + Compared with default ResNet(ResNetV1b), ResNetV1c replaces the 7x7 conv + in the input stem with three 3x3 convs. + + References: + .. [1] https://arxiv.org/pdf/1812.01187.pdf + """ + + def __init__(self, **kwargs): + super(ResNetV1c, self).__init__( + deep_stem=True, avg_down=False, **kwargs) + + +@BACKBONES.register_module() +class ResNetV1d(ResNet): + """ResNetV1d variant described in [1]_. + + Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in + the input stem with three 3x3 convs. And in the downsampling block, a 2x2 + avg_pool with stride 2 is added before conv, whose stride is changed to 1. + """ + + def __init__(self, **kwargs): + super(ResNetV1d, self).__init__( + deep_stem=True, avg_down=True, **kwargs) diff --git a/custom_mmpkg/custom_mmseg/models/backbones/resnext.py b/custom_mmpkg/custom_mmseg/models/backbones/resnext.py new file mode 100644 index 0000000000000000000000000000000000000000..d6a2910074e1671c2e7db2fd3e86f995c590d18b --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/resnext.py @@ -0,0 +1,145 @@ +import math + +from custom_mmpkg.custom_mmcv.cnn import build_conv_layer, build_norm_layer + +from ..builder import BACKBONES +from ..utils import ResLayer +from .resnet import Bottleneck as _Bottleneck +from .resnet import ResNet + + +class Bottleneck(_Bottleneck): + """Bottleneck block for ResNeXt. + + If style is "pytorch", the stride-two layer is the 3x3 conv layer, if it is + "caffe", the stride-two layer is the first 1x1 conv layer. + """ + + def __init__(self, + inplanes, + planes, + groups=1, + base_width=4, + base_channels=64, + **kwargs): + super(Bottleneck, self).__init__(inplanes, planes, **kwargs) + + if groups == 1: + width = self.planes + else: + width = math.floor(self.planes * + (base_width / base_channels)) * groups + + self.norm1_name, norm1 = build_norm_layer( + self.norm_cfg, width, postfix=1) + self.norm2_name, norm2 = build_norm_layer( + self.norm_cfg, width, postfix=2) + self.norm3_name, norm3 = build_norm_layer( + self.norm_cfg, self.planes * self.expansion, postfix=3) + + self.conv1 = build_conv_layer( + self.conv_cfg, + self.inplanes, + width, + kernel_size=1, + stride=self.conv1_stride, + bias=False) + self.add_module(self.norm1_name, norm1) + fallback_on_stride = False + self.with_modulated_dcn = False + if self.with_dcn: + fallback_on_stride = self.dcn.pop('fallback_on_stride', False) + if not self.with_dcn or fallback_on_stride: + self.conv2 = build_conv_layer( + self.conv_cfg, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + else: + assert self.conv_cfg is None, 'conv_cfg must be None for DCN' + self.conv2 = build_conv_layer( + self.dcn, + width, + width, + kernel_size=3, + stride=self.conv2_stride, + padding=self.dilation, + dilation=self.dilation, + groups=groups, + bias=False) + + self.add_module(self.norm2_name, norm2) + self.conv3 = build_conv_layer( + self.conv_cfg, + width, + self.planes * self.expansion, + kernel_size=1, + bias=False) + self.add_module(self.norm3_name, norm3) + + +@BACKBONES.register_module() +class ResNeXt(ResNet): + """ResNeXt backbone. + + Args: + depth (int): Depth of resnet, from {18, 34, 50, 101, 152}. + in_channels (int): Number of input image channels. Normally 3. + num_stages (int): Resnet stages, normally 4. + groups (int): Group of resnext. + base_width (int): Base width of resnext. + strides (Sequence[int]): Strides of the first block of each stage. + dilations (Sequence[int]): Dilation of each stage. + out_indices (Sequence[int]): Output from which stages. + style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two + layer is the 3x3 conv layer, otherwise the stride-two layer is + the first 1x1 conv layer. + frozen_stages (int): Stages to be frozen (all param fixed). -1 means + not freezing any parameters. + norm_cfg (dict): dictionary to construct and config norm layer. + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. + zero_init_residual (bool): whether to use zero init for last norm layer + in resblocks to let them behave as identity. + + Example: + >>> from custom_mmpkg.custom_mmseg.models import ResNeXt + >>> import torch + >>> self = ResNeXt(depth=50) + >>> self.eval() + >>> inputs = torch.rand(1, 3, 32, 32) + >>> level_outputs = self.forward(inputs) + >>> for level_out in level_outputs: + ... print(tuple(level_out.shape)) + (1, 256, 8, 8) + (1, 512, 4, 4) + (1, 1024, 2, 2) + (1, 2048, 1, 1) + """ + + arch_settings = { + 50: (Bottleneck, (3, 4, 6, 3)), + 101: (Bottleneck, (3, 4, 23, 3)), + 152: (Bottleneck, (3, 8, 36, 3)) + } + + def __init__(self, groups=1, base_width=4, **kwargs): + self.groups = groups + self.base_width = base_width + super(ResNeXt, self).__init__(**kwargs) + + def make_res_layer(self, **kwargs): + """Pack all blocks in a stage into a ``ResLayer``""" + return ResLayer( + groups=self.groups, + base_width=self.base_width, + base_channels=self.base_channels, + **kwargs) diff --git a/custom_mmpkg/custom_mmseg/models/backbones/unet.py b/custom_mmpkg/custom_mmseg/models/backbones/unet.py new file mode 100644 index 0000000000000000000000000000000000000000..694272114506e42ebc2531996432a567e1e588b6 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/unet.py @@ -0,0 +1,429 @@ +import torch.nn as nn +import torch.utils.checkpoint as cp +from custom_mmpkg.custom_mmcv.cnn import (UPSAMPLE_LAYERS, ConvModule, build_activation_layer, + build_norm_layer, constant_init, kaiming_init) +from custom_mmpkg.custom_mmcv.runner import load_checkpoint +from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import _BatchNorm + +from custom_mmpkg.custom_mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import UpConvBlock + + +class BasicConvBlock(nn.Module): + """Basic convolutional block for UNet. + + This module consists of several plain convolutional layers. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + num_convs (int): Number of convolutional layers. Default: 2. + stride (int): Whether use stride convolution to downsample + the input feature map. If stride=2, it only uses stride convolution + in the first convolutional layer to downsample the input feature + map. Options are 1 or 2. Default: 1. + dilation (int): Whether use dilated convolution to expand the + receptive field. Set dilation rate of each convolutional layer and + the dilation rate of the first convolutional layer is always 1. + Default: 1. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + dcn (bool): Use deformable convolution in convolutional layer or not. + Default: None. + plugins (dict): plugins for convolutional layers. Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + num_convs=2, + stride=1, + dilation=1, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + dcn=None, + plugins=None): + super(BasicConvBlock, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.with_cp = with_cp + convs = [] + for i in range(num_convs): + convs.append( + ConvModule( + in_channels=in_channels if i == 0 else out_channels, + out_channels=out_channels, + kernel_size=3, + stride=stride if i == 0 else 1, + dilation=1 if i == 0 else dilation, + padding=1 if i == 0 else dilation, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + self.convs = nn.Sequential(*convs) + + def forward(self, x): + """Forward function.""" + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(self.convs, x) + else: + out = self.convs(x) + return out + + +@UPSAMPLE_LAYERS.register_module() +class DeconvModule(nn.Module): + """Deconvolution upsample module in decoder for UNet (2X upsample). + + This module uses deconvolution to upsample feature map in the decoder + of UNet. + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + kernel_size (int): Kernel size of the convolutional layer. Default: 4. + """ + + def __init__(self, + in_channels, + out_channels, + with_cp=False, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + *, + kernel_size=4, + scale_factor=2): + super(DeconvModule, self).__init__() + + assert (kernel_size - scale_factor >= 0) and\ + (kernel_size - scale_factor) % 2 == 0,\ + f'kernel_size should be greater than or equal to scale_factor '\ + f'and (kernel_size - scale_factor) should be even numbers, '\ + f'while the kernel size is {kernel_size} and scale_factor is '\ + f'{scale_factor}.' + + stride = scale_factor + padding = (kernel_size - scale_factor) // 2 + self.with_cp = with_cp + deconv = nn.ConvTranspose2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding) + + norm_name, norm = build_norm_layer(norm_cfg, out_channels) + activate = build_activation_layer(act_cfg) + self.deconv_upsamping = nn.Sequential(deconv, norm, activate) + + def forward(self, x): + """Forward function.""" + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(self.deconv_upsamping, x) + else: + out = self.deconv_upsamping(x) + return out + + +@UPSAMPLE_LAYERS.register_module() +class InterpConv(nn.Module): + """Interpolation upsample module in decoder for UNet. + + This module uses interpolation to upsample feature map in the decoder + of UNet. It consists of one interpolation upsample layer and one + convolutional layer. It can be one interpolation upsample layer followed + by one convolutional layer (conv_first=False) or one convolutional layer + followed by one interpolation upsample layer (conv_first=True). + + Args: + in_channels (int): Number of input channels. + out_channels (int): Number of output channels. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + conv_first (bool): Whether convolutional layer or interpolation + upsample layer first. Default: False. It means interpolation + upsample layer followed by one convolutional layer. + kernel_size (int): Kernel size of the convolutional layer. Default: 1. + stride (int): Stride of the convolutional layer. Default: 1. + padding (int): Padding of the convolutional layer. Default: 1. + upsample_cfg (dict): Interpolation config of the upsample layer. + Default: dict( + scale_factor=2, mode='bilinear', align_corners=False). + """ + + def __init__(self, + in_channels, + out_channels, + with_cp=False, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + *, + conv_cfg=None, + conv_first=False, + kernel_size=1, + stride=1, + padding=0, + upsample_cfg=dict( + scale_factor=2, mode='bilinear', align_corners=False)): + super(InterpConv, self).__init__() + + self.with_cp = with_cp + conv = ConvModule( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + upsample = nn.Upsample(**upsample_cfg) + if conv_first: + self.interp_upsample = nn.Sequential(conv, upsample) + else: + self.interp_upsample = nn.Sequential(upsample, conv) + + def forward(self, x): + """Forward function.""" + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(self.interp_upsample, x) + else: + out = self.interp_upsample(x) + return out + + +@BACKBONES.register_module() +class UNet(nn.Module): + """UNet backbone. + U-Net: Convolutional Networks for Biomedical Image Segmentation. + https://arxiv.org/pdf/1505.04597.pdf + + Args: + in_channels (int): Number of input image channels. Default" 3. + base_channels (int): Number of base channels of each stage. + The output channels of the first stage. Default: 64. + num_stages (int): Number of stages in encoder, normally 5. Default: 5. + strides (Sequence[int 1 | 2]): Strides of each stage in encoder. + len(strides) is equal to num_stages. Normally the stride of the + first stage in encoder is 1. If strides[i]=2, it uses stride + convolution to downsample in the correspondence encoder stage. + Default: (1, 1, 1, 1, 1). + enc_num_convs (Sequence[int]): Number of convolutional layers in the + convolution block of the correspondence encoder stage. + Default: (2, 2, 2, 2, 2). + dec_num_convs (Sequence[int]): Number of convolutional layers in the + convolution block of the correspondence decoder stage. + Default: (2, 2, 2, 2). + downsamples (Sequence[int]): Whether use MaxPool to downsample the + feature map after the first stage of encoder + (stages: [1, num_stages)). If the correspondence encoder stage use + stride convolution (strides[i]=2), it will never use MaxPool to + downsample, even downsamples[i-1]=True. + Default: (True, True, True, True). + enc_dilations (Sequence[int]): Dilation rate of each stage in encoder. + Default: (1, 1, 1, 1, 1). + dec_dilations (Sequence[int]): Dilation rate of each stage in decoder. + Default: (1, 1, 1, 1). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + upsample_cfg (dict): The upsample config of the upsample module in + decoder. Default: dict(type='InterpConv'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + dcn (bool): Use deformable convolution in convolutional layer or not. + Default: None. + plugins (dict): plugins for convolutional layers. Default: None. + + Notice: + The input image size should be divisible by the whole downsample rate + of the encoder. More detail of the whole downsample rate can be found + in UNet._check_input_divisible. + + """ + + def __init__(self, + in_channels=3, + base_channels=64, + num_stages=5, + strides=(1, 1, 1, 1, 1), + enc_num_convs=(2, 2, 2, 2, 2), + dec_num_convs=(2, 2, 2, 2), + downsamples=(True, True, True, True), + enc_dilations=(1, 1, 1, 1, 1), + dec_dilations=(1, 1, 1, 1), + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + upsample_cfg=dict(type='InterpConv'), + norm_eval=False, + dcn=None, + plugins=None): + super(UNet, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + assert len(strides) == num_stages, \ + 'The length of strides should be equal to num_stages, '\ + f'while the strides is {strides}, the length of '\ + f'strides is {len(strides)}, and the num_stages is '\ + f'{num_stages}.' + assert len(enc_num_convs) == num_stages, \ + 'The length of enc_num_convs should be equal to num_stages, '\ + f'while the enc_num_convs is {enc_num_convs}, the length of '\ + f'enc_num_convs is {len(enc_num_convs)}, and the num_stages is '\ + f'{num_stages}.' + assert len(dec_num_convs) == (num_stages-1), \ + 'The length of dec_num_convs should be equal to (num_stages-1), '\ + f'while the dec_num_convs is {dec_num_convs}, the length of '\ + f'dec_num_convs is {len(dec_num_convs)}, and the num_stages is '\ + f'{num_stages}.' + assert len(downsamples) == (num_stages-1), \ + 'The length of downsamples should be equal to (num_stages-1), '\ + f'while the downsamples is {downsamples}, the length of '\ + f'downsamples is {len(downsamples)}, and the num_stages is '\ + f'{num_stages}.' + assert len(enc_dilations) == num_stages, \ + 'The length of enc_dilations should be equal to num_stages, '\ + f'while the enc_dilations is {enc_dilations}, the length of '\ + f'enc_dilations is {len(enc_dilations)}, and the num_stages is '\ + f'{num_stages}.' + assert len(dec_dilations) == (num_stages-1), \ + 'The length of dec_dilations should be equal to (num_stages-1), '\ + f'while the dec_dilations is {dec_dilations}, the length of '\ + f'dec_dilations is {len(dec_dilations)}, and the num_stages is '\ + f'{num_stages}.' + self.num_stages = num_stages + self.strides = strides + self.downsamples = downsamples + self.norm_eval = norm_eval + self.base_channels = base_channels + + self.encoder = nn.ModuleList() + self.decoder = nn.ModuleList() + + for i in range(num_stages): + enc_conv_block = [] + if i != 0: + if strides[i] == 1 and downsamples[i - 1]: + enc_conv_block.append(nn.MaxPool2d(kernel_size=2)) + upsample = (strides[i] != 1 or downsamples[i - 1]) + self.decoder.append( + UpConvBlock( + conv_block=BasicConvBlock, + in_channels=base_channels * 2**i, + skip_channels=base_channels * 2**(i - 1), + out_channels=base_channels * 2**(i - 1), + num_convs=dec_num_convs[i - 1], + stride=1, + dilation=dec_dilations[i - 1], + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + upsample_cfg=upsample_cfg if upsample else None, + dcn=None, + plugins=None)) + + enc_conv_block.append( + BasicConvBlock( + in_channels=in_channels, + out_channels=base_channels * 2**i, + num_convs=enc_num_convs[i], + stride=strides[i], + dilation=enc_dilations[i], + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + dcn=None, + plugins=None)) + self.encoder.append((nn.Sequential(*enc_conv_block))) + in_channels = base_channels * 2**i + + def forward(self, x): + self._check_input_divisible(x) + enc_outs = [] + for enc in self.encoder: + x = enc(x) + enc_outs.append(x) + dec_outs = [x] + for i in reversed(range(len(self.decoder))): + x = self.decoder[i](enc_outs[i], x) + dec_outs.append(x) + + return dec_outs + + def train(self, mode=True): + """Convert the model into training mode while keep normalization layer + freezed.""" + super(UNet, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + # trick: eval have effect on BatchNorm only + if isinstance(m, _BatchNorm): + m.eval() + + def _check_input_divisible(self, x): + h, w = x.shape[-2:] + whole_downsample_rate = 1 + for i in range(1, self.num_stages): + if self.strides[i] == 2 or self.downsamples[i - 1]: + whole_downsample_rate *= 2 + assert (h % whole_downsample_rate == 0) \ + and (w % whole_downsample_rate == 0),\ + f'The input image size {(h, w)} should be divisible by the whole '\ + f'downsample rate {whole_downsample_rate}, when num_stages is '\ + f'{self.num_stages}, strides is {self.strides}, and downsamples '\ + f'is {self.downsamples}.' + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if isinstance(pretrained, str): + logger = get_root_logger() + load_checkpoint(self, pretrained, strict=False, logger=logger) + elif pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + else: + raise TypeError('pretrained must be a str or None') diff --git a/custom_mmpkg/custom_mmseg/models/backbones/vit.py b/custom_mmpkg/custom_mmseg/models/backbones/vit.py new file mode 100644 index 0000000000000000000000000000000000000000..93bae38424b69dd6699089163db30fa787efb9ac --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/backbones/vit.py @@ -0,0 +1,459 @@ +"""Modified from https://github.com/rwightman/pytorch-image- +models/blob/master/timm/models/vision_transformer.py.""" + +import math + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from custom_mmpkg.custom_mmcv.cnn import (Conv2d, Linear, build_activation_layer, build_norm_layer, + constant_init, kaiming_init, normal_init) +from custom_mmpkg.custom_mmcv.runner import _load_checkpoint +from custom_mmpkg.custom_mmcv.utils.parrots_wrapper import _BatchNorm + +from custom_mmpkg.custom_mmseg.utils import get_root_logger +from ..builder import BACKBONES +from ..utils import DropPath, trunc_normal_ + + +class Mlp(nn.Module): + """MLP layer for Encoder block. + + Args: + in_features(int): Input dimension for the first fully + connected layer. + hidden_features(int): Output dimension for the first fully + connected layer. + out_features(int): Output dementsion for the second fully + connected layer. + act_cfg(dict): Config dict for activation layer. + Default: dict(type='GELU'). + drop(float): Drop rate for the dropout layer. Dropout rate has + to be between 0 and 1. Default: 0. + """ + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_cfg=dict(type='GELU'), + drop=0.): + super(Mlp, self).__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = Linear(in_features, hidden_features) + self.act = build_activation_layer(act_cfg) + self.fc2 = Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + """Attention layer for Encoder block. + + Args: + dim (int): Dimension for the input vector. + num_heads (int): Number of parallel attention heads. + qkv_bias (bool): Enable bias for qkv if True. Default: False. + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + attn_drop (float): Drop rate for attention output weights. + Default: 0. + proj_drop (float): Drop rate for output weights. Default: 0. + """ + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super(Attention, self).__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + b, n, c = x.shape + qkv = self.qkv(x).reshape(b, n, 3, self.num_heads, + c // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(b, n, c) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + """Implements encoder block with residual connection. + + Args: + dim (int): The feature dimension. + num_heads (int): Number of parallel attention heads. + mlp_ratio (int): Ratio of mlp hidden dim to embedding dim. + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop (float): Drop rate for mlp output weights. Default: 0. + attn_drop (float): Drop rate for attention output weights. + Default: 0. + proj_drop (float): Drop rate for attn layer output weights. + Default: 0. + drop_path (float): Drop rate for paths of model. + Default: 0. + act_cfg (dict): Config dict for activation layer. + Default: dict(type='GELU'). + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN', requires_grad=True). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + """ + + def __init__(self, + dim, + num_heads, + mlp_ratio=4, + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + proj_drop=0., + drop_path=0., + act_cfg=dict(type='GELU'), + norm_cfg=dict(type='LN', eps=1e-6), + with_cp=False): + super(Block, self).__init__() + self.with_cp = with_cp + _, self.norm1 = build_norm_layer(norm_cfg, dim) + self.attn = Attention(dim, num_heads, qkv_bias, qk_scale, attn_drop, + proj_drop) + self.drop_path = DropPath( + drop_path) if drop_path > 0. else nn.Identity() + _, self.norm2 = build_norm_layer(norm_cfg, dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_cfg=act_cfg, + drop=drop) + + def forward(self, x): + + def _inner_forward(x): + out = x + self.drop_path(self.attn(self.norm1(x))) + out = out + self.drop_path(self.mlp(self.norm2(out))) + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding. + + Args: + img_size (int | tuple): Input image size. + default: 224. + patch_size (int): Width and height for a patch. + default: 16. + in_channels (int): Input channels for images. Default: 3. + embed_dim (int): The embedding dimension. Default: 768. + """ + + def __init__(self, + img_size=224, + patch_size=16, + in_channels=3, + embed_dim=768): + super(PatchEmbed, self).__init__() + if isinstance(img_size, int): + self.img_size = (img_size, img_size) + elif isinstance(img_size, tuple): + self.img_size = img_size + else: + raise TypeError('img_size must be type of int or tuple') + h, w = self.img_size + self.patch_size = (patch_size, patch_size) + self.num_patches = (h // patch_size) * (w // patch_size) + self.proj = Conv2d( + in_channels, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + return self.proj(x).flatten(2).transpose(1, 2) + + +@BACKBONES.register_module() +class VisionTransformer(nn.Module): + """Vision transformer backbone. + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for + Image Recognition at Scale` - https://arxiv.org/abs/2010.11929 + + Args: + img_size (tuple): input image size. Default: (224, 224). + patch_size (int, tuple): patch size. Default: 16. + in_channels (int): number of input channels. Default: 3. + embed_dim (int): embedding dimension. Default: 768. + depth (int): depth of transformer. Default: 12. + num_heads (int): number of attention heads. Default: 12. + mlp_ratio (int): ratio of mlp hidden dim to embedding dim. + Default: 4. + out_indices (list | tuple | int): Output from which stages. + Default: -1. + qkv_bias (bool): enable bias for qkv if True. Default: True. + qk_scale (float): override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): dropout rate. Default: 0. + attn_drop_rate (float): attention dropout rate. Default: 0. + drop_path_rate (float): Rate of DropPath. Default: 0. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='LN', eps=1e-6, requires_grad=True). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='GELU'). + norm_eval (bool): Whether to set norm layers to eval mode, namely, + freeze running stats (mean and var). Note: Effect on Batch Norm + and its variants only. Default: False. + final_norm (bool): Whether to add a additional layer to normalize + final feature map. Default: False. + interpolate_mode (str): Select the interpolate mode for position + embeding vector resize. Default: bicubic. + with_cls_token (bool): If concatenating class token into image tokens + as transformer input. Default: True. + with_cp (bool): Use checkpoint or not. Using checkpoint + will save some memory while slowing down the training speed. + Default: False. + """ + + def __init__(self, + img_size=(224, 224), + patch_size=16, + in_channels=3, + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4, + out_indices=11, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_cfg=dict(type='LN', eps=1e-6, requires_grad=True), + act_cfg=dict(type='GELU'), + norm_eval=False, + final_norm=False, + with_cls_token=True, + interpolate_mode='bicubic', + with_cp=False): + super(VisionTransformer, self).__init__() + self.img_size = img_size + self.patch_size = patch_size + self.features = self.embed_dim = embed_dim + self.patch_embed = PatchEmbed( + img_size=img_size, + patch_size=patch_size, + in_channels=in_channels, + embed_dim=embed_dim) + + self.with_cls_token = with_cls_token + self.cls_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, self.patch_embed.num_patches + 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + if isinstance(out_indices, int): + self.out_indices = [out_indices] + elif isinstance(out_indices, list) or isinstance(out_indices, tuple): + self.out_indices = out_indices + else: + raise TypeError('out_indices must be type of int, list or tuple') + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=dpr[i], + attn_drop=attn_drop_rate, + act_cfg=act_cfg, + norm_cfg=norm_cfg, + with_cp=with_cp) for i in range(depth) + ]) + + self.interpolate_mode = interpolate_mode + self.final_norm = final_norm + if final_norm: + _, self.norm = build_norm_layer(norm_cfg, embed_dim) + + self.norm_eval = norm_eval + self.with_cp = with_cp + + def init_weights(self, pretrained=None): + if isinstance(pretrained, str): + logger = get_root_logger() + checkpoint = _load_checkpoint(pretrained, logger=logger) + if 'state_dict' in checkpoint: + state_dict = checkpoint['state_dict'] + else: + state_dict = checkpoint + + if 'pos_embed' in state_dict.keys(): + if self.pos_embed.shape != state_dict['pos_embed'].shape: + logger.info(msg=f'Resize the pos_embed shape from \ +{state_dict["pos_embed"].shape} to {self.pos_embed.shape}') + h, w = self.img_size + pos_size = int( + math.sqrt(state_dict['pos_embed'].shape[1] - 1)) + state_dict['pos_embed'] = self.resize_pos_embed( + state_dict['pos_embed'], (h, w), (pos_size, pos_size), + self.patch_size, self.interpolate_mode) + + self.load_state_dict(state_dict, False) + + elif pretrained is None: + # We only implement the 'jax_impl' initialization implemented at + # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353 # noqa: E501 + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + for n, m in self.named_modules(): + if isinstance(m, Linear): + trunc_normal_(m.weight, std=.02) + if m.bias is not None: + if 'mlp' in n: + normal_init(m.bias, std=1e-6) + else: + constant_init(m.bias, 0) + elif isinstance(m, Conv2d): + kaiming_init(m.weight, mode='fan_in') + if m.bias is not None: + constant_init(m.bias, 0) + elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)): + constant_init(m.bias, 0) + constant_init(m.weight, 1.0) + else: + raise TypeError('pretrained must be a str or None') + + def _pos_embeding(self, img, patched_img, pos_embed): + """Positiong embeding method. + + Resize the pos_embed, if the input image size doesn't match + the training size. + Args: + img (torch.Tensor): The inference image tensor, the shape + must be [B, C, H, W]. + patched_img (torch.Tensor): The patched image, it should be + shape of [B, L1, C]. + pos_embed (torch.Tensor): The pos_embed weighs, it should be + shape of [B, L2, c]. + Return: + torch.Tensor: The pos encoded image feature. + """ + assert patched_img.ndim == 3 and pos_embed.ndim == 3, \ + 'the shapes of patched_img and pos_embed must be [B, L, C]' + x_len, pos_len = patched_img.shape[1], pos_embed.shape[1] + if x_len != pos_len: + if pos_len == (self.img_size[0] // self.patch_size) * ( + self.img_size[1] // self.patch_size) + 1: + pos_h = self.img_size[0] // self.patch_size + pos_w = self.img_size[1] // self.patch_size + else: + raise ValueError( + 'Unexpected shape of pos_embed, got {}.'.format( + pos_embed.shape)) + pos_embed = self.resize_pos_embed(pos_embed, img.shape[2:], + (pos_h, pos_w), self.patch_size, + self.interpolate_mode) + return self.pos_drop(patched_img + pos_embed) + + @staticmethod + def resize_pos_embed(pos_embed, input_shpae, pos_shape, patch_size, mode): + """Resize pos_embed weights. + + Resize pos_embed using bicubic interpolate method. + Args: + pos_embed (torch.Tensor): pos_embed weights. + input_shpae (tuple): Tuple for (input_h, intput_w). + pos_shape (tuple): Tuple for (pos_h, pos_w). + patch_size (int): Patch size. + Return: + torch.Tensor: The resized pos_embed of shape [B, L_new, C] + """ + assert pos_embed.ndim == 3, 'shape of pos_embed must be [B, L, C]' + input_h, input_w = input_shpae + pos_h, pos_w = pos_shape + cls_token_weight = pos_embed[:, 0] + pos_embed_weight = pos_embed[:, (-1 * pos_h * pos_w):] + pos_embed_weight = pos_embed_weight.reshape( + 1, pos_h, pos_w, pos_embed.shape[2]).permute(0, 3, 1, 2) + pos_embed_weight = F.interpolate( + pos_embed_weight, + size=[input_h // patch_size, input_w // patch_size], + align_corners=False, + mode=mode) + cls_token_weight = cls_token_weight.unsqueeze(1) + pos_embed_weight = torch.flatten(pos_embed_weight, 2).transpose(1, 2) + pos_embed = torch.cat((cls_token_weight, pos_embed_weight), dim=1) + return pos_embed + + def forward(self, inputs): + B = inputs.shape[0] + + x = self.patch_embed(inputs) + + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + x = self._pos_embeding(inputs, x, self.pos_embed) + + if not self.with_cls_token: + # Remove class token for transformer input + x = x[:, 1:] + + outs = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if i == len(self.blocks) - 1: + if self.final_norm: + x = self.norm(x) + if i in self.out_indices: + if self.with_cls_token: + # Remove class token and reshape token for decoder head + out = x[:, 1:] + else: + out = x + B, _, C = out.shape + out = out.reshape(B, inputs.shape[2] // self.patch_size, + inputs.shape[3] // self.patch_size, + C).permute(0, 3, 1, 2) + outs.append(out) + + return tuple(outs) + + def train(self, mode=True): + super(VisionTransformer, self).train(mode) + if mode and self.norm_eval: + for m in self.modules(): + if isinstance(m, nn.LayerNorm): + m.eval() diff --git a/custom_mmpkg/custom_mmseg/models/builder.py b/custom_mmpkg/custom_mmseg/models/builder.py new file mode 100644 index 0000000000000000000000000000000000000000..e6e5920e9dec9d62e5a62ed688cab7d3bfd1ac74 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/builder.py @@ -0,0 +1,46 @@ +import warnings + +from custom_mmpkg.custom_mmcv.cnn import MODELS as MMCV_MODELS +from custom_mmpkg.custom_mmcv.utils import Registry + +MODELS = Registry('models', parent=MMCV_MODELS) + +BACKBONES = MODELS +NECKS = MODELS +HEADS = MODELS +LOSSES = MODELS +SEGMENTORS = MODELS + + +def build_backbone(cfg): + """Build backbone.""" + return BACKBONES.build(cfg) + + +def build_neck(cfg): + """Build neck.""" + return NECKS.build(cfg) + + +def build_head(cfg): + """Build head.""" + return HEADS.build(cfg) + + +def build_loss(cfg): + """Build loss.""" + return LOSSES.build(cfg) + + +def build_segmentor(cfg, train_cfg=None, test_cfg=None): + """Build segmentor.""" + if train_cfg is not None or test_cfg is not None: + warnings.warn( + 'train_cfg and test_cfg is deprecated, ' + 'please specify them in model', UserWarning) + assert cfg.get('train_cfg') is None or train_cfg is None, \ + 'train_cfg specified in both outer field and model field ' + assert cfg.get('test_cfg') is None or test_cfg is None, \ + 'test_cfg specified in both outer field and model field ' + return SEGMENTORS.build( + cfg, default_args=dict(train_cfg=train_cfg, test_cfg=test_cfg)) diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/__init__.py b/custom_mmpkg/custom_mmseg/models/decode_heads/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ac66d3cfe0ea04af45c0f3594bf135841c3812e3 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/__init__.py @@ -0,0 +1,28 @@ +from .ann_head import ANNHead +from .apc_head import APCHead +from .aspp_head import ASPPHead +from .cc_head import CCHead +from .da_head import DAHead +from .dm_head import DMHead +from .dnl_head import DNLHead +from .ema_head import EMAHead +from .enc_head import EncHead +from .fcn_head import FCNHead +from .fpn_head import FPNHead +from .gc_head import GCHead +from .lraspp_head import LRASPPHead +from .nl_head import NLHead +from .ocr_head import OCRHead +# from .point_head import PointHead +from .psa_head import PSAHead +from .psp_head import PSPHead +from .sep_aspp_head import DepthwiseSeparableASPPHead +from .sep_fcn_head import DepthwiseSeparableFCNHead +from .uper_head import UPerHead + +__all__ = [ + 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', + 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', + 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', + 'APCHead', 'DMHead', 'LRASPPHead' +] diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/ann_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/ann_head.py new file mode 100644 index 0000000000000000000000000000000000000000..bebbc4f1ba6f76508a3f71265e519cbd24a509cc --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/ann_head.py @@ -0,0 +1,245 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from ..builder import HEADS +from ..utils import SelfAttentionBlock as _SelfAttentionBlock +from .decode_head import BaseDecodeHead + + +class PPMConcat(nn.ModuleList): + """Pyramid Pooling Module that only concat the features of each layer. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. + """ + + def __init__(self, pool_scales=(1, 3, 6, 8)): + super(PPMConcat, self).__init__( + [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales]) + + def forward(self, feats): + """Forward function.""" + ppm_outs = [] + for ppm in self: + ppm_out = ppm(feats) + ppm_outs.append(ppm_out.view(*feats.shape[:2], -1)) + concat_outs = torch.cat(ppm_outs, dim=2) + return concat_outs + + +class SelfAttentionBlock(_SelfAttentionBlock): + """Make a ANN used SelfAttentionBlock. + + Args: + low_in_channels (int): Input channels of lower level feature, + which is the key feature for self-attention. + high_in_channels (int): Input channels of higher level feature, + which is the query feature for self-attention. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + share_key_query (bool): Whether share projection weight between key + and query projection. + query_scale (int): The scale of query feature map. + key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module of key feature. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, low_in_channels, high_in_channels, channels, + out_channels, share_key_query, query_scale, key_pool_scales, + conv_cfg, norm_cfg, act_cfg): + key_psp = PPMConcat(key_pool_scales) + if query_scale > 1: + query_downsample = nn.MaxPool2d(kernel_size=query_scale) + else: + query_downsample = None + super(SelfAttentionBlock, self).__init__( + key_in_channels=low_in_channels, + query_in_channels=high_in_channels, + channels=channels, + out_channels=out_channels, + share_key_query=share_key_query, + query_downsample=query_downsample, + key_downsample=key_psp, + key_query_num_convs=1, + key_query_norm=True, + value_out_num_convs=1, + value_out_norm=False, + matmul_norm=True, + with_out=True, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + +class AFNB(nn.Module): + """Asymmetric Fusion Non-local Block(AFNB) + + Args: + low_in_channels (int): Input channels of lower level feature, + which is the key feature for self-attention. + high_in_channels (int): Input channels of higher level feature, + which is the query feature for self-attention. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + and query projection. + query_scales (tuple[int]): The scales of query feature map. + Default: (1,) + key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module of key feature. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, low_in_channels, high_in_channels, channels, + out_channels, query_scales, key_pool_scales, conv_cfg, + norm_cfg, act_cfg): + super(AFNB, self).__init__() + self.stages = nn.ModuleList() + for query_scale in query_scales: + self.stages.append( + SelfAttentionBlock( + low_in_channels=low_in_channels, + high_in_channels=high_in_channels, + channels=channels, + out_channels=out_channels, + share_key_query=False, + query_scale=query_scale, + key_pool_scales=key_pool_scales, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.bottleneck = ConvModule( + out_channels + high_in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, low_feats, high_feats): + """Forward function.""" + priors = [stage(high_feats, low_feats) for stage in self.stages] + context = torch.stack(priors, dim=0).sum(dim=0) + output = self.bottleneck(torch.cat([context, high_feats], 1)) + return output + + +class APNB(nn.Module): + """Asymmetric Pyramid Non-local Block (APNB) + + Args: + in_channels (int): Input channels of key/query feature, + which is the key feature for self-attention. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + query_scales (tuple[int]): The scales of query feature map. + Default: (1,) + key_pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module of key feature. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, in_channels, channels, out_channels, query_scales, + key_pool_scales, conv_cfg, norm_cfg, act_cfg): + super(APNB, self).__init__() + self.stages = nn.ModuleList() + for query_scale in query_scales: + self.stages.append( + SelfAttentionBlock( + low_in_channels=in_channels, + high_in_channels=in_channels, + channels=channels, + out_channels=out_channels, + share_key_query=True, + query_scale=query_scale, + key_pool_scales=key_pool_scales, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + self.bottleneck = ConvModule( + 2 * in_channels, + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, feats): + """Forward function.""" + priors = [stage(feats, feats) for stage in self.stages] + context = torch.stack(priors, dim=0).sum(dim=0) + output = self.bottleneck(torch.cat([context, feats], 1)) + return output + + +@HEADS.register_module() +class ANNHead(BaseDecodeHead): + """Asymmetric Non-local Neural Networks for Semantic Segmentation. + + This head is the implementation of `ANNNet + `_. + + Args: + project_channels (int): Projection channels for Nonlocal. + query_scales (tuple[int]): The scales of query feature map. + Default: (1,) + key_pool_scales (tuple[int]): The pooling scales of key feature map. + Default: (1, 3, 6, 8). + """ + + def __init__(self, + project_channels, + query_scales=(1, ), + key_pool_scales=(1, 3, 6, 8), + **kwargs): + super(ANNHead, self).__init__( + input_transform='multiple_select', **kwargs) + assert len(self.in_channels) == 2 + low_in_channels, high_in_channels = self.in_channels + self.project_channels = project_channels + self.fusion = AFNB( + low_in_channels=low_in_channels, + high_in_channels=high_in_channels, + out_channels=high_in_channels, + channels=project_channels, + query_scales=query_scales, + key_pool_scales=key_pool_scales, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.bottleneck = ConvModule( + high_in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.context = APNB( + in_channels=self.channels, + out_channels=self.channels, + channels=project_channels, + query_scales=query_scales, + key_pool_scales=key_pool_scales, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + low_feats, high_feats = self._transform_inputs(inputs) + output = self.fusion(low_feats, high_feats) + output = self.dropout(output) + output = self.bottleneck(output) + output = self.context(output) + output = self.cls_seg(output) + + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/apc_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/apc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..119c083a3422b939615a2310d647993d31cb4dc0 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/apc_head.py @@ -0,0 +1,158 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class ACM(nn.Module): + """Adaptive Context Module used in APCNet. + + Args: + pool_scale (int): Pooling scale used in Adaptive Context + Module to extract region features. + fusion (bool): Add one conv to fuse residual feature. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict | None): Config of conv layers. + norm_cfg (dict | None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg, + norm_cfg, act_cfg): + super(ACM, self).__init__() + self.pool_scale = pool_scale + self.fusion = fusion + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.pooled_redu_conv = ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.input_redu_conv = ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.global_info = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + self.gla = nn.Conv2d(self.channels, self.pool_scale**2, 1, 1, 0) + + self.residual_conv = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + if self.fusion: + self.fusion_conv = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, x): + """Forward function.""" + pooled_x = F.adaptive_avg_pool2d(x, self.pool_scale) + # [batch_size, channels, h, w] + x = self.input_redu_conv(x) + # [batch_size, channels, pool_scale, pool_scale] + pooled_x = self.pooled_redu_conv(pooled_x) + batch_size = x.size(0) + # [batch_size, pool_scale * pool_scale, channels] + pooled_x = pooled_x.view(batch_size, self.channels, + -1).permute(0, 2, 1).contiguous() + # [batch_size, h * w, pool_scale * pool_scale] + affinity_matrix = self.gla(x + resize( + self.global_info(F.adaptive_avg_pool2d(x, 1)), size=x.shape[2:]) + ).permute(0, 2, 3, 1).reshape( + batch_size, -1, self.pool_scale**2) + affinity_matrix = F.sigmoid(affinity_matrix) + # [batch_size, h * w, channels] + z_out = torch.matmul(affinity_matrix, pooled_x) + # [batch_size, channels, h * w] + z_out = z_out.permute(0, 2, 1).contiguous() + # [batch_size, channels, h, w] + z_out = z_out.view(batch_size, self.channels, x.size(2), x.size(3)) + z_out = self.residual_conv(z_out) + z_out = F.relu(z_out + x) + if self.fusion: + z_out = self.fusion_conv(z_out) + + return z_out + + +@HEADS.register_module() +class APCHead(BaseDecodeHead): + """Adaptive Pyramid Context Network for Semantic Segmentation. + + This head is the implementation of + `APCNet `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Adaptive Context + Module. Default: (1, 2, 3, 6). + fusion (bool): Add one conv to fuse residual feature. + """ + + def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs): + super(APCHead, self).__init__(**kwargs) + assert isinstance(pool_scales, (list, tuple)) + self.pool_scales = pool_scales + self.fusion = fusion + acm_modules = [] + for pool_scale in self.pool_scales: + acm_modules.append( + ACM(pool_scale, + self.fusion, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.acm_modules = nn.ModuleList(acm_modules) + self.bottleneck = ConvModule( + self.in_channels + len(pool_scales) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + acm_outs = [x] + for acm_module in self.acm_modules: + acm_outs.append(acm_module(x)) + acm_outs = torch.cat(acm_outs, dim=1) + output = self.bottleneck(acm_outs) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/aspp_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/aspp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5b251f2659b9800df341d214610f3766ef81a835 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/aspp_head.py @@ -0,0 +1,107 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class ASPPModule(nn.ModuleList): + """Atrous Spatial Pyramid Pooling (ASPP) Module. + + Args: + dilations (tuple[int]): Dilation rate of each layer. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg, + act_cfg): + super(ASPPModule, self).__init__() + self.dilations = dilations + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + for dilation in dilations: + self.append( + ConvModule( + self.in_channels, + self.channels, + 1 if dilation == 1 else 3, + dilation=dilation, + padding=0 if dilation == 1 else dilation, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def forward(self, x): + """Forward function.""" + aspp_outs = [] + for aspp_module in self: + aspp_outs.append(aspp_module(x)) + + return aspp_outs + + +@HEADS.register_module() +class ASPPHead(BaseDecodeHead): + """Rethinking Atrous Convolution for Semantic Image Segmentation. + + This head is the implementation of `DeepLabV3 + `_. + + Args: + dilations (tuple[int]): Dilation rates for ASPP module. + Default: (1, 6, 12, 18). + """ + + def __init__(self, dilations=(1, 6, 12, 18), **kwargs): + super(ASPPHead, self).__init__(**kwargs) + assert isinstance(dilations, (list, tuple)) + self.dilations = dilations + self.image_pool = nn.Sequential( + nn.AdaptiveAvgPool2d(1), + ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.aspp_modules = ASPPModule( + dilations, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.bottleneck = ConvModule( + (len(dilations) + 1) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + aspp_outs = [ + resize( + self.image_pool(x), + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + ] + aspp_outs.extend(self.aspp_modules(x)) + aspp_outs = torch.cat(aspp_outs, dim=1) + output = self.bottleneck(aspp_outs) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/cascade_decode_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/cascade_decode_head.py new file mode 100644 index 0000000000000000000000000000000000000000..d02122ca0e68743b1bf7a893afae96042f23838c --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/cascade_decode_head.py @@ -0,0 +1,57 @@ +from abc import ABCMeta, abstractmethod + +from .decode_head import BaseDecodeHead + + +class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta): + """Base class for cascade decode head used in + :class:`CascadeEncoderDecoder.""" + + def __init__(self, *args, **kwargs): + super(BaseCascadeDecodeHead, self).__init__(*args, **kwargs) + + @abstractmethod + def forward(self, inputs, prev_output): + """Placeholder of forward function.""" + pass + + def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg, + train_cfg): + """Forward function for training. + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + train_cfg (dict): The training config. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + seg_logits = self.forward(inputs, prev_output) + losses = self.losses(seg_logits, gt_semantic_seg) + + return losses + + def forward_test(self, inputs, prev_output, img_metas, test_cfg): + """Forward function for testing. + + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + test_cfg (dict): The testing config. + + Returns: + Tensor: Output segmentation map. + """ + return self.forward(inputs, prev_output) diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/cc_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/cc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a582718478dab5c55eec3de6bcf7ac842da25e8d --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/cc_head.py @@ -0,0 +1,42 @@ +import torch + +from ..builder import HEADS +from .fcn_head import FCNHead + +try: + from custom_mmpkg.custom_mmcv.ops import CrissCrossAttention +except ModuleNotFoundError: + CrissCrossAttention = None + + +@HEADS.register_module() +class CCHead(FCNHead): + """CCNet: Criss-Cross Attention for Semantic Segmentation. + + This head is the implementation of `CCNet + `_. + + Args: + recurrence (int): Number of recurrence of Criss Cross Attention + module. Default: 2. + """ + + def __init__(self, recurrence=2, **kwargs): + if CrissCrossAttention is None: + raise RuntimeError('Please install mmcv-full for ' + 'CrissCrossAttention ops') + super(CCHead, self).__init__(num_convs=2, **kwargs) + self.recurrence = recurrence + self.cca = CrissCrossAttention(self.channels) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + for _ in range(self.recurrence): + output = self.cca(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/da_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/da_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ce384a5f040e815c61ffd4a0e46d058fa874e11a --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/da_head.py @@ -0,0 +1,178 @@ +import torch +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule, Scale +from torch import nn + +from custom_mmpkg.custom_mmseg.core import add_prefix +from ..builder import HEADS +from ..utils import SelfAttentionBlock as _SelfAttentionBlock +from .decode_head import BaseDecodeHead + + +class PAM(_SelfAttentionBlock): + """Position Attention Module (PAM) + + Args: + in_channels (int): Input channels of key/query feature. + channels (int): Output channels of key/query transform. + """ + + def __init__(self, in_channels, channels): + super(PAM, self).__init__( + key_in_channels=in_channels, + query_in_channels=in_channels, + channels=channels, + out_channels=in_channels, + share_key_query=False, + query_downsample=None, + key_downsample=None, + key_query_num_convs=1, + key_query_norm=False, + value_out_num_convs=1, + value_out_norm=False, + matmul_norm=False, + with_out=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None) + + self.gamma = Scale(0) + + def forward(self, x): + """Forward function.""" + out = super(PAM, self).forward(x, x) + + out = self.gamma(out) + x + return out + + +class CAM(nn.Module): + """Channel Attention Module (CAM)""" + + def __init__(self): + super(CAM, self).__init__() + self.gamma = Scale(0) + + def forward(self, x): + """Forward function.""" + batch_size, channels, height, width = x.size() + proj_query = x.view(batch_size, channels, -1) + proj_key = x.view(batch_size, channels, -1).permute(0, 2, 1) + energy = torch.bmm(proj_query, proj_key) + energy_new = torch.max( + energy, -1, keepdim=True)[0].expand_as(energy) - energy + attention = F.softmax(energy_new, dim=-1) + proj_value = x.view(batch_size, channels, -1) + + out = torch.bmm(attention, proj_value) + out = out.view(batch_size, channels, height, width) + + out = self.gamma(out) + x + return out + + +@HEADS.register_module() +class DAHead(BaseDecodeHead): + """Dual Attention Network for Scene Segmentation. + + This head is the implementation of `DANet + `_. + + Args: + pam_channels (int): The channels of Position Attention Module(PAM). + """ + + def __init__(self, pam_channels, **kwargs): + super(DAHead, self).__init__(**kwargs) + self.pam_channels = pam_channels + self.pam_in_conv = ConvModule( + self.in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.pam = PAM(self.channels, pam_channels) + self.pam_out_conv = ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.pam_conv_seg = nn.Conv2d( + self.channels, self.num_classes, kernel_size=1) + + self.cam_in_conv = ConvModule( + self.in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.cam = CAM() + self.cam_out_conv = ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.cam_conv_seg = nn.Conv2d( + self.channels, self.num_classes, kernel_size=1) + + def pam_cls_seg(self, feat): + """PAM feature classification.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.pam_conv_seg(feat) + return output + + def cam_cls_seg(self, feat): + """CAM feature classification.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.cam_conv_seg(feat) + return output + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + pam_feat = self.pam_in_conv(x) + pam_feat = self.pam(pam_feat) + pam_feat = self.pam_out_conv(pam_feat) + pam_out = self.pam_cls_seg(pam_feat) + + cam_feat = self.cam_in_conv(x) + cam_feat = self.cam(cam_feat) + cam_feat = self.cam_out_conv(cam_feat) + cam_out = self.cam_cls_seg(cam_feat) + + feat_sum = pam_feat + cam_feat + pam_cam_out = self.cls_seg(feat_sum) + + return pam_cam_out, pam_out, cam_out + + def forward_test(self, inputs, img_metas, test_cfg): + """Forward function for testing, only ``pam_cam`` is used.""" + return self.forward(inputs)[0] + + def losses(self, seg_logit, seg_label): + """Compute ``pam_cam``, ``pam``, ``cam`` loss.""" + pam_cam_seg_logit, pam_seg_logit, cam_seg_logit = seg_logit + loss = dict() + loss.update( + add_prefix( + super(DAHead, self).losses(pam_cam_seg_logit, seg_label), + 'pam_cam')) + loss.update( + add_prefix( + super(DAHead, self).losses(pam_seg_logit, seg_label), 'pam')) + loss.update( + add_prefix( + super(DAHead, self).losses(cam_seg_logit, seg_label), 'cam')) + return loss diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/decode_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/decode_head.py new file mode 100644 index 0000000000000000000000000000000000000000..2ed88037dd0e2200d359a2e3dd40dc24ba40feeb --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/decode_head.py @@ -0,0 +1,234 @@ +from abc import ABCMeta, abstractmethod + +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import normal_init +from custom_mmpkg.custom_mmcv.runner import auto_fp16, force_fp32 + +from custom_mmpkg.custom_mmseg.core import build_pixel_sampler +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import build_loss +from ..losses import accuracy + + +class BaseDecodeHead(nn.Module, metaclass=ABCMeta): + """Base class for BaseDecodeHead. + + Args: + in_channels (int|Sequence[int]): Input channels. + channels (int): Channels after modules, before conv_seg. + num_classes (int): Number of classes. + dropout_ratio (float): Ratio of dropout layer. Default: 0.1. + conv_cfg (dict|None): Config of conv layers. Default: None. + norm_cfg (dict|None): Config of norm layers. Default: None. + act_cfg (dict): Config of activation layers. + Default: dict(type='ReLU') + in_index (int|Sequence[int]): Input feature index. Default: -1 + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + Default: None. + loss_decode (dict): Config of decode loss. + Default: dict(type='CrossEntropyLoss'). + ignore_index (int | None): The label index to be ignored. When using + masked BCE loss, ignore_index should be set to None. Default: 255 + sampler (dict|None): The config of segmentation map sampler. + Default: None. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + """ + + def __init__(self, + in_channels, + channels, + *, + num_classes, + dropout_ratio=0.1, + conv_cfg=None, + norm_cfg=None, + act_cfg=dict(type='ReLU'), + in_index=-1, + input_transform=None, + loss_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=False, + loss_weight=1.0), + ignore_index=255, + sampler=None, + align_corners=False): + super(BaseDecodeHead, self).__init__() + self._init_inputs(in_channels, in_index, input_transform) + self.channels = channels + self.num_classes = num_classes + self.dropout_ratio = dropout_ratio + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.in_index = in_index + self.loss_decode = build_loss(loss_decode) + self.ignore_index = ignore_index + self.align_corners = align_corners + if sampler is not None: + self.sampler = build_pixel_sampler(sampler, context=self) + else: + self.sampler = None + + self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1) + if dropout_ratio > 0: + self.dropout = nn.Dropout2d(dropout_ratio) + else: + self.dropout = None + self.fp16_enabled = False + + def extra_repr(self): + """Extra repr.""" + s = f'input_transform={self.input_transform}, ' \ + f'ignore_index={self.ignore_index}, ' \ + f'align_corners={self.align_corners}' + return s + + def _init_inputs(self, in_channels, in_index, input_transform): + """Check and initialize input transforms. + + The in_channels, in_index and input_transform must match. + Specifically, when input_transform is None, only single feature map + will be selected. So in_channels and in_index must be of type int. + When input_transform + + Args: + in_channels (int|Sequence[int]): Input channels. + in_index (int|Sequence[int]): Input feature index. + input_transform (str|None): Transformation type of input features. + Options: 'resize_concat', 'multiple_select', None. + 'resize_concat': Multiple feature maps will be resize to the + same size as first one and than concat together. + Usually used in FCN head of HRNet. + 'multiple_select': Multiple feature maps will be bundle into + a list and passed into decode head. + None: Only one select feature map is allowed. + """ + + if input_transform is not None: + assert input_transform in ['resize_concat', 'multiple_select'] + self.input_transform = input_transform + self.in_index = in_index + if input_transform is not None: + assert isinstance(in_channels, (list, tuple)) + assert isinstance(in_index, (list, tuple)) + assert len(in_channels) == len(in_index) + if input_transform == 'resize_concat': + self.in_channels = sum(in_channels) + else: + self.in_channels = in_channels + else: + assert isinstance(in_channels, int) + assert isinstance(in_index, int) + self.in_channels = in_channels + + def init_weights(self): + """Initialize weights of classification layer.""" + normal_init(self.conv_seg, mean=0, std=0.01) + + def _transform_inputs(self, inputs): + """Transform inputs for decoder. + + Args: + inputs (list[Tensor]): List of multi-level img features. + + Returns: + Tensor: The transformed inputs + """ + + if self.input_transform == 'resize_concat': + inputs = [inputs[i] for i in self.in_index] + upsampled_inputs = [ + resize( + input=x, + size=inputs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) for x in inputs + ] + inputs = torch.cat(upsampled_inputs, dim=1) + elif self.input_transform == 'multiple_select': + inputs = [inputs[i] for i in self.in_index] + else: + inputs = inputs[self.in_index] + + return inputs + + @auto_fp16() + @abstractmethod + def forward(self, inputs): + """Placeholder of forward function.""" + pass + + def forward_train(self, inputs, img_metas, gt_semantic_seg, train_cfg): + """Forward function for training. + Args: + inputs (list[Tensor]): List of multi-level img features. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + train_cfg (dict): The training config. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + seg_logits = self.forward(inputs) + losses = self.losses(seg_logits, gt_semantic_seg) + return losses + + def forward_test(self, inputs, img_metas, test_cfg): + """Forward function for testing. + + Args: + inputs (list[Tensor]): List of multi-level img features. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + test_cfg (dict): The testing config. + + Returns: + Tensor: Output segmentation map. + """ + return self.forward(inputs) + + def cls_seg(self, feat): + """Classify each pixel.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.conv_seg(feat) + return output + + @force_fp32(apply_to=('seg_logit', )) + def losses(self, seg_logit, seg_label): + """Compute segmentation loss.""" + loss = dict() + seg_logit = resize( + input=seg_logit, + size=seg_label.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + if self.sampler is not None: + seg_weight = self.sampler.sample(seg_logit, seg_label) + else: + seg_weight = None + seg_label = seg_label.squeeze(1) + loss['loss_seg'] = self.loss_decode( + seg_logit, + seg_label, + weight=seg_weight, + ignore_index=self.ignore_index) + loss['acc_seg'] = accuracy(seg_logit, seg_label) + return loss diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/dm_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/dm_head.py new file mode 100644 index 0000000000000000000000000000000000000000..607cd3dd2219a7971319c84ad2383bca25306b3d --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/dm_head.py @@ -0,0 +1,140 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class DCM(nn.Module): + """Dynamic Convolutional Module used in DMNet. + + Args: + filter_size (int): The filter size of generated convolution kernel + used in Dynamic Convolutional Module. + fusion (bool): Add one conv to fuse DCM output feature. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict | None): Config of conv layers. + norm_cfg (dict | None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg, + norm_cfg, act_cfg): + super(DCM, self).__init__() + self.filter_size = filter_size + self.fusion = fusion + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.filter_gen_conv = nn.Conv2d(self.in_channels, self.channels, 1, 1, + 0) + + self.input_redu_conv = ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + if self.norm_cfg is not None: + self.norm = build_norm_layer(self.norm_cfg, self.channels)[1] + else: + self.norm = None + self.activate = build_activation_layer(self.act_cfg) + + if self.fusion: + self.fusion_conv = ConvModule( + self.channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, x): + """Forward function.""" + generated_filter = self.filter_gen_conv( + F.adaptive_avg_pool2d(x, self.filter_size)) + x = self.input_redu_conv(x) + b, c, h, w = x.shape + # [1, b * c, h, w], c = self.channels + x = x.view(1, b * c, h, w) + # [b * c, 1, filter_size, filter_size] + generated_filter = generated_filter.view(b * c, 1, self.filter_size, + self.filter_size) + pad = (self.filter_size - 1) // 2 + if (self.filter_size - 1) % 2 == 0: + p2d = (pad, pad, pad, pad) + else: + p2d = (pad + 1, pad, pad + 1, pad) + x = F.pad(input=x, pad=p2d, mode='constant', value=0) + # [1, b * c, h, w] + output = F.conv2d(input=x, weight=generated_filter, groups=b * c) + # [b, c, h, w] + output = output.view(b, c, h, w) + if self.norm is not None: + output = self.norm(output) + output = self.activate(output) + + if self.fusion: + output = self.fusion_conv(output) + + return output + + +@HEADS.register_module() +class DMHead(BaseDecodeHead): + """Dynamic Multi-scale Filters for Semantic Segmentation. + + This head is the implementation of + `DMNet `_. + + Args: + filter_sizes (tuple[int]): The size of generated convolutional filters + used in Dynamic Convolutional Module. Default: (1, 3, 5, 7). + fusion (bool): Add one conv to fuse DCM output feature. + """ + + def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs): + super(DMHead, self).__init__(**kwargs) + assert isinstance(filter_sizes, (list, tuple)) + self.filter_sizes = filter_sizes + self.fusion = fusion + dcm_modules = [] + for filter_size in self.filter_sizes: + dcm_modules.append( + DCM(filter_size, + self.fusion, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.dcm_modules = nn.ModuleList(dcm_modules) + self.bottleneck = ConvModule( + self.in_channels + len(filter_sizes) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + dcm_outs = [x] + for dcm_module in self.dcm_modules: + dcm_outs.append(dcm_module(x)) + dcm_outs = torch.cat(dcm_outs, dim=1) + output = self.bottleneck(dcm_outs) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/dnl_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/dnl_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ed5b7c1936aa6114d0370625482b677db58a43e8 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/dnl_head.py @@ -0,0 +1,131 @@ +import torch +from custom_mmpkg.custom_mmcv.cnn import NonLocal2d +from torch import nn + +from ..builder import HEADS +from .fcn_head import FCNHead + + +class DisentangledNonLocal2d(NonLocal2d): + """Disentangled Non-Local Blocks. + + Args: + temperature (float): Temperature to adjust attention. Default: 0.05 + """ + + def __init__(self, *arg, temperature, **kwargs): + super().__init__(*arg, **kwargs) + self.temperature = temperature + self.conv_mask = nn.Conv2d(self.in_channels, 1, kernel_size=1) + + def embedded_gaussian(self, theta_x, phi_x): + """Embedded gaussian with temperature.""" + + # NonLocal2d pairwise_weight: [N, HxW, HxW] + pairwise_weight = torch.matmul(theta_x, phi_x) + if self.use_scale: + # theta_x.shape[-1] is `self.inter_channels` + pairwise_weight /= theta_x.shape[-1]**0.5 + pairwise_weight /= self.temperature + pairwise_weight = pairwise_weight.softmax(dim=-1) + return pairwise_weight + + def forward(self, x): + # x: [N, C, H, W] + n = x.size(0) + + # g_x: [N, HxW, C] + g_x = self.g(x).view(n, self.inter_channels, -1) + g_x = g_x.permute(0, 2, 1) + + # theta_x: [N, HxW, C], phi_x: [N, C, HxW] + if self.mode == 'gaussian': + theta_x = x.view(n, self.in_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + if self.sub_sample: + phi_x = self.phi(x).view(n, self.in_channels, -1) + else: + phi_x = x.view(n, self.in_channels, -1) + elif self.mode == 'concatenation': + theta_x = self.theta(x).view(n, self.inter_channels, -1, 1) + phi_x = self.phi(x).view(n, self.inter_channels, 1, -1) + else: + theta_x = self.theta(x).view(n, self.inter_channels, -1) + theta_x = theta_x.permute(0, 2, 1) + phi_x = self.phi(x).view(n, self.inter_channels, -1) + + # subtract mean + theta_x -= theta_x.mean(dim=-2, keepdim=True) + phi_x -= phi_x.mean(dim=-1, keepdim=True) + + pairwise_func = getattr(self, self.mode) + # pairwise_weight: [N, HxW, HxW] + pairwise_weight = pairwise_func(theta_x, phi_x) + + # y: [N, HxW, C] + y = torch.matmul(pairwise_weight, g_x) + # y: [N, C, H, W] + y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels, + *x.size()[2:]) + + # unary_mask: [N, 1, HxW] + unary_mask = self.conv_mask(x) + unary_mask = unary_mask.view(n, 1, -1) + unary_mask = unary_mask.softmax(dim=-1) + # unary_x: [N, 1, C] + unary_x = torch.matmul(unary_mask, g_x) + # unary_x: [N, C, 1, 1] + unary_x = unary_x.permute(0, 2, 1).contiguous().reshape( + n, self.inter_channels, 1, 1) + + output = x + self.conv_out(y + unary_x) + + return output + + +@HEADS.register_module() +class DNLHead(FCNHead): + """Disentangled Non-Local Neural Networks. + + This head is the implementation of `DNLNet + `_. + + Args: + reduction (int): Reduction factor of projection transform. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + sqrt(1/inter_channels). Default: False. + mode (str): The nonlocal mode. Options are 'embedded_gaussian', + 'dot_product'. Default: 'embedded_gaussian.'. + temperature (float): Temperature to adjust attention. Default: 0.05 + """ + + def __init__(self, + reduction=2, + use_scale=True, + mode='embedded_gaussian', + temperature=0.05, + **kwargs): + super(DNLHead, self).__init__(num_convs=2, **kwargs) + self.reduction = reduction + self.use_scale = use_scale + self.mode = mode + self.temperature = temperature + self.dnl_block = DisentangledNonLocal2d( + in_channels=self.channels, + reduction=self.reduction, + use_scale=self.use_scale, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + mode=self.mode, + temperature=self.temperature) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + output = self.dnl_block(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/ema_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/ema_head.py new file mode 100644 index 0000000000000000000000000000000000000000..e2279d53be90e3aee8e109eae47277f7c3266cef --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/ema_head.py @@ -0,0 +1,168 @@ +import math + +import torch +import torch.distributed as dist +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +def reduce_mean(tensor): + """Reduce mean when distributed training.""" + if not (dist.is_available() and dist.is_initialized()): + return tensor + tensor = tensor.clone() + dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM) + return tensor + + +class EMAModule(nn.Module): + """Expectation Maximization Attention Module used in EMANet. + + Args: + channels (int): Channels of the whole module. + num_bases (int): Number of bases. + num_stages (int): Number of the EM iterations. + """ + + def __init__(self, channels, num_bases, num_stages, momentum): + super(EMAModule, self).__init__() + assert num_stages >= 1, 'num_stages must be at least 1!' + self.num_bases = num_bases + self.num_stages = num_stages + self.momentum = momentum + + bases = torch.zeros(1, channels, self.num_bases) + bases.normal_(0, math.sqrt(2. / self.num_bases)) + # [1, channels, num_bases] + bases = F.normalize(bases, dim=1, p=2) + self.register_buffer('bases', bases) + + def forward(self, feats): + """Forward function.""" + batch_size, channels, height, width = feats.size() + # [batch_size, channels, height*width] + feats = feats.view(batch_size, channels, height * width) + # [batch_size, channels, num_bases] + bases = self.bases.repeat(batch_size, 1, 1) + + with torch.no_grad(): + for i in range(self.num_stages): + # [batch_size, height*width, num_bases] + attention = torch.einsum('bcn,bck->bnk', feats, bases) + attention = F.softmax(attention, dim=2) + # l1 norm + attention_normed = F.normalize(attention, dim=1, p=1) + # [batch_size, channels, num_bases] + bases = torch.einsum('bcn,bnk->bck', feats, attention_normed) + # l2 norm + bases = F.normalize(bases, dim=1, p=2) + + feats_recon = torch.einsum('bck,bnk->bcn', bases, attention) + feats_recon = feats_recon.view(batch_size, channels, height, width) + + if self.training: + bases = bases.mean(dim=0, keepdim=True) + bases = reduce_mean(bases) + # l2 norm + bases = F.normalize(bases, dim=1, p=2) + self.bases = (1 - + self.momentum) * self.bases + self.momentum * bases + + return feats_recon + + +@HEADS.register_module() +class EMAHead(BaseDecodeHead): + """Expectation Maximization Attention Networks for Semantic Segmentation. + + This head is the implementation of `EMANet + `_. + + Args: + ema_channels (int): EMA module channels + num_bases (int): Number of bases. + num_stages (int): Number of the EM iterations. + concat_input (bool): Whether concat the input and output of convs + before classification layer. Default: True + momentum (float): Momentum to update the base. Default: 0.1. + """ + + def __init__(self, + ema_channels, + num_bases, + num_stages, + concat_input=True, + momentum=0.1, + **kwargs): + super(EMAHead, self).__init__(**kwargs) + self.ema_channels = ema_channels + self.num_bases = num_bases + self.num_stages = num_stages + self.concat_input = concat_input + self.momentum = momentum + self.ema_module = EMAModule(self.ema_channels, self.num_bases, + self.num_stages, self.momentum) + + self.ema_in_conv = ConvModule( + self.in_channels, + self.ema_channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + # project (0, inf) -> (-inf, inf) + self.ema_mid_conv = ConvModule( + self.ema_channels, + self.ema_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=None, + act_cfg=None) + for param in self.ema_mid_conv.parameters(): + param.requires_grad = False + + self.ema_out_conv = ConvModule( + self.ema_channels, + self.ema_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=None) + self.bottleneck = ConvModule( + self.ema_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.concat_input: + self.conv_cat = ConvModule( + self.in_channels + self.channels, + self.channels, + kernel_size=3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + feats = self.ema_in_conv(x) + identity = feats + feats = self.ema_mid_conv(feats) + recon = self.ema_module(feats) + recon = F.relu(recon, inplace=True) + recon = self.ema_out_conv(recon) + output = F.relu(identity + recon, inplace=True) + output = self.bottleneck(output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/enc_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/enc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..ee8ecd7401ec9619eb2ac176d380e4e513294ea3 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/enc_head.py @@ -0,0 +1,187 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule, build_norm_layer + +from custom_mmpkg.custom_mmseg.ops import Encoding, resize +from ..builder import HEADS, build_loss +from .decode_head import BaseDecodeHead + + +class EncModule(nn.Module): + """Encoding Module used in EncNet. + + Args: + in_channels (int): Input channels. + num_codes (int): Number of code words. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + """ + + def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg): + super(EncModule, self).__init__() + self.encoding_project = ConvModule( + in_channels, + in_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + # TODO: resolve this hack + # change to 1d + if norm_cfg is not None: + encoding_norm_cfg = norm_cfg.copy() + if encoding_norm_cfg['type'] in ['BN', 'IN']: + encoding_norm_cfg['type'] += '1d' + else: + encoding_norm_cfg['type'] = encoding_norm_cfg['type'].replace( + '2d', '1d') + else: + # fallback to BN1d + encoding_norm_cfg = dict(type='BN1d') + self.encoding = nn.Sequential( + Encoding(channels=in_channels, num_codes=num_codes), + build_norm_layer(encoding_norm_cfg, num_codes)[1], + nn.ReLU(inplace=True)) + self.fc = nn.Sequential( + nn.Linear(in_channels, in_channels), nn.Sigmoid()) + + def forward(self, x): + """Forward function.""" + encoding_projection = self.encoding_project(x) + encoding_feat = self.encoding(encoding_projection).mean(dim=1) + batch_size, channels, _, _ = x.size() + gamma = self.fc(encoding_feat) + y = gamma.view(batch_size, channels, 1, 1) + output = F.relu_(x + x * y) + return encoding_feat, output + + +@HEADS.register_module() +class EncHead(BaseDecodeHead): + """Context Encoding for Semantic Segmentation. + + This head is the implementation of `EncNet + `_. + + Args: + num_codes (int): Number of code words. Default: 32. + use_se_loss (bool): Whether use Semantic Encoding Loss (SE-loss) to + regularize the training. Default: True. + add_lateral (bool): Whether use lateral connection to fuse features. + Default: False. + loss_se_decode (dict): Config of decode loss. + Default: dict(type='CrossEntropyLoss', use_sigmoid=True). + """ + + def __init__(self, + num_codes=32, + use_se_loss=True, + add_lateral=False, + loss_se_decode=dict( + type='CrossEntropyLoss', + use_sigmoid=True, + loss_weight=0.2), + **kwargs): + super(EncHead, self).__init__( + input_transform='multiple_select', **kwargs) + self.use_se_loss = use_se_loss + self.add_lateral = add_lateral + self.num_codes = num_codes + self.bottleneck = ConvModule( + self.in_channels[-1], + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if add_lateral: + self.lateral_convs = nn.ModuleList() + for in_channels in self.in_channels[:-1]: # skip the last one + self.lateral_convs.append( + ConvModule( + in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + self.fusion = ConvModule( + len(self.in_channels) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.enc_module = EncModule( + self.channels, + num_codes=num_codes, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if self.use_se_loss: + self.loss_se_decode = build_loss(loss_se_decode) + self.se_layer = nn.Linear(self.channels, self.num_classes) + + def forward(self, inputs): + """Forward function.""" + inputs = self._transform_inputs(inputs) + feat = self.bottleneck(inputs[-1]) + if self.add_lateral: + laterals = [ + resize( + lateral_conv(inputs[i]), + size=feat.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + feat = self.fusion(torch.cat([feat, *laterals], 1)) + encode_feat, output = self.enc_module(feat) + output = self.cls_seg(output) + if self.use_se_loss: + se_output = self.se_layer(encode_feat) + return output, se_output + else: + return output + + def forward_test(self, inputs, img_metas, test_cfg): + """Forward function for testing, ignore se_loss.""" + if self.use_se_loss: + return self.forward(inputs)[0] + else: + return self.forward(inputs) + + @staticmethod + def _convert_to_onehot_labels(seg_label, num_classes): + """Convert segmentation label to onehot. + + Args: + seg_label (Tensor): Segmentation label of shape (N, H, W). + num_classes (int): Number of classes. + + Returns: + Tensor: Onehot labels of shape (N, num_classes). + """ + + batch_size = seg_label.size(0) + onehot_labels = seg_label.new_zeros((batch_size, num_classes)) + for i in range(batch_size): + hist = seg_label[i].float().histc( + bins=num_classes, min=0, max=num_classes - 1) + onehot_labels[i] = hist > 0 + return onehot_labels + + def losses(self, seg_logit, seg_label): + """Compute segmentation and semantic encoding loss.""" + seg_logit, se_seg_logit = seg_logit + loss = dict() + loss.update(super(EncHead, self).losses(seg_logit, seg_label)) + se_loss = self.loss_se_decode( + se_seg_logit, + self._convert_to_onehot_labels(seg_label, self.num_classes)) + loss['loss_se'] = se_loss + return loss diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/fcn_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/fcn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..7f0c384381a1f1b26f795e2ed53c571823858317 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/fcn_head.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +@HEADS.register_module() +class FCNHead(BaseDecodeHead): + """Fully Convolution Networks for Semantic Segmentation. + + This head is implemented of `FCNNet `_. + + Args: + num_convs (int): Number of convs in the head. Default: 2. + kernel_size (int): The kernel size for convs in the head. Default: 3. + concat_input (bool): Whether concat the input and output of convs + before classification layer. + dilation (int): The dilation rate for convs in the head. Default: 1. + """ + + def __init__(self, + num_convs=2, + kernel_size=3, + concat_input=True, + dilation=1, + **kwargs): + assert num_convs >= 0 and dilation > 0 and isinstance(dilation, int) + self.num_convs = num_convs + self.concat_input = concat_input + self.kernel_size = kernel_size + super(FCNHead, self).__init__(**kwargs) + if num_convs == 0: + assert self.in_channels == self.channels + + conv_padding = (kernel_size // 2) * dilation + convs = [] + convs.append( + ConvModule( + self.in_channels, + self.channels, + kernel_size=kernel_size, + padding=conv_padding, + dilation=dilation, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + for i in range(num_convs - 1): + convs.append( + ConvModule( + self.channels, + self.channels, + kernel_size=kernel_size, + padding=conv_padding, + dilation=dilation, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + if num_convs == 0: + self.convs = nn.Identity() + else: + self.convs = nn.Sequential(*convs) + if self.concat_input: + self.conv_cat = ConvModule( + self.in_channels + self.channels, + self.channels, + kernel_size=kernel_size, + padding=kernel_size // 2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs(x) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/fpn_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/fpn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..26d0849b1ace5911974437be5ae328e6107b44bc --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/fpn_head.py @@ -0,0 +1,68 @@ +import numpy as np +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +@HEADS.register_module() +class FPNHead(BaseDecodeHead): + """Panoptic Feature Pyramid Networks. + + This head is the implementation of `Semantic FPN + `_. + + Args: + feature_strides (tuple[int]): The strides for input feature maps. + stack_lateral. All strides suppose to be power of 2. The first + one is of largest resolution. + """ + + def __init__(self, feature_strides, **kwargs): + super(FPNHead, self).__init__( + input_transform='multiple_select', **kwargs) + assert len(feature_strides) == len(self.in_channels) + assert min(feature_strides) == feature_strides[0] + self.feature_strides = feature_strides + + self.scale_heads = nn.ModuleList() + for i in range(len(feature_strides)): + head_length = max( + 1, + int(np.log2(feature_strides[i]) - np.log2(feature_strides[0]))) + scale_head = [] + for k in range(head_length): + scale_head.append( + ConvModule( + self.in_channels[i] if k == 0 else self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + if feature_strides[i] != feature_strides[0]: + scale_head.append( + nn.Upsample( + scale_factor=2, + mode='bilinear', + align_corners=self.align_corners)) + self.scale_heads.append(nn.Sequential(*scale_head)) + + def forward(self, inputs): + + x = self._transform_inputs(inputs) + + output = self.scale_heads[0](x[0]) + for i in range(1, len(self.feature_strides)): + # non inplace + output = output + resize( + self.scale_heads[i](x[i]), + size=output.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/gc_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/gc_head.py new file mode 100644 index 0000000000000000000000000000000000000000..600049998d04fc5f469e8da41243bb2d51b64cc1 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/gc_head.py @@ -0,0 +1,47 @@ +import torch +from custom_mmpkg.custom_mmcv.cnn import ContextBlock + +from ..builder import HEADS +from .fcn_head import FCNHead + + +@HEADS.register_module() +class GCHead(FCNHead): + """GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond. + + This head is the implementation of `GCNet + `_. + + Args: + ratio (float): Multiplier of channels ratio. Default: 1/4. + pooling_type (str): The pooling type of context aggregation. + Options are 'att', 'avg'. Default: 'avg'. + fusion_types (tuple[str]): The fusion type for feature fusion. + Options are 'channel_add', 'channel_mul'. Default: ('channel_add',) + """ + + def __init__(self, + ratio=1 / 4., + pooling_type='att', + fusion_types=('channel_add', ), + **kwargs): + super(GCHead, self).__init__(num_convs=2, **kwargs) + self.ratio = ratio + self.pooling_type = pooling_type + self.fusion_types = fusion_types + self.gc_block = ContextBlock( + in_channels=self.channels, + ratio=self.ratio, + pooling_type=self.pooling_type, + fusion_types=self.fusion_types) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + output = self.gc_block(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/lraspp_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/lraspp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5395a8f57fdbdf6828842f2c4c9a291ecf0b2cdc --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/lraspp_head.py @@ -0,0 +1,90 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv import is_tuple_of +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +@HEADS.register_module() +class LRASPPHead(BaseDecodeHead): + """Lite R-ASPP (LRASPP) head is proposed in Searching for MobileNetV3. + + This head is the improved implementation of `Searching for MobileNetV3 + `_. + + Args: + branch_channels (tuple[int]): The number of output channels in every + each branch. Default: (32, 64). + """ + + def __init__(self, branch_channels=(32, 64), **kwargs): + super(LRASPPHead, self).__init__(**kwargs) + if self.input_transform != 'multiple_select': + raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform ' + f'must be \'multiple_select\'. But received ' + f'\'{self.input_transform}\'') + assert is_tuple_of(branch_channels, int) + assert len(branch_channels) == len(self.in_channels) - 1 + self.branch_channels = branch_channels + + self.convs = nn.Sequential() + self.conv_ups = nn.Sequential() + for i in range(len(branch_channels)): + self.convs.add_module( + f'conv{i}', + nn.Conv2d( + self.in_channels[i], branch_channels[i], 1, bias=False)) + self.conv_ups.add_module( + f'conv_up{i}', + ConvModule( + self.channels + branch_channels[i], + self.channels, + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=False)) + + self.conv_up_input = nn.Conv2d(self.channels, self.channels, 1) + + self.aspp_conv = ConvModule( + self.in_channels[-1], + self.channels, + 1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + bias=False) + self.image_pool = nn.Sequential( + nn.AvgPool2d(kernel_size=49, stride=(16, 20)), + ConvModule( + self.in_channels[2], + self.channels, + 1, + act_cfg=dict(type='Sigmoid'), + bias=False)) + + def forward(self, inputs): + """Forward function.""" + inputs = self._transform_inputs(inputs) + + x = inputs[-1] + + x = self.aspp_conv(x) * resize( + self.image_pool(x), + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + x = self.conv_up_input(x) + + for i in range(len(self.branch_channels) - 1, -1, -1): + x = resize( + x, + size=inputs[i].size()[2:], + mode='bilinear', + align_corners=self.align_corners) + x = torch.cat([x, self.convs[i](inputs[i])], 1) + x = self.conv_ups[i](x) + + return self.cls_seg(x) diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/nl_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/nl_head.py new file mode 100644 index 0000000000000000000000000000000000000000..9010d303cb3808de4893c3900ebbd9917a9cc57e --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/nl_head.py @@ -0,0 +1,49 @@ +import torch +from custom_mmpkg.custom_mmcv.cnn import NonLocal2d + +from ..builder import HEADS +from .fcn_head import FCNHead + + +@HEADS.register_module() +class NLHead(FCNHead): + """Non-local Neural Networks. + + This head is the implementation of `NLNet + `_. + + Args: + reduction (int): Reduction factor of projection transform. Default: 2. + use_scale (bool): Whether to scale pairwise_weight by + sqrt(1/inter_channels). Default: True. + mode (str): The nonlocal mode. Options are 'embedded_gaussian', + 'dot_product'. Default: 'embedded_gaussian.'. + """ + + def __init__(self, + reduction=2, + use_scale=True, + mode='embedded_gaussian', + **kwargs): + super(NLHead, self).__init__(num_convs=2, **kwargs) + self.reduction = reduction + self.use_scale = use_scale + self.mode = mode + self.nl_block = NonLocal2d( + in_channels=self.channels, + reduction=self.reduction, + use_scale=self.use_scale, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + mode=self.mode) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + output = self.convs[0](x) + output = self.nl_block(output) + output = self.convs[1](output) + if self.concat_input: + output = self.conv_cat(torch.cat([x, output], dim=1)) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/ocr_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/ocr_head.py new file mode 100644 index 0000000000000000000000000000000000000000..b31f00233355ea610ea61a4f40cc3dfb6d84c8b7 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/ocr_head.py @@ -0,0 +1,127 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from ..utils import SelfAttentionBlock as _SelfAttentionBlock +from .cascade_decode_head import BaseCascadeDecodeHead + + +class SpatialGatherModule(nn.Module): + """Aggregate the context features according to the initial predicted + probability distribution. + + Employ the soft-weighted method to aggregate the context. + """ + + def __init__(self, scale): + super(SpatialGatherModule, self).__init__() + self.scale = scale + + def forward(self, feats, probs): + """Forward function.""" + batch_size, num_classes, height, width = probs.size() + channels = feats.size(1) + probs = probs.view(batch_size, num_classes, -1) + feats = feats.view(batch_size, channels, -1) + # [batch_size, height*width, num_classes] + feats = feats.permute(0, 2, 1) + # [batch_size, channels, height*width] + probs = F.softmax(self.scale * probs, dim=2) + # [batch_size, channels, num_classes] + ocr_context = torch.matmul(probs, feats) + ocr_context = ocr_context.permute(0, 2, 1).contiguous().unsqueeze(3) + return ocr_context + + +class ObjectAttentionBlock(_SelfAttentionBlock): + """Make a OCR used SelfAttentionBlock.""" + + def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg, + act_cfg): + if scale > 1: + query_downsample = nn.MaxPool2d(kernel_size=scale) + else: + query_downsample = None + super(ObjectAttentionBlock, self).__init__( + key_in_channels=in_channels, + query_in_channels=in_channels, + channels=channels, + out_channels=in_channels, + share_key_query=False, + query_downsample=query_downsample, + key_downsample=None, + key_query_num_convs=2, + key_query_norm=True, + value_out_num_convs=1, + value_out_norm=True, + matmul_norm=True, + with_out=True, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.bottleneck = ConvModule( + in_channels * 2, + in_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, query_feats, key_feats): + """Forward function.""" + context = super(ObjectAttentionBlock, + self).forward(query_feats, key_feats) + output = self.bottleneck(torch.cat([context, query_feats], dim=1)) + if self.query_downsample is not None: + output = resize(query_feats) + + return output + + +@HEADS.register_module() +class OCRHead(BaseCascadeDecodeHead): + """Object-Contextual Representations for Semantic Segmentation. + + This head is the implementation of `OCRNet + `_. + + Args: + ocr_channels (int): The intermediate channels of OCR block. + scale (int): The scale of probability map in SpatialGatherModule in + Default: 1. + """ + + def __init__(self, ocr_channels, scale=1, **kwargs): + super(OCRHead, self).__init__(**kwargs) + self.ocr_channels = ocr_channels + self.scale = scale + self.object_context_block = ObjectAttentionBlock( + self.channels, + self.ocr_channels, + self.scale, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.spatial_gather_module = SpatialGatherModule(self.scale) + + self.bottleneck = ConvModule( + self.in_channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs, prev_output): + """Forward function.""" + x = self._transform_inputs(inputs) + feats = self.bottleneck(x) + context = self.spatial_gather_module(feats, prev_output) + object_context = self.object_context_block(feats, context) + output = self.cls_seg(object_context) + + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/point_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/point_head.py new file mode 100644 index 0000000000000000000000000000000000000000..71c9f8e078536a733616a9f33de2dd35a8adbc26 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/point_head.py @@ -0,0 +1,350 @@ +# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend/point_head/point_head.py # noqa + +import torch +import torch.nn as nn + +from custom_mmpkg.custom_mmcv.cnn import ConvModule, normal_init +from custom_mmpkg.custom_mmcv.ops import point_sample + +from custom_mmpkg.custom_mmseg.models.builder import HEADS +from custom_mmpkg.custom_mmseg.ops import resize +from ..losses import accuracy +from .cascade_decode_head import BaseCascadeDecodeHead + + +def calculate_uncertainty(seg_logits): + """Estimate uncertainty based on seg logits. + + For each location of the prediction ``seg_logits`` we estimate + uncertainty as the difference between top first and top second + predicted logits. + + Args: + seg_logits (Tensor): Semantic segmentation logits, + shape (batch_size, num_classes, height, width). + + Returns: + scores (Tensor): T uncertainty scores with the most uncertain + locations having the highest uncertainty score, shape ( + batch_size, 1, height, width) + """ + top2_scores = torch.topk(seg_logits, k=2, dim=1)[0] + return (top2_scores[:, 1] - top2_scores[:, 0]).unsqueeze(1) + + +@HEADS.register_module() +class PointHead(BaseCascadeDecodeHead): + """A mask point head use in PointRend. + + ``PointHead`` use shared multi-layer perceptron (equivalent to + nn.Conv1d) to predict the logit of input points. The fine-grained feature + and coarse feature will be concatenate together for predication. + + Args: + num_fcs (int): Number of fc layers in the head. Default: 3. + in_channels (int): Number of input channels. Default: 256. + fc_channels (int): Number of fc channels. Default: 256. + num_classes (int): Number of classes for logits. Default: 80. + class_agnostic (bool): Whether use class agnostic classification. + If so, the output channels of logits will be 1. Default: False. + coarse_pred_each_layer (bool): Whether concatenate coarse feature with + the output of each fc layer. Default: True. + conv_cfg (dict|None): Dictionary to construct and config conv layer. + Default: dict(type='Conv1d')) + norm_cfg (dict|None): Dictionary to construct and config norm layer. + Default: None. + loss_point (dict): Dictionary to construct and config loss layer of + point head. Default: dict(type='CrossEntropyLoss', use_mask=True, + loss_weight=1.0). + """ + + def __init__(self, + num_fcs=3, + coarse_pred_each_layer=True, + conv_cfg=dict(type='Conv1d'), + norm_cfg=None, + act_cfg=dict(type='ReLU', inplace=False), + **kwargs): + super(PointHead, self).__init__( + input_transform='multiple_select', + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + **kwargs) + + self.num_fcs = num_fcs + self.coarse_pred_each_layer = coarse_pred_each_layer + + fc_in_channels = sum(self.in_channels) + self.num_classes + fc_channels = self.channels + self.fcs = nn.ModuleList() + for k in range(num_fcs): + fc = ConvModule( + fc_in_channels, + fc_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.fcs.append(fc) + fc_in_channels = fc_channels + fc_in_channels += self.num_classes if self.coarse_pred_each_layer \ + else 0 + self.fc_seg = nn.Conv1d( + fc_in_channels, + self.num_classes, + kernel_size=1, + stride=1, + padding=0) + if self.dropout_ratio > 0: + self.dropout = nn.Dropout(self.dropout_ratio) + delattr(self, 'conv_seg') + + def init_weights(self): + """Initialize weights of classification layer.""" + normal_init(self.fc_seg, std=0.001) + + def cls_seg(self, feat): + """Classify each pixel with fc.""" + if self.dropout is not None: + feat = self.dropout(feat) + output = self.fc_seg(feat) + return output + + def forward(self, fine_grained_point_feats, coarse_point_feats): + x = torch.cat([fine_grained_point_feats, coarse_point_feats], dim=1) + for fc in self.fcs: + x = fc(x) + if self.coarse_pred_each_layer: + x = torch.cat((x, coarse_point_feats), dim=1) + return self.cls_seg(x) + + def _get_fine_grained_point_feats(self, x, points): + """Sample from fine grained features. + + Args: + x (list[Tensor]): Feature pyramid from by neck or backbone. + points (Tensor): Point coordinates, shape (batch_size, + num_points, 2). + + Returns: + fine_grained_feats (Tensor): Sampled fine grained feature, + shape (batch_size, sum(channels of x), num_points). + """ + + fine_grained_feats_list = [ + point_sample(_, points, align_corners=self.align_corners) + for _ in x + ] + if len(fine_grained_feats_list) > 1: + fine_grained_feats = torch.cat(fine_grained_feats_list, dim=1) + else: + fine_grained_feats = fine_grained_feats_list[0] + + return fine_grained_feats + + def _get_coarse_point_feats(self, prev_output, points): + """Sample from fine grained features. + + Args: + prev_output (list[Tensor]): Prediction of previous decode head. + points (Tensor): Point coordinates, shape (batch_size, + num_points, 2). + + Returns: + coarse_feats (Tensor): Sampled coarse feature, shape (batch_size, + num_classes, num_points). + """ + + coarse_feats = point_sample( + prev_output, points, align_corners=self.align_corners) + + return coarse_feats + + def forward_train(self, inputs, prev_output, img_metas, gt_semantic_seg, + train_cfg): + """Forward function for training. + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + train_cfg (dict): The training config. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + x = self._transform_inputs(inputs) + with torch.no_grad(): + points = self.get_points_train( + prev_output, calculate_uncertainty, cfg=train_cfg) + fine_grained_point_feats = self._get_fine_grained_point_feats( + x, points) + coarse_point_feats = self._get_coarse_point_feats(prev_output, points) + point_logits = self.forward(fine_grained_point_feats, + coarse_point_feats) + point_label = point_sample( + gt_semantic_seg.float(), + points, + mode='nearest', + align_corners=self.align_corners) + point_label = point_label.squeeze(1).long() + + losses = self.losses(point_logits, point_label) + + return losses + + def forward_test(self, inputs, prev_output, img_metas, test_cfg): + """Forward function for testing. + + Args: + inputs (list[Tensor]): List of multi-level img features. + prev_output (Tensor): The output of previous decode head. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + test_cfg (dict): The testing config. + + Returns: + Tensor: Output segmentation map. + """ + + x = self._transform_inputs(inputs) + refined_seg_logits = prev_output.clone() + for _ in range(test_cfg.subdivision_steps): + refined_seg_logits = resize( + refined_seg_logits, + scale_factor=test_cfg.scale_factor, + mode='bilinear', + align_corners=self.align_corners) + batch_size, channels, height, width = refined_seg_logits.shape + point_indices, points = self.get_points_test( + refined_seg_logits, calculate_uncertainty, cfg=test_cfg) + fine_grained_point_feats = self._get_fine_grained_point_feats( + x, points) + coarse_point_feats = self._get_coarse_point_feats( + prev_output, points) + point_logits = self.forward(fine_grained_point_feats, + coarse_point_feats) + + point_indices = point_indices.unsqueeze(1).expand(-1, channels, -1) + refined_seg_logits = refined_seg_logits.reshape( + batch_size, channels, height * width) + refined_seg_logits = refined_seg_logits.scatter_( + 2, point_indices, point_logits) + refined_seg_logits = refined_seg_logits.view( + batch_size, channels, height, width) + + return refined_seg_logits + + def losses(self, point_logits, point_label): + """Compute segmentation loss.""" + loss = dict() + loss['loss_point'] = self.loss_decode( + point_logits, point_label, ignore_index=self.ignore_index) + loss['acc_point'] = accuracy(point_logits, point_label) + return loss + + def get_points_train(self, seg_logits, uncertainty_func, cfg): + """Sample points for training. + + Sample points in [0, 1] x [0, 1] coordinate space based on their + uncertainty. The uncertainties are calculated for each point using + 'uncertainty_func' function that takes point's logit prediction as + input. + + Args: + seg_logits (Tensor): Semantic segmentation logits, shape ( + batch_size, num_classes, height, width). + uncertainty_func (func): uncertainty calculation function. + cfg (dict): Training config of point head. + + Returns: + point_coords (Tensor): A tensor of shape (batch_size, num_points, + 2) that contains the coordinates of ``num_points`` sampled + points. + """ + num_points = cfg.num_points + oversample_ratio = cfg.oversample_ratio + importance_sample_ratio = cfg.importance_sample_ratio + assert oversample_ratio >= 1 + assert 0 <= importance_sample_ratio <= 1 + batch_size = seg_logits.shape[0] + num_sampled = int(num_points * oversample_ratio) + point_coords = torch.rand( + batch_size, num_sampled, 2, device=seg_logits.device) + point_logits = point_sample(seg_logits, point_coords) + # It is crucial to calculate uncertainty based on the sampled + # prediction value for the points. Calculating uncertainties of the + # coarse predictions first and sampling them for points leads to + # incorrect results. To illustrate this: assume uncertainty func( + # logits)=-abs(logits), a sampled point between two coarse + # predictions with -1 and 1 logits has 0 logits, and therefore 0 + # uncertainty value. However, if we calculate uncertainties for the + # coarse predictions first, both will have -1 uncertainty, + # and sampled point will get -1 uncertainty. + point_uncertainties = uncertainty_func(point_logits) + num_uncertain_points = int(importance_sample_ratio * num_points) + num_random_points = num_points - num_uncertain_points + idx = torch.topk( + point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1] + shift = num_sampled * torch.arange( + batch_size, dtype=torch.long, device=seg_logits.device) + idx += shift[:, None] + point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view( + batch_size, num_uncertain_points, 2) + if num_random_points > 0: + rand_point_coords = torch.rand( + batch_size, num_random_points, 2, device=seg_logits.device) + point_coords = torch.cat((point_coords, rand_point_coords), dim=1) + return point_coords + + def get_points_test(self, seg_logits, uncertainty_func, cfg): + """Sample points for testing. + + Find ``num_points`` most uncertain points from ``uncertainty_map``. + + Args: + seg_logits (Tensor): A tensor of shape (batch_size, num_classes, + height, width) for class-specific or class-agnostic prediction. + uncertainty_func (func): uncertainty calculation function. + cfg (dict): Testing config of point head. + + Returns: + point_indices (Tensor): A tensor of shape (batch_size, num_points) + that contains indices from [0, height x width) of the most + uncertain points. + point_coords (Tensor): A tensor of shape (batch_size, num_points, + 2) that contains [0, 1] x [0, 1] normalized coordinates of the + most uncertain points from the ``height x width`` grid . + """ + + num_points = cfg.subdivision_num_points + uncertainty_map = uncertainty_func(seg_logits) + batch_size, _, height, width = uncertainty_map.shape + h_step = 1.0 / height + w_step = 1.0 / width + + uncertainty_map = uncertainty_map.view(batch_size, height * width) + num_points = min(height * width, num_points) + point_indices = uncertainty_map.topk(num_points, dim=1)[1] + point_coords = torch.zeros( + batch_size, + num_points, + 2, + dtype=torch.float, + device=seg_logits.device) + point_coords[:, :, 0] = w_step / 2.0 + (point_indices % + width).float() * w_step + point_coords[:, :, 1] = h_step / 2.0 + (point_indices // + width).float() * h_step + return point_indices, point_coords diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/psa_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/psa_head.py new file mode 100644 index 0000000000000000000000000000000000000000..c1424a85d7ca93eb299ad1e9116600f703940fc9 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/psa_head.py @@ -0,0 +1,196 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + +try: + from custom_mmpkg.custom_mmcv.ops import PSAMask +except ModuleNotFoundError: + PSAMask = None + + +@HEADS.register_module() +class PSAHead(BaseDecodeHead): + """Point-wise Spatial Attention Network for Scene Parsing. + + This head is the implementation of `PSANet + `_. + + Args: + mask_size (tuple[int]): The PSA mask size. It usually equals input + size. + psa_type (str): The type of psa module. Options are 'collect', + 'distribute', 'bi-direction'. Default: 'bi-direction' + compact (bool): Whether use compact map for 'collect' mode. + Default: True. + shrink_factor (int): The downsample factors of psa mask. Default: 2. + normalization_factor (float): The normalize factor of attention. + psa_softmax (bool): Whether use softmax for attention. + """ + + def __init__(self, + mask_size, + psa_type='bi-direction', + compact=False, + shrink_factor=2, + normalization_factor=1.0, + psa_softmax=True, + **kwargs): + if PSAMask is None: + raise RuntimeError('Please install mmcv-full for PSAMask ops') + super(PSAHead, self).__init__(**kwargs) + assert psa_type in ['collect', 'distribute', 'bi-direction'] + self.psa_type = psa_type + self.compact = compact + self.shrink_factor = shrink_factor + self.mask_size = mask_size + mask_h, mask_w = mask_size + self.psa_softmax = psa_softmax + if normalization_factor is None: + normalization_factor = mask_h * mask_w + self.normalization_factor = normalization_factor + + self.reduce = ConvModule( + self.in_channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.attention = nn.Sequential( + ConvModule( + self.channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + self.channels, mask_h * mask_w, kernel_size=1, bias=False)) + if psa_type == 'bi-direction': + self.reduce_p = ConvModule( + self.in_channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.attention_p = nn.Sequential( + ConvModule( + self.channels, + self.channels, + kernel_size=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + nn.Conv2d( + self.channels, mask_h * mask_w, kernel_size=1, bias=False)) + self.psamask_collect = PSAMask('collect', mask_size) + self.psamask_distribute = PSAMask('distribute', mask_size) + else: + self.psamask = PSAMask(psa_type, mask_size) + self.proj = ConvModule( + self.channels * (2 if psa_type == 'bi-direction' else 1), + self.in_channels, + kernel_size=1, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + self.bottleneck = ConvModule( + self.in_channels * 2, + self.channels, + kernel_size=3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + identity = x + align_corners = self.align_corners + if self.psa_type in ['collect', 'distribute']: + out = self.reduce(x) + n, c, h, w = out.size() + if self.shrink_factor != 1: + if h % self.shrink_factor and w % self.shrink_factor: + h = (h - 1) // self.shrink_factor + 1 + w = (w - 1) // self.shrink_factor + 1 + align_corners = True + else: + h = h // self.shrink_factor + w = w // self.shrink_factor + align_corners = False + out = resize( + out, + size=(h, w), + mode='bilinear', + align_corners=align_corners) + y = self.attention(out) + if self.compact: + if self.psa_type == 'collect': + y = y.view(n, h * w, + h * w).transpose(1, 2).view(n, h * w, h, w) + else: + y = self.psamask(y) + if self.psa_softmax: + y = F.softmax(y, dim=1) + out = torch.bmm( + out.view(n, c, h * w), y.view(n, h * w, h * w)).view( + n, c, h, w) * (1.0 / self.normalization_factor) + else: + x_col = self.reduce(x) + x_dis = self.reduce_p(x) + n, c, h, w = x_col.size() + if self.shrink_factor != 1: + if h % self.shrink_factor and w % self.shrink_factor: + h = (h - 1) // self.shrink_factor + 1 + w = (w - 1) // self.shrink_factor + 1 + align_corners = True + else: + h = h // self.shrink_factor + w = w // self.shrink_factor + align_corners = False + x_col = resize( + x_col, + size=(h, w), + mode='bilinear', + align_corners=align_corners) + x_dis = resize( + x_dis, + size=(h, w), + mode='bilinear', + align_corners=align_corners) + y_col = self.attention(x_col) + y_dis = self.attention_p(x_dis) + if self.compact: + y_dis = y_dis.view(n, h * w, + h * w).transpose(1, 2).view(n, h * w, h, w) + else: + y_col = self.psamask_collect(y_col) + y_dis = self.psamask_distribute(y_dis) + if self.psa_softmax: + y_col = F.softmax(y_col, dim=1) + y_dis = F.softmax(y_dis, dim=1) + x_col = torch.bmm( + x_col.view(n, c, h * w), y_col.view(n, h * w, h * w)).view( + n, c, h, w) * (1.0 / self.normalization_factor) + x_dis = torch.bmm( + x_dis.view(n, c, h * w), y_dis.view(n, h * w, h * w)).view( + n, c, h, w) * (1.0 / self.normalization_factor) + out = torch.cat([x_col, x_dis], 1) + out = self.proj(out) + out = resize( + out, + size=identity.shape[2:], + mode='bilinear', + align_corners=align_corners) + out = self.bottleneck(torch.cat((identity, out), dim=1)) + out = self.cls_seg(out) + return out diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/psp_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/psp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..0f7880f21319ac6035f604b0fc54f2237f7ed988 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/psp_head.py @@ -0,0 +1,101 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead + + +class PPM(nn.ModuleList): + """Pooling Pyramid Module used in PSPNet. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict): Config of activation layers. + align_corners (bool): align_corners argument of F.interpolate. + """ + + def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg, + act_cfg, align_corners): + super(PPM, self).__init__() + self.pool_scales = pool_scales + self.align_corners = align_corners + self.in_channels = in_channels + self.channels = channels + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + for pool_scale in pool_scales: + self.append( + nn.Sequential( + nn.AdaptiveAvgPool2d(pool_scale), + ConvModule( + self.in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg))) + + def forward(self, x): + """Forward function.""" + ppm_outs = [] + for ppm in self: + ppm_out = ppm(x) + upsampled_ppm_out = resize( + ppm_out, + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + ppm_outs.append(upsampled_ppm_out) + return ppm_outs + + +@HEADS.register_module() +class PSPHead(BaseDecodeHead): + """Pyramid Scene Parsing Network. + + This head is the implementation of + `PSPNet `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. Default: (1, 2, 3, 6). + """ + + def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): + super(PSPHead, self).__init__(**kwargs) + assert isinstance(pool_scales, (list, tuple)) + self.pool_scales = pool_scales + self.psp_modules = PPM( + self.pool_scales, + self.in_channels, + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + self.bottleneck = ConvModule( + self.in_channels + len(pool_scales) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + psp_outs = [x] + psp_outs.extend(self.psp_modules(x)) + psp_outs = torch.cat(psp_outs, dim=1) + output = self.bottleneck(psp_outs) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/sep_aspp_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/sep_aspp_head.py new file mode 100644 index 0000000000000000000000000000000000000000..26942ae135d172ec2dbb3775c0dc548c6976e729 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/sep_aspp_head.py @@ -0,0 +1,101 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule, DepthwiseSeparableConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .aspp_head import ASPPHead, ASPPModule + + +class DepthwiseSeparableASPPModule(ASPPModule): + """Atrous Spatial Pyramid Pooling (ASPP) Module with depthwise separable + conv.""" + + def __init__(self, **kwargs): + super(DepthwiseSeparableASPPModule, self).__init__(**kwargs) + for i, dilation in enumerate(self.dilations): + if dilation > 1: + self[i] = DepthwiseSeparableConvModule( + self.in_channels, + self.channels, + 3, + dilation=dilation, + padding=dilation, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + +@HEADS.register_module() +class DepthwiseSeparableASPPHead(ASPPHead): + """Encoder-Decoder with Atrous Separable Convolution for Semantic Image + Segmentation. + + This head is the implementation of `DeepLabV3+ + `_. + + Args: + c1_in_channels (int): The input channels of c1 decoder. If is 0, + the no decoder will be used. + c1_channels (int): The intermediate channels of c1 decoder. + """ + + def __init__(self, c1_in_channels, c1_channels, **kwargs): + super(DepthwiseSeparableASPPHead, self).__init__(**kwargs) + assert c1_in_channels >= 0 + self.aspp_modules = DepthwiseSeparableASPPModule( + dilations=self.dilations, + in_channels=self.in_channels, + channels=self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + if c1_in_channels > 0: + self.c1_bottleneck = ConvModule( + c1_in_channels, + c1_channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + else: + self.c1_bottleneck = None + self.sep_bottleneck = nn.Sequential( + DepthwiseSeparableConvModule( + self.channels + c1_channels, + self.channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg), + DepthwiseSeparableConvModule( + self.channels, + self.channels, + 3, + padding=1, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg)) + + def forward(self, inputs): + """Forward function.""" + x = self._transform_inputs(inputs) + aspp_outs = [ + resize( + self.image_pool(x), + size=x.size()[2:], + mode='bilinear', + align_corners=self.align_corners) + ] + aspp_outs.extend(self.aspp_modules(x)) + aspp_outs = torch.cat(aspp_outs, dim=1) + output = self.bottleneck(aspp_outs) + if self.c1_bottleneck is not None: + c1_output = self.c1_bottleneck(inputs[0]) + output = resize( + input=output, + size=c1_output.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + output = torch.cat([output, c1_output], dim=1) + output = self.sep_bottleneck(output) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/sep_fcn_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/sep_fcn_head.py new file mode 100644 index 0000000000000000000000000000000000000000..fabb624a530098e44ed1d9a9a7762addeaa126d6 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/sep_fcn_head.py @@ -0,0 +1,51 @@ +from custom_mmpkg.custom_mmcv.cnn import DepthwiseSeparableConvModule + +from ..builder import HEADS +from .fcn_head import FCNHead + + +@HEADS.register_module() +class DepthwiseSeparableFCNHead(FCNHead): + """Depthwise-Separable Fully Convolutional Network for Semantic + Segmentation. + + This head is implemented according to Fast-SCNN paper. + Args: + in_channels(int): Number of output channels of FFM. + channels(int): Number of middle-stage channels in the decode head. + concat_input(bool): Whether to concatenate original decode input into + the result of several consecutive convolution layers. + Default: True. + num_classes(int): Used to determine the dimension of + final prediction tensor. + in_index(int): Correspond with 'out_indices' in FastSCNN backbone. + norm_cfg (dict | None): Config of norm layers. + align_corners (bool): align_corners argument of F.interpolate. + Default: False. + loss_decode(dict): Config of loss type and some + relevant additional options. + """ + + def __init__(self, **kwargs): + super(DepthwiseSeparableFCNHead, self).__init__(**kwargs) + self.convs[0] = DepthwiseSeparableConvModule( + self.in_channels, + self.channels, + kernel_size=self.kernel_size, + padding=self.kernel_size // 2, + norm_cfg=self.norm_cfg) + for i in range(1, self.num_convs): + self.convs[i] = DepthwiseSeparableConvModule( + self.channels, + self.channels, + kernel_size=self.kernel_size, + padding=self.kernel_size // 2, + norm_cfg=self.norm_cfg) + + if self.concat_input: + self.conv_cat = DepthwiseSeparableConvModule( + self.in_channels + self.channels, + self.channels, + kernel_size=self.kernel_size, + padding=self.kernel_size // 2, + norm_cfg=self.norm_cfg) diff --git a/custom_mmpkg/custom_mmseg/models/decode_heads/uper_head.py b/custom_mmpkg/custom_mmseg/models/decode_heads/uper_head.py new file mode 100644 index 0000000000000000000000000000000000000000..d4990010074568484b8ea768bbca8e43d407659a --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/decode_heads/uper_head.py @@ -0,0 +1,126 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from custom_mmpkg.custom_mmseg.ops import resize +from ..builder import HEADS +from .decode_head import BaseDecodeHead +from .psp_head import PPM + + +@HEADS.register_module() +class UPerHead(BaseDecodeHead): + """Unified Perceptual Parsing for Scene Understanding. + + This head is the implementation of `UPerNet + `_. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module applied on the last feature. Default: (1, 2, 3, 6). + """ + + def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs): + super(UPerHead, self).__init__( + input_transform='multiple_select', **kwargs) + # PSP Module + self.psp_modules = PPM( + pool_scales, + self.in_channels[-1], + self.channels, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + align_corners=self.align_corners) + self.bottleneck = ConvModule( + self.in_channels[-1] + len(pool_scales) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + # FPN Module + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + for in_channels in self.in_channels[:-1]: # skip the top layer + l_conv = ConvModule( + in_channels, + self.channels, + 1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + inplace=False) + fpn_conv = ConvModule( + self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg, + inplace=False) + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + self.fpn_bottleneck = ConvModule( + len(self.in_channels) * self.channels, + self.channels, + 3, + padding=1, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + act_cfg=self.act_cfg) + + def psp_forward(self, inputs): + """Forward function of PSP module.""" + x = inputs[-1] + psp_outs = [x] + psp_outs.extend(self.psp_modules(x)) + psp_outs = torch.cat(psp_outs, dim=1) + output = self.bottleneck(psp_outs) + + return output + + def forward(self, inputs): + """Forward function.""" + + inputs = self._transform_inputs(inputs) + + # build laterals + laterals = [ + lateral_conv(inputs[i]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + laterals.append(self.psp_forward(inputs)) + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] += resize( + laterals[i], + size=prev_shape, + mode='bilinear', + align_corners=self.align_corners) + + # build outputs + fpn_outs = [ + self.fpn_convs[i](laterals[i]) + for i in range(used_backbone_levels - 1) + ] + # append psp feature + fpn_outs.append(laterals[-1]) + + for i in range(used_backbone_levels - 1, 0, -1): + fpn_outs[i] = resize( + fpn_outs[i], + size=fpn_outs[0].shape[2:], + mode='bilinear', + align_corners=self.align_corners) + fpn_outs = torch.cat(fpn_outs, dim=1) + output = self.fpn_bottleneck(fpn_outs) + output = self.cls_seg(output) + return output diff --git a/custom_mmpkg/custom_mmseg/models/losses/__init__.py b/custom_mmpkg/custom_mmseg/models/losses/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..beca72045694273d63465bac2f27dbc6672271db --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/losses/__init__.py @@ -0,0 +1,12 @@ +from .accuracy import Accuracy, accuracy +from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy, + cross_entropy, mask_cross_entropy) +from .dice_loss import DiceLoss +from .lovasz_loss import LovaszLoss +from .utils import reduce_loss, weight_reduce_loss, weighted_loss + +__all__ = [ + 'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy', + 'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss', + 'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss' +] diff --git a/custom_mmpkg/custom_mmseg/models/losses/accuracy.py b/custom_mmpkg/custom_mmseg/models/losses/accuracy.py new file mode 100644 index 0000000000000000000000000000000000000000..c0fd2e7e74a0f721c4a814c09d6e453e5956bb38 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/losses/accuracy.py @@ -0,0 +1,78 @@ +import torch.nn as nn + + +def accuracy(pred, target, topk=1, thresh=None): + """Calculate accuracy according to the prediction and target. + + Args: + pred (torch.Tensor): The model prediction, shape (N, num_class, ...) + target (torch.Tensor): The target of each prediction, shape (N, , ...) + topk (int | tuple[int], optional): If the predictions in ``topk`` + matches the target, the predictions will be regarded as + correct ones. Defaults to 1. + thresh (float, optional): If not None, predictions with scores under + this threshold are considered incorrect. Default to None. + + Returns: + float | tuple[float]: If the input ``topk`` is a single integer, + the function will return a single float as accuracy. If + ``topk`` is a tuple containing multiple integers, the + function will return a tuple containing accuracies of + each ``topk`` number. + """ + assert isinstance(topk, (int, tuple)) + if isinstance(topk, int): + topk = (topk, ) + return_single = True + else: + return_single = False + + maxk = max(topk) + if pred.size(0) == 0: + accu = [pred.new_tensor(0.) for i in range(len(topk))] + return accu[0] if return_single else accu + assert pred.ndim == target.ndim + 1 + assert pred.size(0) == target.size(0) + assert maxk <= pred.size(1), \ + f'maxk {maxk} exceeds pred dimension {pred.size(1)}' + pred_value, pred_label = pred.topk(maxk, dim=1) + # transpose to shape (maxk, N, ...) + pred_label = pred_label.transpose(0, 1) + correct = pred_label.eq(target.unsqueeze(0).expand_as(pred_label)) + if thresh is not None: + # Only prediction values larger than thresh are counted as correct + correct = correct & (pred_value > thresh).t() + res = [] + for k in topk: + correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) + res.append(correct_k.mul_(100.0 / target.numel())) + return res[0] if return_single else res + + +class Accuracy(nn.Module): + """Accuracy calculation module.""" + + def __init__(self, topk=(1, ), thresh=None): + """Module to calculate the accuracy. + + Args: + topk (tuple, optional): The criterion used to calculate the + accuracy. Defaults to (1,). + thresh (float, optional): If not None, predictions with scores + under this threshold are considered incorrect. Default to None. + """ + super().__init__() + self.topk = topk + self.thresh = thresh + + def forward(self, pred, target): + """Forward function to calculate accuracy. + + Args: + pred (torch.Tensor): Prediction of models. + target (torch.Tensor): Target for each prediction. + + Returns: + tuple[float]: The accuracies under different topk criterions. + """ + return accuracy(pred, target, self.topk, self.thresh) diff --git a/custom_mmpkg/custom_mmseg/models/losses/cross_entropy_loss.py b/custom_mmpkg/custom_mmseg/models/losses/cross_entropy_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..42c0790c98616bb69621deed55547fc04c7392ef --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/losses/cross_entropy_loss.py @@ -0,0 +1,198 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES +from .utils import get_class_weight, weight_reduce_loss + + +def cross_entropy(pred, + label, + weight=None, + class_weight=None, + reduction='mean', + avg_factor=None, + ignore_index=-100): + """The wrapper function for :func:`F.cross_entropy`""" + # class_weight is a manual rescaling weight given to each class. + # If given, has to be a Tensor of size C element-wise losses + loss = F.cross_entropy( + pred, + label, + weight=class_weight, + reduction='none', + ignore_index=ignore_index) + + # apply weights and do the reduction + if weight is not None: + weight = weight.float() + loss = weight_reduce_loss( + loss, weight=weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def _expand_onehot_labels(labels, label_weights, target_shape, ignore_index): + """Expand onehot labels to match the size of prediction.""" + bin_labels = labels.new_zeros(target_shape) + valid_mask = (labels >= 0) & (labels != ignore_index) + inds = torch.nonzero(valid_mask, as_tuple=True) + + if inds[0].numel() > 0: + if labels.dim() == 3: + bin_labels[inds[0], labels[valid_mask], inds[1], inds[2]] = 1 + else: + bin_labels[inds[0], labels[valid_mask]] = 1 + + valid_mask = valid_mask.unsqueeze(1).expand(target_shape).float() + if label_weights is None: + bin_label_weights = valid_mask + else: + bin_label_weights = label_weights.unsqueeze(1).expand(target_shape) + bin_label_weights *= valid_mask + + return bin_labels, bin_label_weights + + +def binary_cross_entropy(pred, + label, + weight=None, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=255): + """Calculate the binary CrossEntropy loss. + + Args: + pred (torch.Tensor): The prediction with shape (N, 1). + label (torch.Tensor): The learning label of the prediction. + weight (torch.Tensor, optional): Sample-wise loss weight. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (int | None): The label index to be ignored. Default: 255 + + Returns: + torch.Tensor: The calculated loss + """ + if pred.dim() != label.dim(): + assert (pred.dim() == 2 and label.dim() == 1) or ( + pred.dim() == 4 and label.dim() == 3), \ + 'Only pred shape [N, C], label shape [N] or pred shape [N, C, ' \ + 'H, W], label shape [N, H, W] are supported' + label, weight = _expand_onehot_labels(label, weight, pred.shape, + ignore_index) + + # weighted element-wise losses + if weight is not None: + weight = weight.float() + loss = F.binary_cross_entropy_with_logits( + pred, label.float(), pos_weight=class_weight, reduction='none') + # do the reduction for the weighted loss + loss = weight_reduce_loss( + loss, weight, reduction=reduction, avg_factor=avg_factor) + + return loss + + +def mask_cross_entropy(pred, + target, + label, + reduction='mean', + avg_factor=None, + class_weight=None, + ignore_index=None): + """Calculate the CrossEntropy loss for masks. + + Args: + pred (torch.Tensor): The prediction with shape (N, C), C is the number + of classes. + target (torch.Tensor): The learning label of the prediction. + label (torch.Tensor): ``label`` indicates the class label of the mask' + corresponding object. This will be used to select the mask in the + of the class which the object belongs to when the mask prediction + if not class-agnostic. + reduction (str, optional): The method used to reduce the loss. + Options are "none", "mean" and "sum". + avg_factor (int, optional): Average factor that is used to average + the loss. Defaults to None. + class_weight (list[float], optional): The weight for each class. + ignore_index (None): Placeholder, to be consistent with other loss. + Default: None. + + Returns: + torch.Tensor: The calculated loss + """ + assert ignore_index is None, 'BCE loss does not support ignore_index' + # TODO: handle these two reserved arguments + assert reduction == 'mean' and avg_factor is None + num_rois = pred.size()[0] + inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device) + pred_slice = pred[inds, label].squeeze(1) + return F.binary_cross_entropy_with_logits( + pred_slice, target, weight=class_weight, reduction='mean')[None] + + +@LOSSES.register_module() +class CrossEntropyLoss(nn.Module): + """CrossEntropyLoss. + + Args: + use_sigmoid (bool, optional): Whether the prediction uses sigmoid + of softmax. Defaults to False. + use_mask (bool, optional): Whether to use mask cross entropy loss. + Defaults to False. + reduction (str, optional): . Defaults to 'mean'. + Options are "none", "mean" and "sum". + class_weight (list[float] | str, optional): Weight of each class. If in + str format, read them from a file. Defaults to None. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + """ + + def __init__(self, + use_sigmoid=False, + use_mask=False, + reduction='mean', + class_weight=None, + loss_weight=1.0): + super(CrossEntropyLoss, self).__init__() + assert (use_sigmoid is False) or (use_mask is False) + self.use_sigmoid = use_sigmoid + self.use_mask = use_mask + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = get_class_weight(class_weight) + + if self.use_sigmoid: + self.cls_criterion = binary_cross_entropy + elif self.use_mask: + self.cls_criterion = mask_cross_entropy + else: + self.cls_criterion = cross_entropy + + def forward(self, + cls_score, + label, + weight=None, + avg_factor=None, + reduction_override=None, + **kwargs): + """Forward function.""" + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.class_weight is not None: + class_weight = cls_score.new_tensor(self.class_weight) + else: + class_weight = None + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + weight, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_cls diff --git a/custom_mmpkg/custom_mmseg/models/losses/dice_loss.py b/custom_mmpkg/custom_mmseg/models/losses/dice_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..27a77b962d7d8b3079c7d6cd9db52280c6fb4970 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/losses/dice_loss.py @@ -0,0 +1,119 @@ +"""Modified from https://github.com/LikeLy-Journey/SegmenTron/blob/master/ +segmentron/solver/loss.py (Apache-2.0 License)""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES +from .utils import get_class_weight, weighted_loss + + +@weighted_loss +def dice_loss(pred, + target, + valid_mask, + smooth=1, + exponent=2, + class_weight=None, + ignore_index=255): + assert pred.shape[0] == target.shape[0] + total_loss = 0 + num_classes = pred.shape[1] + for i in range(num_classes): + if i != ignore_index: + dice_loss = binary_dice_loss( + pred[:, i], + target[..., i], + valid_mask=valid_mask, + smooth=smooth, + exponent=exponent) + if class_weight is not None: + dice_loss *= class_weight[i] + total_loss += dice_loss + return total_loss / num_classes + + +@weighted_loss +def binary_dice_loss(pred, target, valid_mask, smooth=1, exponent=2, **kwards): + assert pred.shape[0] == target.shape[0] + pred = pred.reshape(pred.shape[0], -1) + target = target.reshape(target.shape[0], -1) + valid_mask = valid_mask.reshape(valid_mask.shape[0], -1) + + num = torch.sum(torch.mul(pred, target) * valid_mask, dim=1) * 2 + smooth + den = torch.sum(pred.pow(exponent) + target.pow(exponent), dim=1) + smooth + + return 1 - num / den + + +@LOSSES.register_module() +class DiceLoss(nn.Module): + """DiceLoss. + + This loss is proposed in `V-Net: Fully Convolutional Neural Networks for + Volumetric Medical Image Segmentation `_. + + Args: + loss_type (str, optional): Binary or multi-class loss. + Default: 'multi_class'. Options are "binary" and "multi_class". + smooth (float): A float number to smooth loss, and avoid NaN error. + Default: 1 + exponent (float): An float number to calculate denominator + value: \\sum{x^exponent} + \\sum{y^exponent}. Default: 2. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + class_weight (list[float] | str, optional): Weight of each class. If in + str format, read them from a file. Defaults to None. + loss_weight (float, optional): Weight of the loss. Default to 1.0. + ignore_index (int | None): The label index to be ignored. Default: 255. + """ + + def __init__(self, + smooth=1, + exponent=2, + reduction='mean', + class_weight=None, + loss_weight=1.0, + ignore_index=255, + **kwards): + super(DiceLoss, self).__init__() + self.smooth = smooth + self.exponent = exponent + self.reduction = reduction + self.class_weight = get_class_weight(class_weight) + self.loss_weight = loss_weight + self.ignore_index = ignore_index + + def forward(self, + pred, + target, + avg_factor=None, + reduction_override=None, + **kwards): + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.class_weight is not None: + class_weight = pred.new_tensor(self.class_weight) + else: + class_weight = None + + pred = F.softmax(pred, dim=1) + num_classes = pred.shape[1] + one_hot_target = F.one_hot( + torch.clamp(target.long(), 0, num_classes - 1), + num_classes=num_classes) + valid_mask = (target != self.ignore_index).long() + + loss = self.loss_weight * dice_loss( + pred, + one_hot_target, + valid_mask=valid_mask, + reduction=reduction, + avg_factor=avg_factor, + smooth=self.smooth, + exponent=self.exponent, + class_weight=class_weight, + ignore_index=self.ignore_index) + return loss diff --git a/custom_mmpkg/custom_mmseg/models/losses/lovasz_loss.py b/custom_mmpkg/custom_mmseg/models/losses/lovasz_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..e381378522673d41a4ce2b9b9d6d70b9b3102bb0 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/losses/lovasz_loss.py @@ -0,0 +1,303 @@ +"""Modified from https://github.com/bermanmaxim/LovaszSoftmax/blob/master/pytor +ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim +Berman 2018 ESAT-PSI KU Leuven (MIT License)""" + +import custom_mmpkg.custom_mmcv as mmcv +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..builder import LOSSES +from .utils import get_class_weight, weight_reduce_loss + + +def lovasz_grad(gt_sorted): + """Computes gradient of the Lovasz extension w.r.t sorted errors. + + See Alg. 1 in paper. + """ + p = len(gt_sorted) + gts = gt_sorted.sum() + intersection = gts - gt_sorted.float().cumsum(0) + union = gts + (1 - gt_sorted).float().cumsum(0) + jaccard = 1. - intersection / union + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + +def flatten_binary_logits(logits, labels, ignore_index=None): + """Flattens predictions in the batch (binary case) Remove labels equal to + 'ignore_index'.""" + logits = logits.view(-1) + labels = labels.view(-1) + if ignore_index is None: + return logits, labels + valid = (labels != ignore_index) + vlogits = logits[valid] + vlabels = labels[valid] + return vlogits, vlabels + + +def flatten_probs(probs, labels, ignore_index=None): + """Flattens predictions in the batch.""" + if probs.dim() == 3: + # assumes output of a sigmoid layer + B, H, W = probs.size() + probs = probs.view(B, 1, H, W) + B, C, H, W = probs.size() + probs = probs.permute(0, 2, 3, 1).contiguous().view(-1, C) # B*H*W, C=P,C + labels = labels.view(-1) + if ignore_index is None: + return probs, labels + valid = (labels != ignore_index) + vprobs = probs[valid.nonzero().squeeze()] + vlabels = labels[valid] + return vprobs, vlabels + + +def lovasz_hinge_flat(logits, labels): + """Binary Lovasz hinge loss. + + Args: + logits (torch.Tensor): [P], logits at each prediction + (between -infty and +infty). + labels (torch.Tensor): [P], binary ground truth labels (0 or 1). + + Returns: + torch.Tensor: The calculated loss. + """ + if len(labels) == 0: + # only void pixels, the gradients should be 0 + return logits.sum() * 0. + signs = 2. * labels.float() - 1. + errors = (1. - logits * signs) + errors_sorted, perm = torch.sort(errors, dim=0, descending=True) + perm = perm.data + gt_sorted = labels[perm] + grad = lovasz_grad(gt_sorted) + loss = torch.dot(F.relu(errors_sorted), grad) + return loss + + +def lovasz_hinge(logits, + labels, + classes='present', + per_image=False, + class_weight=None, + reduction='mean', + avg_factor=None, + ignore_index=255): + """Binary Lovasz hinge loss. + + Args: + logits (torch.Tensor): [B, H, W], logits at each pixel + (between -infty and +infty). + labels (torch.Tensor): [B, H, W], binary ground truth masks (0 or 1). + classes (str | list[int], optional): Placeholder, to be consistent with + other loss. Default: None. + per_image (bool, optional): If per_image is True, compute the loss per + image instead of per batch. Default: False. + class_weight (list[float], optional): Placeholder, to be consistent + with other loss. Default: None. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + avg_factor (int, optional): Average factor that is used to average + the loss. This parameter only works when per_image is True. + Default: None. + ignore_index (int | None): The label index to be ignored. Default: 255. + + Returns: + torch.Tensor: The calculated loss. + """ + if per_image: + loss = [ + lovasz_hinge_flat(*flatten_binary_logits( + logit.unsqueeze(0), label.unsqueeze(0), ignore_index)) + for logit, label in zip(logits, labels) + ] + loss = weight_reduce_loss( + torch.stack(loss), None, reduction, avg_factor) + else: + loss = lovasz_hinge_flat( + *flatten_binary_logits(logits, labels, ignore_index)) + return loss + + +def lovasz_softmax_flat(probs, labels, classes='present', class_weight=None): + """Multi-class Lovasz-Softmax loss. + + Args: + probs (torch.Tensor): [P, C], class probabilities at each prediction + (between 0 and 1). + labels (torch.Tensor): [P], ground truth labels (between 0 and C - 1). + classes (str | list[int], optional): Classes chosen to calculate loss. + 'all' for all classes, 'present' for classes present in labels, or + a list of classes to average. Default: 'present'. + class_weight (list[float], optional): The weight for each class. + Default: None. + + Returns: + torch.Tensor: The calculated loss. + """ + if probs.numel() == 0: + # only void pixels, the gradients should be 0 + return probs * 0. + C = probs.size(1) + losses = [] + class_to_sum = list(range(C)) if classes in ['all', 'present'] else classes + for c in class_to_sum: + fg = (labels == c).float() # foreground for class c + if (classes == 'present' and fg.sum() == 0): + continue + if C == 1: + if len(classes) > 1: + raise ValueError('Sigmoid output possible only with 1 class') + class_pred = probs[:, 0] + else: + class_pred = probs[:, c] + errors = (fg - class_pred).abs() + errors_sorted, perm = torch.sort(errors, 0, descending=True) + perm = perm.data + fg_sorted = fg[perm] + loss = torch.dot(errors_sorted, lovasz_grad(fg_sorted)) + if class_weight is not None: + loss *= class_weight[c] + losses.append(loss) + return torch.stack(losses).mean() + + +def lovasz_softmax(probs, + labels, + classes='present', + per_image=False, + class_weight=None, + reduction='mean', + avg_factor=None, + ignore_index=255): + """Multi-class Lovasz-Softmax loss. + + Args: + probs (torch.Tensor): [B, C, H, W], class probabilities at each + prediction (between 0 and 1). + labels (torch.Tensor): [B, H, W], ground truth labels (between 0 and + C - 1). + classes (str | list[int], optional): Classes chosen to calculate loss. + 'all' for all classes, 'present' for classes present in labels, or + a list of classes to average. Default: 'present'. + per_image (bool, optional): If per_image is True, compute the loss per + image instead of per batch. Default: False. + class_weight (list[float], optional): The weight for each class. + Default: None. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + avg_factor (int, optional): Average factor that is used to average + the loss. This parameter only works when per_image is True. + Default: None. + ignore_index (int | None): The label index to be ignored. Default: 255. + + Returns: + torch.Tensor: The calculated loss. + """ + + if per_image: + loss = [ + lovasz_softmax_flat( + *flatten_probs( + prob.unsqueeze(0), label.unsqueeze(0), ignore_index), + classes=classes, + class_weight=class_weight) + for prob, label in zip(probs, labels) + ] + loss = weight_reduce_loss( + torch.stack(loss), None, reduction, avg_factor) + else: + loss = lovasz_softmax_flat( + *flatten_probs(probs, labels, ignore_index), + classes=classes, + class_weight=class_weight) + return loss + + +@LOSSES.register_module() +class LovaszLoss(nn.Module): + """LovaszLoss. + + This loss is proposed in `The Lovasz-Softmax loss: A tractable surrogate + for the optimization of the intersection-over-union measure in neural + networks `_. + + Args: + loss_type (str, optional): Binary or multi-class loss. + Default: 'multi_class'. Options are "binary" and "multi_class". + classes (str | list[int], optional): Classes chosen to calculate loss. + 'all' for all classes, 'present' for classes present in labels, or + a list of classes to average. Default: 'present'. + per_image (bool, optional): If per_image is True, compute the loss per + image instead of per batch. Default: False. + reduction (str, optional): The method used to reduce the loss. Options + are "none", "mean" and "sum". This parameter only works when + per_image is True. Default: 'mean'. + class_weight (list[float] | str, optional): Weight of each class. If in + str format, read them from a file. Defaults to None. + loss_weight (float, optional): Weight of the loss. Defaults to 1.0. + """ + + def __init__(self, + loss_type='multi_class', + classes='present', + per_image=False, + reduction='mean', + class_weight=None, + loss_weight=1.0): + super(LovaszLoss, self).__init__() + assert loss_type in ('binary', 'multi_class'), "loss_type should be \ + 'binary' or 'multi_class'." + + if loss_type == 'binary': + self.cls_criterion = lovasz_hinge + else: + self.cls_criterion = lovasz_softmax + assert classes in ('all', 'present') or mmcv.is_list_of(classes, int) + if not per_image: + assert reduction == 'none', "reduction should be 'none' when \ + per_image is False." + + self.classes = classes + self.per_image = per_image + self.reduction = reduction + self.loss_weight = loss_weight + self.class_weight = get_class_weight(class_weight) + + def forward(self, + cls_score, + label, + weight=None, + avg_factor=None, + reduction_override=None, + **kwargs): + """Forward function.""" + assert reduction_override in (None, 'none', 'mean', 'sum') + reduction = ( + reduction_override if reduction_override else self.reduction) + if self.class_weight is not None: + class_weight = cls_score.new_tensor(self.class_weight) + else: + class_weight = None + + # if multi-class loss, transform logits to probs + if self.cls_criterion == lovasz_softmax: + cls_score = F.softmax(cls_score, dim=1) + + loss_cls = self.loss_weight * self.cls_criterion( + cls_score, + label, + self.classes, + self.per_image, + class_weight=class_weight, + reduction=reduction, + avg_factor=avg_factor, + **kwargs) + return loss_cls diff --git a/custom_mmpkg/custom_mmseg/models/losses/utils.py b/custom_mmpkg/custom_mmseg/models/losses/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..cdfbd436d2305f82af065a853e789d6fa37614cd --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/losses/utils.py @@ -0,0 +1,121 @@ +import functools + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +import torch.nn.functional as F + + +def get_class_weight(class_weight): + """Get class weight for loss function. + + Args: + class_weight (list[float] | str | None): If class_weight is a str, + take it as a file name and read from it. + """ + if isinstance(class_weight, str): + # take it as a file path + if class_weight.endswith('.npy'): + class_weight = np.load(class_weight) + else: + # pkl, json or yaml + class_weight = mmcv.load(class_weight) + + return class_weight + + +def reduce_loss(loss, reduction): + """Reduce loss as specified. + + Args: + loss (Tensor): Elementwise loss tensor. + reduction (str): Options are "none", "mean" and "sum". + + Return: + Tensor: Reduced loss tensor. + """ + reduction_enum = F._Reduction.get_enum(reduction) + # none: 0, elementwise_mean:1, sum: 2 + if reduction_enum == 0: + return loss + elif reduction_enum == 1: + return loss.mean() + elif reduction_enum == 2: + return loss.sum() + + +def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None): + """Apply element-wise weight and reduce loss. + + Args: + loss (Tensor): Element-wise loss. + weight (Tensor): Element-wise weights. + reduction (str): Same as built-in losses of PyTorch. + avg_factor (float): Avarage factor when computing the mean of losses. + + Returns: + Tensor: Processed loss values. + """ + # if weight is specified, apply element-wise weight + if weight is not None: + assert weight.dim() == loss.dim() + if weight.dim() > 1: + assert weight.size(1) == 1 or weight.size(1) == loss.size(1) + loss = loss * weight + + # if avg_factor is not specified, just reduce the loss + if avg_factor is None: + loss = reduce_loss(loss, reduction) + else: + # if reduction is mean, then average the loss by avg_factor + if reduction == 'mean': + loss = loss.sum() / avg_factor + # if reduction is 'none', then do nothing, otherwise raise an error + elif reduction != 'none': + raise ValueError('avg_factor can not be used with reduction="sum"') + return loss + + +def weighted_loss(loss_func): + """Create a weighted version of a given loss function. + + To use this decorator, the loss function must have the signature like + `loss_func(pred, target, **kwargs)`. The function only needs to compute + element-wise loss without any reduction. This decorator will add weight + and reduction arguments to the function. The decorated function will have + the signature like `loss_func(pred, target, weight=None, reduction='mean', + avg_factor=None, **kwargs)`. + + :Example: + + >>> import torch + >>> @weighted_loss + >>> def l1_loss(pred, target): + >>> return (pred - target).abs() + + >>> pred = torch.Tensor([0, 2, 3]) + >>> target = torch.Tensor([1, 1, 1]) + >>> weight = torch.Tensor([1, 0, 1]) + + >>> l1_loss(pred, target) + tensor(1.3333) + >>> l1_loss(pred, target, weight) + tensor(1.) + >>> l1_loss(pred, target, reduction='none') + tensor([1., 1., 2.]) + >>> l1_loss(pred, target, weight, avg_factor=2) + tensor(1.5000) + """ + + @functools.wraps(loss_func) + def wrapper(pred, + target, + weight=None, + reduction='mean', + avg_factor=None, + **kwargs): + # get element-wise loss + loss = loss_func(pred, target, **kwargs) + loss = weight_reduce_loss(loss, weight, reduction, avg_factor) + return loss + + return wrapper diff --git a/custom_mmpkg/custom_mmseg/models/necks/__init__.py b/custom_mmpkg/custom_mmseg/models/necks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9b9d3d5b3fe80247642d962edd6fb787537d01d6 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/necks/__init__.py @@ -0,0 +1,4 @@ +from .fpn import FPN +from .multilevel_neck import MultiLevelNeck + +__all__ = ['FPN', 'MultiLevelNeck'] diff --git a/custom_mmpkg/custom_mmseg/models/necks/fpn.py b/custom_mmpkg/custom_mmseg/models/necks/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..3c32cc5e44f92e7779ba1ba913c2482107e5900d --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/necks/fpn.py @@ -0,0 +1,212 @@ +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule, xavier_init + +from ..builder import NECKS + + +@NECKS.register_module() +class FPN(nn.Module): + """Feature Pyramid Network. + + This is an implementation of - Feature Pyramid Networks for Object + Detection (https://arxiv.org/abs/1612.03144) + + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale) + num_outs (int): Number of output scales. + start_level (int): Index of the start input backbone level used to + build the feature pyramid. Default: 0. + end_level (int): Index of the end input backbone level (exclusive) to + build the feature pyramid. Default: -1, which means the last level. + add_extra_convs (bool | str): If bool, it decides whether to add conv + layers on top of the original feature maps. Default to False. + If True, its actual mode is specified by `extra_convs_on_inputs`. + If str, it specifies the source feature map of the extra convs. + Only the following options are allowed + + - 'on_input': Last feat map of neck inputs (i.e. backbone feature). + - 'on_lateral': Last feature map after lateral convs. + - 'on_output': The last output feature map after fpn convs. + extra_convs_on_inputs (bool, deprecated): Whether to apply extra convs + on the original feature from the backbone. If True, + it is equivalent to `add_extra_convs='on_input'`. If False, it is + equivalent to set `add_extra_convs='on_output'`. Default to True. + relu_before_extra_convs (bool): Whether to apply relu before the extra + conv. Default: False. + no_norm_on_lateral (bool): Whether to apply norm on lateral. + Default: False. + conv_cfg (dict): Config dict for convolution layer. Default: None. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (str): Config dict for activation layer in ConvModule. + Default: None. + upsample_cfg (dict): Config dict for interpolate layer. + Default: `dict(mode='nearest')` + + Example: + >>> import torch + >>> in_channels = [2, 3, 5, 7] + >>> scales = [340, 170, 84, 43] + >>> inputs = [torch.rand(1, c, s, s) + ... for c, s in zip(in_channels, scales)] + >>> self = FPN(in_channels, 11, len(in_channels)).eval() + >>> outputs = self.forward(inputs) + >>> for i in range(len(outputs)): + ... print(f'outputs[{i}].shape = {outputs[i].shape}') + outputs[0].shape = torch.Size([1, 11, 340, 340]) + outputs[1].shape = torch.Size([1, 11, 170, 170]) + outputs[2].shape = torch.Size([1, 11, 84, 84]) + outputs[3].shape = torch.Size([1, 11, 43, 43]) + """ + + def __init__(self, + in_channels, + out_channels, + num_outs, + start_level=0, + end_level=-1, + add_extra_convs=False, + extra_convs_on_inputs=False, + relu_before_extra_convs=False, + no_norm_on_lateral=False, + conv_cfg=None, + norm_cfg=None, + act_cfg=None, + upsample_cfg=dict(mode='nearest')): + super(FPN, self).__init__() + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.num_ins = len(in_channels) + self.num_outs = num_outs + self.relu_before_extra_convs = relu_before_extra_convs + self.no_norm_on_lateral = no_norm_on_lateral + self.fp16_enabled = False + self.upsample_cfg = upsample_cfg.copy() + + if end_level == -1: + self.backbone_end_level = self.num_ins + assert num_outs >= self.num_ins - start_level + else: + # if end_level < inputs, no extra level is allowed + self.backbone_end_level = end_level + assert end_level <= len(in_channels) + assert num_outs == end_level - start_level + self.start_level = start_level + self.end_level = end_level + self.add_extra_convs = add_extra_convs + assert isinstance(add_extra_convs, (str, bool)) + if isinstance(add_extra_convs, str): + # Extra_convs_source choices: 'on_input', 'on_lateral', 'on_output' + assert add_extra_convs in ('on_input', 'on_lateral', 'on_output') + elif add_extra_convs: # True + if extra_convs_on_inputs: + # For compatibility with previous release + # TODO: deprecate `extra_convs_on_inputs` + self.add_extra_convs = 'on_input' + else: + self.add_extra_convs = 'on_output' + + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + + for i in range(self.start_level, self.backbone_end_level): + l_conv = ConvModule( + in_channels[i], + out_channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg if not self.no_norm_on_lateral else None, + act_cfg=act_cfg, + inplace=False) + fpn_conv = ConvModule( + out_channels, + out_channels, + 3, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + # add extra conv layers (e.g., RetinaNet) + extra_levels = num_outs - self.backbone_end_level + self.start_level + if self.add_extra_convs and extra_levels >= 1: + for i in range(extra_levels): + if i == 0 and self.add_extra_convs == 'on_input': + in_channels = self.in_channels[self.backbone_end_level - 1] + else: + in_channels = out_channels + extra_fpn_conv = ConvModule( + in_channels, + out_channels, + 3, + stride=2, + padding=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + inplace=False) + self.fpn_convs.append(extra_fpn_conv) + + # default init_weights for conv(msra) and norm in ConvModule + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + xavier_init(m, distribution='uniform') + + def forward(self, inputs): + assert len(inputs) == len(self.in_channels) + + # build laterals + laterals = [ + lateral_conv(inputs[i + self.start_level]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + # In some cases, fixing `scale factor` (e.g. 2) is preferred, but + # it cannot co-exist with `size` in `F.interpolate`. + if 'scale_factor' in self.upsample_cfg: + laterals[i - 1] += F.interpolate(laterals[i], + **self.upsample_cfg) + else: + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] += F.interpolate( + laterals[i], size=prev_shape, **self.upsample_cfg) + + # build outputs + # part 1: from original levels + outs = [ + self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels) + ] + # part 2: add extra levels + if self.num_outs > len(outs): + # use max pool to get more levels on top of outputs + # (e.g., Faster R-CNN, Mask R-CNN) + if not self.add_extra_convs: + for i in range(self.num_outs - used_backbone_levels): + outs.append(F.max_pool2d(outs[-1], 1, stride=2)) + # add conv layers on top of original feature maps (RetinaNet) + else: + if self.add_extra_convs == 'on_input': + extra_source = inputs[self.backbone_end_level - 1] + elif self.add_extra_convs == 'on_lateral': + extra_source = laterals[-1] + elif self.add_extra_convs == 'on_output': + extra_source = outs[-1] + else: + raise NotImplementedError + outs.append(self.fpn_convs[used_backbone_levels](extra_source)) + for i in range(used_backbone_levels + 1, self.num_outs): + if self.relu_before_extra_convs: + outs.append(self.fpn_convs[i](F.relu(outs[-1]))) + else: + outs.append(self.fpn_convs[i](outs[-1])) + return tuple(outs) diff --git a/custom_mmpkg/custom_mmseg/models/necks/multilevel_neck.py b/custom_mmpkg/custom_mmseg/models/necks/multilevel_neck.py new file mode 100644 index 0000000000000000000000000000000000000000..ce5e8563f7d7b27944fbad2c247f789967433bba --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/necks/multilevel_neck.py @@ -0,0 +1,70 @@ +import torch.nn as nn +import torch.nn.functional as F +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from ..builder import NECKS + + +@NECKS.register_module() +class MultiLevelNeck(nn.Module): + """MultiLevelNeck. + + A neck structure connect vit backbone and decoder_heads. + Args: + in_channels (List[int]): Number of input channels per scale. + out_channels (int): Number of output channels (used at each scale). + scales (List[int]): Scale factors for each input feature map. + norm_cfg (dict): Config dict for normalization layer. Default: None. + act_cfg (dict): Config dict for activation layer in ConvModule. + Default: None. + """ + + def __init__(self, + in_channels, + out_channels, + scales=[0.5, 1, 2, 4], + norm_cfg=None, + act_cfg=None): + super(MultiLevelNeck, self).__init__() + assert isinstance(in_channels, list) + self.in_channels = in_channels + self.out_channels = out_channels + self.scales = scales + self.num_outs = len(scales) + self.lateral_convs = nn.ModuleList() + self.convs = nn.ModuleList() + for in_channel in in_channels: + self.lateral_convs.append( + ConvModule( + in_channel, + out_channels, + kernel_size=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + for _ in range(self.num_outs): + self.convs.append( + ConvModule( + out_channels, + out_channels, + kernel_size=3, + padding=1, + stride=1, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + + def forward(self, inputs): + assert len(inputs) == len(self.in_channels) + print(inputs[0].shape) + inputs = [ + lateral_conv(inputs[i]) + for i, lateral_conv in enumerate(self.lateral_convs) + ] + # for len(inputs) not equal to self.num_outs + if len(inputs) == 1: + inputs = [inputs[0] for _ in range(self.num_outs)] + outs = [] + for i in range(self.num_outs): + x_resize = F.interpolate( + inputs[i], scale_factor=self.scales[i], mode='bilinear') + outs.append(self.convs[i](x_resize)) + return tuple(outs) diff --git a/custom_mmpkg/custom_mmseg/models/segmentors/__init__.py b/custom_mmpkg/custom_mmseg/models/segmentors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..dca2f09405330743c476e190896bee39c45498ea --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/segmentors/__init__.py @@ -0,0 +1,5 @@ +from .base import BaseSegmentor +from .cascade_encoder_decoder import CascadeEncoderDecoder +from .encoder_decoder import EncoderDecoder + +__all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder'] diff --git a/custom_mmpkg/custom_mmseg/models/segmentors/base.py b/custom_mmpkg/custom_mmseg/models/segmentors/base.py new file mode 100644 index 0000000000000000000000000000000000000000..5fd073f4a9d7713f02b107b7ad541384c3d27d6b --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/segmentors/base.py @@ -0,0 +1,273 @@ +import logging +import warnings +from abc import ABCMeta, abstractmethod +from collections import OrderedDict + +import custom_mmpkg.custom_mmcv as mmcv +import numpy as np +import torch +import torch.distributed as dist +import torch.nn as nn +from custom_mmpkg.custom_mmcv.runner import auto_fp16 + + +class BaseSegmentor(nn.Module): + """Base class for segmentors.""" + + __metaclass__ = ABCMeta + + def __init__(self): + super(BaseSegmentor, self).__init__() + self.fp16_enabled = False + + @property + def with_neck(self): + """bool: whether the segmentor has neck""" + return hasattr(self, 'neck') and self.neck is not None + + @property + def with_auxiliary_head(self): + """bool: whether the segmentor has auxiliary head""" + return hasattr(self, + 'auxiliary_head') and self.auxiliary_head is not None + + @property + def with_decode_head(self): + """bool: whether the segmentor has decode head""" + return hasattr(self, 'decode_head') and self.decode_head is not None + + @abstractmethod + def extract_feat(self, imgs): + """Placeholder for extract features from images.""" + pass + + @abstractmethod + def encode_decode(self, img, img_metas): + """Placeholder for encode images with backbone and decode into a + semantic segmentation map of the same size as input.""" + pass + + @abstractmethod + def forward_train(self, imgs, img_metas, **kwargs): + """Placeholder for Forward function for training.""" + pass + + @abstractmethod + def simple_test(self, img, img_meta, **kwargs): + """Placeholder for single image test.""" + pass + + @abstractmethod + def aug_test(self, imgs, img_metas, **kwargs): + """Placeholder for augmentation test.""" + pass + + def init_weights(self, pretrained=None): + """Initialize the weights in segmentor. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + if pretrained is not None: + logger = logging.getLogger() + logger.info(f'load model from: {pretrained}') + + def forward_test(self, imgs, img_metas, **kwargs): + """ + Args: + imgs (List[Tensor]): the outer list indicates test-time + augmentations and inner Tensor should have a shape NxCxHxW, + which contains all images in the batch. + img_metas (List[List[dict]]): the outer list indicates test-time + augs (multiscale, flip, etc.) and the inner list indicates + images in a batch. + """ + for var, name in [(imgs, 'imgs'), (img_metas, 'img_metas')]: + if not isinstance(var, list): + raise TypeError(f'{name} must be a list, but got ' + f'{type(var)}') + + num_augs = len(imgs) + if num_augs != len(img_metas): + raise ValueError(f'num of augmentations ({len(imgs)}) != ' + f'num of image meta ({len(img_metas)})') + # all images in the same aug batch all of the same ori_shape and pad + # shape + for img_meta in img_metas: + ori_shapes = [_['ori_shape'] for _ in img_meta] + assert all(shape == ori_shapes[0] for shape in ori_shapes) + img_shapes = [_['img_shape'] for _ in img_meta] + assert all(shape == img_shapes[0] for shape in img_shapes) + pad_shapes = [_['pad_shape'] for _ in img_meta] + assert all(shape == pad_shapes[0] for shape in pad_shapes) + + if num_augs == 1: + return self.simple_test(imgs[0], img_metas[0], **kwargs) + else: + return self.aug_test(imgs, img_metas, **kwargs) + + @auto_fp16(apply_to=('img', )) + def forward(self, img, img_metas, return_loss=True, **kwargs): + """Calls either :func:`forward_train` or :func:`forward_test` depending + on whether ``return_loss`` is ``True``. + + Note this setting will change the expected inputs. When + ``return_loss=True``, img and img_meta are single-nested (i.e. Tensor + and List[dict]), and when ``resturn_loss=False``, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + """ + if return_loss: + return self.forward_train(img, img_metas, **kwargs) + else: + return self.forward_test(img, img_metas, **kwargs) + + def train_step(self, data_batch, optimizer, **kwargs): + """The iteration step during training. + + This method defines an iteration step during training, except for the + back propagation and optimizer updating, which are done in an optimizer + hook. Note that in some complicated cases or models, the whole process + including back propagation and optimizer updating is also defined in + this method, such as GAN. + + Args: + data (dict): The output of dataloader. + optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of + runner is passed to ``train_step()``. This argument is unused + and reserved. + + Returns: + dict: It should contain at least 3 keys: ``loss``, ``log_vars``, + ``num_samples``. + ``loss`` is a tensor for back propagation, which can be a + weighted sum of multiple losses. + ``log_vars`` contains all the variables to be sent to the + logger. + ``num_samples`` indicates the batch size (when the model is + DDP, it means the batch size on each GPU), which is used for + averaging the logs. + """ + losses = self(**data_batch) + loss, log_vars = self._parse_losses(losses) + + outputs = dict( + loss=loss, + log_vars=log_vars, + num_samples=len(data_batch['img_metas'])) + + return outputs + + def val_step(self, data_batch, **kwargs): + """The iteration step during validation. + + This method shares the same signature as :func:`train_step`, but used + during val epochs. Note that the evaluation after training epochs is + not implemented with this method, but an evaluation hook. + """ + output = self(**data_batch, **kwargs) + return output + + @staticmethod + def _parse_losses(losses): + """Parse the raw outputs (losses) of the network. + + Args: + losses (dict): Raw output of the network, which usually contain + losses and other necessary information. + + Returns: + tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor + which may be a weighted sum of all losses, log_vars contains + all the variables to be sent to the logger. + """ + log_vars = OrderedDict() + for loss_name, loss_value in losses.items(): + if isinstance(loss_value, torch.Tensor): + log_vars[loss_name] = loss_value.mean() + elif isinstance(loss_value, list): + log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value) + else: + raise TypeError( + f'{loss_name} is not a tensor or list of tensors') + + loss = sum(_value for _key, _value in log_vars.items() + if 'loss' in _key) + + log_vars['loss'] = loss + for loss_name, loss_value in log_vars.items(): + # reduce loss when distributed training + if dist.is_available() and dist.is_initialized(): + loss_value = loss_value.data.clone() + dist.all_reduce(loss_value.div_(dist.get_world_size())) + log_vars[loss_name] = loss_value.item() + + return loss, log_vars + + def show_result(self, + img, + result, + palette=None, + win_name='', + show=False, + wait_time=0, + out_file=None, + opacity=0.5): + """Draw `result` over `img`. + + Args: + img (str or Tensor): The image to be displayed. + result (Tensor): The semantic segmentation results to draw over + `img`. + palette (list[list[int]]] | np.ndarray | None): The palette of + segmentation map. If None is given, random palette will be + generated. Default: None + win_name (str): The window name. + wait_time (int): Value of waitKey param. + Default: 0. + show (bool): Whether to show the image. + Default: False. + out_file (str or None): The filename to write the image. + Default: None. + opacity(float): Opacity of painted segmentation map. + Default 0.5. + Must be in (0, 1] range. + Returns: + img (Tensor): Only if not `show` or `out_file` + """ + img = mmcv.imread(img) + img = img.copy() + seg = result[0] + if palette is None: + if self.PALETTE is None: + palette = np.random.randint( + 0, 255, size=(len(self.CLASSES), 3)) + else: + palette = self.PALETTE + palette = np.array(palette) + assert palette.shape[0] == len(self.CLASSES) + assert palette.shape[1] == 3 + assert len(palette.shape) == 2 + assert 0 < opacity <= 1.0 + color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8) + for label, color in enumerate(palette): + color_seg[seg == label, :] = color + # convert to BGR + color_seg = color_seg[..., ::-1] + + img = img * (1 - opacity) + color_seg * opacity + img = img.astype(np.uint8) + # if out_file specified, do not show image in window + if out_file is not None: + show = False + + if show: + mmcv.imshow(img, win_name, wait_time) + if out_file is not None: + mmcv.imwrite(img, out_file) + + if not (show or out_file): + warnings.warn('show==False and out_file is not specified, only ' + 'result image will be returned') + return img diff --git a/custom_mmpkg/custom_mmseg/models/segmentors/cascade_encoder_decoder.py b/custom_mmpkg/custom_mmseg/models/segmentors/cascade_encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..cdece0ac89713a0cead10d98959bad2e2d05c4e7 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/segmentors/cascade_encoder_decoder.py @@ -0,0 +1,98 @@ +from torch import nn + +from custom_mmpkg.custom_mmseg.core import add_prefix +from custom_mmpkg.custom_mmseg.ops import resize +from .. import builder +from ..builder import SEGMENTORS +from .encoder_decoder import EncoderDecoder + + +@SEGMENTORS.register_module() +class CascadeEncoderDecoder(EncoderDecoder): + """Cascade Encoder Decoder segmentors. + + CascadeEncoderDecoder almost the same as EncoderDecoder, while decoders of + CascadeEncoderDecoder are cascaded. The output of previous decoder_head + will be the input of next decoder_head. + """ + + def __init__(self, + num_stages, + backbone, + decode_head, + neck=None, + auxiliary_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + self.num_stages = num_stages + super(CascadeEncoderDecoder, self).__init__( + backbone=backbone, + decode_head=decode_head, + neck=neck, + auxiliary_head=auxiliary_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained) + + def _init_decode_head(self, decode_head): + """Initialize ``decode_head``""" + assert isinstance(decode_head, list) + assert len(decode_head) == self.num_stages + self.decode_head = nn.ModuleList() + for i in range(self.num_stages): + self.decode_head.append(builder.build_head(decode_head[i])) + self.align_corners = self.decode_head[-1].align_corners + self.num_classes = self.decode_head[-1].num_classes + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone and heads. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + self.backbone.init_weights(pretrained=pretrained) + for i in range(self.num_stages): + self.decode_head[i].init_weights() + if self.with_auxiliary_head: + if isinstance(self.auxiliary_head, nn.ModuleList): + for aux_head in self.auxiliary_head: + aux_head.init_weights() + else: + self.auxiliary_head.init_weights() + + def encode_decode(self, img, img_metas): + """Encode images with backbone and decode into a semantic segmentation + map of the same size as input.""" + x = self.extract_feat(img) + out = self.decode_head[0].forward_test(x, img_metas, self.test_cfg) + for i in range(1, self.num_stages): + out = self.decode_head[i].forward_test(x, out, img_metas, + self.test_cfg) + out = resize( + input=out, + size=img.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + return out + + def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg): + """Run forward function and calculate loss for decode head in + training.""" + losses = dict() + + loss_decode = self.decode_head[0].forward_train( + x, img_metas, gt_semantic_seg, self.train_cfg) + + losses.update(add_prefix(loss_decode, 'decode_0')) + + for i in range(1, self.num_stages): + # forward test again, maybe unnecessary for most methods. + prev_outputs = self.decode_head[i - 1].forward_test( + x, img_metas, self.test_cfg) + loss_decode = self.decode_head[i].forward_train( + x, prev_outputs, img_metas, gt_semantic_seg, self.train_cfg) + losses.update(add_prefix(loss_decode, f'decode_{i}')) + + return losses diff --git a/custom_mmpkg/custom_mmseg/models/segmentors/encoder_decoder.py b/custom_mmpkg/custom_mmseg/models/segmentors/encoder_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..b8e42be53aba4b492efec32877bfe4c6a7a1e2aa --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/segmentors/encoder_decoder.py @@ -0,0 +1,298 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_mmpkg.custom_mmseg.core import add_prefix +from custom_mmpkg.custom_mmseg.ops import resize +from .. import builder +from ..builder import SEGMENTORS +from .base import BaseSegmentor + + +@SEGMENTORS.register_module() +class EncoderDecoder(BaseSegmentor): + """Encoder Decoder segmentors. + + EncoderDecoder typically consists of backbone, decode_head, auxiliary_head. + Note that auxiliary_head is only used for deep supervision during training, + which could be dumped during inference. + """ + + def __init__(self, + backbone, + decode_head, + neck=None, + auxiliary_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None): + super(EncoderDecoder, self).__init__() + self.backbone = builder.build_backbone(backbone) + if neck is not None: + self.neck = builder.build_neck(neck) + self._init_decode_head(decode_head) + self._init_auxiliary_head(auxiliary_head) + + self.train_cfg = train_cfg + self.test_cfg = test_cfg + + self.init_weights(pretrained=pretrained) + + assert self.with_decode_head + + def _init_decode_head(self, decode_head): + """Initialize ``decode_head``""" + self.decode_head = builder.build_head(decode_head) + self.align_corners = self.decode_head.align_corners + self.num_classes = self.decode_head.num_classes + + def _init_auxiliary_head(self, auxiliary_head): + """Initialize ``auxiliary_head``""" + if auxiliary_head is not None: + if isinstance(auxiliary_head, list): + self.auxiliary_head = nn.ModuleList() + for head_cfg in auxiliary_head: + self.auxiliary_head.append(builder.build_head(head_cfg)) + else: + self.auxiliary_head = builder.build_head(auxiliary_head) + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone and heads. + + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + super(EncoderDecoder, self).init_weights(pretrained) + self.backbone.init_weights(pretrained=pretrained) + self.decode_head.init_weights() + if self.with_auxiliary_head: + if isinstance(self.auxiliary_head, nn.ModuleList): + for aux_head in self.auxiliary_head: + aux_head.init_weights() + else: + self.auxiliary_head.init_weights() + + def extract_feat(self, img): + """Extract features from images.""" + x = self.backbone(img) + if self.with_neck: + x = self.neck(x) + return x + + def encode_decode(self, img, img_metas): + """Encode images with backbone and decode into a semantic segmentation + map of the same size as input.""" + x = self.extract_feat(img) + out = self._decode_head_forward_test(x, img_metas) + out = resize( + input=out, + size=img.shape[2:], + mode='bilinear', + align_corners=self.align_corners) + return out + + def _decode_head_forward_train(self, x, img_metas, gt_semantic_seg): + """Run forward function and calculate loss for decode head in + training.""" + losses = dict() + loss_decode = self.decode_head.forward_train(x, img_metas, + gt_semantic_seg, + self.train_cfg) + + losses.update(add_prefix(loss_decode, 'decode')) + return losses + + def _decode_head_forward_test(self, x, img_metas): + """Run forward function and calculate loss for decode head in + inference.""" + seg_logits = self.decode_head.forward_test(x, img_metas, self.test_cfg) + return seg_logits + + def _auxiliary_head_forward_train(self, x, img_metas, gt_semantic_seg): + """Run forward function and calculate loss for auxiliary head in + training.""" + losses = dict() + if isinstance(self.auxiliary_head, nn.ModuleList): + for idx, aux_head in enumerate(self.auxiliary_head): + loss_aux = aux_head.forward_train(x, img_metas, + gt_semantic_seg, + self.train_cfg) + losses.update(add_prefix(loss_aux, f'aux_{idx}')) + else: + loss_aux = self.auxiliary_head.forward_train( + x, img_metas, gt_semantic_seg, self.train_cfg) + losses.update(add_prefix(loss_aux, 'aux')) + + return losses + + def forward_dummy(self, img): + """Dummy forward function.""" + seg_logit = self.encode_decode(img, None) + + return seg_logit + + def forward_train(self, img, img_metas, gt_semantic_seg): + """Forward function for training. + + Args: + img (Tensor): Input images. + img_metas (list[dict]): List of image info dict where each dict + has: 'img_shape', 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + gt_semantic_seg (Tensor): Semantic segmentation masks + used if the architecture supports semantic segmentation task. + + Returns: + dict[str, Tensor]: a dictionary of loss components + """ + + x = self.extract_feat(img) + + losses = dict() + + loss_decode = self._decode_head_forward_train(x, img_metas, + gt_semantic_seg) + losses.update(loss_decode) + + if self.with_auxiliary_head: + loss_aux = self._auxiliary_head_forward_train( + x, img_metas, gt_semantic_seg) + losses.update(loss_aux) + + return losses + + # TODO refactor + def slide_inference(self, img, img_meta, rescale): + """Inference by sliding-window with overlap. + + If h_crop > h_img or w_crop > w_img, the small patch will be used to + decode without padding. + """ + + h_stride, w_stride = self.test_cfg.stride + h_crop, w_crop = self.test_cfg.crop_size + batch_size, _, h_img, w_img = img.size() + num_classes = self.num_classes + h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1 + w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1 + preds = img.new_zeros((batch_size, num_classes, h_img, w_img)) + count_mat = img.new_zeros((batch_size, 1, h_img, w_img)) + for h_idx in range(h_grids): + for w_idx in range(w_grids): + y1 = h_idx * h_stride + x1 = w_idx * w_stride + y2 = min(y1 + h_crop, h_img) + x2 = min(x1 + w_crop, w_img) + y1 = max(y2 - h_crop, 0) + x1 = max(x2 - w_crop, 0) + crop_img = img[:, :, y1:y2, x1:x2] + crop_seg_logit = self.encode_decode(crop_img, img_meta) + preds += F.pad(crop_seg_logit, + (int(x1), int(preds.shape[3] - x2), int(y1), + int(preds.shape[2] - y2))) + + count_mat[:, :, y1:y2, x1:x2] += 1 + assert (count_mat == 0).sum() == 0 + if torch.onnx.is_in_onnx_export(): + # cast count_mat to constant while exporting to ONNX + count_mat = torch.from_numpy( + count_mat.cpu().detach().numpy()).to(device=img.device) + preds = preds / count_mat + if rescale: + preds = resize( + preds, + size=img_meta[0]['ori_shape'][:2], + mode='bilinear', + align_corners=self.align_corners, + warning=False) + return preds + + def whole_inference(self, img, img_meta, rescale): + """Inference with full image.""" + + seg_logit = self.encode_decode(img, img_meta) + if rescale: + # support dynamic shape for onnx + if torch.onnx.is_in_onnx_export(): + size = img.shape[2:] + else: + size = img_meta[0]['ori_shape'][:2] + seg_logit = resize( + seg_logit, + size=size, + mode='bilinear', + align_corners=self.align_corners, + warning=False) + + return seg_logit + + def inference(self, img, img_meta, rescale): + """Inference with slide/whole style. + + Args: + img (Tensor): The input image of shape (N, 3, H, W). + img_meta (dict): Image info dict where each dict has: 'img_shape', + 'scale_factor', 'flip', and may also contain + 'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'. + For details on the values of these keys see + `mmseg/datasets/pipelines/formatting.py:Collect`. + rescale (bool): Whether rescale back to original shape. + + Returns: + Tensor: The output segmentation map. + """ + + assert self.test_cfg.mode in ['slide', 'whole'] + ori_shape = img_meta[0]['ori_shape'] + assert all(_['ori_shape'] == ori_shape for _ in img_meta) + if self.test_cfg.mode == 'slide': + seg_logit = self.slide_inference(img, img_meta, rescale) + else: + seg_logit = self.whole_inference(img, img_meta, rescale) + output = F.softmax(seg_logit, dim=1) + flip = img_meta[0]['flip'] + if flip: + flip_direction = img_meta[0]['flip_direction'] + assert flip_direction in ['horizontal', 'vertical'] + if flip_direction == 'horizontal': + output = output.flip(dims=(3, )) + elif flip_direction == 'vertical': + output = output.flip(dims=(2, )) + + return output + + def simple_test(self, img, img_meta, rescale=True): + """Simple test with single image.""" + seg_logit = self.inference(img, img_meta, rescale) + seg_pred = seg_logit.argmax(dim=1) + if torch.onnx.is_in_onnx_export(): + # our inference backend only support 4D output + seg_pred = seg_pred.unsqueeze(0) + return seg_pred + seg_pred = seg_pred.cpu().numpy() + # unravel batch dim + seg_pred = list(seg_pred) + return seg_pred + + def aug_test(self, imgs, img_metas, rescale=True): + """Test with augmentations. + + Only rescale=True is supported. + """ + # aug_test rescale all imgs back to ori_shape for now + assert rescale + # to save memory, we get augmented seg logit inplace + seg_logit = self.inference(imgs[0], img_metas[0], rescale) + for i in range(1, len(imgs)): + cur_seg_logit = self.inference(imgs[i], img_metas[i], rescale) + seg_logit += cur_seg_logit + seg_logit /= len(imgs) + seg_pred = seg_logit.argmax(dim=1) + seg_pred = seg_pred.cpu().numpy() + # unravel batch dim + seg_pred = list(seg_pred) + return seg_pred diff --git a/custom_mmpkg/custom_mmseg/models/utils/__init__.py b/custom_mmpkg/custom_mmseg/models/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3d3bdd349b9f2ae499a2fcb2ac1d2e3c77befebe --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/__init__.py @@ -0,0 +1,13 @@ +from .drop import DropPath +from .inverted_residual import InvertedResidual, InvertedResidualV3 +from .make_divisible import make_divisible +from .res_layer import ResLayer +from .se_layer import SELayer +from .self_attention_block import SelfAttentionBlock +from .up_conv_block import UpConvBlock +from .weight_init import trunc_normal_ + +__all__ = [ + 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', + 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_' +] diff --git a/custom_mmpkg/custom_mmseg/models/utils/drop.py b/custom_mmpkg/custom_mmseg/models/utils/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..4520b0ff407d2a95a864086bdbca0065f222aa63 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/drop.py @@ -0,0 +1,31 @@ +"""Modified from https://github.com/rwightman/pytorch-image- +models/blob/master/timm/models/layers/drop.py.""" + +import torch +from torch import nn + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of + residual blocks). + + Args: + drop_prob (float): Drop rate for paths of model. Dropout rate has + to be between 0 and 1. Default: 0. + """ + + def __init__(self, drop_prob=0.): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.keep_prob = 1 - drop_prob + + def forward(self, x): + if self.drop_prob == 0. or not self.training: + return x + shape = (x.shape[0], ) + (1, ) * ( + x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = self.keep_prob + torch.rand( + shape, dtype=x.dtype, device=x.device) + random_tensor.floor_() # binarize + output = x.div(self.keep_prob) * random_tensor + return output diff --git a/custom_mmpkg/custom_mmseg/models/utils/inverted_residual.py b/custom_mmpkg/custom_mmseg/models/utils/inverted_residual.py new file mode 100644 index 0000000000000000000000000000000000000000..0fb93391f83c15c91cca833a296b922380607e66 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/inverted_residual.py @@ -0,0 +1,208 @@ +from custom_mmpkg.custom_mmcv.cnn import ConvModule +from torch import nn +from torch.utils import checkpoint as cp + +from .se_layer import SELayer + + +class InvertedResidual(nn.Module): + """InvertedResidual block for MobileNetV2. + + Args: + in_channels (int): The input channels of the InvertedResidual block. + out_channels (int): The output channels of the InvertedResidual block. + stride (int): Stride of the middle (first) 3x3 convolution. + expand_ratio (int): Adjusts number of channels of the hidden layer + in InvertedResidual by this amount. + dilation (int): Dilation rate of depthwise conv. Default: 1 + conv_cfg (dict): Config dict for convolution layer. + Default: None, which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU6'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + stride, + expand_ratio, + dilation=1, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU6'), + with_cp=False): + super(InvertedResidual, self).__init__() + self.stride = stride + assert stride in [1, 2], f'stride must in [1, 2]. ' \ + f'But received {stride}.' + self.with_cp = with_cp + self.use_res_connect = self.stride == 1 and in_channels == out_channels + hidden_dim = int(round(in_channels * expand_ratio)) + + layers = [] + if expand_ratio != 1: + layers.append( + ConvModule( + in_channels=in_channels, + out_channels=hidden_dim, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + layers.extend([ + ConvModule( + in_channels=hidden_dim, + out_channels=hidden_dim, + kernel_size=3, + stride=stride, + padding=dilation, + dilation=dilation, + groups=hidden_dim, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg), + ConvModule( + in_channels=hidden_dim, + out_channels=out_channels, + kernel_size=1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + ]) + self.conv = nn.Sequential(*layers) + + def forward(self, x): + + def _inner_forward(x): + if self.use_res_connect: + return x + self.conv(x) + else: + return self.conv(x) + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out + + +class InvertedResidualV3(nn.Module): + """Inverted Residual Block for MobileNetV3. + + Args: + in_channels (int): The input channels of this Module. + out_channels (int): The output channels of this Module. + mid_channels (int): The input channels of the depthwise convolution. + kernel_size (int): The kernel size of the depthwise convolution. + Default: 3. + stride (int): The stride of the depthwise convolution. Default: 1. + se_cfg (dict): Config dict for se layer. Default: None, which means no + se layer. + with_expand_conv (bool): Use expand conv or not. If set False, + mid_channels must be the same with in_channels. Default: True. + conv_cfg (dict): Config dict for convolution layer. Default: None, + which means using conv2d. + norm_cfg (dict): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict): Config dict for activation layer. + Default: dict(type='ReLU'). + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + + Returns: + Tensor: The output tensor. + """ + + def __init__(self, + in_channels, + out_channels, + mid_channels, + kernel_size=3, + stride=1, + se_cfg=None, + with_expand_conv=True, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + with_cp=False): + super(InvertedResidualV3, self).__init__() + self.with_res_shortcut = (stride == 1 and in_channels == out_channels) + assert stride in [1, 2] + self.with_cp = with_cp + self.with_se = se_cfg is not None + self.with_expand_conv = with_expand_conv + + if self.with_se: + assert isinstance(se_cfg, dict) + if not self.with_expand_conv: + assert mid_channels == in_channels + + if self.with_expand_conv: + self.expand_conv = ConvModule( + in_channels=in_channels, + out_channels=mid_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.depthwise_conv = ConvModule( + in_channels=mid_channels, + out_channels=mid_channels, + kernel_size=kernel_size, + stride=stride, + padding=kernel_size // 2, + groups=mid_channels, + conv_cfg=dict( + type='Conv2dAdaptivePadding') if stride == 2 else conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + if self.with_se: + self.se = SELayer(**se_cfg) + + self.linear_conv = ConvModule( + in_channels=mid_channels, + out_channels=out_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=None) + + def forward(self, x): + + def _inner_forward(x): + out = x + + if self.with_expand_conv: + out = self.expand_conv(out) + + out = self.depthwise_conv(out) + + if self.with_se: + out = self.se(out) + + out = self.linear_conv(out) + + if self.with_res_shortcut: + return x + out + else: + return out + + if self.with_cp and x.requires_grad: + out = cp.checkpoint(_inner_forward, x) + else: + out = _inner_forward(x) + + return out diff --git a/custom_mmpkg/custom_mmseg/models/utils/make_divisible.py b/custom_mmpkg/custom_mmseg/models/utils/make_divisible.py new file mode 100644 index 0000000000000000000000000000000000000000..75ad756052529f52fe83bb95dd1f0ecfc9a13078 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/make_divisible.py @@ -0,0 +1,27 @@ +def make_divisible(value, divisor, min_value=None, min_ratio=0.9): + """Make divisible function. + + This function rounds the channel number to the nearest value that can be + divisible by the divisor. It is taken from the original tf repo. It ensures + that all layers have a channel number that is divisible by divisor. It can + be seen here: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py # noqa + + Args: + value (int): The original channel number. + divisor (int): The divisor to fully divide the channel number. + min_value (int): The minimum value of the output channel. + Default: None, means that the minimum value equal to the divisor. + min_ratio (float): The minimum ratio of the rounded channel number to + the original channel number. Default: 0.9. + + Returns: + int: The modified output channel number. + """ + + if min_value is None: + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than (1-min_ratio). + if new_value < min_ratio * value: + new_value += divisor + return new_value diff --git a/custom_mmpkg/custom_mmseg/models/utils/res_layer.py b/custom_mmpkg/custom_mmseg/models/utils/res_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..1379e62e7c6e591fae25e57d548c5f735e3ad33a --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/res_layer.py @@ -0,0 +1,94 @@ +from custom_mmpkg.custom_mmcv.cnn import build_conv_layer, build_norm_layer +from torch import nn as nn + + +class ResLayer(nn.Sequential): + """ResLayer to build ResNet style backbone. + + Args: + block (nn.Module): block used to build ResLayer. + inplanes (int): inplanes of block. + planes (int): planes of block. + num_blocks (int): number of blocks. + stride (int): stride of the first block. Default: 1 + avg_down (bool): Use AvgPool instead of stride conv when + downsampling in the bottleneck. Default: False + conv_cfg (dict): dictionary to construct and config conv layer. + Default: None + norm_cfg (dict): dictionary to construct and config norm layer. + Default: dict(type='BN') + multi_grid (int | None): Multi grid dilation rates of last + stage. Default: None + contract_dilation (bool): Whether contract first dilation of each layer + Default: False + """ + + def __init__(self, + block, + inplanes, + planes, + num_blocks, + stride=1, + dilation=1, + avg_down=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + multi_grid=None, + contract_dilation=False, + **kwargs): + self.block = block + + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = [] + conv_stride = stride + if avg_down: + conv_stride = 1 + downsample.append( + nn.AvgPool2d( + kernel_size=stride, + stride=stride, + ceil_mode=True, + count_include_pad=False)) + downsample.extend([ + build_conv_layer( + conv_cfg, + inplanes, + planes * block.expansion, + kernel_size=1, + stride=conv_stride, + bias=False), + build_norm_layer(norm_cfg, planes * block.expansion)[1] + ]) + downsample = nn.Sequential(*downsample) + + layers = [] + if multi_grid is None: + if dilation > 1 and contract_dilation: + first_dilation = dilation // 2 + else: + first_dilation = dilation + else: + first_dilation = multi_grid[0] + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=stride, + dilation=first_dilation, + downsample=downsample, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + inplanes = planes * block.expansion + for i in range(1, num_blocks): + layers.append( + block( + inplanes=inplanes, + planes=planes, + stride=1, + dilation=dilation if multi_grid is None else multi_grid[i], + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + **kwargs)) + super(ResLayer, self).__init__(*layers) diff --git a/custom_mmpkg/custom_mmseg/models/utils/se_layer.py b/custom_mmpkg/custom_mmseg/models/utils/se_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..7ec17f8d6713a441de2400186ded50ff651821ca --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/se_layer.py @@ -0,0 +1,57 @@ +import custom_mmpkg.custom_mmcv as mmcv +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule + +from .make_divisible import make_divisible + + +class SELayer(nn.Module): + """Squeeze-and-Excitation Module. + + Args: + channels (int): The input (and output) channels of the SE layer. + ratio (int): Squeeze ratio in SELayer, the intermediate channel will be + ``int(channels/ratio)``. Default: 16. + conv_cfg (None or dict): Config dict for convolution layer. + Default: None, which means using conv2d. + act_cfg (dict or Sequence[dict]): Config dict for activation layer. + If act_cfg is a dict, two activation layers will be configured + by this dict. If act_cfg is a sequence of dicts, the first + activation layer will be configured by the first dict and the + second activation layer will be configured by the second dict. + Default: (dict(type='ReLU'), dict(type='HSigmoid', bias=3.0, + divisor=6.0)). + """ + + def __init__(self, + channels, + ratio=16, + conv_cfg=None, + act_cfg=(dict(type='ReLU'), + dict(type='HSigmoid', bias=3.0, divisor=6.0))): + super(SELayer, self).__init__() + if isinstance(act_cfg, dict): + act_cfg = (act_cfg, act_cfg) + assert len(act_cfg) == 2 + assert mmcv.is_tuple_of(act_cfg, dict) + self.global_avgpool = nn.AdaptiveAvgPool2d(1) + self.conv1 = ConvModule( + in_channels=channels, + out_channels=make_divisible(channels // ratio, 8), + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[0]) + self.conv2 = ConvModule( + in_channels=make_divisible(channels // ratio, 8), + out_channels=channels, + kernel_size=1, + stride=1, + conv_cfg=conv_cfg, + act_cfg=act_cfg[1]) + + def forward(self, x): + out = self.global_avgpool(x) + out = self.conv1(out) + out = self.conv2(out) + return x * out diff --git a/custom_mmpkg/custom_mmseg/models/utils/self_attention_block.py b/custom_mmpkg/custom_mmseg/models/utils/self_attention_block.py new file mode 100644 index 0000000000000000000000000000000000000000..ad20ca6cf14b4dce040f350dbdd0fee6ce5ed9cf --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/self_attention_block.py @@ -0,0 +1,159 @@ +import torch +from custom_mmpkg.custom_mmcv.cnn import ConvModule, constant_init +from torch import nn as nn +from torch.nn import functional as F + + +class SelfAttentionBlock(nn.Module): + """General self-attention block/non-local block. + + Please refer to https://arxiv.org/abs/1706.03762 for details about key, + query and value. + + Args: + key_in_channels (int): Input channels of key feature. + query_in_channels (int): Input channels of query feature. + channels (int): Output channels of key/query transform. + out_channels (int): Output channels. + share_key_query (bool): Whether share projection weight between key + and query projection. + query_downsample (nn.Module): Query downsample module. + key_downsample (nn.Module): Key downsample module. + key_query_num_convs (int): Number of convs for key/query projection. + value_num_convs (int): Number of convs for value projection. + matmul_norm (bool): Whether normalize attention map with sqrt of + channels + with_out (bool): Whether use out projection. + conv_cfg (dict|None): Config of conv layers. + norm_cfg (dict|None): Config of norm layers. + act_cfg (dict|None): Config of activation layers. + """ + + def __init__(self, key_in_channels, query_in_channels, channels, + out_channels, share_key_query, query_downsample, + key_downsample, key_query_num_convs, value_out_num_convs, + key_query_norm, value_out_norm, matmul_norm, with_out, + conv_cfg, norm_cfg, act_cfg): + super(SelfAttentionBlock, self).__init__() + if share_key_query: + assert key_in_channels == query_in_channels + self.key_in_channels = key_in_channels + self.query_in_channels = query_in_channels + self.out_channels = out_channels + self.channels = channels + self.share_key_query = share_key_query + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + self.act_cfg = act_cfg + self.key_project = self.build_project( + key_in_channels, + channels, + num_convs=key_query_num_convs, + use_conv_module=key_query_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + if share_key_query: + self.query_project = self.key_project + else: + self.query_project = self.build_project( + query_in_channels, + channels, + num_convs=key_query_num_convs, + use_conv_module=key_query_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + self.value_project = self.build_project( + key_in_channels, + channels if with_out else out_channels, + num_convs=value_out_num_convs, + use_conv_module=value_out_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + if with_out: + self.out_project = self.build_project( + channels, + out_channels, + num_convs=value_out_num_convs, + use_conv_module=value_out_norm, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.out_project = None + + self.query_downsample = query_downsample + self.key_downsample = key_downsample + self.matmul_norm = matmul_norm + + self.init_weights() + + def init_weights(self): + """Initialize weight of later layer.""" + if self.out_project is not None: + if not isinstance(self.out_project, ConvModule): + constant_init(self.out_project, 0) + + def build_project(self, in_channels, channels, num_convs, use_conv_module, + conv_cfg, norm_cfg, act_cfg): + """Build projection layer for key/query/value/out.""" + if use_conv_module: + convs = [ + ConvModule( + in_channels, + channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + ] + for _ in range(num_convs - 1): + convs.append( + ConvModule( + channels, + channels, + 1, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg)) + else: + convs = [nn.Conv2d(in_channels, channels, 1)] + for _ in range(num_convs - 1): + convs.append(nn.Conv2d(channels, channels, 1)) + if len(convs) > 1: + convs = nn.Sequential(*convs) + else: + convs = convs[0] + return convs + + def forward(self, query_feats, key_feats): + """Forward function.""" + batch_size = query_feats.size(0) + query = self.query_project(query_feats) + if self.query_downsample is not None: + query = self.query_downsample(query) + query = query.reshape(*query.shape[:2], -1) + query = query.permute(0, 2, 1).contiguous() + + key = self.key_project(key_feats) + value = self.value_project(key_feats) + if self.key_downsample is not None: + key = self.key_downsample(key) + value = self.key_downsample(value) + key = key.reshape(*key.shape[:2], -1) + value = value.reshape(*value.shape[:2], -1) + value = value.permute(0, 2, 1).contiguous() + + sim_map = torch.matmul(query, key) + if self.matmul_norm: + sim_map = (self.channels**-.5) * sim_map + sim_map = F.softmax(sim_map, dim=-1) + + context = torch.matmul(sim_map, value) + context = context.permute(0, 2, 1).contiguous() + context = context.reshape(batch_size, -1, *query_feats.shape[2:]) + if self.out_project is not None: + context = self.out_project(context) + return context diff --git a/custom_mmpkg/custom_mmseg/models/utils/up_conv_block.py b/custom_mmpkg/custom_mmseg/models/utils/up_conv_block.py new file mode 100644 index 0000000000000000000000000000000000000000..a4320e261db00a8bd0ba2578bcf2fdde952d0270 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/up_conv_block.py @@ -0,0 +1,101 @@ +import torch +import torch.nn as nn +from custom_mmpkg.custom_mmcv.cnn import ConvModule, build_upsample_layer + + +class UpConvBlock(nn.Module): + """Upsample convolution block in decoder for UNet. + + This upsample convolution block consists of one upsample module + followed by one convolution block. The upsample module expands the + high-level low-resolution feature map and the convolution block fuses + the upsampled high-level low-resolution feature map and the low-level + high-resolution feature map from encoder. + + Args: + conv_block (nn.Sequential): Sequential of convolutional layers. + in_channels (int): Number of input channels of the high-level + skip_channels (int): Number of input channels of the low-level + high-resolution feature map from encoder. + out_channels (int): Number of output channels. + num_convs (int): Number of convolutional layers in the conv_block. + Default: 2. + stride (int): Stride of convolutional layer in conv_block. Default: 1. + dilation (int): Dilation rate of convolutional layer in conv_block. + Default: 1. + with_cp (bool): Use checkpoint or not. Using checkpoint will save some + memory while slowing down the training speed. Default: False. + conv_cfg (dict | None): Config dict for convolution layer. + Default: None. + norm_cfg (dict | None): Config dict for normalization layer. + Default: dict(type='BN'). + act_cfg (dict | None): Config dict for activation layer in ConvModule. + Default: dict(type='ReLU'). + upsample_cfg (dict): The upsample config of the upsample module in + decoder. Default: dict(type='InterpConv'). If the size of + high-level feature map is the same as that of skip feature map + (low-level feature map from encoder), it does not need upsample the + high-level feature map and the upsample_cfg is None. + dcn (bool): Use deformable convolution in convolutional layer or not. + Default: None. + plugins (dict): plugins for convolutional layers. Default: None. + """ + + def __init__(self, + conv_block, + in_channels, + skip_channels, + out_channels, + num_convs=2, + stride=1, + dilation=1, + with_cp=False, + conv_cfg=None, + norm_cfg=dict(type='BN'), + act_cfg=dict(type='ReLU'), + upsample_cfg=dict(type='InterpConv'), + dcn=None, + plugins=None): + super(UpConvBlock, self).__init__() + assert dcn is None, 'Not implemented yet.' + assert plugins is None, 'Not implemented yet.' + + self.conv_block = conv_block( + in_channels=2 * skip_channels, + out_channels=out_channels, + num_convs=num_convs, + stride=stride, + dilation=dilation, + with_cp=with_cp, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg, + dcn=None, + plugins=None) + if upsample_cfg is not None: + self.upsample = build_upsample_layer( + cfg=upsample_cfg, + in_channels=in_channels, + out_channels=skip_channels, + with_cp=with_cp, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + else: + self.upsample = ConvModule( + in_channels, + skip_channels, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=conv_cfg, + norm_cfg=norm_cfg, + act_cfg=act_cfg) + + def forward(self, skip, x): + """Forward function.""" + + x = self.upsample(x) + out = torch.cat([skip, x], dim=1) + out = self.conv_block(out) + + return out diff --git a/custom_mmpkg/custom_mmseg/models/utils/weight_init.py b/custom_mmpkg/custom_mmseg/models/utils/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..38141ba3d61f64ddfc0a31574b4648cbad96d7dd --- /dev/null +++ b/custom_mmpkg/custom_mmseg/models/utils/weight_init.py @@ -0,0 +1,62 @@ +"""Modified from https://github.com/rwightman/pytorch-image- +models/blob/master/timm/models/layers/drop.py.""" + +import math +import warnings + +import torch + + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + """Reference: https://people.sc.fsu.edu/~jburkardt/presentations + /truncated_normal.pdf""" + + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + 'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. ' + 'The distribution of values may be incorrect.', + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + lower_bound = norm_cdf((a - mean) / std) + upper_bound = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * lower_bound - 1, 2 * upper_bound - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor (``torch.Tensor``): an n-dimensional `torch.Tensor` + mean (float): the mean of the normal distribution + std (float): the standard deviation of the normal distribution + a (float): the minimum cutoff value + b (float): the maximum cutoff value + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) diff --git a/custom_mmpkg/custom_mmseg/ops/__init__.py b/custom_mmpkg/custom_mmseg/ops/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bec51c75b9363a9a19e9fb5c35f4e7dbd6f7751c --- /dev/null +++ b/custom_mmpkg/custom_mmseg/ops/__init__.py @@ -0,0 +1,4 @@ +from .encoding import Encoding +from .wrappers import Upsample, resize + +__all__ = ['Upsample', 'resize', 'Encoding'] diff --git a/custom_mmpkg/custom_mmseg/ops/encoding.py b/custom_mmpkg/custom_mmseg/ops/encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..7eb3629a6426550b8e4c537ee1ff4341893e489e --- /dev/null +++ b/custom_mmpkg/custom_mmseg/ops/encoding.py @@ -0,0 +1,74 @@ +import torch +from torch import nn +from torch.nn import functional as F + + +class Encoding(nn.Module): + """Encoding Layer: a learnable residual encoder. + + Input is of shape (batch_size, channels, height, width). + Output is of shape (batch_size, num_codes, channels). + + Args: + channels: dimension of the features or feature channels + num_codes: number of code words + """ + + def __init__(self, channels, num_codes): + super(Encoding, self).__init__() + # init codewords and smoothing factor + self.channels, self.num_codes = channels, num_codes + std = 1. / ((num_codes * channels)**0.5) + # [num_codes, channels] + self.codewords = nn.Parameter( + torch.empty(num_codes, channels, + dtype=torch.float).uniform_(-std, std), + requires_grad=True) + # [num_codes] + self.scale = nn.Parameter( + torch.empty(num_codes, dtype=torch.float).uniform_(-1, 0), + requires_grad=True) + + @staticmethod + def scaled_l2(x, codewords, scale): + num_codes, channels = codewords.size() + batch_size = x.size(0) + reshaped_scale = scale.view((1, 1, num_codes)) + expanded_x = x.unsqueeze(2).expand( + (batch_size, x.size(1), num_codes, channels)) + reshaped_codewords = codewords.view((1, 1, num_codes, channels)) + + scaled_l2_norm = reshaped_scale * ( + expanded_x - reshaped_codewords).pow(2).sum(dim=3) + return scaled_l2_norm + + @staticmethod + def aggregate(assignment_weights, x, codewords): + num_codes, channels = codewords.size() + reshaped_codewords = codewords.view((1, 1, num_codes, channels)) + batch_size = x.size(0) + + expanded_x = x.unsqueeze(2).expand( + (batch_size, x.size(1), num_codes, channels)) + encoded_feat = (assignment_weights.unsqueeze(3) * + (expanded_x - reshaped_codewords)).sum(dim=1) + return encoded_feat + + def forward(self, x): + assert x.dim() == 4 and x.size(1) == self.channels + # [batch_size, channels, height, width] + batch_size = x.size(0) + # [batch_size, height x width, channels] + x = x.view(batch_size, self.channels, -1).transpose(1, 2).contiguous() + # assignment_weights: [batch_size, channels, num_codes] + assignment_weights = F.softmax( + self.scaled_l2(x, self.codewords, self.scale), dim=2) + # aggregate + encoded_feat = self.aggregate(assignment_weights, x, self.codewords) + return encoded_feat + + def __repr__(self): + repr_str = self.__class__.__name__ + repr_str += f'(Nx{self.channels}xHxW =>Nx{self.num_codes}' \ + f'x{self.channels})' + return repr_str diff --git a/custom_mmpkg/custom_mmseg/ops/wrappers.py b/custom_mmpkg/custom_mmseg/ops/wrappers.py new file mode 100644 index 0000000000000000000000000000000000000000..0ed9a0cb8d7c0e0ec2748dd89c652756653cac78 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/ops/wrappers.py @@ -0,0 +1,50 @@ +import warnings + +import torch.nn as nn +import torch.nn.functional as F + + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 + and input_w > 1) and (output_h - 1) % (input_h - 1) + and (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + return F.interpolate(input, size, scale_factor, mode, align_corners) + + +class Upsample(nn.Module): + + def __init__(self, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None): + super(Upsample, self).__init__() + self.size = size + if isinstance(scale_factor, tuple): + self.scale_factor = tuple(float(factor) for factor in scale_factor) + else: + self.scale_factor = float(scale_factor) if scale_factor else None + self.mode = mode + self.align_corners = align_corners + + def forward(self, x): + if not self.size: + size = [int(t * self.scale_factor) for t in x.shape[-2:]] + else: + size = self.size + return resize(x, size, None, self.mode, self.align_corners) diff --git a/custom_mmpkg/custom_mmseg/utils/__init__.py b/custom_mmpkg/custom_mmseg/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ac489e2dbbc0e6fa87f5088b4edcc20f8cadc1a6 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/utils/__init__.py @@ -0,0 +1,4 @@ +from .collect_env import collect_env +from .logger import get_root_logger + +__all__ = ['get_root_logger', 'collect_env'] diff --git a/custom_mmpkg/custom_mmseg/utils/collect_env.py b/custom_mmpkg/custom_mmseg/utils/collect_env.py new file mode 100644 index 0000000000000000000000000000000000000000..ce5cd6ffee77a234e7c54d6990d273bbd872b8f5 --- /dev/null +++ b/custom_mmpkg/custom_mmseg/utils/collect_env.py @@ -0,0 +1,17 @@ +from custom_mmpkg.custom_mmcv.utils import collect_env as collect_base_env +from custom_mmpkg.custom_mmcv.utils import get_git_hash + +import custom_mmpkg.custom_mmseg as mmseg + + +def collect_env(): + """Collect the information of the running environments.""" + env_info = collect_base_env() + env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}' + + return env_info + + +if __name__ == '__main__': + for name, val in collect_env().items(): + print('{}: {}'.format(name, val)) diff --git a/custom_mmpkg/custom_mmseg/utils/logger.py b/custom_mmpkg/custom_mmseg/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..bc2ac05b20a1f7f24a3e7876757d87b00972a69d --- /dev/null +++ b/custom_mmpkg/custom_mmseg/utils/logger.py @@ -0,0 +1,27 @@ +import logging + +from custom_mmpkg.custom_mmcv.utils import get_logger + + +def get_root_logger(log_file=None, log_level=logging.INFO): + """Get the root logger. + + The logger will be initialized if it has not been initialized. By default a + StreamHandler will be added. If `log_file` is specified, a FileHandler will + also be added. The name of the root logger is the top-level package name, + e.g., "mmseg". + + Args: + log_file (str | None): The log filename. If specified, a FileHandler + will be added to the root logger. + log_level (int): The root logger level. Note that only the process of + rank 0 is affected, while other processes will set the level to + "Error" and be silent most of the time. + + Returns: + logging.Logger: The root logger. + """ + + logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level) + + return logger diff --git a/custom_oneformer/__init__.py b/custom_oneformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..39ebcd384f616ae2ba170407cee3267d461a5914 --- /dev/null +++ b/custom_oneformer/__init__.py @@ -0,0 +1,9 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from . import data # register all new datasets +from . import modeling + +# config +from .config import * + +# models +from .oneformer_model import OneFormer \ No newline at end of file diff --git a/custom_oneformer/config.py b/custom_oneformer/config.py new file mode 100644 index 0000000000000000000000000000000000000000..fb803f468277ba0c1fdbb8635910e75c80cd90c3 --- /dev/null +++ b/custom_oneformer/config.py @@ -0,0 +1,239 @@ +# -*- coding: utf-8 -*- +# Copyright (c) Facebook, Inc. and its affiliates. +from custom_detectron2.config import CfgNode as CN + +__all__ = ["add_common_config", "add_oneformer_config", "add_swin_config", + "add_dinat_config", "add_beit_adapter_config", "add_convnext_config"] + +def add_common_config(cfg): + """ + Add config for common configuration + """ + # data config + # select the dataset mapper + cfg.INPUT.DATASET_MAPPER_NAME = "oneformer_unified" + # Color augmentation + cfg.INPUT.COLOR_AUG_SSD = False + # We retry random cropping until no single category in semantic segmentation GT occupies more + # than `SINGLE_CATEGORY_MAX_AREA` part of the crop. + cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0 + # Pad image and segmentation GT in dataset mapper. + cfg.INPUT.SIZE_DIVISIBILITY = -1 + + cfg.INPUT.TASK_SEQ_LEN = 77 + cfg.INPUT.MAX_SEQ_LEN = 77 + + cfg.INPUT.TASK_PROB = CN() + cfg.INPUT.TASK_PROB.SEMANTIC = 0.33 + cfg.INPUT.TASK_PROB.INSTANCE = 0.66 + + # test dataset + cfg.DATASETS.TEST_PANOPTIC = ("",) + cfg.DATASETS.TEST_INSTANCE = ("",) + cfg.DATASETS.TEST_SEMANTIC = ("",) + + # solver config + # weight decay on embedding + cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0 + # optimizer + cfg.SOLVER.OPTIMIZER = "ADAMW" + cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1 + + # wandb + cfg.WANDB = CN() + cfg.WANDB.PROJECT = "unified_dense_recognition" + cfg.WANDB.NAME = None + + cfg.MODEL.IS_TRAIN = False + cfg.MODEL.IS_DEMO = True + + # text encoder config + cfg.MODEL.TEXT_ENCODER = CN() + + cfg.MODEL.TEXT_ENCODER.WIDTH = 256 + cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH = 77 + cfg.MODEL.TEXT_ENCODER.NUM_LAYERS = 12 + cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE = 49408 + cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS = 2 + cfg.MODEL.TEXT_ENCODER.N_CTX = 16 + + # mask_former inference config + cfg.MODEL.TEST = CN() + cfg.MODEL.TEST.SEMANTIC_ON = True + cfg.MODEL.TEST.INSTANCE_ON = False + cfg.MODEL.TEST.PANOPTIC_ON = False + cfg.MODEL.TEST.DETECTION_ON = False + cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD = 0.0 + cfg.MODEL.TEST.OVERLAP_THRESHOLD = 0.0 + cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False + cfg.MODEL.TEST.TASK = "panoptic" + + # TEST AUG Slide + cfg.TEST.AUG.IS_SLIDE = False + cfg.TEST.AUG.CROP_SIZE = (640, 640) + cfg.TEST.AUG.STRIDE = (426, 426) + cfg.TEST.AUG.SCALE = (2048, 640) + cfg.TEST.AUG.SETR_MULTI_SCALE = True + cfg.TEST.AUG.KEEP_RATIO = True + cfg.TEST.AUG.SIZE_DIVISOR = 32 + + # pixel decoder config + cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256 + # adding transformer in pixel decoder + cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0 + # pixel decoder + cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder" + cfg.MODEL.SEM_SEG_HEAD.SEM_EMBED_DIM = 256 + cfg.MODEL.SEM_SEG_HEAD.INST_EMBED_DIM = 256 + + # LSJ aug + cfg.INPUT.IMAGE_SIZE = 1024 + cfg.INPUT.MIN_SCALE = 0.1 + cfg.INPUT.MAX_SCALE = 2.0 + + # MSDeformAttn encoder configs + cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"] + cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4 + cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8 + +def add_oneformer_config(cfg): + """ + Add config for ONE_FORMER. + """ + + # mask_former model config + cfg.MODEL.ONE_FORMER = CN() + + # loss + cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION = True + cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT = 0.1 + cfg.MODEL.ONE_FORMER.CLASS_WEIGHT = 1.0 + cfg.MODEL.ONE_FORMER.DICE_WEIGHT = 1.0 + cfg.MODEL.ONE_FORMER.MASK_WEIGHT = 20.0 + cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT = 0.5 + cfg.MODEL.ONE_FORMER.CONTRASTIVE_TEMPERATURE = 0.07 + + # transformer config + cfg.MODEL.ONE_FORMER.NHEADS = 8 + cfg.MODEL.ONE_FORMER.DROPOUT = 0.1 + cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD = 2048 + cfg.MODEL.ONE_FORMER.ENC_LAYERS = 0 + cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS = 2 + cfg.MODEL.ONE_FORMER.DEC_LAYERS = 6 + cfg.MODEL.ONE_FORMER.PRE_NORM = False + + cfg.MODEL.ONE_FORMER.HIDDEN_DIM = 256 + cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES = 120 + cfg.MODEL.ONE_FORMER.NUM_OBJECT_CTX = 16 + cfg.MODEL.ONE_FORMER.USE_TASK_NORM = True + + cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE = "res5" + cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ = False + + # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet) + # you can use this config to override + cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY = 32 + + # transformer module + cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME = "ContrastiveMultiScaleMaskedTransformerDecoder" + + # point loss configs + # Number of points sampled during training for a mask point head. + cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS = 112 * 112 + # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the + # original paper. + cfg.MODEL.ONE_FORMER.OVERSAMPLE_RATIO = 3.0 + # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in + # the original paper. + cfg.MODEL.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75 + +def add_swin_config(cfg): + """ + Add config forSWIN Backbone. + """ + + # swin transformer backbone + cfg.MODEL.SWIN = CN() + cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224 + cfg.MODEL.SWIN.PATCH_SIZE = 4 + cfg.MODEL.SWIN.EMBED_DIM = 96 + cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2] + cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24] + cfg.MODEL.SWIN.WINDOW_SIZE = 7 + cfg.MODEL.SWIN.MLP_RATIO = 4.0 + cfg.MODEL.SWIN.QKV_BIAS = True + cfg.MODEL.SWIN.QK_SCALE = None + cfg.MODEL.SWIN.DROP_RATE = 0.0 + cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0 + cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3 + cfg.MODEL.SWIN.APE = False + cfg.MODEL.SWIN.PATCH_NORM = True + cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"] + cfg.MODEL.SWIN.USE_CHECKPOINT = False + ## Semask additions + cfg.MODEL.SWIN.SEM_WINDOW_SIZE = 7 + cfg.MODEL.SWIN.NUM_SEM_BLOCKS = 1 + +def add_dinat_config(cfg): + """ + Add config for NAT Backbone. + """ + + # DINAT transformer backbone + cfg.MODEL.DiNAT = CN() + cfg.MODEL.DiNAT.DEPTHS = [3, 4, 18, 5] + cfg.MODEL.DiNAT.OUT_FEATURES = ["res2", "res3", "res4", "res5"] + cfg.MODEL.DiNAT.EMBED_DIM = 64 + cfg.MODEL.DiNAT.MLP_RATIO = 3.0 + cfg.MODEL.DiNAT.NUM_HEADS = [2, 4, 8, 16] + cfg.MODEL.DiNAT.DROP_PATH_RATE = 0.2 + cfg.MODEL.DiNAT.KERNEL_SIZE = 7 + cfg.MODEL.DiNAT.DILATIONS = [[1, 16, 1], [1, 4, 1, 8], [1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]] + cfg.MODEL.DiNAT.OUT_INDICES = (0, 1, 2, 3) + cfg.MODEL.DiNAT.QKV_BIAS = True + cfg.MODEL.DiNAT.QK_SCALE = None + cfg.MODEL.DiNAT.DROP_RATE = 0 + cfg.MODEL.DiNAT.ATTN_DROP_RATE = 0. + cfg.MODEL.DiNAT.IN_PATCH_SIZE = 4 + +def add_convnext_config(cfg): + """ + Add config for ConvNeXt Backbone. + """ + + # swin transformer backbone + cfg.MODEL.CONVNEXT = CN() + cfg.MODEL.CONVNEXT.IN_CHANNELS = 3 + cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3] + cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536] + cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.4 + cfg.MODEL.CONVNEXT.LSIT = 1.0 + cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3] + cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"] + +def add_beit_adapter_config(cfg): + """ + Add config for BEiT Adapter Backbone. + """ + + # beit adapter backbone + cfg.MODEL.BEiTAdapter = CN() + cfg.MODEL.BEiTAdapter.IMG_SIZE = 640 + cfg.MODEL.BEiTAdapter.PATCH_SIZE = 16 + cfg.MODEL.BEiTAdapter.EMBED_DIM = 1024 + cfg.MODEL.BEiTAdapter.DEPTH = 24 + cfg.MODEL.BEiTAdapter.NUM_HEADS = 16 + cfg.MODEL.BEiTAdapter.MLP_RATIO = 4 + cfg.MODEL.BEiTAdapter.QKV_BIAS = True + cfg.MODEL.BEiTAdapter.USE_ABS_POS_EMB = False + cfg.MODEL.BEiTAdapter.USE_REL_POS_BIAS = True + cfg.MODEL.BEiTAdapter.INIT_VALUES = 1e-6 + cfg.MODEL.BEiTAdapter.DROP_PATH_RATE = 0.3 + cfg.MODEL.BEiTAdapter.CONV_INPLANE = 64 + cfg.MODEL.BEiTAdapter.N_POINTS = 4 + cfg.MODEL.BEiTAdapter.DEFORM_NUM_HEADS = 16 + cfg.MODEL.BEiTAdapter.CFFN_RATIO = 0.25 + cfg.MODEL.BEiTAdapter.DEFORM_RATIO = 0.5 + cfg.MODEL.BEiTAdapter.WITH_CP = True + cfg.MODEL.BEiTAdapter.INTERACTION_INDEXES=[[0, 5], [6, 11], [12, 17], [18, 23]] + cfg.MODEL.BEiTAdapter.OUT_FEATURES = ["res2", "res3", "res4", "res5"] \ No newline at end of file diff --git a/custom_oneformer/data/__init__.py b/custom_oneformer/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..63ba265b1effc69f1eef16e57a04db8902ee347e --- /dev/null +++ b/custom_oneformer/data/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from . import datasets diff --git a/custom_oneformer/data/bpe_simple_vocab_16e6.txt.gz b/custom_oneformer/data/bpe_simple_vocab_16e6.txt.gz new file mode 100644 index 0000000000000000000000000000000000000000..36a15856e00a06a9fbed8cdd34d2393fea4a3113 --- /dev/null +++ b/custom_oneformer/data/bpe_simple_vocab_16e6.txt.gz @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a +size 1356917 diff --git a/custom_oneformer/data/build.py b/custom_oneformer/data/build.py new file mode 100644 index 0000000000000000000000000000000000000000..7668f3451de4d5e338e6987a9da97d5812e838ca --- /dev/null +++ b/custom_oneformer/data/build.py @@ -0,0 +1,117 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from typing import Any, Callable, Dict, List, Optional, Union +import torch.utils.data as torchdata + +from custom_detectron2.config import configurable + + +from custom_detectron2.data.common import DatasetFromList, MapDataset +from custom_detectron2.data.dataset_mapper import DatasetMapper +from custom_detectron2.data.samplers import ( + InferenceSampler, +) +from custom_detectron2.data.build import ( + get_detection_dataset_dicts, + trivial_batch_collator +) +""" +This file contains the default logic to build a dataloader for training or testing. +""" + +__all__ = [ + "build_detection_test_loader", +] + + +def _test_loader_from_config(cfg, dataset_name, mapper=None): + """ + Uses the given `dataset_name` argument (instead of the names in cfg), because the + standard practice is to evaluate each test set individually (not combining them). + """ + if isinstance(dataset_name, str): + dataset_name = [dataset_name] + + dataset = get_detection_dataset_dicts( + dataset_name, + filter_empty=False, + proposal_files=[ + cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name + ] + if cfg.MODEL.LOAD_PROPOSALS + else None, + ) + if mapper is None: + mapper = DatasetMapper(cfg, False) + return { + "dataset": dataset, + "mapper": mapper, + "num_workers": cfg.DATALOADER.NUM_WORKERS, + "sampler": InferenceSampler(len(dataset)) + if not isinstance(dataset, torchdata.IterableDataset) + else None, + } + + +@configurable(from_config=_test_loader_from_config) +def build_detection_test_loader( + dataset: Union[List[Any], torchdata.Dataset], + *, + mapper: Callable[[Dict[str, Any]], Any], + sampler: Optional[torchdata.Sampler] = None, + batch_size: int = 1, + num_workers: int = 0, + collate_fn: Optional[Callable[[List[Any]], Any]] = None, +) -> torchdata.DataLoader: + """ + Similar to `build_detection_train_loader`, with default batch size = 1, + and sampler = :class:`InferenceSampler`. This sampler coordinates all workers + to produce the exact set of all samples. + + Args: + dataset: a list of dataset dicts, + or a pytorch dataset (either map-style or iterable). They can be obtained + by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`. + mapper: a callable which takes a sample (dict) from dataset + and returns the format to be consumed by the model. + When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``. + sampler: a sampler that produces + indices to be applied on ``dataset``. Default to :class:`InferenceSampler`, + which splits the dataset across all workers. Sampler must be None + if `dataset` is iterable. + batch_size: the batch size of the data loader to be created. + Default to 1 image per worker since this is the standard when reporting + inference time in papers. + num_workers: number of parallel data loading workers + collate_fn: same as the argument of `torch.utils.data.DataLoader`. + Defaults to do no collation and return a list of data. + + Returns: + DataLoader: a torch DataLoader, that loads the given detection + dataset, with test-time transformation and batching. + + Examples: + :: + data_loader = build_detection_test_loader( + DatasetRegistry.get("my_test"), + mapper=DatasetMapper(...)) + + # or, instantiate with a CfgNode: + data_loader = build_detection_test_loader(cfg, "my_test") + """ + if isinstance(dataset, list): + dataset = DatasetFromList(dataset, copy=False) + if mapper is not None: + dataset = MapDataset(dataset, mapper) + if isinstance(dataset, torchdata.IterableDataset): + assert sampler is None, "sampler must be None if dataset is IterableDataset" + else: + if sampler is None: + sampler = InferenceSampler(len(dataset)) + return torchdata.DataLoader( + dataset, + batch_size=batch_size, + sampler=sampler, + drop_last=False, + num_workers=num_workers, + collate_fn=trivial_batch_collator if collate_fn is None else collate_fn, + ) \ No newline at end of file diff --git a/custom_oneformer/data/dataset_mappers/__init__.py b/custom_oneformer/data/dataset_mappers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/custom_oneformer/data/dataset_mappers/__init__.py @@ -0,0 +1 @@ + diff --git a/custom_oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py b/custom_oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..44f3598e4b2c220356d55f3a415253f4b4a1fd79 --- /dev/null +++ b/custom_oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py @@ -0,0 +1,341 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import copy +import logging + +import numpy as np +import torch + +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.config import configurable +from custom_detectron2.data import detection_utils as utils +from custom_detectron2.data import transforms as T +from custom_detectron2.structures import BitMasks, Instances +from custom_oneformer.utils.box_ops import masks_to_boxes +from custom_oneformer.data.tokenizer import SimpleTokenizer, Tokenize + +__all__ = ["COCOUnifiedNewBaselineDatasetMapper"] + + +def build_transform_gen(cfg, is_train): + """ + Create a list of default :class:`Augmentation` from config. + Now it includes resizing and flipping. + Returns: + list[Augmentation] + """ + assert is_train, "Only support training augmentation" + image_size = cfg.INPUT.IMAGE_SIZE + min_scale = cfg.INPUT.MIN_SCALE + max_scale = cfg.INPUT.MAX_SCALE + + augmentation = [] + + if cfg.INPUT.RANDOM_FLIP != "none": + augmentation.append( + T.RandomFlip( + horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal", + vertical=cfg.INPUT.RANDOM_FLIP == "vertical", + ) + ) + + augmentation.extend([ + T.ResizeScale( + min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size + ), + T.FixedSizeCrop(crop_size=(image_size, image_size)), + ]) + + return augmentation + + +# This is specifically designed for the COCO dataset. +class COCOUnifiedNewBaselineDatasetMapper: + """ + A callable which takes a dataset dict in Detectron2 Dataset format, + and map it into a format used by OneFormer. + + This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation. + + The callable currently does the following: + + 1. Read the image from "file_name" + 2. Applies geometric transforms to the image and annotation + 3. Find and applies suitable cropping to the image and annotation + 4. Prepare image and annotation to Tensors + """ + + @configurable + def __init__( + self, + is_train=True, + *, + num_queries, + tfm_gens, + meta, + image_format, + max_seq_len, + task_seq_len, + semantic_prob, + instance_prob, + ): + """ + NOTE: this interface is experimental. + Args: + is_train: for training or inference + augmentations: a list of augmentations or deterministic transforms to apply + crop_gen: crop augmentation + tfm_gens: data augmentation + image_format: an image format supported by :func:`detection_utils.read_image`. + """ + self.tfm_gens = tfm_gens + logging.getLogger(__name__).info( + "[COCOUnifiedNewBaselineDatasetMapper] Full TransformGens used in training: {}".format( + str(self.tfm_gens) + ) + ) + + self.img_format = image_format + self.is_train = is_train + self.meta = meta + self.ignore_label = self.meta.ignore_label + self.num_queries = num_queries + + self.things = [] + for k,v in self.meta.thing_dataset_id_to_contiguous_id.items(): + self.things.append(v) + self.class_names = self.meta.stuff_classes + self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len) + self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) + self.semantic_prob = semantic_prob + self.instance_prob = instance_prob + + @classmethod + def from_config(cls, cfg, is_train=True): + # Build augmentation + tfm_gens = build_transform_gen(cfg, is_train) + dataset_names = cfg.DATASETS.TRAIN + meta = MetadataCatalog.get(dataset_names[0]) + + ret = { + "is_train": is_train, + "meta": meta, + "tfm_gens": tfm_gens, + "image_format": cfg.INPUT.FORMAT, + "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX, + "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, + "max_seq_len": cfg.INPUT.MAX_SEQ_LEN, + "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC, + "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE, + } + return ret + + def _get_semantic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): + instances = Instances(image_shape) + + classes = [] + texts = ["a semantic photo"] * self.num_queries + masks = [] + label = np.ones_like(pan_seg_gt) * self.ignore_label + + for segment_info in segments_info: + class_id = segment_info["category_id"] + if not segment_info["iscrowd"]: + mask = pan_seg_gt == segment_info["id"] + if not np.all(mask == False): + if class_id not in classes: + cls_name = self.class_names[class_id] + classes.append(class_id) + masks.append(mask) + num_class_obj[cls_name] += 1 + else: + idx = classes.index(class_id) + masks[idx] += mask + masks[idx] = np.clip(masks[idx], 0, 1).astype(np.bool) + label[mask] = class_id + + num = 0 + for i, cls_name in enumerate(self.class_names): + if num_class_obj[cls_name] > 0: + for _ in range(num_class_obj[cls_name]): + if num >= len(texts): + break + texts[num] = f"a photo with a {cls_name}" + num += 1 + + classes = np.array(classes) + instances.gt_classes = torch.tensor(classes, dtype=torch.int64) + if len(masks) == 0: + # Some image does not have annotation (all ignored) + instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) + instances.gt_bboxes = torch.zeros((0, 4)) + else: + masks = BitMasks( + torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) + ) + instances.gt_masks = masks.tensor + # Placeholder bounding boxes for stuff regions. Note that these are not used during training. + instances.gt_bboxes = torch.stack([torch.tensor([0., 0., 1., 1.])] * instances.gt_masks.shape[0]) + return instances, texts, label + + def _get_instance_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): + instances = Instances(image_shape) + + classes = [] + texts = ["an instance photo"] * self.num_queries + masks = [] + label = np.ones_like(pan_seg_gt) * self.ignore_label + + for segment_info in segments_info: + class_id = segment_info["category_id"] + if class_id in self.things: + if not segment_info["iscrowd"]: + mask = pan_seg_gt == segment_info["id"] + if not np.all(mask == False): + cls_name = self.class_names[class_id] + classes.append(class_id) + masks.append(mask) + num_class_obj[cls_name] += 1 + label[mask] = class_id + + num = 0 + for i, cls_name in enumerate(self.class_names): + if num_class_obj[cls_name] > 0: + for _ in range(num_class_obj[cls_name]): + if num >= len(texts): + break + texts[num] = f"a photo with a {cls_name}" + num += 1 + + classes = np.array(classes) + instances.gt_classes = torch.tensor(classes, dtype=torch.int64) + if len(masks) == 0: + # Some image does not have annotation (all ignored) + instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) + instances.gt_bboxes = torch.zeros((0, 4)) + else: + masks = BitMasks( + torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) + ) + instances.gt_masks = masks.tensor + instances.gt_bboxes = masks_to_boxes(instances.gt_masks) + return instances, texts, label + + def _get_panoptic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): + instances = Instances(image_shape) + + classes = [] + texts = ["a panoptic photo"] * self.num_queries + masks = [] + label = np.ones_like(pan_seg_gt) * self.ignore_label + + for segment_info in segments_info: + class_id = segment_info["category_id"] + if not segment_info["iscrowd"]: + mask = pan_seg_gt == segment_info["id"] + if not np.all(mask == False): + cls_name = self.class_names[class_id] + classes.append(class_id) + masks.append(mask) + num_class_obj[cls_name] += 1 + label[mask] = class_id + + num = 0 + for i, cls_name in enumerate(self.class_names): + if num_class_obj[cls_name] > 0: + for _ in range(num_class_obj[cls_name]): + if num >= len(texts): + break + texts[num] = f"a photo with a {cls_name}" + num += 1 + + classes = np.array(classes) + instances.gt_classes = torch.tensor(classes, dtype=torch.int64) + if len(masks) == 0: + # Some image does not have annotation (all ignored) + instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) + instances.gt_bboxes = torch.zeros((0, 4)) + else: + masks = BitMasks( + torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) + ) + instances.gt_masks = masks.tensor + instances.gt_bboxes = masks_to_boxes(instances.gt_masks) + for i in range(instances.gt_classes.shape[0]): + # Placeholder bounding boxes for stuff regions. Note that these are not used during training. + if instances.gt_classes[i].item() not in self.things: + instances.gt_bboxes[i] = torch.tensor([0., 0., 1., 1.]) + return instances, texts, label + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + utils.check_image_size(dataset_dict, image) + + image, transforms = T.apply_transform_gens(self.tfm_gens, image) + image_shape = image.shape[:2] # h, w + + # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, + # but not efficient on large generic data structures due to the use of pickle & mp.Queue. + # Therefore it's important to use torch.Tensor. + dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) + + if not self.is_train: + # USER: Modify this if you want to keep them for some reason. + dataset_dict.pop("annotations", None) + return dataset_dict + + # semantic segmentation + if "sem_seg_file_name" in dataset_dict: + # PyTorch transformation not implemented for uint16, so converting it to double first + sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") + sem_seg_gt = transforms.apply_segmentation(sem_seg_gt) + else: + sem_seg_gt = None + + if "pan_seg_file_name" in dataset_dict: + pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") + segments_info = dataset_dict["segments_info"] + + # apply the same transformation to panoptic segmentation + pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) + + from panopticapi.utils import rgb2id + pan_seg_gt = rgb2id(pan_seg_gt) + + prob_task = np.random.uniform(0,1.) + + num_class_obj = {} + + for name in self.class_names: + num_class_obj[name] = 0 + + if prob_task < self.semantic_prob: + task = "The task is semantic" + instances, text, sem_seg = self._get_semantic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) + elif prob_task < self.instance_prob: + task = "The task is instance" + instances, text, sem_seg = self._get_instance_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) + else: + task = "The task is panoptic" + instances, text, sem_seg = self._get_panoptic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) + + + dataset_dict["sem_seg"] = torch.from_numpy(sem_seg).long() + dataset_dict["instances"] = instances + dataset_dict["orig_shape"] = image_shape + dataset_dict["task"] = task + dataset_dict["text"] = text + dataset_dict["thing_ids"] = self.things + + return dataset_dict diff --git a/custom_oneformer/data/dataset_mappers/dataset_mapper.py b/custom_oneformer/data/dataset_mappers/dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..004947e5044ac18eaa199986f4ede3e11eb78145 --- /dev/null +++ b/custom_oneformer/data/dataset_mappers/dataset_mapper.py @@ -0,0 +1,203 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/dataset_mapper.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import copy +import logging +import numpy as np +from typing import List, Optional, Union +import torch + +from custom_detectron2.config import configurable + +from custom_detectron2.data import detection_utils as utils +from custom_detectron2.data import transforms as T +from custom_oneformer.data.tokenizer import SimpleTokenizer, Tokenize + +__all__ = ["DatasetMapper"] + + +class DatasetMapper: + """ + A callable which takes a dataset dict in Detectron2 Dataset format, + and map it into a format used by the model. + + This is the default callable to be used to map your dataset dict into training data. + You may need to follow it to implement your own one for customized logic, + such as a different way to read or transform images. + See :doc:`/tutorials/data_loading` for details. + + The callable currently does the following: + + 1. Read the image from "file_name" + 2. Applies cropping/geometric transforms to the image and annotations + 3. Prepare data and annotations to Tensor and :class:`Instances` + """ + + @configurable + def __init__( + self, + is_train: bool, + *, + augmentations: List[Union[T.Augmentation, T.Transform]], + image_format: str, + task_seq_len: int, + task: str = "panoptic", + use_instance_mask: bool = False, + use_keypoint: bool = False, + instance_mask_format: str = "polygon", + keypoint_hflip_indices: Optional[np.ndarray] = None, + precomputed_proposal_topk: Optional[int] = None, + recompute_boxes: bool = False, + ): + """ + NOTE: this interface is experimental. + + Args: + is_train: whether it's used in training or inference + augmentations: a list of augmentations or deterministic transforms to apply + image_format: an image format supported by :func:`detection_utils.read_image`. + use_instance_mask: whether to process instance segmentation annotations, if available + use_keypoint: whether to process keypoint annotations if available + instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation + masks into this format. + keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices` + precomputed_proposal_topk: if given, will load pre-computed + proposals from dataset_dict and keep the top k proposals for each image. + recompute_boxes: whether to overwrite bounding box annotations + by computing tight bounding boxes from instance mask annotations. + """ + if recompute_boxes: + assert use_instance_mask, "recompute_boxes requires instance masks" + # fmt: off + self.is_train = is_train + self.augmentations = T.AugmentationList(augmentations) + self.image_format = image_format + self.use_instance_mask = use_instance_mask + self.instance_mask_format = instance_mask_format + self.use_keypoint = use_keypoint + self.keypoint_hflip_indices = keypoint_hflip_indices + self.proposal_topk = precomputed_proposal_topk + self.recompute_boxes = recompute_boxes + self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) + self.task = task + assert self.task in ["panoptic", "semantic", "instance"] + + # fmt: on + logger = logging.getLogger(__name__) + mode = "training" if is_train else "inference" + logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}") + + @classmethod + def from_config(cls, cfg, is_train: bool = True): + augs = utils.build_augmentation(cfg, is_train) + if cfg.INPUT.CROP.ENABLED and is_train: + augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE)) + recompute_boxes = cfg.MODEL.MASK_ON + else: + recompute_boxes = False + + ret = { + "is_train": is_train, + "augmentations": augs, + "image_format": cfg.INPUT.FORMAT, + "use_instance_mask": cfg.MODEL.MASK_ON, + "instance_mask_format": cfg.INPUT.MASK_FORMAT, + "use_keypoint": cfg.MODEL.KEYPOINT_ON, + "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, + "recompute_boxes": recompute_boxes, + "task": cfg.MODEL.TEST.TASK, + } + + if cfg.MODEL.KEYPOINT_ON: + ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN) + + if cfg.MODEL.LOAD_PROPOSALS: + ret["precomputed_proposal_topk"] = ( + cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN + if is_train + else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST + ) + return ret + + def _transform_annotations(self, dataset_dict, transforms, image_shape): + # USER: Modify this if you want to keep them for some reason. + for anno in dataset_dict["annotations"]: + if not self.use_instance_mask: + anno.pop("segmentation", None) + if not self.use_keypoint: + anno.pop("keypoints", None) + + # USER: Implement additional transformations if you have other types of data + annos = [ + utils.transform_instance_annotations( + obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices + ) + for obj in dataset_dict.pop("annotations") + if obj.get("iscrowd", 0) == 0 + ] + instances = utils.annotations_to_instances( + annos, image_shape, mask_format=self.instance_mask_format + ) + + # After transforms such as cropping are applied, the bounding box may no longer + # tightly bound the object. As an example, imagine a triangle object + # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight + # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to + # the intersection of original bounding box and the cropping box. + if self.recompute_boxes: + instances.gt_boxes = instances.gt_masks.get_bounding_boxes() + dataset_dict["instances"] = utils.filter_empty_instances(instances) + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + # USER: Write your own image loading if it's not from a file + image = utils.read_image(dataset_dict["file_name"], format=self.image_format) + utils.check_image_size(dataset_dict, image) + + task = f"The task is {self.task}" + dataset_dict["task"] = task + + # USER: Remove if you don't do semantic/panoptic segmentation. + if "sem_seg_file_name" in dataset_dict: + sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2) + else: + sem_seg_gt = None + + aug_input = T.AugInput(image, sem_seg=sem_seg_gt) + transforms = self.augmentations(aug_input) + image, sem_seg_gt = aug_input.image, aug_input.sem_seg + + image_shape = image.shape[:2] # h, w + # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, + # but not efficient on large generic data structures due to the use of pickle & mp.Queue. + # Therefore it's important to use torch.Tensor. + dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) + if sem_seg_gt is not None: + dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long")) + + # USER: Remove if you don't use pre-computed proposals. + # Most users would not need this feature. + if self.proposal_topk is not None: + utils.transform_proposals( + dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk + ) + + if not self.is_train: + # USER: Modify this if you want to keep them for some reason. + dataset_dict.pop("annotations", None) + dataset_dict.pop("sem_seg_file_name", None) + return dataset_dict + + if "annotations" in dataset_dict: + self._transform_annotations(dataset_dict, transforms, image_shape) + + return dataset_dict \ No newline at end of file diff --git a/custom_oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py b/custom_oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..5c067c3015cdf361ef72549d2c6ca05cd9ea035f --- /dev/null +++ b/custom_oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py @@ -0,0 +1,375 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import copy +import logging +import os + +import numpy as np +import torch +from torch.nn import functional as F + +from custom_detectron2.config import configurable +from custom_detectron2.data import detection_utils as utils +from custom_detectron2.data import transforms as T +from custom_detectron2.structures import BitMasks, Instances +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.projects.point_rend import ColorAugSSDTransform +from custom_oneformer.utils.box_ops import masks_to_boxes +from custom_oneformer.data.tokenizer import SimpleTokenizer, Tokenize + +__all__ = ["OneFormerUnifiedDatasetMapper"] + + +class OneFormerUnifiedDatasetMapper: + """ + A callable which takes a dataset dict in Detectron2 Dataset format, + and map it into a format used by OneFormer for universal segmentation. + + The callable currently does the following: + + 1. Read the image from "file_name" + 2. Applies geometric transforms to the image and annotation + 3. Find and applies suitable cropping to the image and annotation + 4. Prepare image and annotation to Tensors + """ + + @configurable + def __init__( + self, + is_train=True, + *, + name, + num_queries, + meta, + augmentations, + image_format, + ignore_label, + size_divisibility, + task_seq_len, + max_seq_len, + semantic_prob, + instance_prob, + ): + """ + NOTE: this interface is experimental. + Args: + is_train: for training or inference + augmentations: a list of augmentations or deterministic transforms to apply + image_format: an image format supported by :func:`detection_utils.read_image`. + ignore_label: the label that is ignored to evaluation + size_divisibility: pad image size to be divisible by this value + """ + self.is_train = is_train + self.meta = meta + self.name = name + self.tfm_gens = augmentations + self.img_format = image_format + self.ignore_label = ignore_label + self.size_divisibility = size_divisibility + self.num_queries = num_queries + + logger = logging.getLogger(__name__) + mode = "training" if is_train else "inference" + logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") + + self.things = [] + for k,v in self.meta.thing_dataset_id_to_contiguous_id.items(): + self.things.append(v) + self.class_names = self.meta.stuff_classes + self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len) + self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) + self.semantic_prob = semantic_prob + self.instance_prob = instance_prob + + @classmethod + def from_config(cls, cfg, is_train=True): + # Build augmentation + augs = [ + T.ResizeShortestEdge( + cfg.INPUT.MIN_SIZE_TRAIN, + cfg.INPUT.MAX_SIZE_TRAIN, + cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING, + ) + ] + if cfg.INPUT.CROP.ENABLED: + augs.append( + T.RandomCrop_CategoryAreaConstraint( + cfg.INPUT.CROP.TYPE, + cfg.INPUT.CROP.SIZE, + cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA, + cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + ) + ) + if cfg.INPUT.COLOR_AUG_SSD: + augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT)) + augs.append(T.RandomFlip()) + + # Assume always applies to the training set. + dataset_names = cfg.DATASETS.TRAIN + meta = MetadataCatalog.get(dataset_names[0]) + ignore_label = meta.ignore_label + + ret = { + "is_train": is_train, + "meta": meta, + "name": dataset_names[0], + "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX, + "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, + "max_seq_len": cfg.INPUT.MAX_SEQ_LEN, + "augmentations": augs, + "image_format": cfg.INPUT.FORMAT, + "ignore_label": ignore_label, + "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY, + "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC, + "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE, + } + return ret + + def _get_semantic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): + pan_seg_gt = pan_seg_gt.numpy() + instances = Instances(image_shape) + + classes = [] + texts = ["a semantic photo"] * self.num_queries + masks = [] + label = np.ones_like(pan_seg_gt) * self.ignore_label + + for segment_info in segments_info: + class_id = segment_info["category_id"] + if not segment_info["iscrowd"]: + mask = pan_seg_gt == segment_info["id"] + if not np.all(mask == False): + if class_id not in classes: + cls_name = self.class_names[class_id] + classes.append(class_id) + masks.append(mask) + num_class_obj[cls_name] += 1 + else: + idx = classes.index(class_id) + masks[idx] += mask + masks[idx] = np.clip(masks[idx], 0, 1).astype(np.bool) + label[mask] = class_id + + num = 0 + for i, cls_name in enumerate(self.class_names): + if num_class_obj[cls_name] > 0: + for _ in range(num_class_obj[cls_name]): + if num >= len(texts): + break + texts[num] = f"a photo with a {cls_name}" + num += 1 + + classes = np.array(classes) + instances.gt_classes = torch.tensor(classes, dtype=torch.int64) + if len(masks) == 0: + # Some image does not have annotation (all ignored) + instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) + instances.gt_bboxes = torch.zeros((0, 4)) + else: + masks = BitMasks( + torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) + ) + instances.gt_masks = masks.tensor + # Placeholder bounding boxes for stuff regions. Note that these are not used during training. + instances.gt_bboxes = torch.stack([torch.tensor([0., 0., 1., 1.])] * instances.gt_masks.shape[0]) + return instances, texts, label + + def _get_instance_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): + pan_seg_gt = pan_seg_gt.numpy() + instances = Instances(image_shape) + + classes = [] + texts = ["an instance photo"] * self.num_queries + masks = [] + label = np.ones_like(pan_seg_gt) * self.ignore_label + + for segment_info in segments_info: + class_id = segment_info["category_id"] + if class_id in self.things: + if not segment_info["iscrowd"]: + mask = pan_seg_gt == segment_info["id"] + if not np.all(mask == False): + cls_name = self.class_names[class_id] + classes.append(class_id) + masks.append(mask) + num_class_obj[cls_name] += 1 + label[mask] = class_id + + num = 0 + for i, cls_name in enumerate(self.class_names): + if num_class_obj[cls_name] > 0: + for _ in range(num_class_obj[cls_name]): + if num >= len(texts): + break + texts[num] = f"a photo with a {cls_name}" + num += 1 + + classes = np.array(classes) + instances.gt_classes = torch.tensor(classes, dtype=torch.int64) + if len(masks) == 0: + # Some image does not have annotation (all ignored) + instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) + instances.gt_bboxes = torch.zeros((0, 4)) + else: + masks = BitMasks( + torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) + ) + instances.gt_masks = masks.tensor + instances.gt_bboxes = masks_to_boxes(instances.gt_masks) + return instances, texts, label + + def _get_panoptic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj): + pan_seg_gt = pan_seg_gt.numpy() + instances = Instances(image_shape) + + classes = [] + texts = ["a panoptic photo"] * self.num_queries + masks = [] + label = np.ones_like(pan_seg_gt) * self.ignore_label + + for segment_info in segments_info: + class_id = segment_info["category_id"] + if not segment_info["iscrowd"]: + mask = pan_seg_gt == segment_info["id"] + if not np.all(mask == False): + cls_name = self.class_names[class_id] + classes.append(class_id) + masks.append(mask) + num_class_obj[cls_name] += 1 + label[mask] = class_id + + num = 0 + for i, cls_name in enumerate(self.class_names): + if num_class_obj[cls_name] > 0: + for _ in range(num_class_obj[cls_name]): + if num >= len(texts): + break + texts[num] = f"a photo with a {cls_name}" + num += 1 + + classes = np.array(classes) + instances.gt_classes = torch.tensor(classes, dtype=torch.int64) + if len(masks) == 0: + # Some image does not have annotation (all ignored) + instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1])) + instances.gt_bboxes = torch.zeros((0, 4)) + else: + masks = BitMasks( + torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks]) + ) + instances.gt_masks = masks.tensor + instances.gt_bboxes = masks_to_boxes(instances.gt_masks) + for i in range(instances.gt_classes.shape[0]): + # Placeholder bounding boxes for stuff regions. Note that these are not used during training. + if instances.gt_classes[i].item() not in self.things: + instances.gt_bboxes[i] = torch.tensor([0., 0., 1., 1.]) + return instances, texts, label + + def __call__(self, dataset_dict): + """ + Args: + dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. + + Returns: + dict: a format that builtin models in detectron2 accept + """ + assert self.is_train, "OneFormerUnifiedDatasetMapper should only be used for training!" + + dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below + image = utils.read_image(dataset_dict["file_name"], format=self.img_format) + utils.check_image_size(dataset_dict, image) + + # semantic segmentation + if "sem_seg_file_name" in dataset_dict: + # PyTorch transformation not implemented for uint16, so converting it to double first + sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double") + else: + sem_seg_gt = None + + # panoptic segmentation + if "pan_seg_file_name" in dataset_dict: + pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB") + segments_info = dataset_dict["segments_info"] + else: + pan_seg_gt = None + segments_info = None + + if pan_seg_gt is None: + raise ValueError( + "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format( + dataset_dict["file_name"] + ) + ) + + aug_input = T.AugInput(image, sem_seg=sem_seg_gt) + aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) + image = aug_input.image + if sem_seg_gt is not None: + sem_seg_gt = aug_input.sem_seg + + # apply the same transformation to panoptic segmentation + pan_seg_gt = transforms.apply_segmentation(pan_seg_gt) + + from panopticapi.utils import rgb2id + + pan_seg_gt = rgb2id(pan_seg_gt) + + # Pad image and segmentation label here! + image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) + if sem_seg_gt is not None: + sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long")) + pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long")) + + if self.size_divisibility > 0: + image_size = (image.shape[-2], image.shape[-1]) + padding_size = [ + 0, + self.size_divisibility - image_size[1], + 0, + self.size_divisibility - image_size[0], + ] + image = F.pad(image, padding_size, value=128).contiguous() + if sem_seg_gt is not None: + sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous() + pan_seg_gt = F.pad( + pan_seg_gt, padding_size, value=0 + ).contiguous() # 0 is the VOID panoptic label + + image_shape = (image.shape[-2], image.shape[-1]) # h, w + + # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, + # but not efficient on large generic data structures due to the use of pickle & mp.Queue. + # Therefore it's important to use torch.Tensor. + dataset_dict["image"] = image + + if "annotations" in dataset_dict: + raise ValueError("Pemantic segmentation dataset should not have 'annotations'.") + + prob_task = np.random.uniform(0,1.) + + num_class_obj = {} + + for name in self.class_names: + num_class_obj[name] = 0 + + if prob_task < self.semantic_prob: + task = "The task is semantic" + instances, text, sem_seg = self._get_semantic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) + elif prob_task < self.instance_prob: + task = "The task is instance" + instances, text, sem_seg = self._get_instance_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) + else: + task = "The task is panoptic" + instances, text, sem_seg = self._get_panoptic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj) + + dataset_dict["sem_seg"] = torch.from_numpy(sem_seg).long() + dataset_dict["instances"] = instances + dataset_dict["orig_shape"] = image_shape + dataset_dict["task"] = task + dataset_dict["text"] = text + dataset_dict["thing_ids"] = self.things + + return dataset_dict diff --git a/custom_oneformer/data/datasets/__init__.py b/custom_oneformer/data/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..59ce30713f63d056107b2a06ecd434eb27a30b7d --- /dev/null +++ b/custom_oneformer/data/datasets/__init__.py @@ -0,0 +1,7 @@ +from . import ( + register_ade20k_panoptic, + register_cityscapes_panoptic, + register_coco_panoptic_annos_semseg, + register_ade20k_instance, + register_coco_panoptic2instance, +) diff --git a/custom_oneformer/data/datasets/register_ade20k_instance.py b/custom_oneformer/data/datasets/register_ade20k_instance.py new file mode 100644 index 0000000000000000000000000000000000000000..2cfa20c26ab8ceb54d122dd187096be14d0f5015 --- /dev/null +++ b/custom_oneformer/data/datasets/register_ade20k_instance.py @@ -0,0 +1,56 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py +# ------------------------------------------------------------------------------ + +import json +import logging +import numpy as np +import os +from PIL import Image + +from custom_detectron2.data import DatasetCatalog, MetadataCatalog +from custom_detectron2.data.datasets.coco import load_coco_json, register_coco_instances +from custom_detectron2.utils.file_io import PathManager + +ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}] + + +_PREDEFINED_SPLITS = { + # point annotations without masks + "ade20k_instance_train": ( + "ADEChallengeData2016/images/training", + "ADEChallengeData2016/ade20k_instance_train.json", + ), + "ade20k_instance_val": ( + "ADEChallengeData2016/images/validation", + "ADEChallengeData2016/ade20k_instance_val.json", + ), +} + + +def _get_ade_instances_meta(): + thing_ids = [k["id"] for k in ADE_CATEGORIES] + assert len(thing_ids) == 100, len(thing_ids) + # Mapping from the incontiguous ADE category id to an id in [0, 99] + thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)} + thing_classes = [k["name"] for k in ADE_CATEGORIES] + ret = { + "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id, + "thing_classes": thing_classes, + } + return ret + + +def register_all_ade20k_instance(root): + for key, (image_root, json_file) in _PREDEFINED_SPLITS.items(): + # Assume pre-defined datasets live in `./datasets`. + register_coco_instances( + key, + _get_ade_instances_meta(), + os.path.join(root, json_file) if "://" not in json_file else json_file, + os.path.join(root, image_root), + ) + + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_all_ade20k_instance(_root) diff --git a/custom_oneformer/data/datasets/register_ade20k_panoptic.py b/custom_oneformer/data/datasets/register_ade20k_panoptic.py new file mode 100644 index 0000000000000000000000000000000000000000..1300c30f03617ecf2f6614eee29b39320edfa079 --- /dev/null +++ b/custom_oneformer/data/datasets/register_ade20k_panoptic.py @@ -0,0 +1,394 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_panoptic.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import json +import os + +from custom_detectron2.data import DatasetCatalog, MetadataCatalog +from custom_detectron2.utils.file_io import PathManager + +ADE20K_150_CATEGORIES = [ + {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"}, + {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"}, + {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"}, + {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"}, + {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"}, + {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"}, + {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"}, + {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"}, + {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "}, + {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"}, + {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"}, + {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"}, + {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"}, + {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"}, + {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"}, + {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"}, + {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"}, + {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"}, + {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"}, + {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"}, + {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"}, + {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"}, + {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"}, + {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"}, + {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"}, + {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"}, + {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"}, + {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"}, + {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"}, + {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"}, + {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"}, + {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"}, + {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"}, + {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"}, + {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"}, + {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"}, + {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"}, + {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"}, + {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"}, + {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"}, + {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"}, + {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"}, + {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"}, + {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"}, + { + "color": [6, 51, 255], + "id": 44, + "isthing": 1, + "name": "chest of drawers, chest, bureau, dresser", + }, + {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"}, + {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"}, + {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"}, + {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"}, + {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"}, + {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"}, + {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"}, + {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"}, + {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"}, + {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"}, + {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"}, + { + "color": [255, 71, 0], + "id": 56, + "isthing": 1, + "name": "pool table, billiard table, snooker table", + }, + {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"}, + {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"}, + {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"}, + {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"}, + {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"}, + {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"}, + {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"}, + {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"}, + { + "color": [0, 255, 133], + "id": 65, + "isthing": 1, + "name": "toilet, can, commode, crapper, pot, potty, stool, throne", + }, + {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"}, + {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"}, + {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"}, + {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"}, + {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"}, + {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"}, + {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"}, + {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"}, + {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"}, + {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"}, + {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"}, + {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"}, + {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"}, + {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"}, + {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"}, + {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"}, + {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"}, + {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"}, + {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"}, + {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"}, + {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"}, + {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"}, + {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"}, + {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"}, + {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"}, + {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"}, + {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"}, + {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"}, + {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"}, + { + "color": [0, 122, 255], + "id": 95, + "isthing": 1, + "name": "bannister, banister, balustrade, balusters, handrail", + }, + { + "color": [0, 255, 163], + "id": 96, + "isthing": 0, + "name": "escalator, moving staircase, moving stairway", + }, + { + "color": [255, 153, 0], + "id": 97, + "isthing": 1, + "name": "ottoman, pouf, pouffe, puff, hassock", + }, + {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"}, + {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"}, + { + "color": [143, 255, 0], + "id": 100, + "isthing": 0, + "name": "poster, posting, placard, notice, bill, card", + }, + {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"}, + {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"}, + {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"}, + {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"}, + { + "color": [133, 0, 255], + "id": 105, + "isthing": 0, + "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter", + }, + {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"}, + { + "color": [184, 0, 255], + "id": 107, + "isthing": 1, + "name": "washer, automatic washer, washing machine", + }, + {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"}, + {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"}, + {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"}, + {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"}, + {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"}, + {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"}, + {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"}, + {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"}, + {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"}, + {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"}, + {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"}, + {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"}, + {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"}, + {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"}, + {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"}, + {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"}, + {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"}, + {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"}, + {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"}, + {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"}, + {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"}, + {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"}, + {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"}, + {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"}, + {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"}, + {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"}, + {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"}, + {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"}, + {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"}, + {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"}, + {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"}, + {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"}, + {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"}, + {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"}, + {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"}, + {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"}, + {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"}, + {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"}, + {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"}, + {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"}, + {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"}, + {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"}, +] + +ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES] + +MetadataCatalog.get("ade20k_sem_seg_train").set( + stuff_colors=ADE20k_COLORS[:], +) + +MetadataCatalog.get("ade20k_sem_seg_val").set( + stuff_colors=ADE20k_COLORS[:], +) + + +def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta): + """ + Args: + image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". + gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". + json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". + Returns: + list[dict]: a list of dicts in Detectron2 standard format. (See + `Using Custom Datasets `_ ) + """ + + def _convert_category_id(segment_info, meta): + if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: + segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ + segment_info["category_id"] + ] + segment_info["isthing"] = True + else: + segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ + segment_info["category_id"] + ] + segment_info["isthing"] = False + return segment_info + + with PathManager.open(json_file) as f: + json_info = json.load(f) + + ret = [] + for ann in json_info["annotations"]: + image_id = ann["image_id"] + # TODO: currently we assume image and label has the same filename but + # different extension, and images have extension ".jpg" for COCO. Need + # to make image extension a user-provided argument if we extend this + # function to support other COCO-like datasets. + image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") + label_file = os.path.join(gt_dir, ann["file_name"]) + sem_label_file = os.path.join(semseg_dir, ann["file_name"]) + segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] + ret.append( + { + "file_name": image_file, + "image_id": image_id, + "pan_seg_file_name": label_file, + "sem_seg_file_name": sem_label_file, + "segments_info": segments_info, + } + ) + assert len(ret), f"No images found in {image_dir}!" + assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] + assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] + assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] + return ret + + +def register_ade20k_panoptic( + name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None, +): + """ + Register a "standard" version of ADE20k panoptic segmentation dataset named `name`. + The dictionaries in this registered dataset follows detectron2's standard format. + Hence it's called "standard". + Args: + name (str): the name that identifies a dataset, + e.g. "ade20k_panoptic_train" + metadata (dict): extra metadata associated with this dataset. + image_root (str): directory which contains all the images + panoptic_root (str): directory which contains panoptic annotation images in COCO format + panoptic_json (str): path to the json panoptic annotation file in COCO format + sem_seg_root (none): not used, to be consistent with + `register_coco_panoptic_separated`. + instances_json (str): path to the json instance annotation file + """ + panoptic_name = name + DatasetCatalog.register( + panoptic_name, + lambda: load_ade20k_panoptic_json( + panoptic_json, image_root, panoptic_root, semantic_root, metadata + ), + ) + MetadataCatalog.get(panoptic_name).set( + panoptic_root=panoptic_root, + image_root=image_root, + panoptic_json=panoptic_json, + json_file=instances_json, + evaluator_type="ade20k_panoptic_seg", + ignore_label=255, + label_divisor=1000, + **metadata, + ) + + +_PREDEFINED_SPLITS_ADE20K_PANOPTIC = { + "ade20k_panoptic_train": ( + "ADEChallengeData2016/images/training", + "ADEChallengeData2016/ade20k_panoptic_train", + "ADEChallengeData2016/ade20k_panoptic_train.json", + "ADEChallengeData2016/annotations_detectron2/training", + "ADEChallengeData2016/ade20k_instance_train.json", + ), + "ade20k_panoptic_val": ( + "ADEChallengeData2016/images/validation", + "ADEChallengeData2016/ade20k_panoptic_val", + "ADEChallengeData2016/ade20k_panoptic_val.json", + "ADEChallengeData2016/annotations_detectron2/validation", + "ADEChallengeData2016/ade20k_instance_val.json", + ), +} + + +def get_metadata(): + meta = {} + # The following metadata maps contiguous id from [0, #thing categories + + # #stuff categories) to their names and colors. We have to replica of the + # same name and color under "thing_*" and "stuff_*" because the current + # visualization function in D2 handles thing and class classes differently + # due to some heuristic used in Panoptic FPN. We keep the same naming to + # enable reusing existing visualization functions. + thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] + thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1] + stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES] + stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES] + + meta["thing_classes"] = thing_classes + meta["thing_colors"] = thing_colors + meta["stuff_classes"] = stuff_classes + meta["stuff_colors"] = stuff_colors + + # Convert category id for training: + # category id: like semantic segmentation, it is the class id for each + # pixel. Since there are some classes not used in evaluation, the category + # id is not always contiguous and thus we have two set of category ids: + # - original category id: category id in the original dataset, mainly + # used for evaluation. + # - contiguous category id: [0, #classes), in order to train the linear + # softmax classifier. + thing_dataset_id_to_contiguous_id = {} + stuff_dataset_id_to_contiguous_id = {} + + for i, cat in enumerate(ADE20K_150_CATEGORIES): + if cat["isthing"]: + thing_dataset_id_to_contiguous_id[cat["id"]] = i + # else: + # stuff_dataset_id_to_contiguous_id[cat["id"]] = i + + # in order to use sem_seg evaluator + stuff_dataset_id_to_contiguous_id[cat["id"]] = i + + meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id + meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id + + return meta + + +def register_all_ade20k_panoptic(root): + metadata = get_metadata() + for ( + prefix, + (image_root, panoptic_root, panoptic_json, semantic_root, instance_json), + ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items(): + # The "standard" version of COCO panoptic segmentation dataset, + # e.g. used by Panoptic-DeepLab + register_ade20k_panoptic( + prefix, + metadata, + os.path.join(root, image_root), + os.path.join(root, panoptic_root), + os.path.join(root, semantic_root), + os.path.join(root, panoptic_json), + os.path.join(root, instance_json), + ) + + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_all_ade20k_panoptic(_root) diff --git a/custom_oneformer/data/datasets/register_cityscapes_panoptic.py b/custom_oneformer/data/datasets/register_cityscapes_panoptic.py new file mode 100644 index 0000000000000000000000000000000000000000..bf3341607a258baa57379120908b30104f205b80 --- /dev/null +++ b/custom_oneformer/data/datasets/register_cityscapes_panoptic.py @@ -0,0 +1,199 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/cityscapes_panoptic.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import json +import logging +import os + +from custom_detectron2.data import DatasetCatalog, MetadataCatalog +from custom_detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES +from custom_detectron2.utils.file_io import PathManager + +""" +This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog. +""" + + +logger = logging.getLogger(__name__) + + +def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info): + files = [] + # scan through the directory + cities = PathManager.ls(image_dir) + logger.info(f"{len(cities)} cities found in '{image_dir}'.") + image_dict = {} + for city in cities: + city_img_dir = os.path.join(image_dir, city) + for basename in PathManager.ls(city_img_dir): + image_file = os.path.join(city_img_dir, basename) + + suffix = "_leftImg8bit.png" + assert basename.endswith(suffix), basename + basename = os.path.basename(basename)[: -len(suffix)] + + image_dict[basename] = image_file + + for ann in json_info["annotations"]: + image_file = image_dict.get(ann["image_id"], None) + assert image_file is not None, "No image {} found for annotation {}".format( + ann["image_id"], ann["file_name"] + ) + label_file = os.path.join(gt_dir, ann["file_name"]) + segments_info = ann["segments_info"] + files.append((image_file, label_file, segments_info)) + + assert len(files), "No images found in {}".format(image_dir) + assert PathManager.isfile(files[0][0]), files[0][0] + assert PathManager.isfile(files[0][1]), files[0][1] + return files + + +def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta): + """ + Args: + image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train". + gt_dir (str): path to the raw annotations. e.g., + "~/cityscapes/gtFine/cityscapes_panoptic_train". + gt_json (str): path to the json file. e.g., + "~/cityscapes/gtFine/cityscapes_panoptic_train.json". + meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id" + and "stuff_dataset_id_to_contiguous_id" to map category ids to + contiguous ids for training. + + Returns: + list[dict]: a list of dicts in Detectron2 standard format. (See + `Using Custom Datasets `_ ) + """ + + def _convert_category_id(segment_info, meta): + if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: + segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ + segment_info["category_id"] + ] + else: + segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ + segment_info["category_id"] + ] + return segment_info + + assert os.path.exists( + gt_json + ), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files." # noqa + + + with open(gt_json) as f: + json_info = json.load(f) + + files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info) + ret = [] + for image_file, label_file, segments_info in files: + sem_label_file = ( + image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png" + ) + segments_info = [_convert_category_id(x, meta) for x in segments_info] + ret.append( + { + "file_name": image_file, + "image_id": "_".join( + os.path.splitext(os.path.basename(image_file))[0].split("_")[:3] + ), + "sem_seg_file_name": sem_label_file, + "pan_seg_file_name": label_file, + "segments_info": segments_info, + } + ) + assert len(ret), f"No images found in {image_dir}!" + assert PathManager.isfile( + ret[0]["sem_seg_file_name"] + ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py" # noqa + assert PathManager.isfile( + ret[0]["pan_seg_file_name"] + ), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py" # noqa + return ret + + +_RAW_CITYSCAPES_PANOPTIC_SPLITS = { + "cityscapes_fine_panoptic_train": ( + "cityscapes/leftImg8bit/train", + "cityscapes/gtFine/cityscapes_panoptic_train", + "cityscapes/gtFine/cityscapes_panoptic_train.json", + ), + "cityscapes_fine_panoptic_val": ( + "cityscapes/leftImg8bit/val", + "cityscapes/gtFine/cityscapes_panoptic_val", + "cityscapes/gtFine/cityscapes_panoptic_val.json", + ), + # "cityscapes_fine_panoptic_test": not supported yet +} + + +def register_all_cityscapes_panoptic(root): + meta = {} + # The following metadata maps contiguous id from [0, #thing categories + + # #stuff categories) to their names and colors. We have to replica of the + # same name and color under "thing_*" and "stuff_*" because the current + # visualization function in D2 handles thing and class classes differently + # due to some heuristic used in Panoptic FPN. We keep the same naming to + # enable reusing existing visualization functions. + thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES] + thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES] + stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES] + stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES] + + meta["thing_classes"] = thing_classes + meta["thing_colors"] = thing_colors + meta["stuff_classes"] = stuff_classes + meta["stuff_colors"] = stuff_colors + + # There are three types of ids in cityscapes panoptic segmentation: + # (1) category id: like semantic segmentation, it is the class id for each + # pixel. Since there are some classes not used in evaluation, the category + # id is not always contiguous and thus we have two set of category ids: + # - original category id: category id in the original dataset, mainly + # used for evaluation. + # - contiguous category id: [0, #classes), in order to train the classifier + # (2) instance id: this id is used to differentiate different instances from + # the same category. For "stuff" classes, the instance id is always 0; for + # "thing" classes, the instance id starts from 1 and 0 is reserved for + # ignored instances (e.g. crowd annotation). + # (3) panoptic id: this is the compact id that encode both category and + # instance id by: category_id * 1000 + instance_id. + thing_dataset_id_to_contiguous_id = {} + stuff_dataset_id_to_contiguous_id = {} + + for k in CITYSCAPES_CATEGORIES: + if k["isthing"] == 1: + thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"] + else: + stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"] + + meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id + meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id + + for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items(): + image_dir = os.path.join(root, image_dir) + gt_dir = os.path.join(root, gt_dir) + gt_json = os.path.join(root, gt_json) + + if key in DatasetCatalog.list(): + DatasetCatalog.remove(key) + + DatasetCatalog.register( + key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta) + ) + MetadataCatalog.get(key).set( + panoptic_root=gt_dir, + image_root=image_dir, + panoptic_json=gt_json, + gt_dir=gt_dir.replace("cityscapes_panoptic_", ""), + evaluator_type="cityscapes_panoptic_seg", + ignore_label=255, + label_divisor=1000, + **meta, + ) + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_all_cityscapes_panoptic(_root) \ No newline at end of file diff --git a/custom_oneformer/data/datasets/register_coco_panoptic2instance.py b/custom_oneformer/data/datasets/register_coco_panoptic2instance.py new file mode 100644 index 0000000000000000000000000000000000000000..fb0d101135f703e4688acee3df85ea0d7ee12f69 --- /dev/null +++ b/custom_oneformer/data/datasets/register_coco_panoptic2instance.py @@ -0,0 +1,44 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/builtin.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + + +""" +This file registers pre-defined datasets at hard-coded paths, and their metadata. + +We hard-code metadata for common datasets. This will enable: +1. Consistency check when loading the datasets +2. Use models on these standard datasets directly and run demos, + without having to download the dataset annotations + +We hard-code some paths to the dataset that's assumed to +exist in "./datasets/". + +Users SHOULD NOT use this file to create new dataset / metadata for new dataset. +To add new dataset, refer to the tutorial "docs/DATASETS.md". +""" + +import os +from custom_detectron2.data.datasets.builtin_meta import _get_builtin_metadata +from custom_detectron2.data.datasets.coco import register_coco_instances + + +_PREDEFINED_SPLITS_COCO = { + "coco_2017_val_panoptic2instance": ("coco/val2017", "coco/annotations/panoptic2instances_val2017.json"), +} + + +def register_panoptic2instances_coco(root): + for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO.items(): + # Assume pre-defined datasets live in `./datasets`. + register_coco_instances( + key, + _get_builtin_metadata("coco"), + os.path.join(root, json_file) if "://" not in json_file else json_file, + os.path.join(root, image_root), + ) + + +_root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets")) +register_panoptic2instances_coco(_root) \ No newline at end of file diff --git a/custom_oneformer/data/datasets/register_coco_panoptic_annos_semseg.py b/custom_oneformer/data/datasets/register_coco_panoptic_annos_semseg.py new file mode 100644 index 0000000000000000000000000000000000000000..a3d60d75a11aa8faee4ae8961cfcc24bdacbdac5 --- /dev/null +++ b/custom_oneformer/data/datasets/register_coco_panoptic_annos_semseg.py @@ -0,0 +1,367 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import json +import os + +from custom_detectron2.data import DatasetCatalog, MetadataCatalog +from custom_detectron2.data.datasets import load_sem_seg +from custom_detectron2.data.datasets.builtin_meta import COCO_CATEGORIES +from custom_detectron2.utils.file_io import PathManager +import contextlib +import logging +import io +from fvcore.common.timer import Timer +import custom_pycocotools.mask as mask_util +from custom_detectron2.structures import BoxMode + + +logger = logging.getLogger(__name__) + + +_PREDEFINED_SPLITS_COCO_PANOPTIC = { + "coco_2017_train_panoptic": ( + # This is the original panoptic annotation directory + "coco/panoptic_train2017", + "coco/annotations/panoptic_train2017.json", + # This directory contains semantic annotations that are + # converted from panoptic annotations. + # It is used by PanopticFPN. + # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py + # to create these directories. + "coco/panoptic_semseg_train2017", + ), + "coco_2017_val_panoptic": ( + "coco/panoptic_val2017", + "coco/annotations/panoptic_val2017.json", + "coco/panoptic_semseg_val2017", + ), +} + +def load_coco_instance_json(json_file, image_root, dataset_name=None): + from custom_pycocotools.coco import COCO + + timer = Timer() + json_file = PathManager.get_local_path(json_file) + with contextlib.redirect_stdout(io.StringIO()): + coco_api = COCO(json_file) + if timer.seconds() > 1: + logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds())) + + id_map = None + if dataset_name is not None: + meta = MetadataCatalog.get(dataset_name) + cat_ids = sorted(coco_api.getCatIds()) + cats = coco_api.loadCats(cat_ids) + # The categories in a custom json file may not be sorted. + thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])] + meta.thing_classes = thing_classes + + # In COCO, certain category ids are artificially removed, + # and by convention they are always ignored. + # We deal with COCO's id issue and translate + # the category ids to contiguous ids in [0, 80). + + # It works by looking at the "categories" field in the json, therefore + # if users' own json also have incontiguous ids, we'll + # apply this mapping as well but print a warning. + if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)): + if "coco" not in dataset_name: + logger.warning( + """ +Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you. +""" + ) + id_map = {v: i for i, v in enumerate(cat_ids)} + meta.thing_dataset_id_to_contiguous_id = id_map + + # sort indices for reproducible results + img_ids = sorted(coco_api.imgs.keys()) + # imgs is a list of dicts, each looks something like: + # {'license': 4, + # 'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg', + # 'file_name': 'COCO_val2014_000000001268.jpg', + # 'height': 427, + # 'width': 640, + # 'date_captured': '2013-11-17 05:57:24', + # 'id': 1268} + imgs = coco_api.loadImgs(img_ids) + # anns is a list[list[dict]], where each dict is an annotation + # record for an object. The inner list enumerates the objects in an image + # and the outer list enumerates over images. Example of anns[0]: + # [{'segmentation': [[192.81, + # 247.09, + # ... + # 219.03, + # 249.06]], + # 'area': 1035.749, + # 'iscrowd': 0, + # 'image_id': 1268, + # 'bbox': [192.81, 224.8, 74.73, 33.43], + # 'category_id': 16, + # 'id': 42986}, + # ...] + anns = [coco_api.imgToAnns[img_id] for img_id in img_ids] + total_num_valid_anns = sum([len(x) for x in anns]) + total_num_anns = len(coco_api.anns) + if total_num_valid_anns < total_num_anns: + logger.warning( + f"{json_file} contains {total_num_anns} annotations, but only " + f"{total_num_valid_anns} of them match to images in the file." + ) + + if "minival" not in json_file: + # The popular valminusminival & minival annotations for COCO2014 contain this bug. + # However the ratio of buggy annotations there is tiny and does not affect accuracy. + # Therefore we explicitly white-list them. + ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image] + assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format( + json_file + ) + + imgs_anns = list(zip(imgs, anns)) + logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file)) + + dataset_dicts = {} + + ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"] + + num_instances_without_valid_segmentation = 0 + + for (img_dict, anno_dict_list) in imgs_anns: + record = {} + record["file_name"] = os.path.join(image_root, img_dict["file_name"]) + record["height"] = img_dict["height"] + record["width"] = img_dict["width"] + image_id = record["image_id"] = img_dict["id"] + + objs = [] + for anno in anno_dict_list: + # Check that the image_id in this annotation is the same as + # the image_id we're looking at. + # This fails only when the data parsing logic or the annotation file is buggy. + + # The original COCO valminusminival2014 & minival2014 annotation files + # actually contains bugs that, together with certain ways of using COCO API, + # can trigger this assertion. + assert anno["image_id"] == image_id + + assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.' + + obj = {key: anno[key] for key in ann_keys if key in anno} + if "bbox" in obj and len(obj["bbox"]) == 0: + raise ValueError( + f"One annotation of image {image_id} contains empty 'bbox' value! " + "This json does not have valid COCO format." + ) + + segm = anno.get("segmentation", None) + if segm: # either list[list[float]] or dict(RLE) + if isinstance(segm, dict): + if isinstance(segm["counts"], list): + # convert to compressed RLE + segm = mask_util.frPyObjects(segm, *segm["size"]) + else: + # filter out invalid polygons (< 3 points) + segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6] + if len(segm) == 0: + num_instances_without_valid_segmentation += 1 + continue # ignore this instance + obj["segmentation"] = segm + + keypts = anno.get("keypoints", None) + if keypts: # list[int] + for idx, v in enumerate(keypts): + if idx % 3 != 2: + # COCO's segmentation coordinates are floating points in [0, H or W], + # but keypoint coordinates are integers in [0, H-1 or W-1] + # Therefore we assume the coordinates are "pixel indices" and + # add 0.5 to convert to floating point coordinates. + keypts[idx] = v + 0.5 + obj["keypoints"] = keypts + + obj["bbox_mode"] = BoxMode.XYWH_ABS + if id_map: + annotation_category_id = obj["category_id"] + try: + obj["category_id"] = id_map[annotation_category_id] + except KeyError as e: + raise KeyError( + f"Encountered category_id={annotation_category_id} " + "but this id does not exist in 'categories' of the json file." + ) from e + objs.append(obj) + record["annotations"] = objs + dataset_dicts[image_id] = record + + if num_instances_without_valid_segmentation > 0: + logger.warning( + "Filtered out {} instances without valid segmentation. ".format( + num_instances_without_valid_segmentation + ) + + "There might be issues in your dataset generation process. Please " + "check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully" + ) + return dataset_dicts + +def get_metadata(): + meta = {} + # The following metadata maps contiguous id from [0, #thing categories + + # #stuff categories) to their names and colors. We have to replica of the + # same name and color under "thing_*" and "stuff_*" because the current + # visualization function in D2 handles thing and class classes differently + # due to some heuristic used in Panoptic FPN. We keep the same naming to + # enable reusing existing visualization functions. + thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1] + thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1] + stuff_classes = [k["name"] for k in COCO_CATEGORIES] + stuff_colors = [k["color"] for k in COCO_CATEGORIES] + + meta["thing_classes"] = thing_classes + meta["thing_colors"] = thing_colors + meta["stuff_classes"] = stuff_classes + meta["stuff_colors"] = stuff_colors + + # Convert category id for training: + # category id: like semantic segmentation, it is the class id for each + # pixel. Since there are some classes not used in evaluation, the category + # id is not always contiguous and thus we have two set of category ids: + # - original category id: category id in the original dataset, mainly + # used for evaluation. + # - contiguous category id: [0, #classes), in order to train the linear + # softmax classifier. + thing_dataset_id_to_contiguous_id = {} + stuff_dataset_id_to_contiguous_id = {} + + for i, cat in enumerate(COCO_CATEGORIES): + if cat["isthing"]: + thing_dataset_id_to_contiguous_id[cat["id"]] = i + # else: + # stuff_dataset_id_to_contiguous_id[cat["id"]] = i + + # in order to use sem_seg evaluator + stuff_dataset_id_to_contiguous_id[cat["id"]] = i + + meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id + meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id + + return meta + + +def load_coco_panoptic_json(json_file, instances_json, instances_name, image_dir, gt_dir, semseg_dir, meta): + """ + Args: + image_dir (str): path to the raw dataset. e.g., "~/coco/train2017". + gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017". + json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json". + Returns: + list[dict]: a list of dicts in Detectron2 standard format. (See + `Using Custom Datasets `_ ) + """ + + def _convert_category_id(segment_info, meta): + if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]: + segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][ + segment_info["category_id"] + ] + segment_info["isthing"] = True + else: + segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][ + segment_info["category_id"] + ] + segment_info["isthing"] = False + return segment_info + + with PathManager.open(json_file) as f: + json_info = json.load(f) + + instance_data_dicts = load_coco_instance_json(instances_json, image_dir.replace("panoptic_", ""), instances_name) + + ret = [] + for ann in json_info["annotations"]: + image_id = int(ann["image_id"]) + # TODO: currently we assume image and label has the same filename but + # different extension, and images have extension ".jpg" for COCO. Need + # to make image extension a user-provided argument if we extend this + # function to support other COCO-like datasets. + image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg") + label_file = os.path.join(gt_dir, ann["file_name"]) + sem_label_file = os.path.join(semseg_dir, ann["file_name"]) + segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]] + ret.append( + { + "file_name": image_file, + "image_id": image_id, + "pan_seg_file_name": label_file, + "sem_seg_file_name": sem_label_file, + "segments_info": segments_info, + "annotations": instance_data_dicts[image_id]["annotations"], + } + ) + assert len(ret), f"No images found in {image_dir}!" + assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"] + assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"] + assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"] + return ret + + +def register_coco_panoptic_annos_sem_seg( + name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json, instances_name, +): + panoptic_name = name + delattr(MetadataCatalog.get(panoptic_name), "thing_classes") + delattr(MetadataCatalog.get(panoptic_name), "thing_colors") + MetadataCatalog.get(panoptic_name).set( + thing_classes=metadata["thing_classes"], + thing_colors=metadata["thing_colors"], + # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"], + ) + + # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg" + semantic_name = name + "_with_sem_seg" + DatasetCatalog.register( + semantic_name, + lambda: load_coco_panoptic_json(panoptic_json, instances_json, instances_name, image_root, panoptic_root, sem_seg_root, metadata), + ) + MetadataCatalog.get(semantic_name).set( + sem_seg_root=sem_seg_root, + panoptic_root=panoptic_root, + image_root=image_root, + panoptic_json=panoptic_json, + json_file=instances_json, + evaluator_type="coco_panoptic_seg", + ignore_label=255, + label_divisor=1000, + **metadata, + ) + + +def register_all_coco_panoptic_annos_sem_seg(root): + for ( + prefix, + (panoptic_root, panoptic_json, semantic_root), + ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items(): + + prefix_instances = prefix[: -len("_panoptic")] + instances_meta = MetadataCatalog.get(prefix_instances) + image_root, instances_json = instances_meta.image_root, instances_meta.json_file + + if 'val' in instances_json: + instances_json = instances_json.replace('instances_', 'panoptic2instances_') + + register_coco_panoptic_annos_sem_seg( + prefix, + get_metadata(), + image_root, + os.path.join(root, panoptic_root), + os.path.join(root, panoptic_json), + os.path.join(root, semantic_root), + instances_json, + prefix_instances, + ) + + +_root = os.getenv("DETECTRON2_DATASETS", "datasets") +register_all_coco_panoptic_annos_sem_seg(_root) diff --git a/custom_oneformer/data/tokenizer.py b/custom_oneformer/data/tokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..05d4c29c2d1ed03e5748e7346eeea494a2cd9144 --- /dev/null +++ b/custom_oneformer/data/tokenizer.py @@ -0,0 +1,192 @@ +# ------------------------------------------------------------------------- +# MIT License +# +# Copyright (c) 2021 OpenAI +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# Modified by Jiarui Xu +# ------------------------------------------------------------------------- + +import gzip +import html +import os +from functools import lru_cache + +import ftfy +import regex as re +import torch + + +@lru_cache() +def default_bpe(): + return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bpe_simple_vocab_16e6.txt.gz') + +@lru_cache() +def bytes_to_unicode(): + """Returns list of utf-8 byte and a corresponding list of unicode strings. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent + coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables + between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord('!'), ord('~') + 1)) + list(range(ord('¡'), ord('¬') + 1)) + list(range(ord('®'), ord('ÿ') + 1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +def basic_clean(text): + text = ftfy.fix_text(text) + text = html.unescape(html.unescape(text)) + return text.strip() + + +def whitespace_clean(text): + text = re.sub(r'\s+', ' ', text) + text = text.strip() + return text + +class Tokenize: + + def __init__(self, tokenizer, max_seq_len=77, truncate=True): + self.tokenizer = tokenizer + self.max_seq_len = max_seq_len + self.truncate = truncate + + def __call__(self, texts): + expanded_dim = False + if isinstance(texts, str): + texts = [texts] + expanded_dim = True + + sot_token = self.tokenizer.encoder['<|startoftext|>'] + eot_token = self.tokenizer.encoder['<|endoftext|>'] + all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] for text in texts] + result = torch.zeros(len(all_tokens), self.max_seq_len, dtype=torch.long) + + for i, tokens in enumerate(all_tokens): + if len(tokens) > self.max_seq_len: + if self.truncate: + tokens = tokens[:self.max_seq_len] + tokens[-1] = eot_token + else: + raise RuntimeError(f'Input {texts[i]} is too long for context length {self.max_seq_len}') + result[i, :len(tokens)] = torch.tensor(tokens) + + if expanded_dim: + return result[0] + + return result + + +class SimpleTokenizer(object): + + def __init__(self, bpe_path: str = default_bpe()): + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + merges = gzip.open(bpe_path).read().decode('utf-8').split('\n') + merges = merges[1:49152 - 256 - 2 + 1] + merges = [tuple(merge.split()) for merge in merges] + vocab = list(bytes_to_unicode().values()) + vocab = vocab + [v + '' for v in vocab] + for merge in merges: + vocab.append(''.join(merge)) + vocab.extend(['<|startoftext|>', '<|endoftext|>']) + self.encoder = dict(zip(vocab, range(len(vocab)))) + self.decoder = {v: k for k, v in self.encoder.items()} + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} + self.pat = re.compile( + r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", + re.IGNORECASE) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token[:-1]) + (token[-1] + '', ) + pairs = get_pairs(word) + + if not pairs: + return token + '' + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf'))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except: # noqa: E722 + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = ' '.join(word) + self.cache[token] = word + return word + + def encode(self, text): + bpe_tokens = [] + text = whitespace_clean(basic_clean(text)).lower() + for token in re.findall(self.pat, text): + token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) + bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) + return bpe_tokens + + def decode(self, tokens): + text = ''.join([self.decoder[token] for token in tokens]) + text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors='replace').replace('', ' ') + return text \ No newline at end of file diff --git a/custom_oneformer/demo/colormap.py b/custom_oneformer/demo/colormap.py new file mode 100644 index 0000000000000000000000000000000000000000..3eff9a46d37a1926c48ef0ad6e3308128438140f --- /dev/null +++ b/custom_oneformer/demo/colormap.py @@ -0,0 +1,170 @@ +# Copyright (c) Facebook, Inc. and its affiliates. + +""" +An awesome colormap for really neat visualizations. +Copied from Detectron, and removed gray colors. +""" + +import numpy as np +import random +random.seed(0) + +__all__ = ["colormap", "random_color", "random_colors"] + +# fmt: off +# RGB: +# _COLORS = np.array( +# [ +# 0.000, 0.447, 0.741, +# 0.850, 0.325, 0.098, +# 0.929, 0.694, 0.125, +# 0.494, 0.184, 0.556, +# 0.466, 0.674, 0.188, +# 0.301, 0.745, 0.933, +# 0.635, 0.078, 0.184, +# 0.300, 0.300, 0.300, +# 0.600, 0.600, 0.600, +# 1.000, 0.000, 0.000, +# 1.000, 0.500, 0.000, +# 0.749, 0.749, 0.000, +# 0.000, 1.000, 0.000, +# 0.000, 0.000, 1.000, +# 0.667, 0.000, 1.000, +# 0.333, 0.333, 0.000, +# 0.333, 0.667, 0.000, +# 0.333, 1.000, 0.000, +# 0.667, 0.333, 0.000, +# 0.667, 0.667, 0.000, +# 0.667, 1.000, 0.000, +# 1.000, 0.333, 0.000, +# 1.000, 0.667, 0.000, +# 1.000, 1.000, 0.000, +# 0.000, 0.333, 0.500, +# 0.000, 0.667, 0.500, +# 0.000, 1.000, 0.500, +# 0.333, 0.000, 0.500, +# 0.333, 0.333, 0.500, +# 0.333, 0.667, 0.500, +# 0.333, 1.000, 0.500, +# 0.667, 0.000, 0.500, +# 0.667, 0.333, 0.500, +# 0.667, 0.667, 0.500, +# 0.667, 1.000, 0.500, +# 1.000, 0.000, 0.500, +# 1.000, 0.333, 0.500, +# 1.000, 0.667, 0.500, +# 1.000, 1.000, 0.500, +# 0.000, 0.333, 1.000, +# 0.000, 0.667, 1.000, +# 0.000, 1.000, 1.000, +# 0.333, 0.000, 1.000, +# 0.333, 0.333, 1.000, +# 0.333, 0.667, 1.000, +# 0.333, 1.000, 1.000, +# 0.667, 0.000, 1.000, +# 0.667, 0.333, 1.000, +# 0.667, 0.667, 1.000, +# 0.667, 1.000, 1.000, +# 1.000, 0.000, 1.000, +# 1.000, 0.333, 1.000, +# 1.000, 0.667, 1.000, +# 0.333, 0.000, 0.000, +# 0.500, 0.000, 0.000, +# 0.667, 0.000, 0.000, +# 0.833, 0.000, 0.000, +# 1.000, 0.000, 0.000, +# 0.000, 0.167, 0.000, +# 0.000, 0.333, 0.000, +# 0.000, 0.500, 0.000, +# 0.000, 0.667, 0.000, +# 0.000, 0.833, 0.000, +# 0.000, 1.000, 0.000, +# 0.000, 0.000, 0.167, +# 0.000, 0.000, 0.333, +# 0.000, 0.000, 0.500, +# 0.000, 0.000, 0.667, +# 0.000, 0.000, 0.833, +# 0.000, 0.000, 1.000, +# 0.000, 0.000, 0.000, +# 0.143, 0.143, 0.143, +# 0.857, 0.857, 0.857, +# 1.000, 1.000, 1.000 +# ] +# ).astype(np.float32).reshape(-1, 3) +# fmt: on + +_COLORS = [] + + +def gen_color(): + color = tuple(np.round(np.random.choice(range(256), size=3)/255, 3)) + if color not in _COLORS and np.mean(color) != 0.0: + _COLORS.append(color) + else: + gen_color() + + +for _ in range(300): + gen_color() + + +def colormap(rgb=False, maximum=255): + """ + Args: + rgb (bool): whether to return RGB colors or BGR colors. + maximum (int): either 255 or 1 + Returns: + ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1] + """ + assert maximum in [255, 1], maximum + c = _COLORS * maximum + if not rgb: + c = c[:, ::-1] + return c + + +def random_color(rgb=False, maximum=255): + """ + Args: + rgb (bool): whether to return RGB colors or BGR colors. + maximum (int): either 255 or 1 + Returns: + ndarray: a vector of 3 numbers + """ + idx = np.random.randint(0, len(_COLORS)) + ret = _COLORS[idx] * maximum + if not rgb: + ret = ret[::-1] + return ret + + +def random_colors(N, rgb=False, maximum=255): + """ + Args: + N (int): number of unique colors needed + rgb (bool): whether to return RGB colors or BGR colors. + maximum (int): either 255 or 1 + Returns: + ndarray: a list of random_color + """ + indices = random.sample(range(len(_COLORS)), N) + ret = [_COLORS[i] * maximum for i in indices] + if not rgb: + ret = [x[::-1] for x in ret] + return ret + + +if __name__ == "__main__": + import cv2 + + size = 100 + H, W = 10, 10 + canvas = np.random.rand(H * size, W * size, 3).astype("float32") + for h in range(H): + for w in range(W): + idx = h * W + w + if idx >= len(_COLORS): + break + canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx] + cv2.imshow("a", canvas) + cv2.waitKey(0) \ No newline at end of file diff --git a/custom_oneformer/demo/defaults.py b/custom_oneformer/demo/defaults.py new file mode 100644 index 0000000000000000000000000000000000000000..9895d61a32fd02857f7313e5e739ce6918604463 --- /dev/null +++ b/custom_oneformer/demo/defaults.py @@ -0,0 +1,77 @@ +import torch +import custom_detectron2.data.transforms as T +from custom_detectron2.checkpoint import DetectionCheckpointer +from custom_detectron2.data import ( + MetadataCatalog, +) +from custom_detectron2.modeling import build_model + + +__all__ = [ + "DefaultPredictor", +] + + +class DefaultPredictor: + """ + Create a simple end-to-end predictor with the given config that runs on + single device for a single input image. + Compared to using the model directly, this class does the following additions: + 1. Load checkpoint from `cfg.MODEL.WEIGHTS`. + 2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`. + 3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`. + 4. Take one input image and produce a single output, instead of a batch. + This is meant for simple demo purposes, so it does the above steps automatically. + This is not meant for benchmarks or running complicated inference logic. + If you'd like to do anything more complicated, please refer to its source code as + examples to build and use the model manually. + Attributes: + metadata (Metadata): the metadata of the underlying dataset, obtained from + cfg.DATASETS.TEST. + Examples: + :: + pred = DefaultPredictor(cfg) + inputs = cv2.imread("input.jpg") + outputs = pred(inputs) + """ + + def __init__(self, cfg): + self.cfg = cfg.clone() # cfg can be modified by model + self.model = build_model(self.cfg) + self.model.eval() + if len(cfg.DATASETS.TEST): + self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0]) + + checkpointer = DetectionCheckpointer(self.model) + checkpointer.load(cfg.MODEL.WEIGHTS) + + self.aug = T.ResizeShortestEdge( + [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST + ) + + self.input_format = cfg.INPUT.FORMAT + assert self.input_format in ["RGB", "BGR"], self.input_format + + def __call__(self, original_image, task): + """ + Args: + original_image (np.ndarray): an image of shape (H, W, C) (in BGR order). + Returns: + predictions (dict): + the output of the model for one image only. + See :doc:`/tutorials/models` for details about the format. + """ + with torch.no_grad(): # https://github.com/sphinx-doc/sphinx/issues/4258 + # Apply pre-processing to image. + if self.input_format == "RGB": + # whether the model expects BGR inputs or RGB + original_image = original_image[:, :, ::-1] + height, width = original_image.shape[:2] + image = self.aug.get_transform(original_image).apply_image(original_image) + image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1)) + + task = f"The task is {task}" + + inputs = {"image": image, "height": height, "width": width, "task": task} + predictions = self.model([inputs])[0] + return predictions \ No newline at end of file diff --git a/custom_oneformer/demo/predictor.py b/custom_oneformer/demo/predictor.py new file mode 100644 index 0000000000000000000000000000000000000000..f6805a3aa4cc9f5c356b8155fda297feed463d33 --- /dev/null +++ b/custom_oneformer/demo/predictor.py @@ -0,0 +1,190 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py +import atexit +import bisect +import multiprocessing as mp +from collections import deque + +import cv2 +import torch + +from custom_detectron2.data import MetadataCatalog +from defaults import DefaultPredictor +from custom_detectron2.utils.video_visualizer import VideoVisualizer +from visualizer import ColorMode, Visualizer + + +class VisualizationDemo(object): + def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False): + """ + Args: + cfg (CfgNode): + instance_mode (ColorMode): + parallel (bool): whether to run the model in different processes from visualization. + Useful since the visualization logic can be slow. + """ + self.metadata = MetadataCatalog.get( + cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused" + ) + if 'cityscapes_fine_sem_seg_val' in cfg.DATASETS.TEST[0]: + from cityscapesscripts.helpers.labels import labels + stuff_colors = [k.color for k in labels if k.trainId != 255] + self.metadata = self.metadata.set(stuff_colors=stuff_colors) + self.cpu_device = torch.device("cpu") + self.instance_mode = instance_mode + + self.parallel = parallel + if parallel: + num_gpu = torch.cuda.device_count() + self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu) + else: + self.predictor = DefaultPredictor(cfg) + + def run_on_image(self, image, task, sem_gt, pan_gt, ins_gt, box_gt): + """ + Args: + image (np.ndarray): an image of shape (H, W, C) (in BGR order). + This is the format used by OpenCV. + Returns: + predictions (dict): the output of the model. + vis_output (VisImage): the visualized image output. + """ + vis_output = None + # Convert image from OpenCV BGR format to Matplotlib RGB format. + image = image[:, :, ::-1] + vis_output = {} + + if task == 'panoptic': + visualizer = Visualizer(image, metadata=self.metadata, instance_mode=0) + predictions = self.predictor(image, "panoptic") + panoptic_seg, segments_info = predictions["panoptic_seg"] + vis_output['panoptic'] = visualizer.draw_panoptic_seg_predictions( + panoptic_seg.to(self.cpu_device), segments_info, alpha=1 + ) + + # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=0) + # vis_output['pan_gt'] = visualizer.draw_panoptic_seg( + # pan_gt[0].to(self.cpu_device), pan_gt[1], alpha=1 + # ) + + if task == 'panoptic' or task == 'semantic': + visualizer = Visualizer(image, metadata=self.metadata, instance_mode=1) + predictions = self.predictor(image, "semantic") + vis_output['semantic'] = visualizer.draw_sem_seg( + predictions["sem_seg"].argmax(dim=0).to(self.cpu_device), alpha=1 + ) + + # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=1) + # vis_output['gt_sem'] = visualizer.draw_sem_seg( + # sem_gt.to(self.cpu_device), alpha=1 + # ) + + if task == 'panoptic' or task == 'instance': + visualizer = Visualizer(image, metadata=self.metadata, instance_mode=2) + predictions = self.predictor(image, "instance") + instances = predictions["instances"].to(self.cpu_device) + vis_output['instance'] = visualizer.draw_instance_predictions(predictions=instances, alpha=1) + + if 'boxes' in predictions: + boxes, labels, scores = predictions["boxes"] + visualizer = Visualizer(image, False, metadata=self.metadata, instance_mode=0) + vis_output['boxes'] = visualizer.draw_box_predictions( + boxes.to(self.cpu_device), labels.to(self.cpu_device), scores.to(self.cpu_device)) + + + # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=2) + # vis_output['ins_gt'] = visualizer.draw_instance_predictions(predictions=ins_gt.to(self.cpu_device), alpha=1) + # vis_output['input'] = visualizer.get_image(image) + + return predictions, vis_output + + +class AsyncPredictor: + """ + A predictor that runs the model asynchronously, possibly on >1 GPUs. + Because rendering the visualization takes considerably amount of time, + this helps improve throughput a little bit when rendering videos. + """ + + class _StopToken: + pass + + class _PredictWorker(mp.Process): + def __init__(self, cfg, task_queue, result_queue): + self.cfg = cfg + self.task_queue = task_queue + self.result_queue = result_queue + super().__init__() + + def run(self): + predictor = DefaultPredictor(self.cfg) + + while True: + task = self.task_queue.get() + if isinstance(task, AsyncPredictor._StopToken): + break + idx, data = task + result = predictor(data) + self.result_queue.put((idx, result)) + + def __init__(self, cfg, num_gpus: int = 1): + """ + Args: + cfg (CfgNode): + num_gpus (int): if 0, will run on CPU + """ + num_workers = max(num_gpus, 1) + self.task_queue = mp.Queue(maxsize=num_workers * 3) + self.result_queue = mp.Queue(maxsize=num_workers * 3) + self.procs = [] + for gpuid in range(max(num_gpus, 1)): + cfg = cfg.clone() + cfg.defrost() + cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu" + self.procs.append( + AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue) + ) + + self.put_idx = 0 + self.get_idx = 0 + self.result_rank = [] + self.result_data = [] + + for p in self.procs: + p.start() + atexit.register(self.shutdown) + + def put(self, image): + self.put_idx += 1 + self.task_queue.put((self.put_idx, image)) + + def get(self): + self.get_idx += 1 # the index needed for this request + if len(self.result_rank) and self.result_rank[0] == self.get_idx: + res = self.result_data[0] + del self.result_data[0], self.result_rank[0] + return res + + while True: + # make sure the results are returned in the correct order + idx, res = self.result_queue.get() + if idx == self.get_idx: + return res + insert = bisect.bisect(self.result_rank, idx) + self.result_rank.insert(insert, idx) + self.result_data.insert(insert, res) + + def __len__(self): + return self.put_idx - self.get_idx + + def __call__(self, image): + self.put(image) + return self.get() + + def shutdown(self): + for _ in self.procs: + self.task_queue.put(AsyncPredictor._StopToken()) + + @property + def default_buffer_size(self): + return len(self.procs) * 5 diff --git a/custom_oneformer/demo/visualizer.py b/custom_oneformer/demo/visualizer.py new file mode 100644 index 0000000000000000000000000000000000000000..4f03034510b5b98141dbe9522a08313d5413e555 --- /dev/null +++ b/custom_oneformer/demo/visualizer.py @@ -0,0 +1,1350 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import colorsys +import logging +import math +import numpy as np +from enum import Enum, unique +import cv2 +import matplotlib as mpl +import matplotlib.colors as mplc +import matplotlib.figure as mplfigure +import custom_pycocotools.mask as mask_util +import torch +from matplotlib.backends.backend_agg import FigureCanvasAgg +from PIL import Image + +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes +from custom_detectron2.utils.file_io import PathManager +import random +random.seed(0) +from .colormap import random_color, _COLORS +logger = logging.getLogger(__name__) + +__all__ = ["ColorMode", "VisImage", "Visualizer"] + + +_SMALL_OBJECT_AREA_THRESH = 1000 +_LARGE_MASK_AREA_THRESH = 120000 +_OFF_WHITE = (1.0, 1.0, 1.0) +_BLACK = (0, 0, 0) +_RED = (1.0, 0, 0) + +_KEYPOINT_THRESHOLD = 0.05 + + +def instance_color(rgb=False, idx=1, maximum=255): + """ + Args: + rgb (bool): whether to return RGB colors or BGR colors. + maximum (int): either 255 or 1 + Returns: + ndarray: a vector of 3 numbers + """ + ret = _COLORS[idx] * maximum + if not rgb: + ret = ret[::-1] + return ret + +@unique +class ColorMode(Enum): + """ + Enum of different color modes to use for instance visualizations. + """ + + IMAGE = 0 + """ + Picks a random color for every instance and overlay segmentations with low opacity. + """ + SEGMENTATION = 1 + """ + Let instances of the same category have similar colors + (from metadata.thing_colors), and overlay them with + high opacity. This provides more attention on the quality of segmentation. + """ + IMAGE_BW = 2 + """ + Same as IMAGE, but convert all areas without masks to gray-scale. + Only available for drawing per-instance mask predictions. + """ + + +class GenericMask: + """ + Attribute: + polygons (list[ndarray]): list[ndarray]: polygons for this mask. + Each ndarray has format [x, y, x, y, ...] + mask (ndarray): a binary mask + """ + + def __init__(self, mask_or_polygons, height, width): + self._mask = self._polygons = self._has_holes = None + self.height = height + self.width = width + + m = mask_or_polygons + if isinstance(m, dict): + # RLEs + assert "counts" in m and "size" in m + if isinstance(m["counts"], list): # uncompressed RLEs + h, w = m["size"] + assert h == height and w == width + m = mask_util.frPyObjects(m, h, w) + self._mask = mask_util.decode(m)[:, :] + return + + if isinstance(m, list): # list[ndarray] + self._polygons = [np.asarray(x).reshape(-1) for x in m] + return + + if isinstance(m, np.ndarray): # assumed to be a binary mask + assert m.shape[1] != 2, m.shape + assert m.shape == ( + height, + width, + ), f"mask shape: {m.shape}, target dims: {height}, {width}" + self._mask = m.astype("uint8") + return + + raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m))) + + @property + def mask(self): + if self._mask is None: + self._mask = self.polygons_to_mask(self._polygons) + return self._mask + + @property + def polygons(self): + if self._polygons is None: + self._polygons, self._has_holes = self.mask_to_polygons(self._mask) + return self._polygons + + @property + def has_holes(self): + if self._has_holes is None: + if self._mask is not None: + self._polygons, self._has_holes = self.mask_to_polygons(self._mask) + else: + self._has_holes = False # if original format is polygon, does not have holes + return self._has_holes + + def mask_to_polygons(self, mask): + # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level + # hierarchy. External contours (boundary) of the object are placed in hierarchy-1. + # Internal contours (holes) are placed in hierarchy-2. + # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours. + mask = np.ascontiguousarray(mask) # some versions of cv2 does not support incontiguous arr + res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE) + hierarchy = res[-1] + if hierarchy is None: # empty mask + return [], False + has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0 + res = res[-2] + res = [x.flatten() for x in res] + # These coordinates from OpenCV are integers in range [0, W-1 or H-1]. + # We add 0.5 to turn them into real-value coordinate space. A better solution + # would be to first +0.5 and then dilate the returned polygon by 0.5. + res = [x + 0.5 for x in res if len(x) >= 6] + return res, has_holes + + def polygons_to_mask(self, polygons): + rle = mask_util.frPyObjects(polygons, self.height, self.width) + rle = mask_util.merge(rle) + return mask_util.decode(rle)[:, :] + + def area(self): + return self.mask.sum() + + def bbox(self): + p = mask_util.frPyObjects(self.polygons, self.height, self.width) + p = mask_util.merge(p) + bbox = mask_util.toBbox(p) + bbox[2] += bbox[0] + bbox[3] += bbox[1] + return bbox + + +class _PanopticPrediction: + """ + Unify different panoptic annotation/prediction formats + """ + + def __init__(self, panoptic_seg, segments_info, metadata=None): + if segments_info is None: + assert metadata is not None + # If "segments_info" is None, we assume "panoptic_img" is a + # H*W int32 image storing the panoptic_id in the format of + # category_id * label_divisor + instance_id. We reserve -1 for + # VOID label. + label_divisor = metadata.label_divisor + segments_info = [] + for panoptic_label in np.unique(panoptic_seg.numpy()): + if panoptic_label == -1: + # VOID region. + continue + pred_class = panoptic_label // label_divisor + isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values() + segments_info.append( + { + "id": int(panoptic_label), + "category_id": int(pred_class), + "isthing": bool(isthing), + } + ) + del metadata + + self._seg = panoptic_seg + + self._sinfo = {s["id"]: s for s in segments_info} # seg id -> seg info + segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True) + areas = areas.numpy() + sorted_idxs = np.argsort(-areas) + self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs] + self._seg_ids = self._seg_ids.tolist() + for sid, area in zip(self._seg_ids, self._seg_areas): + if sid in self._sinfo: + self._sinfo[sid]["area"] = float(area) + + def non_empty_mask(self): + """ + Returns: + (H, W) array, a mask for all pixels that have a prediction + """ + empty_ids = [] + for id in self._seg_ids: + if id not in self._sinfo: + empty_ids.append(id) + if len(empty_ids) == 0: + return np.zeros(self._seg.shape, dtype=np.uint8) + assert ( + len(empty_ids) == 1 + ), ">1 ids corresponds to no labels. This is currently not supported" + return (self._seg != empty_ids[0]).numpy().astype(np.bool) + + def semantic_masks(self): + for sid in self._seg_ids: + sinfo = self._sinfo.get(sid) + if sinfo is None or sinfo["isthing"]: + # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions. + continue + yield (self._seg == sid).numpy().astype(np.bool), sinfo + + def instance_masks(self): + for sid in self._seg_ids: + sinfo = self._sinfo.get(sid) + if sinfo is None or not sinfo["isthing"]: + continue + mask = (self._seg == sid).numpy().astype(np.bool) + if mask.sum() > 0: + yield mask, sinfo + + +def _create_text_labels(classes, scores, class_names, is_crowd=None): + """ + Args: + classes (list[int] or None): + scores (list[float] or None): + class_names (list[str] or None): + is_crowd (list[bool] or None): + Returns: + list[str] or None + """ + labels = None + if classes is not None: + if class_names is not None and len(class_names) > 0: + labels = [class_names[i] for i in classes] + else: + labels = [str(i) for i in classes] + if scores is not None: + if labels is None: + labels = ["{:.0f}%".format(s * 100) for s in scores] + else: + labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)] + if labels is not None and is_crowd is not None: + labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)] + return labels + + +class VisImage: + def __init__(self, img, scale=1.0): + """ + Args: + img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255]. + scale (float): scale the input image + """ + self.img = img + self.scale = scale + self.width, self.height = img.shape[1], img.shape[0] + self._setup_figure(img) + + def _setup_figure(self, img): + """ + Args: + Same as in :meth:`__init__()`. + Returns: + fig (matplotlib.pyplot.figure): top level container for all the image plot elements. + ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system. + """ + fig = mplfigure.Figure(frameon=False) + self.dpi = fig.get_dpi() + # add a small 1e-2 to avoid precision lost due to matplotlib's truncation + # (https://github.com/matplotlib/matplotlib/issues/15363) + fig.set_size_inches( + (self.width * self.scale + 1e-2) / self.dpi, + (self.height * self.scale + 1e-2) / self.dpi, + ) + self.canvas = FigureCanvasAgg(fig) + # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig) + ax = fig.add_axes([0.0, 0.0, 1.0, 1.0]) + ax.axis("off") + self.fig = fig + self.ax = ax + self.reset_image(img) + + def reset_image(self, img): + """ + Args: + img: same as in __init__ + """ + img = img.astype("uint8") + self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest") + + def save(self, filepath): + """ + Args: + filepath (str): a string that contains the absolute path, including the file name, where + the visualized image will be saved. + """ + self.fig.savefig(filepath) + + def get_image(self): + """ + Returns: + ndarray: + the visualized image of shape (H, W, 3) (RGB) in uint8 type. + The shape is scaled w.r.t the input image using the given `scale` argument. + """ + canvas = self.canvas + s, (width, height) = canvas.print_to_buffer() + # buf = io.BytesIO() # works for cairo backend + # canvas.print_rgba(buf) + # width, height = self.width, self.height + # s = buf.getvalue() + + buffer = np.frombuffer(s, dtype="uint8") + + img_rgba = buffer.reshape(height, width, 4) + rgb, alpha = np.split(img_rgba, [3], axis=2) + return rgb.astype("uint8") + + +class Visualizer: + """ + Visualizer that draws data about detection/segmentation on images. + It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}` + that draw primitive objects to images, as well as high-level wrappers like + `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}` + that draw composite data in some pre-defined style. + Note that the exact visualization style for the high-level wrappers are subject to change. + Style such as color, opacity, label contents, visibility of labels, or even the visibility + of objects themselves (e.g. when the object is too small) may change according + to different heuristics, as long as the results still look visually reasonable. + To obtain a consistent style, you can implement custom drawing functions with the + abovementioned primitive methods instead. If you need more customized visualization + styles, you can process the data yourself following their format documented in + tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not + intend to satisfy everyone's preference on drawing styles. + This visualizer focuses on high rendering quality rather than performance. It is not + designed to be used for real-time applications. + """ + + # TODO implement a fast, rasterized version using OpenCV + + def __init__(self, img_rgb, is_img=True, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE): + """ + Args: + img_rgb: a numpy array of shape (H, W, C), where H and W correspond to + the height and width of the image respectively. C is the number of + color channels. The image is required to be in RGB format since that + is a requirement of the Matplotlib library. The image is also expected + to be in the range [0, 255]. + metadata (Metadata): dataset metadata (e.g. class names and colors) + instance_mode (ColorMode): defines one of the pre-defined style for drawing + instances on an image. + """ + if is_img: + self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8) + else: + self.img = np.zeros_like(img_rgb).clip(0, 255).astype(np.uint8) + 255 + if metadata is None: + metadata = MetadataCatalog.get("__nonexist__") + self.metadata = metadata + self.output = VisImage(self.img, scale=scale) + self.cpu_device = torch.device("cpu") + + # too small texts are useless, therefore clamp to 9 + self._default_font_size = max( + np.sqrt(self.output.height * self.output.width) // 90, 10 // scale + ) + self._instance_mode = instance_mode + self.keypoint_threshold = _KEYPOINT_THRESHOLD + + def get_image(self, img): + img = np.asarray(img).clip(0, 255).astype(np.uint8) + return VisImage(img, scale=1.0) + + def draw_box_predictions( + self, + boxes=None, + labels=None, + scores=None, + assigned_colors=None + ): + """ + Args: + boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, + or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, + or a :class:`RotatedBoxes`, + or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format + for the N objects in a single image, + labels (list[str]): the text to be displayed for each instance. + assigned_colors (list[matplotlib.colors]): a list of colors, where each color + corresponds to each mask or box in the image. Refer to 'matplotlib.colors' + for full list of formats that the colors are accepted in. + Returns: + output (VisImage): image object with visualizations. + """ + num_instances = 0 + boxes = self._convert_boxes(boxes) + classes = labels.tolist() + scores = scores.tolist() + labels = _create_text_labels(classes, scores, self.metadata.get("stuff_classes", None)) + num_instances = len(boxes) + assert len(labels) == num_instances + if assigned_colors is None: + # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] + assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)] + if num_instances == 0: + return self.output + + # Display in largest to smallest order to reduce occlusion. + areas = None + areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) + + if areas is not None: + sorted_idxs = np.argsort(-areas).tolist() + # Re-order overlapped instances in descending order. + boxes = boxes[sorted_idxs] if boxes is not None else None + labels = [labels[k] for k in sorted_idxs] if labels is not None else None + assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] + + for i in range(num_instances): + color = assigned_colors[i] + if boxes is not None: + self.draw_box(boxes[i], edge_color=color) + + if labels is not None: + # first get a box + if boxes is not None: + x0, y0, x1, y1 = boxes[i] + text_pos = (x0, y0) # if drawing boxes, put text on the box corner. + horiz_align = "left" + else: + continue # drawing the box confidence for keypoints isn't very useful. + # for small objects, draw text at the side to avoid occlusion + instance_area = (y1 - y0) * (x1 - x0) + if ( + instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale + or y1 - y0 < 40 * self.output.scale + ): + if y1 >= self.output.height - 5: + text_pos = (x1, y0) + else: + text_pos = (x0, y1) + + height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + font_size = ( + np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) + * 0.5 + * self._default_font_size + ) + self.draw_text( + labels[i], + text_pos, + color=lighter_color, + horizontal_alignment=horiz_align, + font_size=font_size, + ) + + return self.output + + + def draw_instance_predictions(self, predictions, alpha=0.8, is_text=True): + """ + Draw instance-level prediction results on an image. + Args: + predictions (Instances): the output of an instance detection/segmentation + model. Following fields will be used to draw: + "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle"). + Returns: + output (VisImage): image object with visualizations. + """ + boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None + scores = predictions.scores if predictions.has("scores") else None + classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None + labels = _create_text_labels(classes, scores, self.metadata.get("stuff_classes", None)) + keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None + + if predictions.has("pred_masks"): + masks = np.asarray(predictions.pred_masks) + masks = [GenericMask(x, self.output.height, self.output.width) for x in masks] + else: + masks = None + + if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("stuff_colors"): + # colors = [ + # self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes + # ] + colors = [ + instance_color(rgb=True, idx=c, maximum=1) for c in classes + ] + else: + colors = None + + if self._instance_mode == ColorMode.IMAGE_BW: + self.output.reset_image( + self._create_grayscale_image( + (predictions.pred_masks.any(dim=0) > 0).numpy() + if predictions.has("pred_masks") + else None + ) + ) + + self.overlay_instances( + masks=masks, + boxes=boxes, + labels=labels, + keypoints=keypoints, + assigned_colors=colors, + alpha=alpha, + is_text=is_text, + ) + return self.output + + def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8, is_text=True, edge_color=_OFF_WHITE): + """ + Draw semantic segmentation predictions/labels. + Args: + sem_seg (Tensor or ndarray): the segmentation of shape (H, W). + Each value is the integer label of the pixel. + area_threshold (int): segments with less than `area_threshold` are not drawn. + alpha (float): the larger it is, the more opaque the segmentations are. + Returns: + output (VisImage): image object with visualizations. + """ + if isinstance(sem_seg, torch.Tensor): + sem_seg = sem_seg.numpy() + labels, areas = np.unique(sem_seg, return_counts=True) + sorted_idxs = np.argsort(-areas).tolist() + labels = labels[sorted_idxs] + for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels): + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[label]] + except (AttributeError, IndexError): + mask_color = None + + binary_mask = (sem_seg == label).astype(np.uint8) + text = self.metadata.stuff_classes[label] + self.draw_binary_mask( + binary_mask, + color=mask_color, + edge_color=edge_color, + text=text, + alpha=alpha, + area_threshold=area_threshold, + is_text=is_text, + ) + return self.output + + def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7, is_text=True,): + """ + Draw panoptic prediction annotations or results. + Args: + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each + segment. + segments_info (list[dict] or None): Describe each segment in `panoptic_seg`. + If it is a ``list[dict]``, each dict contains keys "id", "category_id". + If None, category id of each pixel is computed by + ``pixel // metadata.label_divisor``. + area_threshold (int): stuff segments with less than `area_threshold` are not drawn. + Returns: + output (VisImage): image object with visualizations. + """ + pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata) + + if self._instance_mode == ColorMode.IMAGE_BW: + self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask())) + + # draw mask for all semantic segments first i.e. "stuff" + for mask, sinfo in pred.semantic_masks(): + category_idx = sinfo["category_id"] + try: + mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]] + except AttributeError: + mask_color = None + + text = self.metadata.stuff_classes[category_idx] + self.draw_binary_mask( + mask, + color=mask_color, + edge_color=_OFF_WHITE, + text=text, + alpha=alpha, + area_threshold=area_threshold, + is_text=is_text, + ) + + # draw mask for all instances second + all_instances = list(pred.instance_masks()) + if len(all_instances) == 0: + return self.output + masks, sinfo = list(zip(*all_instances)) + category_ids = [x["category_id"] for x in sinfo] + + try: + scores = [x["score"] for x in sinfo] + except KeyError: + scores = None + labels = _create_text_labels( + category_ids, scores, self.metadata.stuff_classes, [x.get("iscrowd", 0) for x in sinfo] + ) + + try: + colors = [ + self._jitter([x / 255 for x in self.metadata.stuff_colors[c]]) for c in category_ids + ] + except AttributeError: + colors = None + self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha, is_text=is_text) + + return self.output + + draw_panoptic_seg_predictions = draw_panoptic_seg # backward compatibility + + def draw_dataset_dict(self, dic): + """ + Draw annotations/segmentaions in Detectron2 Dataset format. + Args: + dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format. + Returns: + output (VisImage): image object with visualizations. + """ + annos = dic.get("annotations", None) + if annos: + if "segmentation" in annos[0]: + masks = [x["segmentation"] for x in annos] + else: + masks = None + if "keypoints" in annos[0]: + keypts = [x["keypoints"] for x in annos] + keypts = np.array(keypts).reshape(len(annos), -1, 3) + else: + keypts = None + + boxes = [ + BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS) + if len(x["bbox"]) == 4 + else x["bbox"] + for x in annos + ] + + colors = None + category_ids = [x["category_id"] for x in annos] + if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("stuff_colors"): + colors = [ + self._jitter([x / 255 for x in self.metadata.stuff_colors[c]]) + for c in category_ids + ] + names = self.metadata.get("stuff_classes", None) + labels = _create_text_labels( + category_ids, + scores=None, + class_names=names, + is_crowd=[x.get("iscrowd", 0) for x in annos], + ) + self.overlay_instances( + labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors + ) + + sem_seg = dic.get("sem_seg", None) + if sem_seg is None and "sem_seg_file_name" in dic: + with PathManager.open(dic["sem_seg_file_name"], "rb") as f: + sem_seg = Image.open(f) + sem_seg = np.asarray(sem_seg, dtype="uint8") + if sem_seg is not None: + self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5) + + pan_seg = dic.get("pan_seg", None) + # if pan_seg is None and "pan_seg_file_name" in dic: + # with PathManager.open(dic["pan_seg_file_name"], "rb") as f: + # pan_seg = Image.open(f) + # pan_seg = np.asarray(pan_seg) + # from panopticapi.utils import rgb2id + # + # pan_seg = rgb2id(pan_seg) + if pan_seg is not None: + segments_info = dic["segments_info"] + pan_seg = torch.tensor(pan_seg) + self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5) + return self.output + + def overlay_instances( + self, + *, + boxes=None, + labels=None, + masks=None, + keypoints=None, + assigned_colors=None, + alpha=0.5, + is_text=True, + ): + """ + Args: + boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`, + or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image, + or a :class:`RotatedBoxes`, + or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format + for the N objects in a single image, + labels (list[str]): the text to be displayed for each instance. + masks (masks-like object): Supported types are: + * :class:`detectron2.structures.PolygonMasks`, + :class:`detectron2.structures.BitMasks`. + * list[list[ndarray]]: contains the segmentation masks for all objects in one image. + The first level of the list corresponds to individual instances. The second + level to all the polygon that compose the instance, and the third level + to the polygon coordinates. The third level should have the format of + [x0, y0, x1, y1, ..., xn, yn] (n >= 3). + * list[ndarray]: each ndarray is a binary mask of shape (H, W). + * list[dict]: each dict is a COCO-style RLE. + keypoints (Keypoint or array like): an array-like object of shape (N, K, 3), + where the N is the number of instances and K is the number of keypoints. + The last dimension corresponds to (x, y, visibility or score). + assigned_colors (list[matplotlib.colors]): a list of colors, where each color + corresponds to each mask or box in the image. Refer to 'matplotlib.colors' + for full list of formats that the colors are accepted in. + Returns: + output (VisImage): image object with visualizations. + """ + num_instances = 0 + if boxes is not None: + boxes = self._convert_boxes(boxes) + num_instances = len(boxes) + if masks is not None: + masks = self._convert_masks(masks) + if num_instances: + assert len(masks) == num_instances + else: + num_instances = len(masks) + if keypoints is not None: + if num_instances: + assert len(keypoints) == num_instances + else: + num_instances = len(keypoints) + keypoints = self._convert_keypoints(keypoints) + if labels is not None: + assert len(labels) == num_instances + if assigned_colors is None: + # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] + assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)] + if num_instances == 0: + return self.output + if boxes is not None and boxes.shape[1] == 5: + return self.overlay_rotated_instances( + boxes=boxes, labels=labels, assigned_colors=assigned_colors + ) + + # Display in largest to smallest order to reduce occlusion. + areas = None + if boxes is not None: + areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1) + elif masks is not None: + areas = np.asarray([x.area() for x in masks]) + + if areas is not None: + sorted_idxs = np.argsort(-areas).tolist() + # Re-order overlapped instances in descending order. + boxes = boxes[sorted_idxs] if boxes is not None else None + labels = [labels[k] for k in sorted_idxs] if labels is not None else None + masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None + assigned_colors = [assigned_colors[idx] for idx in sorted_idxs] + keypoints = keypoints[sorted_idxs] if keypoints is not None else None + + for i in range(num_instances): + color = assigned_colors[i] + if boxes is not None: + self.draw_box(boxes[i], edge_color=color) + + if masks is not None: + for segment in masks[i].polygons: + self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha) + + if labels is not None: + # first get a box + if boxes is not None: + x0, y0, x1, y1 = boxes[i] + text_pos = (x0, y0) # if drawing boxes, put text on the box corner. + horiz_align = "left" + elif masks is not None: + # skip small mask without polygon + if len(masks[i].polygons) == 0: + continue + + x0, y0, x1, y1 = masks[i].bbox() + + # draw text in the center (defined by median) when box is not drawn + # median is less sensitive to outliers. + text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1] + horiz_align = "center" + else: + continue # drawing the box confidence for keypoints isn't very useful. + # for small objects, draw text at the side to avoid occlusion + instance_area = (y1 - y0) * (x1 - x0) + if ( + instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale + or y1 - y0 < 40 * self.output.scale + ): + if y1 >= self.output.height - 5: + text_pos = (x1, y0) + else: + text_pos = (x0, y1) + + height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width) + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + font_size = ( + np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) + * 0.5 + * self._default_font_size + ) + if is_text: + self.draw_text( + labels[i], + text_pos, + color=lighter_color, + horizontal_alignment=horiz_align, + font_size=font_size, + ) + + # draw keypoints + if keypoints is not None: + for keypoints_per_instance in keypoints: + self.draw_and_connect_keypoints(keypoints_per_instance) + + return self.output + + def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None): + """ + Args: + boxes (ndarray): an Nx5 numpy array of + (x_center, y_center, width, height, angle_degrees) format + for the N objects in a single image. + labels (list[str]): the text to be displayed for each instance. + assigned_colors (list[matplotlib.colors]): a list of colors, where each color + corresponds to each mask or box in the image. Refer to 'matplotlib.colors' + for full list of formats that the colors are accepted in. + Returns: + output (VisImage): image object with visualizations. + """ + num_instances = len(boxes) + + if assigned_colors is None: + # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)] + assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)] + if num_instances == 0: + return self.output + + # Display in largest to smallest order to reduce occlusion. + if boxes is not None: + areas = boxes[:, 2] * boxes[:, 3] + + sorted_idxs = np.argsort(-areas).tolist() + # Re-order overlapped instances in descending order. + boxes = boxes[sorted_idxs] + labels = [labels[k] for k in sorted_idxs] if labels is not None else None + colors = [assigned_colors[idx] for idx in sorted_idxs] + + for i in range(num_instances): + self.draw_rotated_box_with_label( + boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None + ) + + return self.output + + def draw_and_connect_keypoints(self, keypoints): + """ + Draws keypoints of an instance and follows the rules for keypoint connections + to draw lines between appropriate keypoints. This follows color heuristics for + line color. + Args: + keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints + and the last dimension corresponds to (x, y, probability). + Returns: + output (VisImage): image object with visualizations. + """ + visible = {} + keypoint_names = self.metadata.get("keypoint_names") + for idx, keypoint in enumerate(keypoints): + + # draw keypoint + x, y, prob = keypoint + if prob > self.keypoint_threshold: + self.draw_circle((x, y), color=_RED) + if keypoint_names: + keypoint_name = keypoint_names[idx] + visible[keypoint_name] = (x, y) + + if self.metadata.get("keypoint_connection_rules"): + for kp0, kp1, color in self.metadata.keypoint_connection_rules: + if kp0 in visible and kp1 in visible: + x0, y0 = visible[kp0] + x1, y1 = visible[kp1] + color = tuple(x / 255.0 for x in color) + self.draw_line([x0, x1], [y0, y1], color=color) + + # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip + # Note that this strategy is specific to person keypoints. + # For other keypoints, it should just do nothing + try: + ls_x, ls_y = visible["left_shoulder"] + rs_x, rs_y = visible["right_shoulder"] + mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2 + except KeyError: + pass + else: + # draw line from nose to mid-shoulder + nose_x, nose_y = visible.get("nose", (None, None)) + if nose_x is not None: + self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED) + + try: + # draw line from mid-shoulder to mid-hip + lh_x, lh_y = visible["left_hip"] + rh_x, rh_y = visible["right_hip"] + except KeyError: + pass + else: + mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2 + self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED) + return self.output + + """ + Primitive drawing functions: + """ + + def draw_text( + self, + text, + position, + *, + font_size=None, + color="g", + horizontal_alignment="center", + rotation=0, + ): + """ + Args: + text (str): class label + position (tuple): a tuple of the x and y coordinates to place text on image. + font_size (int, optional): font of the text. If not provided, a font size + proportional to the image width is calculated and used. + color: color of the text. Refer to `matplotlib.colors` for full list + of formats that are accepted. + horizontal_alignment (str): see `matplotlib.text.Text` + rotation: rotation angle in degrees CCW + Returns: + output (VisImage): image object with text drawn. + """ + if not font_size: + font_size = self._default_font_size + + # since the text background is dark, we don't want the text to be dark + color = np.maximum(list(mplc.to_rgb(color)), 0.2) + color[np.argmax(color)] = max(0.8, np.max(color)) + + x, y = position + self.output.ax.text( + x, + y, + text, + size=font_size * self.output.scale, + family="sans-serif", + bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"}, + verticalalignment="top", + horizontalalignment=horizontal_alignment, + color=color, + zorder=10, + rotation=rotation, + ) + return self.output + + def draw_box(self, box_coord, alpha=1.0, edge_color="g", line_style="-"): + """ + Args: + box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0 + are the coordinates of the image's top left corner. x1 and y1 are the + coordinates of the image's bottom right corner. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + edge_color: color of the outline of the box. Refer to `matplotlib.colors` + for full list of formats that are accepted. + line_style (string): the string to use to create the outline of the boxes. + Returns: + output (VisImage): image object with box drawn. + """ + x0, y0, x1, y1 = box_coord + width = x1 - x0 + height = y1 - y0 + + linewidth = 2 + + self.output.ax.add_patch( + mpl.patches.Rectangle( + (x0, y0), + width, + height, + fill=False, + edgecolor=edge_color, + linewidth=linewidth * self.output.scale, + alpha=alpha, + linestyle=line_style, + ) + ) + return self.output + + def draw_rotated_box_with_label( + self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None + ): + """ + Draw a rotated box with label on its top-left corner. + Args: + rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle), + where cnt_x and cnt_y are the center coordinates of the box. + w and h are the width and height of the box. angle represents how + many degrees the box is rotated CCW with regard to the 0-degree box. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + edge_color: color of the outline of the box. Refer to `matplotlib.colors` + for full list of formats that are accepted. + line_style (string): the string to use to create the outline of the boxes. + label (string): label for rotated box. It will not be rendered when set to None. + Returns: + output (VisImage): image object with box drawn. + """ + cnt_x, cnt_y, w, h, angle = rotated_box + area = w * h + # use thinner lines when the box is small + linewidth = self._default_font_size / ( + 6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3 + ) + + theta = angle * math.pi / 180.0 + c = math.cos(theta) + s = math.sin(theta) + rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)] + # x: left->right ; y: top->down + rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect] + for k in range(4): + j = (k + 1) % 4 + self.draw_line( + [rotated_rect[k][0], rotated_rect[j][0]], + [rotated_rect[k][1], rotated_rect[j][1]], + color=edge_color, + linestyle="--" if k == 1 else line_style, + linewidth=linewidth, + ) + + if label is not None: + text_pos = rotated_rect[1] # topleft corner + + height_ratio = h / np.sqrt(self.output.height * self.output.width) + label_color = self._change_color_brightness(edge_color, brightness_factor=0.7) + font_size = ( + np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size + ) + self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle) + + return self.output + + def draw_circle(self, circle_coord, color, radius=3): + """ + Args: + circle_coord (list(int) or tuple(int)): contains the x and y coordinates + of the center of the circle. + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + radius (int): radius of the circle. + Returns: + output (VisImage): image object with box drawn. + """ + x, y = circle_coord + self.output.ax.add_patch( + mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color) + ) + return self.output + + def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None): + """ + Args: + x_data (list[int]): a list containing x values of all the points being drawn. + Length of list should match the length of y_data. + y_data (list[int]): a list containing y values of all the points being drawn. + Length of list should match the length of x_data. + color: color of the line. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + linestyle: style of the line. Refer to `matplotlib.lines.Line2D` + for a full list of formats that are accepted. + linewidth (float or None): width of the line. When it's None, + a default value will be computed and used. + Returns: + output (VisImage): image object with line drawn. + """ + if linewidth is None: + linewidth = self._default_font_size / 3 + linewidth = max(linewidth, 1) + self.output.ax.add_line( + mpl.lines.Line2D( + x_data, + y_data, + linewidth=linewidth * self.output.scale, + color=color, + linestyle=linestyle, + ) + ) + return self.output + + def draw_binary_mask( + self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=10, is_text=True, + ): + """ + Args: + binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and + W is the image width. Each value in the array is either a 0 or 1 value of uint8 + type. + color: color of the mask. Refer to `matplotlib.colors` for a full list of + formats that are accepted. If None, will pick a random color. + edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a + full list of formats that are accepted. + text (str): if None, will be drawn on the object + alpha (float): blending efficient. Smaller values lead to more transparent masks. + area_threshold (float): a connected component smaller than this area will not be shown. + Returns: + output (VisImage): image object with mask drawn. + """ + if color is None: + color = random_color(rgb=True, maximum=1) + color = mplc.to_rgb(color) + + has_valid_segment = False + binary_mask = binary_mask.astype("uint8") # opencv needs uint8 + mask = GenericMask(binary_mask, self.output.height, self.output.width) + shape2d = (binary_mask.shape[0], binary_mask.shape[1]) + + if not mask.has_holes: + # draw polygons for regular masks + for segment in mask.polygons: + # area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1])) + # if area < (area_threshold or 0): + # continue + has_valid_segment = True + segment = segment.reshape(-1, 2) + self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha) + else: + # TODO: Use Path/PathPatch to draw vector graphics: + # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon + rgba = np.zeros(shape2d + (4,), dtype="float32") + rgba[:, :, :3] = color + rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha + has_valid_segment = True + self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) + + if is_text: + if text is not None and has_valid_segment: + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + self._draw_text_in_mask(binary_mask, text, lighter_color) + return self.output + + def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5): + """ + Args: + soft_mask (ndarray): float array of shape (H, W), each value in [0, 1]. + color: color of the mask. Refer to `matplotlib.colors` for a full list of + formats that are accepted. If None, will pick a random color. + text (str): if None, will be drawn on the object + alpha (float): blending efficient. Smaller values lead to more transparent masks. + Returns: + output (VisImage): image object with mask drawn. + """ + if color is None: + color = random_color(rgb=True, maximum=1) + color = mplc.to_rgb(color) + + shape2d = (soft_mask.shape[0], soft_mask.shape[1]) + rgba = np.zeros(shape2d + (4,), dtype="float32") + rgba[:, :, :3] = color + rgba[:, :, 3] = soft_mask * alpha + self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0)) + + if text is not None: + lighter_color = self._change_color_brightness(color, brightness_factor=0.7) + binary_mask = (soft_mask > 0.5).astype("uint8") + # self._draw_text_in_mask(binary_mask, text, lighter_color) + return self.output + + def draw_polygon(self, segment, color, edge_color=None, alpha=0.5): + """ + Args: + segment: numpy array of shape Nx2, containing all the points in the polygon. + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a + full list of formats that are accepted. If not provided, a darker shade + of the polygon color will be used instead. + alpha (float): blending efficient. Smaller values lead to more transparent masks. + Returns: + output (VisImage): image object with polygon drawn. + """ + if edge_color is None: + # make edge color darker than the polygon color + if alpha > 0.8: + edge_color = self._change_color_brightness(color, brightness_factor=-0.7) + else: + edge_color = color + edge_color = mplc.to_rgb(edge_color) + (1,) + + polygon = mpl.patches.Polygon( + segment, + fill=True, + facecolor=mplc.to_rgb(color) + (alpha,), + edgecolor=edge_color, + linewidth=max(self._default_font_size // 15 * self.output.scale, 1), + ) + self.output.ax.add_patch(polygon) + return self.output + + """ + Internal methods: + """ + + def _jitter(self, color): + """ + Randomly modifies given color to produce a slightly different color than the color given. + Args: + color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color + picked. The values in the list are in the [0.0, 1.0] range. + Returns: + jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the + color after being jittered. The values in the list are in the [0.0, 1.0] range. + """ + color = mplc.to_rgb(color) + vec = np.random.rand(3) + # better to do it in another color space + vec = vec / np.linalg.norm(vec) * 0.5 + res = np.clip(vec + color, 0, 1) + return tuple(res) + + def _create_grayscale_image(self, mask=None): + """ + Create a grayscale version of the original image. + The colors in masked area, if given, will be kept. + """ + img_bw = self.img.astype("f4").mean(axis=2) + img_bw = np.stack([img_bw] * 3, axis=2) + if mask is not None: + img_bw[mask] = self.img[mask] + return img_bw + + def _change_color_brightness(self, color, brightness_factor): + """ + Depending on the brightness_factor, gives a lighter or darker color i.e. a color with + less or more saturation than the original color. + Args: + color: color of the polygon. Refer to `matplotlib.colors` for a full list of + formats that are accepted. + brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of + 0 will correspond to no change, a factor in [-1.0, 0) range will result in + a darker color and a factor in (0, 1.0] range will result in a lighter color. + Returns: + modified_color (tuple[double]): a tuple containing the RGB values of the + modified color. Each value in the tuple is in the [0.0, 1.0] range. + """ + assert brightness_factor >= -1.0 and brightness_factor <= 1.0 + color = mplc.to_rgb(color) + polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color)) + modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1]) + modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness + modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness + modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2]) + return modified_color + + def _convert_boxes(self, boxes): + """ + Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension. + """ + if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes): + return boxes.tensor.detach().numpy() + else: + return np.asarray(boxes) + + def _convert_masks(self, masks_or_polygons): + """ + Convert different format of masks or polygons to a tuple of masks and polygons. + Returns: + list[GenericMask]: + """ + + m = masks_or_polygons + if isinstance(m, PolygonMasks): + m = m.polygons + if isinstance(m, BitMasks): + m = m.tensor.numpy() + if isinstance(m, torch.Tensor): + m = m.numpy() + ret = [] + for x in m: + if isinstance(x, GenericMask): + ret.append(x) + else: + ret.append(GenericMask(x, self.output.height, self.output.width)) + return ret + + def _draw_text_in_mask(self, binary_mask, text, color): + """ + Find proper places to draw text given a binary mask. + """ + # TODO sometimes drawn on wrong objects. the heuristics here can improve. + _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8) + if stats[1:, -1].size == 0: + return + largest_component_id = np.argmax(stats[1:, -1]) + 1 + + # draw text on the largest component, as well as other very large components. + for cid in range(1, _num_cc): + if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH: + # median is more stable than centroid + # center = centroids[largest_component_id] + center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1] + self.draw_text(text, center, color=color) + + def _convert_keypoints(self, keypoints): + if isinstance(keypoints, Keypoints): + keypoints = keypoints.tensor + keypoints = np.asarray(keypoints) + return keypoints + + def get_output(self): + """ + Returns: + output (VisImage): the image output containing the visualizations added + to the image. + """ + return self.output \ No newline at end of file diff --git a/custom_oneformer/evaluation/__init__.py b/custom_oneformer/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..49f62369cca38a3c85884f8dea6baea674cb9060 --- /dev/null +++ b/custom_oneformer/evaluation/__init__.py @@ -0,0 +1,3 @@ +from .detection_coco_evaluator import * +from .coco_evaluator import * +from .cityscapes_evaluation import CityscapesInstanceEvaluator \ No newline at end of file diff --git a/custom_oneformer/evaluation/cityscapes_evaluation.py b/custom_oneformer/evaluation/cityscapes_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..2019165f605bfb60bdbaeef365b1f8fb937747e1 --- /dev/null +++ b/custom_oneformer/evaluation/cityscapes_evaluation.py @@ -0,0 +1,201 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/cityscapes_evaluation.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import glob +import logging +import numpy as np +import os +import tempfile +from collections import OrderedDict +import torch +from PIL import Image + +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.utils import comm +from custom_detectron2.utils.file_io import PathManager + +from .evaluator import DatasetEvaluator + + +class CityscapesEvaluator(DatasetEvaluator): + """ + Base class for evaluation using cityscapes API. + """ + + def __init__(self, dataset_name): + """ + Args: + dataset_name (str): the name of the dataset. + It must have the following metadata associated with it: + "thing_classes", "gt_dir". + """ + self._metadata = MetadataCatalog.get(dataset_name) + self._cpu_device = torch.device("cpu") + self._logger = logging.getLogger(__name__) + + def reset(self): + self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_") + self._temp_dir = self._working_dir.name + # All workers will write to the same results directory + # TODO this does not work in distributed training + assert ( + comm.get_local_size() == comm.get_world_size() + ), "CityscapesEvaluator currently do not work with multiple machines." + self._temp_dir = comm.all_gather(self._temp_dir)[0] + if self._temp_dir != self._working_dir.name: + self._working_dir.cleanup() + self._logger.info( + "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir) + ) + + +class CityscapesInstanceEvaluator(CityscapesEvaluator): + """ + Evaluate instance segmentation results on cityscapes dataset using cityscapes API. + + Note: + * It does not work in multi-machine distributed training. + * It contains a synchronization, therefore has to be used on all ranks. + * Only the main process runs evaluation. + """ + + def process(self, inputs, outputs): + from cityscapesscripts.helpers.labels import name2label + + for input, output in zip(inputs, outputs): + file_name = input["file_name"] + basename = os.path.splitext(os.path.basename(file_name))[0] + pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt") + + if "instances" in output: + output = output["instances"].to(self._cpu_device) + num_instances = len(output) + with open(pred_txt, "w") as fout: + for i in range(num_instances): + pred_class = output.pred_classes[i] + classes = self._metadata.stuff_classes[pred_class] + class_id = name2label[classes].id + score = output.scores[i] + mask = output.pred_masks[i].numpy().astype("uint8") + png_filename = os.path.join( + self._temp_dir, basename + "_{}_{}.png".format(i, classes) + ) + + Image.fromarray(mask * 255).save(png_filename) + fout.write( + "{} {} {}\n".format(os.path.basename(png_filename), class_id, score) + ) + else: + # Cityscapes requires a prediction file for every ground truth image. + with open(pred_txt, "w") as fout: + pass + + def evaluate(self): + """ + Returns: + dict: has a key "segm", whose value is a dict of "AP" and "AP50". + """ + comm.synchronize() + if comm.get_rank() > 0: + return + import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval + + self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) + + # set some global states in cityscapes evaluation API, before evaluating + cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) + cityscapes_eval.args.predictionWalk = None + cityscapes_eval.args.JSONOutput = False + cityscapes_eval.args.colorized = False + cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json") + + # These lines are adopted from + # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa + gt_dir = PathManager.get_local_path(self._metadata.gt_dir) + groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png")) + assert len( + groundTruthImgList + ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( + cityscapes_eval.args.groundTruthSearch + ) + predictionImgList = [] + for gt in groundTruthImgList: + predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args)) + results = cityscapes_eval.evaluateImgLists( + predictionImgList, groundTruthImgList, cityscapes_eval.args + )["averages"] + + ret = OrderedDict() + ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100} + self._working_dir.cleanup() + return ret + + +class CityscapesSemSegEvaluator(CityscapesEvaluator): + """ + Evaluate semantic segmentation results on cityscapes dataset using cityscapes API. + + Note: + * It does not work in multi-machine distributed training. + * It contains a synchronization, therefore has to be used on all ranks. + * Only the main process runs evaluation. + """ + + def process(self, inputs, outputs): + from cityscapesscripts.helpers.labels import trainId2label + + for input, output in zip(inputs, outputs): + file_name = input["file_name"] + basename = os.path.splitext(os.path.basename(file_name))[0] + pred_filename = os.path.join(self._temp_dir, basename + "_pred.png") + + output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy() + pred = 255 * np.ones(output.shape, dtype=np.uint8) + for train_id, label in trainId2label.items(): + if label.ignoreInEval: + continue + pred[output == train_id] = label.id + Image.fromarray(pred).save(pred_filename) + + def evaluate(self): + comm.synchronize() + if comm.get_rank() > 0: + return + # Load the Cityscapes eval script *after* setting the required env var, + # since the script reads CITYSCAPES_DATASET into global variables at load time. + import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval + + self._logger.info("Evaluating results under {} ...".format(self._temp_dir)) + + # set some global states in cityscapes evaluation API, before evaluating + cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir) + cityscapes_eval.args.predictionWalk = None + cityscapes_eval.args.JSONOutput = False + cityscapes_eval.args.colorized = False + + # These lines are adopted from + # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa + gt_dir = PathManager.get_local_path(self._metadata.gt_dir) + groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png")) + assert len( + groundTruthImgList + ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format( + cityscapes_eval.args.groundTruthSearch + ) + predictionImgList = [] + for gt in groundTruthImgList: + predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt)) + results = cityscapes_eval.evaluateImgLists( + predictionImgList, groundTruthImgList, cityscapes_eval.args + ) + ret = OrderedDict() + ret["sem_seg"] = { + "IoU": 100.0 * results["averageScoreClasses"], + "iIoU": 100.0 * results["averageScoreInstClasses"], + "IoU_sup": 100.0 * results["averageScoreCategories"], + "iIoU_sup": 100.0 * results["averageScoreInstCategories"], + } + self._working_dir.cleanup() + return ret diff --git a/custom_oneformer/evaluation/coco_evaluator.py b/custom_oneformer/evaluation/coco_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..fc8b93447cd58a721a20eedfce822b144aaac9f0 --- /dev/null +++ b/custom_oneformer/evaluation/coco_evaluator.py @@ -0,0 +1,563 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/coco_evaluation.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import contextlib +import copy +import io +import itertools +import json +import logging +import numpy as np +import os +import pickle +from collections import OrderedDict +import custom_pycocotools.mask as mask_util +import torch +from custom_pycocotools.coco import COCO +from custom_pycocotools.cocoeval import COCOeval +from tabulate import tabulate + +import custom_detectron2.utils.comm as comm +from custom_detectron2.config import CfgNode +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.data.datasets.coco import convert_to_coco_json +from custom_detectron2.structures import Boxes, BoxMode, pairwise_iou +from custom_detectron2.utils.file_io import PathManager +from custom_detectron2.utils.logger import create_small_table + +from .evaluator import DatasetEvaluator + +try: + from custom_detectron2.evaluation.fast_eval_api import COCOeval_opt +except ImportError: + COCOeval_opt = COCOeval + + +class COCOEvaluator(DatasetEvaluator): + """ + Evaluate AP for instance detection/segmentation, AP + for keypoint detection outputs using COCO's metrics. + See http://cocodataset.org/#detection-eval and + http://cocodataset.org/#keypoints-eval to understand its metrics. + The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means + the metric cannot be computed (e.g. due to no predictions made). + + In addition to COCO, this evaluator is able to support any bounding box detection, + instance segmentation, or keypoint detection dataset. + """ + + def __init__( + self, + dataset_name, + tasks=None, + distributed=True, + output_dir=None, + *, + max_dets_per_image=None, + use_fast_impl=True, + kpt_oks_sigmas=(), + allow_cached_coco=True, + ): + """ + Args: + dataset_name (str): name of the dataset to be evaluated. + It must have either the following corresponding metadata: + + "json_file": the path to the COCO format annotation + + Or it must be in detectron2's standard dataset format + so it can be converted to COCO format automatically. + tasks (tuple[str]): tasks that can be evaluated under the given + configuration. A task is one of "bbox", "segm", "keypoints". + By default, will infer this automatically from predictions. + distributed (True): if True, will collect results from all ranks and run evaluation + in the main process. + Otherwise, will only evaluate the results in the current process. + output_dir (str): optional, an output directory to dump all + results predicted on the dataset. The dump contains two files: + + 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and + contains all the results in the format they are produced by the model. + 2. "coco_instances_results.json" a json file in COCO's result format. + max_dets_per_image (int): limit on the maximum number of detections per image. + By default in COCO, this limit is to 100, but this can be customized + to be greater, as is needed in evaluation metrics AP fixed and AP pool + (see https://arxiv.org/pdf/2102.01066.pdf) + This doesn't affect keypoint evaluation. + use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. + Although the results should be very close to the official implementation in COCO + API, it is still recommended to compute results with the official API for use in + papers. The faster implementation also uses more RAM. + kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS. + See http://cocodataset.org/#keypoints-eval + When empty, it will use the defaults in COCO. + Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. + allow_cached_coco (bool): Whether to use cached coco json from previous validation + runs. You should set this to False if you need to use different validation data. + Defaults to True. + """ + self._logger = logging.getLogger(__name__) + self._distributed = distributed + self._output_dir = output_dir + + if use_fast_impl and (COCOeval_opt is COCOeval): + self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.") + use_fast_impl = False + self._use_fast_impl = use_fast_impl + + # COCOeval requires the limit on the number of detections per image (maxDets) to be a list + # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the + # 3rd element (100) is used as the limit on the number of detections per image when + # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval, + # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults. + if max_dets_per_image is None: + max_dets_per_image = [1, 10, 100] + else: + max_dets_per_image = [1, 10, max_dets_per_image] + self._max_dets_per_image = max_dets_per_image + + if tasks is not None and isinstance(tasks, CfgNode): + kpt_oks_sigmas = ( + tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas + ) + self._logger.warn( + "COCO Evaluator instantiated using config, this is deprecated behavior." + " Please pass in explicit arguments instead." + ) + self._tasks = None # Infering it from predictions should be better + else: + self._tasks = tasks + + self._cpu_device = torch.device("cpu") + + self._metadata = MetadataCatalog.get(dataset_name) + if not hasattr(self._metadata, "json_file"): + if output_dir is None: + raise ValueError( + "output_dir must be provided to COCOEvaluator " + "for datasets not in COCO format." + ) + self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...") + + cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json") + self._metadata.json_file = cache_path + convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco) + + json_file = PathManager.get_local_path(self._metadata.json_file) + with contextlib.redirect_stdout(io.StringIO()): + self._coco_api = COCO(json_file) + + # Test set json files do not contain annotations (evaluation must be + # performed using the COCO evaluation server). + self._do_evaluation = "annotations" in self._coco_api.dataset + if self._do_evaluation: + self._kpt_oks_sigmas = kpt_oks_sigmas + + def reset(self): + self._predictions = [] + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). + It is a list of dict. Each dict corresponds to an image and + contains keys like "height", "width", "file_name", "image_id". + outputs: the outputs of a COCO model. It is a list of dicts with key + "instances" that contains :class:`Instances`. + """ + for input, output in zip(inputs, outputs): + prediction = {"image_id": input["image_id"]} + + if "instances" in output: + instances = output["instances"].to(self._cpu_device) + prediction["instances"] = instances_to_coco_json(instances, input["image_id"]) + if len(prediction) > 1: + self._predictions.append(prediction) + + def evaluate(self, img_ids=None): + """ + Args: + img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset + """ + if self._distributed: + comm.synchronize() + predictions = comm.gather(self._predictions, dst=0) + predictions = list(itertools.chain(*predictions)) + + if not comm.is_main_process(): + return {} + else: + predictions = self._predictions + + if len(predictions) == 0: + self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") + return {} + + if self._output_dir: + PathManager.mkdirs(self._output_dir) + file_path = os.path.join(self._output_dir, "instances_predictions.pth") + with PathManager.open(file_path, "wb") as f: + torch.save(predictions, f) + + self._results = OrderedDict() + if "instances" in predictions[0]: + self._eval_predictions(predictions, img_ids=img_ids) + # Copy so the caller can do whatever with results + return copy.deepcopy(self._results) + + def _tasks_from_predictions(self, predictions): + """ + Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions. + """ + for pred in predictions: + if "segmentation" in pred: + tasks = {"segm"} + if "keypoints" in pred: + tasks.add("keypoints") + return sorted(tasks) + + def _eval_predictions(self, predictions, img_ids=None): + """ + Evaluate predictions. Fill self._results with the metrics of the tasks. + """ + self._logger.info("Preparing results for COCO format ...") + coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) + tasks = self._tasks or self._tasks_from_predictions(coco_results) + + # unmap the category ids for COCO + if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): + dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id + all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) + num_classes = len(all_contiguous_ids) + assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 + + reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} + for result in coco_results: + category_id = result["category_id"] + assert category_id < num_classes, ( + f"A prediction has class={category_id}, " + f"but the dataset only has {num_classes} classes and " + f"predicted class id should be in [0, {num_classes - 1}]." + ) + result["category_id"] = reverse_id_mapping[category_id] + + if self._output_dir: + file_path = os.path.join(self._output_dir, "coco_instances_results.json") + self._logger.info("Saving results to {}".format(file_path)) + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(coco_results)) + f.flush() + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info( + "Evaluating predictions with {} COCO API...".format( + "unofficial" if self._use_fast_impl else "official" + ) + ) + for task in sorted(tasks): + assert task in {"segm", "keypoints"}, f"Got unknown task: {task}!" + coco_eval = ( + _evaluate_predictions_on_coco( + self._coco_api, + coco_results, + task, + kpt_oks_sigmas=self._kpt_oks_sigmas, + use_fast_impl=self._use_fast_impl, + img_ids=img_ids, + max_dets_per_image=self._max_dets_per_image, + ) + if len(coco_results) > 0 + else None # cocoapi does not handle empty results very well + ) + + res = self._derive_coco_results( + coco_eval, task, class_names=self._metadata.get("thing_classes") + ) + self._results[task] = res + + def _derive_coco_results(self, coco_eval, iou_type, class_names=None): + """ + Derive the desired score numbers from summarized COCOeval. + + Args: + coco_eval (None or COCOEval): None represents no predictions from model. + iou_type (str): + class_names (None or list[str]): if provided, will use it to predict + per-category AP. + + Returns: + a dict of {metric name: score} + """ + + metrics = { + "segm": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], + }[iou_type] + + if coco_eval is None: + self._logger.warn("No predictions from the model!") + return {metric: float("nan") for metric in metrics} + + # the standard metrics + results = { + metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") + for idx, metric in enumerate(metrics) + } + self._logger.info( + "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) + ) + if not np.isfinite(sum(results.values())): + self._logger.info("Some metrics cannot be computed and is shown as NaN.") + + if class_names is None or len(class_names) <= 1: + return results + # Compute per-category AP + # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa + precisions = coco_eval.eval["precision"] + # precision has dims (iou, recall, cls, area range, max dets) + assert len(class_names) == precisions.shape[2] + + results_per_category = [] + for idx, name in enumerate(class_names): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + ap = np.mean(precision) if precision.size else float("nan") + results_per_category.append(("{}".format(name), float(ap * 100))) + + # tabulate it + N_COLS = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) + table = tabulate( + results_2d, + tablefmt="pipe", + floatfmt=".3f", + headers=["category", "AP"] * (N_COLS // 2), + numalign="left", + ) + self._logger.info("Per-category {} AP: \n".format(iou_type) + table) + + results.update({"AP-" + name: ap for name, ap in results_per_category}) + return results + + +def instances_to_coco_json(instances, img_id): + """ + Dump an "Instances" object to a COCO-format json that's used for evaluation. + + Args: + instances (Instances): + img_id (int): the image id + + Returns: + list[dict]: list of json annotations in COCO format. + """ + num_instance = len(instances) + if num_instance == 0: + return [] + + scores = instances.scores.tolist() + classes = instances.pred_classes.tolist() + + has_mask = instances.has("pred_masks") + if has_mask: + # use RLE to encode the masks, because they are too large and takes memory + # since this evaluator stores outputs of the entire dataset + rles = [ + mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] + for mask in instances.pred_masks + ] + for rle in rles: + # "counts" is an array encoded by mask_util as a byte-stream. Python3's + # json writer which always produces strings cannot serialize a bytestream + # unless you decode it. Thankfully, utf-8 works out (which is also what + # the custom_pycocotools/_mask.pyx does). + rle["counts"] = rle["counts"].decode("utf-8") + + has_keypoints = instances.has("pred_keypoints") + if has_keypoints: + keypoints = instances.pred_keypoints + + results = [] + for k in range(num_instance): + result = { + "image_id": img_id, + "category_id": classes[k], + "score": scores[k], + } + if has_mask: + result["segmentation"] = rles[k] + if has_keypoints: + # In COCO annotations, + # keypoints coordinates are pixel indices. + # However our predictions are floating point coordinates. + # Therefore we subtract 0.5 to be consistent with the annotation format. + # This is the inverse of data loading logic in `datasets/coco.py`. + keypoints[k][:, :2] -= 0.5 + result["keypoints"] = keypoints[k].flatten().tolist() + results.append(result) + return results + +def _evaluate_predictions_on_coco( + coco_gt, + coco_results, + iou_type, + kpt_oks_sigmas=None, + use_fast_impl=True, + img_ids=None, + max_dets_per_image=None, +): + """ + Evaluate the coco results using COCOEval API. + """ + assert len(coco_results) > 0 + + if iou_type == "segm": + coco_results = copy.deepcopy(coco_results) + # When evaluating mask AP, if the results contain bbox, cocoapi will + # use the box area as the area of the instance, instead of the mask area. + # This leads to a different definition of small/medium/large. + # We remove the bbox field to let mask AP use mask area. + for c in coco_results: + c.pop("bbox", None) + + coco_dt = coco_gt.loadRes(coco_results) + coco_eval = (COCOeval_opt if use_fast_impl else COCOeval)(coco_gt, coco_dt, iou_type) + # For COCO, the default max_dets_per_image is [1, 10, 100]. + if max_dets_per_image is None: + max_dets_per_image = [1, 10, 100] # Default from COCOEval + else: + assert ( + len(max_dets_per_image) >= 3 + ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3" + # In the case that user supplies a custom input for max_dets_per_image, + # apply COCOevalMaxDets to evaluate AP with the custom input. + if max_dets_per_image[2] != 100: + coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type) + if iou_type != "keypoints": + coco_eval.params.maxDets = max_dets_per_image + + if img_ids is not None: + coco_eval.params.imgIds = img_ids + + if iou_type == "keypoints": + # Use the COCO default keypoint OKS sigmas unless overrides are specified + if kpt_oks_sigmas: + assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "custom_pycocotools is too old!" + coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas) + # COCOAPI requires every detection and every gt to have keypoints, so + # we just take the first entry from both + num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3 + num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3 + num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas) + assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, ( + f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. " + f"Ground truth contains {num_keypoints_gt} keypoints. " + f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. " + "They have to agree with each other. For meaning of OKS, please refer to " + "http://cocodataset.org/#keypoints-eval." + ) + + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + return coco_eval + + +class COCOevalMaxDets(COCOeval): + """ + Modified version of COCOeval for evaluating AP with a custom + maxDets (by default for COCO, maxDets is 100) + """ + + def summarize(self): + """ + Compute and display summary metrics for evaluation results given + a custom value for max_dets_per_image + """ + + def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): + p = self.params + iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" + titleStr = "Average Precision" if ap == 1 else "Average Recall" + typeStr = "(AP)" if ap == 1 else "(AR)" + iouStr = ( + "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) + if iouThr is None + else "{:0.2f}".format(iouThr) + ) + + aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] + mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] + if ap == 1: + # dimension of precision: [TxRxKxAxM] + s = self.eval["precision"] + # IoU + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, :, aind, mind] + else: + # dimension of recall: [TxKxAxM] + s = self.eval["recall"] + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, aind, mind] + if len(s[s > -1]) == 0: + mean_s = -1 + else: + mean_s = np.mean(s[s > -1]) + print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) + return mean_s + + def _summarizeDets(): + stats = np.zeros((12,)) + # Evaluate AP using the custom limit on maximum detections per image + stats[0] = _summarize(1, maxDets=self.params.maxDets[2]) + stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) + stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) + stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) + stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) + stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) + stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) + stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) + stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) + return stats + + def _summarizeKps(): + stats = np.zeros((10,)) + stats[0] = _summarize(1, maxDets=20) + stats[1] = _summarize(1, maxDets=20, iouThr=0.5) + stats[2] = _summarize(1, maxDets=20, iouThr=0.75) + stats[3] = _summarize(1, maxDets=20, areaRng="medium") + stats[4] = _summarize(1, maxDets=20, areaRng="large") + stats[5] = _summarize(0, maxDets=20) + stats[6] = _summarize(0, maxDets=20, iouThr=0.5) + stats[7] = _summarize(0, maxDets=20, iouThr=0.75) + stats[8] = _summarize(0, maxDets=20, areaRng="medium") + stats[9] = _summarize(0, maxDets=20, areaRng="large") + return stats + + if not self.eval: + raise Exception("Please run accumulate() first") + iouType = self.params.iouType + if iouType == "segm": + summarize = _summarizeDets + elif iouType == "keypoints": + summarize = _summarizeKps + self.stats = summarize() + + def __str__(self): + self.summarize() \ No newline at end of file diff --git a/custom_oneformer/evaluation/detection_coco_evaluator.py b/custom_oneformer/evaluation/detection_coco_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..9eb543344e2aafb0b663b32fa66e80a15f537c37 --- /dev/null +++ b/custom_oneformer/evaluation/detection_coco_evaluator.py @@ -0,0 +1,723 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/coco_evaluation.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import contextlib +import copy +import io +import itertools +import json +import logging +import numpy as np +import os +import pickle +from collections import OrderedDict +import custom_pycocotools.mask as mask_util +import torch +from custom_pycocotools.coco import COCO +from custom_pycocotools.cocoeval import COCOeval +from tabulate import tabulate + +import custom_detectron2.utils.comm as comm +from custom_detectron2.config import CfgNode +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.data.datasets.coco import convert_to_coco_json +from custom_detectron2.structures import Boxes, BoxMode, pairwise_iou +from custom_detectron2.utils.file_io import PathManager +from custom_detectron2.utils.logger import create_small_table + +from .evaluator import DatasetEvaluator + +try: + from custom_detectron2.evaluation.fast_eval_api import COCOeval_opt +except ImportError: + COCOeval_opt = COCOeval + + +class DetectionCOCOEvaluator(DatasetEvaluator): + """ + Evaluate AR for object proposals, AP for instance detection/segmentation, AP + for keypoint detection outputs using COCO's metrics. + See http://cocodataset.org/#detection-eval and + http://cocodataset.org/#keypoints-eval to understand its metrics. + The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means + the metric cannot be computed (e.g. due to no predictions made). + + In addition to COCO, this evaluator is able to support any bounding box detection, + instance segmentation, or keypoint detection dataset. + """ + + def __init__( + self, + dataset_name, + tasks=None, + distributed=True, + output_dir=None, + *, + max_dets_per_image=None, + use_fast_impl=True, + kpt_oks_sigmas=(), + allow_cached_coco=True, + ): + """ + Args: + dataset_name (str): name of the dataset to be evaluated. + It must have either the following corresponding metadata: + + "json_file": the path to the COCO format annotation + + Or it must be in detectron2's standard dataset format + so it can be converted to COCO format automatically. + tasks (tuple[str]): tasks that can be evaluated under the given + configuration. A task is one of "bbox", "segm", "keypoints". + By default, will infer this automatically from predictions. + distributed (True): if True, will collect results from all ranks and run evaluation + in the main process. + Otherwise, will only evaluate the results in the current process. + output_dir (str): optional, an output directory to dump all + results predicted on the dataset. The dump contains two files: + + 1. "instances_predictions.pth" a file that can be loaded with `torch.load` and + contains all the results in the format they are produced by the model. + 2. "coco_instances_results.json" a json file in COCO's result format. + max_dets_per_image (int): limit on the maximum number of detections per image. + By default in COCO, this limit is to 100, but this can be customized + to be greater, as is needed in evaluation metrics AP fixed and AP pool + (see https://arxiv.org/pdf/2102.01066.pdf) + This doesn't affect keypoint evaluation. + use_fast_impl (bool): use a fast but **unofficial** implementation to compute AP. + Although the results should be very close to the official implementation in COCO + API, it is still recommended to compute results with the official API for use in + papers. The faster implementation also uses more RAM. + kpt_oks_sigmas (list[float]): The sigmas used to calculate keypoint OKS. + See http://cocodataset.org/#keypoints-eval + When empty, it will use the defaults in COCO. + Otherwise it should be the same length as ROI_KEYPOINT_HEAD.NUM_KEYPOINTS. + allow_cached_coco (bool): Whether to use cached coco json from previous validation + runs. You should set this to False if you need to use different validation data. + Defaults to True. + """ + self._logger = logging.getLogger(__name__) + self._distributed = distributed + self._output_dir = output_dir + + if use_fast_impl and (COCOeval_opt is COCOeval): + self._logger.info("Fast COCO eval is not built. Falling back to official COCO eval.") + use_fast_impl = False + self._use_fast_impl = use_fast_impl + + # COCOeval requires the limit on the number of detections per image (maxDets) to be a list + # with at least 3 elements. The default maxDets in COCOeval is [1, 10, 100], in which the + # 3rd element (100) is used as the limit on the number of detections per image when + # evaluating AP. COCOEvaluator expects an integer for max_dets_per_image, so for COCOeval, + # we reformat max_dets_per_image into [1, 10, max_dets_per_image], based on the defaults. + if max_dets_per_image is None: + max_dets_per_image = [1, 10, 100] + else: + max_dets_per_image = [1, 10, max_dets_per_image] + self._max_dets_per_image = max_dets_per_image + + if tasks is not None and isinstance(tasks, CfgNode): + kpt_oks_sigmas = ( + tasks.TEST.KEYPOINT_OKS_SIGMAS if not kpt_oks_sigmas else kpt_oks_sigmas + ) + self._logger.warn( + "COCO Evaluator instantiated using config, this is deprecated behavior." + " Please pass in explicit arguments instead." + ) + self._tasks = None # Infering it from predictions should be better + else: + self._tasks = tasks + + self._cpu_device = torch.device("cpu") + + self._metadata = MetadataCatalog.get(dataset_name) + if not hasattr(self._metadata, "json_file"): + if output_dir is None: + raise ValueError( + "output_dir must be provided to COCOEvaluator " + "for datasets not in COCO format." + ) + self._logger.info(f"Trying to convert '{dataset_name}' to COCO format ...") + + cache_path = os.path.join(output_dir, f"{dataset_name}_coco_format.json") + self._metadata.json_file = cache_path + convert_to_coco_json(dataset_name, cache_path, allow_cached=allow_cached_coco) + + json_file = PathManager.get_local_path(self._metadata.json_file) + with contextlib.redirect_stdout(io.StringIO()): + self._coco_api = COCO(json_file) + + # Test set json files do not contain annotations (evaluation must be + # performed using the COCO evaluation server). + self._do_evaluation = "annotations" in self._coco_api.dataset + if self._do_evaluation: + self._kpt_oks_sigmas = kpt_oks_sigmas + + def reset(self): + self._predictions = [] + + def process(self, inputs, outputs): + """ + Args: + inputs: the inputs to a COCO model (e.g., GeneralizedRCNN). + It is a list of dict. Each dict corresponds to an image and + contains keys like "height", "width", "file_name", "image_id". + outputs: the outputs of a COCO model. It is a list of dicts with key + "box_instances" that contains :class:`Instances`. + """ + for input, output in zip(inputs, outputs): + prediction = {"image_id": input["image_id"]} + + if "box_instances" in output: + instances = output["box_instances"].to(self._cpu_device) + prediction["box_instances"] = instances_to_coco_json(instances, input["image_id"]) + if "proposals" in output: + prediction["proposals"] = output["proposals"].to(self._cpu_device) + if len(prediction) > 1: + self._predictions.append(prediction) + + def evaluate(self, img_ids=None): + """ + Args: + img_ids: a list of image IDs to evaluate on. Default to None for the whole dataset + """ + if self._distributed: + comm.synchronize() + predictions = comm.gather(self._predictions, dst=0) + predictions = list(itertools.chain(*predictions)) + + if not comm.is_main_process(): + return {} + else: + predictions = self._predictions + + if len(predictions) == 0: + self._logger.warning("[COCOEvaluator] Did not receive valid predictions.") + return {} + + if self._output_dir: + PathManager.mkdirs(self._output_dir) + file_path = os.path.join(self._output_dir, "instances_predictions.pth") + with PathManager.open(file_path, "wb") as f: + torch.save(predictions, f) + + self._results = OrderedDict() + if "proposals" in predictions[0]: + self._eval_box_proposals(predictions) + if "box_instances" in predictions[0]: + self._eval_predictions(predictions, img_ids=img_ids) + # Copy so the caller can do whatever with results + return copy.deepcopy(self._results) + + def _tasks_from_predictions(self, predictions): + """ + Get COCO API "tasks" (i.e. iou_type) from COCO-format predictions. + """ + tasks = {"bbox"} + for pred in predictions: + if "keypoints" in pred: + tasks.add("keypoints") + return sorted(tasks) + + def _eval_predictions(self, predictions, img_ids=None): + """ + Evaluate predictions. Fill self._results with the metrics of the tasks. + """ + self._logger.info("Preparing results for COCO format ...") + coco_results = list(itertools.chain(*[x["box_instances"] for x in predictions])) + tasks = self._tasks or self._tasks_from_predictions(coco_results) + + # unmap the category ids for COCO + if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): + dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id + all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) + num_classes = len(all_contiguous_ids) + assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 + + reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} + for result in coco_results: + category_id = result["category_id"] + assert category_id < num_classes, ( + f"A prediction has class={category_id}, " + f"but the dataset only has {num_classes} classes and " + f"predicted class id should be in [0, {num_classes - 1}]." + ) + result["category_id"] = reverse_id_mapping[category_id] + + if self._output_dir: + file_path = os.path.join(self._output_dir, "coco_instances_results.json") + self._logger.info("Saving results to {}".format(file_path)) + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(coco_results)) + f.flush() + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info( + "Evaluating predictions with {} COCO API...".format( + "unofficial" if self._use_fast_impl else "official" + ) + ) + for task in sorted(tasks): + assert task in {"bbox", "keypoints"}, f"Got unknown task: {task}!" + coco_eval = ( + _evaluate_predictions_on_coco( + self._coco_api, + coco_results, + task, + kpt_oks_sigmas=self._kpt_oks_sigmas, + use_fast_impl=self._use_fast_impl, + img_ids=img_ids, + max_dets_per_image=self._max_dets_per_image, + ) + if len(coco_results) > 0 + else None # cocoapi does not handle empty results very well + ) + + res = self._derive_coco_results( + coco_eval, task, class_names=self._metadata.get("thing_classes") + ) + self._results[task] = res + + def _eval_box_proposals(self, predictions): + """ + Evaluate the box proposals in predictions. + Fill self._results with the metrics for "box_proposals" task. + """ + if self._output_dir: + # Saving generated box proposals to file. + # Predicted box_proposals are in XYXY_ABS mode. + bbox_mode = BoxMode.XYXY_ABS.value + ids, boxes, objectness_logits = [], [], [] + for prediction in predictions: + ids.append(prediction["image_id"]) + boxes.append(prediction["proposals"].proposal_boxes.tensor.numpy()) + objectness_logits.append(prediction["proposals"].objectness_logits.numpy()) + + proposal_data = { + "boxes": boxes, + "objectness_logits": objectness_logits, + "ids": ids, + "bbox_mode": bbox_mode, + } + with PathManager.open(os.path.join(self._output_dir, "box_proposals.pkl"), "wb") as f: + pickle.dump(proposal_data, f) + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info("Evaluating bbox proposals ...") + res = {} + areas = {"all": "", "small": "s", "medium": "m", "large": "l"} + for limit in [100, 1000]: + for area, suffix in areas.items(): + stats = _evaluate_box_proposals(predictions, self._coco_api, area=area, limit=limit) + key = "AR{}@{:d}".format(suffix, limit) + res[key] = float(stats["ar"].item() * 100) + self._logger.info("Proposal metrics: \n" + create_small_table(res)) + self._results["box_proposals"] = res + + def _derive_coco_results(self, coco_eval, iou_type, class_names=None): + """ + Derive the desired score numbers from summarized COCOeval. + + Args: + coco_eval (None or COCOEval): None represents no predictions from model. + iou_type (str): + class_names (None or list[str]): if provided, will use it to predict + per-category AP. + + Returns: + a dict of {metric name: score} + """ + + metrics = { + "bbox": ["AP", "AP50", "AP75", "APs", "APm", "APl"], + "keypoints": ["AP", "AP50", "AP75", "APm", "APl"], + }[iou_type] + + if coco_eval is None: + self._logger.warn("No predictions from the model!") + return {metric: float("nan") for metric in metrics} + + # the standard metrics + results = { + metric: float(coco_eval.stats[idx] * 100 if coco_eval.stats[idx] >= 0 else "nan") + for idx, metric in enumerate(metrics) + } + self._logger.info( + "Evaluation results for {}: \n".format(iou_type) + create_small_table(results) + ) + if not np.isfinite(sum(results.values())): + self._logger.info("Some metrics cannot be computed and is shown as NaN.") + + if class_names is None or len(class_names) <= 1: + return results + # Compute per-category AP + # from https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L222-L252 # noqa + precisions = coco_eval.eval["precision"] + # precision has dims (iou, recall, cls, area range, max dets) + assert len(class_names) == precisions.shape[2] + + results_per_category = [] + for idx, name in enumerate(class_names): + # area range index 0: all area ranges + # max dets index -1: typically 100 per image + precision = precisions[:, :, idx, 0, -1] + precision = precision[precision > -1] + ap = np.mean(precision) if precision.size else float("nan") + results_per_category.append(("{}".format(name), float(ap * 100))) + + # tabulate it + N_COLS = min(6, len(results_per_category) * 2) + results_flatten = list(itertools.chain(*results_per_category)) + results_2d = itertools.zip_longest(*[results_flatten[i::N_COLS] for i in range(N_COLS)]) + table = tabulate( + results_2d, + tablefmt="pipe", + floatfmt=".3f", + headers=["category", "AP"] * (N_COLS // 2), + numalign="left", + ) + self._logger.info("Per-category {} AP: \n".format(iou_type) + table) + + results.update({"AP-" + name: ap for name, ap in results_per_category}) + return results + + +def instances_to_coco_json(instances, img_id): + """ + Dump an "Instances" object to a COCO-format json that's used for evaluation. + + Args: + instances (Instances): + img_id (int): the image id + + Returns: + list[dict]: list of json annotations in COCO format. + """ + num_instance = len(instances) + if num_instance == 0: + return [] + + boxes = instances.pred_boxes.tensor.numpy() + boxes = BoxMode.convert(boxes, BoxMode.XYXY_ABS, BoxMode.XYWH_ABS) + boxes = boxes.tolist() + scores = instances.scores.tolist() + classes = instances.pred_classes.tolist() + + has_mask = instances.has("pred_masks") + if has_mask: + # use RLE to encode the masks, because they are too large and takes memory + # since this evaluator stores outputs of the entire dataset + rles = [ + mask_util.encode(np.array(mask[:, :, None], order="F", dtype="uint8"))[0] + for mask in instances.pred_masks + ] + for rle in rles: + # "counts" is an array encoded by mask_util as a byte-stream. Python3's + # json writer which always produces strings cannot serialize a bytestream + # unless you decode it. Thankfully, utf-8 works out (which is also what + # the custom_pycocotools/_mask.pyx does). + rle["counts"] = rle["counts"].decode("utf-8") + + has_keypoints = instances.has("pred_keypoints") + if has_keypoints: + keypoints = instances.pred_keypoints + + results = [] + for k in range(num_instance): + result = { + "image_id": img_id, + "category_id": classes[k], + "bbox": boxes[k], + "score": scores[k], + } + if has_mask: + result["segmentation"] = rles[k] + if has_keypoints: + # In COCO annotations, + # keypoints coordinates are pixel indices. + # However our predictions are floating point coordinates. + # Therefore we subtract 0.5 to be consistent with the annotation format. + # This is the inverse of data loading logic in `datasets/coco.py`. + keypoints[k][:, :2] -= 0.5 + result["keypoints"] = keypoints[k].flatten().tolist() + results.append(result) + return results + + +# inspired from Detectron: +# https://github.com/facebookresearch/Detectron/blob/a6a835f5b8208c45d0dce217ce9bbda915f44df7/detectron/datasets/json_dataset_evaluator.py#L255 # noqa +def _evaluate_box_proposals(dataset_predictions, coco_api, thresholds=None, area="all", limit=None): + """ + Evaluate detection proposal recall metrics. This function is a much + faster alternative to the official COCO API recall evaluation code. However, + it produces slightly different results. + """ + # Record max overlap value for each gt box + # Return vector of overlap values + areas = { + "all": 0, + "small": 1, + "medium": 2, + "large": 3, + "96-128": 4, + "128-256": 5, + "256-512": 6, + "512-inf": 7, + } + area_ranges = [ + [0**2, 1e5**2], # all + [0**2, 32**2], # small + [32**2, 96**2], # medium + [96**2, 1e5**2], # large + [96**2, 128**2], # 96-128 + [128**2, 256**2], # 128-256 + [256**2, 512**2], # 256-512 + [512**2, 1e5**2], + ] # 512-inf + assert area in areas, "Unknown area range: {}".format(area) + area_range = area_ranges[areas[area]] + gt_overlaps = [] + num_pos = 0 + + for prediction_dict in dataset_predictions: + predictions = prediction_dict["proposals"] + + # sort predictions in descending order + # TODO maybe remove this and make it explicit in the documentation + inds = predictions.objectness_logits.sort(descending=True)[1] + predictions = predictions[inds] + + ann_ids = coco_api.getAnnIds(imgIds=prediction_dict["image_id"]) + anno = coco_api.loadAnns(ann_ids) + gt_boxes = [ + BoxMode.convert(obj["bbox"], BoxMode.XYWH_ABS, BoxMode.XYXY_ABS) + for obj in anno + if obj["iscrowd"] == 0 + ] + gt_boxes = torch.as_tensor(gt_boxes).reshape(-1, 4) # guard against no boxes + gt_boxes = Boxes(gt_boxes) + gt_areas = torch.as_tensor([obj["area"] for obj in anno if obj["iscrowd"] == 0]) + + if len(gt_boxes) == 0 or len(predictions) == 0: + continue + + valid_gt_inds = (gt_areas >= area_range[0]) & (gt_areas <= area_range[1]) + gt_boxes = gt_boxes[valid_gt_inds] + + num_pos += len(gt_boxes) + + if len(gt_boxes) == 0: + continue + + if limit is not None and len(predictions) > limit: + predictions = predictions[:limit] + + overlaps = pairwise_iou(predictions.proposal_boxes, gt_boxes) + + _gt_overlaps = torch.zeros(len(gt_boxes)) + for j in range(min(len(predictions), len(gt_boxes))): + # find which proposal box maximally covers each gt box + # and get the iou amount of coverage for each gt box + max_overlaps, argmax_overlaps = overlaps.max(dim=0) + + # find which gt box is 'best' covered (i.e. 'best' = most iou) + gt_ovr, gt_ind = max_overlaps.max(dim=0) + assert gt_ovr >= 0 + # find the proposal box that covers the best covered gt box + box_ind = argmax_overlaps[gt_ind] + # record the iou coverage of this gt box + _gt_overlaps[j] = overlaps[box_ind, gt_ind] + assert _gt_overlaps[j] == gt_ovr + # mark the proposal box and the gt box as used + overlaps[box_ind, :] = -1 + overlaps[:, gt_ind] = -1 + + # append recorded iou coverage level + gt_overlaps.append(_gt_overlaps) + gt_overlaps = ( + torch.cat(gt_overlaps, dim=0) if len(gt_overlaps) else torch.zeros(0, dtype=torch.float32) + ) + gt_overlaps, _ = torch.sort(gt_overlaps) + + if thresholds is None: + step = 0.05 + thresholds = torch.arange(0.5, 0.95 + 1e-5, step, dtype=torch.float32) + recalls = torch.zeros_like(thresholds) + # compute recall for each iou threshold + for i, t in enumerate(thresholds): + recalls[i] = (gt_overlaps >= t).float().sum() / float(num_pos) + # ar = 2 * np.trapz(recalls, thresholds) + ar = recalls.mean() + return { + "ar": ar, + "recalls": recalls, + "thresholds": thresholds, + "gt_overlaps": gt_overlaps, + "num_pos": num_pos, + } + + +def _evaluate_predictions_on_coco( + coco_gt, + coco_results, + iou_type, + kpt_oks_sigmas=None, + use_fast_impl=True, + img_ids=None, + max_dets_per_image=None, +): + """ + Evaluate the coco results using COCOEval API. + """ + assert len(coco_results) > 0 + + if iou_type == "segm": + coco_results = copy.deepcopy(coco_results) + # When evaluating mask AP, if the results contain bbox, cocoapi will + # use the box area as the area of the instance, instead of the mask area. + # This leads to a different definition of small/medium/large. + # We remove the bbox field to let mask AP use mask area. + for c in coco_results: + c.pop("bbox", None) + + coco_dt = coco_gt.loadRes(coco_results) + coco_eval = (COCOeval_opt if use_fast_impl else COCOeval)(coco_gt, coco_dt, iou_type) + # For COCO, the default max_dets_per_image is [1, 10, 100]. + if max_dets_per_image is None: + max_dets_per_image = [1, 10, 100] # Default from COCOEval + else: + assert ( + len(max_dets_per_image) >= 3 + ), "COCOeval requires maxDets (and max_dets_per_image) to have length at least 3" + # In the case that user supplies a custom input for max_dets_per_image, + # apply COCOevalMaxDets to evaluate AP with the custom input. + if max_dets_per_image[2] != 100: + coco_eval = COCOevalMaxDets(coco_gt, coco_dt, iou_type) + if iou_type != "keypoints": + coco_eval.params.maxDets = max_dets_per_image + + if img_ids is not None: + coco_eval.params.imgIds = img_ids + + if iou_type == "keypoints": + # Use the COCO default keypoint OKS sigmas unless overrides are specified + if kpt_oks_sigmas: + assert hasattr(coco_eval.params, "kpt_oks_sigmas"), "custom_pycocotools is too old!" + coco_eval.params.kpt_oks_sigmas = np.array(kpt_oks_sigmas) + # COCOAPI requires every detection and every gt to have keypoints, so + # we just take the first entry from both + num_keypoints_dt = len(coco_results[0]["keypoints"]) // 3 + num_keypoints_gt = len(next(iter(coco_gt.anns.values()))["keypoints"]) // 3 + num_keypoints_oks = len(coco_eval.params.kpt_oks_sigmas) + assert num_keypoints_oks == num_keypoints_dt == num_keypoints_gt, ( + f"[COCOEvaluator] Prediction contain {num_keypoints_dt} keypoints. " + f"Ground truth contains {num_keypoints_gt} keypoints. " + f"The length of cfg.TEST.KEYPOINT_OKS_SIGMAS is {num_keypoints_oks}. " + "They have to agree with each other. For meaning of OKS, please refer to " + "http://cocodataset.org/#keypoints-eval." + ) + + coco_eval.evaluate() + coco_eval.accumulate() + coco_eval.summarize() + + return coco_eval + + +class COCOevalMaxDets(COCOeval): + """ + Modified version of COCOeval for evaluating AP with a custom + maxDets (by default for COCO, maxDets is 100) + """ + + def summarize(self): + """ + Compute and display summary metrics for evaluation results given + a custom value for max_dets_per_image + """ + + def _summarize(ap=1, iouThr=None, areaRng="all", maxDets=100): + p = self.params + iStr = " {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}" + titleStr = "Average Precision" if ap == 1 else "Average Recall" + typeStr = "(AP)" if ap == 1 else "(AR)" + iouStr = ( + "{:0.2f}:{:0.2f}".format(p.iouThrs[0], p.iouThrs[-1]) + if iouThr is None + else "{:0.2f}".format(iouThr) + ) + + aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] + mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] + if ap == 1: + # dimension of precision: [TxRxKxAxM] + s = self.eval["precision"] + # IoU + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, :, aind, mind] + else: + # dimension of recall: [TxKxAxM] + s = self.eval["recall"] + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:, :, aind, mind] + if len(s[s > -1]) == 0: + mean_s = -1 + else: + mean_s = np.mean(s[s > -1]) + print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) + return mean_s + + def _summarizeDets(): + stats = np.zeros((12,)) + # Evaluate AP using the custom limit on maximum detections per image + stats[0] = _summarize(1, maxDets=self.params.maxDets[2]) + stats[1] = _summarize(1, iouThr=0.5, maxDets=self.params.maxDets[2]) + stats[2] = _summarize(1, iouThr=0.75, maxDets=self.params.maxDets[2]) + stats[3] = _summarize(1, areaRng="small", maxDets=self.params.maxDets[2]) + stats[4] = _summarize(1, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[5] = _summarize(1, areaRng="large", maxDets=self.params.maxDets[2]) + stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) + stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) + stats[9] = _summarize(0, areaRng="small", maxDets=self.params.maxDets[2]) + stats[10] = _summarize(0, areaRng="medium", maxDets=self.params.maxDets[2]) + stats[11] = _summarize(0, areaRng="large", maxDets=self.params.maxDets[2]) + return stats + + def _summarizeKps(): + stats = np.zeros((10,)) + stats[0] = _summarize(1, maxDets=20) + stats[1] = _summarize(1, maxDets=20, iouThr=0.5) + stats[2] = _summarize(1, maxDets=20, iouThr=0.75) + stats[3] = _summarize(1, maxDets=20, areaRng="medium") + stats[4] = _summarize(1, maxDets=20, areaRng="large") + stats[5] = _summarize(0, maxDets=20) + stats[6] = _summarize(0, maxDets=20, iouThr=0.5) + stats[7] = _summarize(0, maxDets=20, iouThr=0.75) + stats[8] = _summarize(0, maxDets=20, areaRng="medium") + stats[9] = _summarize(0, maxDets=20, areaRng="large") + return stats + + if not self.eval: + raise Exception("Please run accumulate() first") + iouType = self.params.iouType + if iouType == "segm" or iouType == "bbox": + summarize = _summarizeDets + elif iouType == "keypoints": + summarize = _summarizeKps + self.stats = summarize() + + def __str__(self): + self.summarize() \ No newline at end of file diff --git a/custom_oneformer/evaluation/evaluator.py b/custom_oneformer/evaluation/evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..a15a1d365349c13e400840e3c4a0753646459fff --- /dev/null +++ b/custom_oneformer/evaluation/evaluator.py @@ -0,0 +1,228 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/evaluator.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import datetime +import logging +import time +from collections import OrderedDict, abc +from contextlib import ExitStack, contextmanager +from typing import List, Union +import torch +from torch import nn + +from custom_detectron2.utils.comm import get_world_size, is_main_process +from custom_detectron2.utils.logger import log_every_n_seconds + + +class DatasetEvaluator: + """ + Base class for a dataset evaluator. + + The function :func:`inference_on_dataset` runs the model over + all samples in the dataset, and have a DatasetEvaluator to process the inputs/outputs. + + This class will accumulate information of the inputs/outputs (by :meth:`process`), + and produce evaluation results in the end (by :meth:`evaluate`). + """ + + def reset(self): + """ + Preparation for a new round of evaluation. + Should be called before starting a round of evaluation. + """ + pass + + def process(self, inputs, outputs): + """ + Process the pair of inputs and outputs. + If they contain batches, the pairs can be consumed one-by-one using `zip`: + + .. code-block:: python + + for input_, output in zip(inputs, outputs): + # do evaluation on single input/output pair + ... + + Args: + inputs (list): the inputs that's used to call the model. + outputs (list): the return value of `model(inputs)` + """ + pass + + def evaluate(self): + """ + Evaluate/summarize the performance, after processing all input/output pairs. + + Returns: + dict: + A new evaluator class can return a dict of arbitrary format + as long as the user can process the results. + In our train_net.py, we expect the following format: + + * key: the name of the task (e.g., bbox) + * value: a dict of {metric name: score}, e.g.: {"AP50": 80} + """ + pass + + +class DatasetEvaluators(DatasetEvaluator): + """ + Wrapper class to combine multiple :class:`DatasetEvaluator` instances. + + This class dispatches every evaluation call to + all of its :class:`DatasetEvaluator`. + """ + + def __init__(self, evaluators): + """ + Args: + evaluators (list): the evaluators to combine. + """ + super().__init__() + self._evaluators = evaluators + + def reset(self): + for evaluator in self._evaluators: + evaluator.reset() + + def process(self, inputs, outputs): + for evaluator in self._evaluators: + evaluator.process(inputs, outputs) + + def evaluate(self): + results = OrderedDict() + for evaluator in self._evaluators: + result = evaluator.evaluate() + if is_main_process() and result is not None: + for k, v in result.items(): + assert ( + k not in results + ), "Different evaluators produce results with the same key {}".format(k) + results[k] = v + return results + + +def inference_on_dataset( + model, data_loader, evaluator: Union[DatasetEvaluator, List[DatasetEvaluator], None] +): + """ + Run model on the data_loader and evaluate the metrics with evaluator. + Also benchmark the inference speed of `model.__call__` accurately. + The model will be used in eval mode. + + Args: + model (callable): a callable which takes an object from + `data_loader` and returns some outputs. + + If it's an nn.Module, it will be temporarily set to `eval` mode. + If you wish to evaluate a model in `training` mode instead, you can + wrap the given model and override its behavior of `.eval()` and `.train()`. + data_loader: an iterable object with a length. + The elements it generates will be the inputs to the model. + evaluator: the evaluator(s) to run. Use `None` if you only want to benchmark, + but don't want to do any evaluation. + + Returns: + The return value of `evaluator.evaluate()` + """ + num_devices = get_world_size() + logger = logging.getLogger(__name__) + logger.info("Start inference on {} batches".format(len(data_loader))) + + total = len(data_loader) # inference data loader must have a fixed length + if evaluator is None: + # create a no-op evaluator + evaluator = DatasetEvaluators([]) + if isinstance(evaluator, abc.MutableSequence): + evaluator = DatasetEvaluators(evaluator) + evaluator.reset() + + num_warmup = min(5, total - 1) + start_time = time.perf_counter() + total_data_time = 0 + total_compute_time = 0 + total_eval_time = 0 + with ExitStack() as stack: + if isinstance(model, nn.Module): + stack.enter_context(inference_context(model)) + stack.enter_context(torch.no_grad()) + + start_data_time = time.perf_counter() + for idx, inputs in enumerate(data_loader): + total_data_time += time.perf_counter() - start_data_time + if idx == num_warmup: + start_time = time.perf_counter() + total_data_time = 0 + total_compute_time = 0 + total_eval_time = 0 + + start_compute_time = time.perf_counter() + outputs = model(inputs) + if torch.cuda.is_available(): + torch.cuda.synchronize() + total_compute_time += time.perf_counter() - start_compute_time + + start_eval_time = time.perf_counter() + evaluator.process(inputs, outputs) + total_eval_time += time.perf_counter() - start_eval_time + + iters_after_start = idx + 1 - num_warmup * int(idx >= num_warmup) + data_seconds_per_iter = total_data_time / iters_after_start + compute_seconds_per_iter = total_compute_time / iters_after_start + eval_seconds_per_iter = total_eval_time / iters_after_start + total_seconds_per_iter = (time.perf_counter() - start_time) / iters_after_start + if idx >= num_warmup * 2 or compute_seconds_per_iter > 5: + eta = datetime.timedelta(seconds=int(total_seconds_per_iter * (total - idx - 1))) + log_every_n_seconds( + logging.INFO, + ( + f"Inference done {idx + 1}/{total}. " + f"Dataloading: {data_seconds_per_iter:.4f} s/iter. " + f"Inference: {compute_seconds_per_iter:.4f} s/iter. " + f"Eval: {eval_seconds_per_iter:.4f} s/iter. " + f"Total: {total_seconds_per_iter:.4f} s/iter. " + f"ETA={eta}" + ), + n=5, + ) + start_data_time = time.perf_counter() + + # Measure the time only for this worker (before the synchronization barrier) + total_time = time.perf_counter() - start_time + total_time_str = str(datetime.timedelta(seconds=total_time)) + # NOTE this format is parsed by grep + logger.info( + "Total inference time: {} ({:.6f} s / iter per device, on {} devices)".format( + total_time_str, total_time / (total - num_warmup), num_devices + ) + ) + total_compute_time_str = str(datetime.timedelta(seconds=int(total_compute_time))) + logger.info( + "Total inference pure compute time: {} ({:.6f} s / iter per device, on {} devices)".format( + total_compute_time_str, total_compute_time / (total - num_warmup), num_devices + ) + ) + + results = evaluator.evaluate() + # An evaluator may return None when not in main process. + # Replace it by an empty dict instead to make it easier for downstream code to handle + if results is None: + results = {} + return results + + +@contextmanager +def inference_context(model): + """ + A context where the model is temporarily changed to eval mode, + and restored to previous mode afterwards. + + Args: + model: a torch Module + """ + training_mode = model.training + model.eval() + yield + model.train(training_mode) diff --git a/custom_oneformer/evaluation/instance_evaluation.py b/custom_oneformer/evaluation/instance_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..47d8346b28796fd96d2325ae2cea2100afa5fb44 --- /dev/null +++ b/custom_oneformer/evaluation/instance_evaluation.py @@ -0,0 +1,110 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/evaluation/instance_evaluation.py +# ------------------------------------------------------------------------------ + +import contextlib +import copy +import io +import itertools +import json +import logging +import numpy as np +import os +import pickle +from collections import OrderedDict +import custom_pycocotools.mask as mask_util +import torch +from custom_pycocotools.coco import COCO +from custom_pycocotools.cocoeval import COCOeval +from tabulate import tabulate + +import custom_detectron2.utils.comm as comm +from custom_detectron2.config import CfgNode +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.data.datasets.coco import convert_to_coco_json +from custom_detectron2.evaluation.coco_evaluation import COCOEvaluator, _evaluate_predictions_on_coco +from custom_detectron2.evaluation.fast_eval_api import COCOeval_opt +from custom_detectron2.structures import Boxes, BoxMode, pairwise_iou +from custom_detectron2.utils.file_io import PathManager +from custom_detectron2.utils.logger import create_small_table + + +# modified from COCOEvaluator for instance segmetnat +class InstanceSegEvaluator(COCOEvaluator): + """ + Evaluate AR for object proposals, AP for instance detection/segmentation, AP + for keypoint detection outputs using COCO's metrics. + See http://cocodataset.org/#detection-eval and + http://cocodataset.org/#keypoints-eval to understand its metrics. + The metrics range from 0 to 100 (instead of 0 to 1), where a -1 or NaN means + the metric cannot be computed (e.g. due to no predictions made). + + In addition to COCO, this evaluator is able to support any bounding box detection, + instance segmentation, or keypoint detection dataset. + """ + + def _eval_predictions(self, predictions, img_ids=None): + """ + Evaluate predictions. Fill self._results with the metrics of the tasks. + """ + self._logger.info("Preparing results for COCO format ...") + coco_results = list(itertools.chain(*[x["instances"] for x in predictions])) + tasks = self._tasks or self._tasks_from_predictions(coco_results) + + # unmap the category ids for COCO + if hasattr(self._metadata, "thing_dataset_id_to_contiguous_id"): + dataset_id_to_contiguous_id = self._metadata.thing_dataset_id_to_contiguous_id + # all_contiguous_ids = list(dataset_id_to_contiguous_id.values()) + # num_classes = len(all_contiguous_ids) + # assert min(all_contiguous_ids) == 0 and max(all_contiguous_ids) == num_classes - 1 + + reverse_id_mapping = {v: k for k, v in dataset_id_to_contiguous_id.items()} + for result in coco_results: + category_id = result["category_id"] + # assert category_id < num_classes, ( + # f"A prediction has class={category_id}, " + # f"but the dataset only has {num_classes} classes and " + # f"predicted class id should be in [0, {num_classes - 1}]." + # ) + assert category_id in reverse_id_mapping, ( + f"A prediction has class={category_id}, " + f"but the dataset only has class ids in {dataset_id_to_contiguous_id}." + ) + result["category_id"] = reverse_id_mapping[category_id] + + if self._output_dir: + file_path = os.path.join(self._output_dir, "coco_instances_results.json") + self._logger.info("Saving results to {}".format(file_path)) + with PathManager.open(file_path, "w") as f: + f.write(json.dumps(coco_results)) + f.flush() + + if not self._do_evaluation: + self._logger.info("Annotations are not available for evaluation.") + return + + self._logger.info( + "Evaluating predictions with {} COCO API...".format( + "unofficial" if self._use_fast_impl else "official" + ) + ) + for task in sorted(tasks): + assert task in {"bbox", "segm", "keypoints"}, f"Got unknown task: {task}!" + coco_eval = ( + _evaluate_predictions_on_coco( + self._coco_api, + coco_results, + task, + kpt_oks_sigmas=self._kpt_oks_sigmas, + use_fast_impl=self._use_fast_impl, + img_ids=img_ids, + max_dets_per_image=self._max_dets_per_image, + ) + if len(coco_results) > 0 + else None # cocoapi does not handle empty results very well + ) + + res = self._derive_coco_results( + coco_eval, task, class_names=self._metadata.get("thing_classes") + ) + self._results[task] = res diff --git a/custom_oneformer/modeling/__init__.py b/custom_oneformer/modeling/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e1338369a958062d6ca4a122435b2be6ad27315 --- /dev/null +++ b/custom_oneformer/modeling/__init__.py @@ -0,0 +1,5 @@ +from .backbone.swin import D2SwinTransformer +from .backbone.dinat import D2DiNAT +from .pixel_decoder.fpn import BasePixelDecoder +from .pixel_decoder.msdeformattn import MSDeformAttnPixelDecoder +from .meta_arch.oneformer_head import OneFormerHead diff --git a/custom_oneformer/modeling/backbone/__init__.py b/custom_oneformer/modeling/backbone/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9020c2df23e2af280b7bb168b996ae9eaf312eb8 --- /dev/null +++ b/custom_oneformer/modeling/backbone/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. diff --git a/custom_oneformer/modeling/backbone/dinat.py b/custom_oneformer/modeling/backbone/dinat.py new file mode 100644 index 0000000000000000000000000000000000000000..164d5bc5c9c6d29d95316a1b2a2a4bbfc2f76843 --- /dev/null +++ b/custom_oneformer/modeling/backbone/dinat.py @@ -0,0 +1,324 @@ +# -------------------------------------------------------- +# Neighborhood Attention Transformer +# Licensed under The MIT License +# Written by Ali Hassani +# -------------------------------------------------------- + +# Modified by Jitesh Jain + +import torch +import torch.nn as nn +from custom_timm.models.layers import DropPath +from custom_detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec + +class NeighborhoodAttention(nn.Module): + """ + Neighborhood Attention 2D Module + """ + + def __init__( + self, + dim, + num_heads, + kernel_size, + dilation=1, + bias=True, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + + + def forward(self, x): + + return x + + def extra_repr(self) -> str: + return ( + f"head_dim={self.head_dim}, num_heads={self.num_heads}, " + + f"kernel_size={self.kernel_size}, dilation={self.dilation}, " + + f"rel_pos_bias={self.rpb is not None}" + ) + +class ConvTokenizer(nn.Module): + def __init__(self, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + self.proj = nn.Sequential( + nn.Conv2d(in_chans, embed_dim // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + nn.Conv2d(embed_dim // 2, embed_dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)), + ) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + x = self.proj(x).permute(0, 2, 3, 1) + if self.norm is not None: + x = self.norm(x) + return x + + +class ConvDownsampler(nn.Module): + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, x): + x = self.reduction(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + x = self.norm(x) + return x + + +class Mlp(nn.Module): + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class NATLayer(nn.Module): + def __init__(self, dim, num_heads, kernel_size=7, dilation=None, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm, layer_scale=None): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.mlp_ratio = mlp_ratio + + self.norm1 = norm_layer(dim) + self.attn = NeighborhoodAttention( + dim, kernel_size=kernel_size, dilation=dilation, num_heads=num_heads, + qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.layer_scale = False + if layer_scale is not None and type(layer_scale) in [int, float]: + self.layer_scale = True + self.gamma1 = nn.Parameter(layer_scale * torch.ones(dim), requires_grad=True) + self.gamma2 = nn.Parameter(layer_scale * torch.ones(dim), requires_grad=True) + + def forward(self, x): + if not self.layer_scale: + shortcut = x + x = self.norm1(x) + x = self.attn(x) + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + shortcut = x + x = self.norm1(x) + x = self.attn(x) + x = shortcut + self.drop_path(self.gamma1 * x) + x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x))) + return x + + + +class NATBlock(nn.Module): + def __init__(self, dim, depth, num_heads, kernel_size, dilations=None, + downsample=True, + mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, layer_scale=None): + super().__init__() + self.dim = dim + self.depth = depth + + self.blocks = nn.ModuleList([ + NATLayer(dim=dim, + num_heads=num_heads, + kernel_size=kernel_size, + dilation=None if dilations is None else dilations[i], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + layer_scale=layer_scale) + for i in range(depth)]) + + self.downsample = None if not downsample else ConvDownsampler(dim=dim, norm_layer=norm_layer) + + def forward(self, x): + for blk in self.blocks: + x = blk(x) + if self.downsample is None: + return x, x + return self.downsample(x), x + + +class DiNAT(nn.Module): + def __init__(self, + embed_dim, + mlp_ratio, + depths, + num_heads, + drop_path_rate=0.2, + in_chans=3, + kernel_size=7, + dilations=None, + out_indices=(0, 1, 2, 3), + qkv_bias=True, + qk_scale=None, + drop_rate=0., + attn_drop_rate=0., + norm_layer=nn.LayerNorm, + frozen_stages=-1, + layer_scale=None, + **kwargs): + super().__init__() + self.num_levels = len(depths) + self.embed_dim = embed_dim + self.num_features = [int(embed_dim * 2 ** i) for i in range(self.num_levels)] + self.mlp_ratio = mlp_ratio + + self.patch_embed = ConvTokenizer(in_chans=in_chans, embed_dim=embed_dim, norm_layer=norm_layer) + + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] + self.levels = nn.ModuleList() + for i in range(self.num_levels): + level = NATBlock(dim=int(embed_dim * 2 ** i), + depth=depths[i], + num_heads=num_heads[i], + kernel_size=kernel_size, + dilations=None if dilations is None else dilations[i], + mlp_ratio=self.mlp_ratio, + qkv_bias=qkv_bias, qk_scale=qk_scale, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])], + norm_layer=norm_layer, + downsample=(i < self.num_levels - 1), + layer_scale=layer_scale) + self.levels.append(level) + + # add a norm layer for each output + self.out_indices = out_indices + for i_layer in self.out_indices: + layer = norm_layer(self.num_features[i_layer]) + layer_name = f'norm{i_layer}' + self.add_module(layer_name, layer) + + self.frozen_stages = frozen_stages + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 2: + for i in range(0, self.frozen_stages - 1): + m = self.network[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def train(self, mode=True): + super(DiNAT, self).train(mode) + self._freeze_stages() + + def forward_embeddings(self, x): + x = self.patch_embed(x) + return x + + def forward_tokens(self, x): + outs = {} + for idx, level in enumerate(self.levels): + x, xo = level(x) + if idx in self.out_indices: + norm_layer = getattr(self, f'norm{idx}') + x_out = norm_layer(xo) + outs["res{}".format(idx + 2)] = x_out.permute(0, 3, 1, 2).contiguous() + return outs + + def forward(self, x): + x = self.forward_embeddings(x) + return self.forward_tokens(x) + + +@BACKBONE_REGISTRY.register() +class D2DiNAT(DiNAT, Backbone): + def __init__(self, cfg, input_shape): + + embed_dim = cfg.MODEL.DiNAT.EMBED_DIM + mlp_ratio = cfg.MODEL.DiNAT.MLP_RATIO + depths = cfg.MODEL.DiNAT.DEPTHS + num_heads = cfg.MODEL.DiNAT.NUM_HEADS + drop_path_rate = cfg.MODEL.DiNAT.DROP_PATH_RATE + kernel_size = cfg.MODEL.DiNAT.KERNEL_SIZE + out_indices = cfg.MODEL.DiNAT.OUT_INDICES + dilations = cfg.MODEL.DiNAT.DILATIONS + + super().__init__( + embed_dim=embed_dim, + mlp_ratio=mlp_ratio, + depths=depths, + num_heads=num_heads, + drop_path_rate=drop_path_rate, + kernel_size=kernel_size, + out_indices=out_indices, + dilations=dilations, + ) + + self._out_features = cfg.MODEL.DiNAT.OUT_FEATURES + + self._out_feature_strides = { + "res2": 4, + "res3": 8, + "res4": 16, + "res5": 32, + } + self._out_feature_channels = { + "res2": self.num_features[0], + "res3": self.num_features[1], + "res4": self.num_features[2], + "res5": self.num_features[3], + } + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: names and the corresponding features + """ + assert ( + x.dim() == 4 + ), f"DiNAT takes an input of shape (N, C, H, W). Got {x.shape} instead!" + outputs = {} + y = super().forward(x) + for k in y.keys(): + if k in self._out_features: + outputs[k] = y[k] + return outputs + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + @property + def size_divisibility(self): + return 32 diff --git a/custom_oneformer/modeling/backbone/swin.py b/custom_oneformer/modeling/backbone/swin.py new file mode 100644 index 0000000000000000000000000000000000000000..06ce67a5041d5bfd1efe3c02321b1cb423209d38 --- /dev/null +++ b/custom_oneformer/modeling/backbone/swin.py @@ -0,0 +1,771 @@ +# -------------------------------------------------------- +# Swin Transformer +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu, Yutong Lin, Yixuan Wei +# -------------------------------------------------------- + +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former +# ------------------------------------------------------------------------------ + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from custom_timm.models.layers import DropPath, to_2tuple, trunc_normal_ + +from custom_detectron2.modeling import BACKBONE_REGISTRY, Backbone, ShapeSpec + + +class Mlp(nn.Module): + """Multilayer perceptron.""" + + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.0 + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +def window_reverse(windows, window_size, H, W): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + """Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim ** -0.5 + + # define a parameter table of relative position bias + self.relative_position_bias_table = nn.Parameter( + torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1), num_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=0.02) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask=None): + """Forward function. + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B_, N, 3, self.num_heads, C // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = q @ k.transpose(-2, -1) + + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1) + ].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1 + ) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute( + 2, 0, 1 + ).contiguous() # nH, Wh*Ww, Wh*Ww + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + """Swin Transformer Block. + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + num_heads, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop + ) + + self.H = None + self.W = None + + def forward(self, x, mask_matrix): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + mask_matrix: Attention mask for cyclic shift. + """ + B, L, C = x.shape + H, W = self.H, self.W + assert L == H * W, "input feature has wrong size" + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # pad feature maps to multiples of window size + pad_l = pad_t = 0 + pad_r = (self.window_size - W % self.window_size) % self.window_size + pad_b = (self.window_size - H % self.window_size) % self.window_size + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + attn_mask = mask_matrix + else: + shifted_x = x + attn_mask = None + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, Hp, Wp) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + """Patch Merging Layer + Args: + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(4 * dim) + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + + x = x.view(B, H, W, C) + + # padding + pad_input = (H % 2 == 1) or (W % 2 == 1) + if pad_input: + x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2)) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of feature channels + depth (int): Depths of this stage. + num_heads (int): Number of attention head. + window_size (int): Local window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + depth, + num_heads, + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.window_size = window_size + self.shift_size = window_size // 2 + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + num_heads=num_heads, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(dim=dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x, H, W): + """Forward function. + Args: + x: Input feature, tensor size (B, H*W, C). + H, W: Spatial resolution of the input feature. + """ + + # calculate attention mask for SW-MSA + Hp = int(np.ceil(H / self.window_size)) * self.window_size + Wp = int(np.ceil(W / self.window_size)) * self.window_size + img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device) # 1 Hp Wp 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + for blk in self.blocks: + blk.H, blk.W = H, W + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x, attn_mask) + else: + x = blk(x, attn_mask) + if self.downsample is not None: + x_down = self.downsample(x, H, W) + Wh, Ww = (H + 1) // 2, (W + 1) // 2 + return x, H, W, x_down, Wh, Ww + else: + return x, H, W, x, H, W + + +class PatchEmbed(nn.Module): + """Image to Patch Embedding + Args: + patch_size (int): Patch token size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + norm_layer (nn.Module, optional): Normalization layer. Default: None + """ + + def __init__(self, patch_size=4, in_chans=3, embed_dim=96, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + self.patch_size = patch_size + + self.in_chans = in_chans + self.embed_dim = embed_dim + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + if norm_layer is not None: + self.norm = norm_layer(embed_dim) + else: + self.norm = None + + def forward(self, x): + """Forward function.""" + # padding + _, _, H, W = x.size() + if W % self.patch_size[1] != 0: + x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1])) + if H % self.patch_size[0] != 0: + x = F.pad(x, (0, 0, 0, self.patch_size[0] - H % self.patch_size[0])) + + x = self.proj(x) # B C Wh Ww + if self.norm is not None: + Wh, Ww = x.size(2), x.size(3) + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww) + + return x + + +class SwinTransformer(nn.Module): + """Swin Transformer backbone. + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + Args: + pretrain_img_size (int): Input image size for training the pretrained model, + used in absolute postion embedding. Default 224. + patch_size (int | tuple(int)): Patch size. Default: 4. + in_chans (int): Number of input image channels. Default: 3. + embed_dim (int): Number of linear projection output channels. Default: 96. + depths (tuple[int]): Depths of each Swin Transformer stage. + num_heads (tuple[int]): Number of attention head of each stage. + window_size (int): Window size. Default: 7. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4. + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float): Override default qk scale of head_dim ** -0.5 if set. + drop_rate (float): Dropout rate. + attn_drop_rate (float): Attention dropout rate. Default: 0. + drop_path_rate (float): Stochastic depth rate. Default: 0.2. + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False. + patch_norm (bool): If True, add normalization after patch embedding. Default: True. + out_indices (Sequence[int]): Output from which stages. + frozen_stages (int): Stages to be frozen (stop grad and set eval mode). + -1 means not freezing any parameters. + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + pretrain_img_size=224, + patch_size=4, + in_chans=3, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_size=7, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop_rate=0.0, + attn_drop_rate=0.0, + drop_path_rate=0.2, + norm_layer=nn.LayerNorm, + ape=False, + patch_norm=True, + out_indices=(0, 1, 2, 3), + frozen_stages=-1, + use_checkpoint=False, + ): + super().__init__() + + self.pretrain_img_size = pretrain_img_size + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.ape = ape + self.patch_norm = patch_norm + self.out_indices = out_indices + self.frozen_stages = frozen_stages + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None, + ) + + # absolute position embedding + if self.ape: + pretrain_img_size = to_2tuple(pretrain_img_size) + patch_size = to_2tuple(patch_size) + patches_resolution = [ + pretrain_img_size[0] // patch_size[0], + pretrain_img_size[1] // patch_size[1], + ] + + self.absolute_pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, patches_resolution[0], patches_resolution[1]) + ) + trunc_normal_(self.absolute_pos_embed, std=0.02) + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2 ** i_layer), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + ) + self.layers.append(layer) + + num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] + self.num_features = num_features + + # add a norm layer for each output + for i_layer in out_indices: + layer = norm_layer(num_features[i_layer]) + layer_name = f"norm{i_layer}" + self.add_module(layer_name, layer) + + self._freeze_stages() + + def _freeze_stages(self): + if self.frozen_stages >= 0: + self.patch_embed.eval() + for param in self.patch_embed.parameters(): + param.requires_grad = False + + if self.frozen_stages >= 1 and self.ape: + self.absolute_pos_embed.requires_grad = False + + if self.frozen_stages >= 2: + self.pos_drop.eval() + for i in range(0, self.frozen_stages - 1): + m = self.layers[i] + m.eval() + for param in m.parameters(): + param.requires_grad = False + + def init_weights(self, pretrained=None): + """Initialize the weights in backbone. + Args: + pretrained (str, optional): Path to pre-trained weights. + Defaults to None. + """ + + def _init_weights(m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def forward(self, x): + """Forward function.""" + x = self.patch_embed(x) + + Wh, Ww = x.size(2), x.size(3) + if self.ape: + # interpolate the position embedding to the corresponding size + absolute_pos_embed = F.interpolate( + self.absolute_pos_embed, size=(Wh, Ww), mode="bicubic" + ) + x = (x + absolute_pos_embed).flatten(2).transpose(1, 2) # B Wh*Ww C + else: + x = x.flatten(2).transpose(1, 2) + x = self.pos_drop(x) + + outs = {} + for i in range(self.num_layers): + layer = self.layers[i] + x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww) + + if i in self.out_indices: + norm_layer = getattr(self, f"norm{i}") + x_out = norm_layer(x_out) + + out = x_out.view(-1, H, W, self.num_features[i]).permute(0, 3, 1, 2).contiguous() + outs["res{}".format(i + 2)] = out + + return outs + + def train(self, mode=True): + """Convert the model into training mode while keep layers freezed.""" + super(SwinTransformer, self).train(mode) + self._freeze_stages() + + +@BACKBONE_REGISTRY.register() +class D2SwinTransformer(SwinTransformer, Backbone): + def __init__(self, cfg, input_shape): + + pretrain_img_size = cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE + patch_size = cfg.MODEL.SWIN.PATCH_SIZE + in_chans = 3 + embed_dim = cfg.MODEL.SWIN.EMBED_DIM + depths = cfg.MODEL.SWIN.DEPTHS + num_heads = cfg.MODEL.SWIN.NUM_HEADS + window_size = cfg.MODEL.SWIN.WINDOW_SIZE + mlp_ratio = cfg.MODEL.SWIN.MLP_RATIO + qkv_bias = cfg.MODEL.SWIN.QKV_BIAS + qk_scale = cfg.MODEL.SWIN.QK_SCALE + drop_rate = cfg.MODEL.SWIN.DROP_RATE + attn_drop_rate = cfg.MODEL.SWIN.ATTN_DROP_RATE + drop_path_rate = cfg.MODEL.SWIN.DROP_PATH_RATE + norm_layer = nn.LayerNorm + ape = cfg.MODEL.SWIN.APE + patch_norm = cfg.MODEL.SWIN.PATCH_NORM + use_checkpoint = cfg.MODEL.SWIN.USE_CHECKPOINT + + super().__init__( + pretrain_img_size, + patch_size, + in_chans, + embed_dim, + depths, + num_heads, + window_size, + mlp_ratio, + qkv_bias, + qk_scale, + drop_rate, + attn_drop_rate, + drop_path_rate, + norm_layer, + ape, + patch_norm, + use_checkpoint=use_checkpoint, + ) + + self._out_features = cfg.MODEL.SWIN.OUT_FEATURES + + self._out_feature_strides = { + "res2": 4, + "res3": 8, + "res4": 16, + "res5": 32, + } + self._out_feature_channels = { + "res2": self.num_features[0], + "res3": self.num_features[1], + "res4": self.num_features[2], + "res5": self.num_features[3], + } + + def forward(self, x): + """ + Args: + x: Tensor of shape (N,C,H,W). H, W must be a multiple of ``self.size_divisibility``. + Returns: + dict[str->Tensor]: names and the corresponding features + """ + assert ( + x.dim() == 4 + ), f"SwinTransformer takes an input of shape (N, C, H, W). Got {x.shape} instead!" + outputs = {} + y = super().forward(x) + for k in y.keys(): + if k in self._out_features: + outputs[k] = y[k] + return outputs + + def output_shape(self): + return { + name: ShapeSpec( + channels=self._out_feature_channels[name], stride=self._out_feature_strides[name] + ) + for name in self._out_features + } + + @property + def size_divisibility(self): + return 32 diff --git a/custom_oneformer/modeling/matcher.py b/custom_oneformer/modeling/matcher.py new file mode 100644 index 0000000000000000000000000000000000000000..2564940d610d57a1206ce5232e696fc6b34672da --- /dev/null +++ b/custom_oneformer/modeling/matcher.py @@ -0,0 +1,212 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/matcher.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +""" +Modules to compute the matching cost and solve the corresponding LSAP. +""" +import torch +import torch.nn.functional as F +from scipy.optimize import linear_sum_assignment +from torch import nn +from torch.cuda.amp import autocast +import numpy as np + +# from custom_detectron2.projects.point_rend.point_features import point_sample + + +def linear_sum_assignment_with_nan(cost_matrix): + cost_matrix = np.asarray(cost_matrix) + nan = np.isnan(cost_matrix).any() + nan_all = np.isnan(cost_matrix).all() + empty = cost_matrix.size == 0 + + if not empty: + if nan_all: + print('Matrix contains all NaN values!') + elif nan: + print('Matrix contains NaN values!') + + if nan_all: + cost_matrix = np.empty(shape=(0, 0)) + elif nan: + cost_matrix[np.isnan(cost_matrix)] = 100 + + return linear_sum_assignment(cost_matrix) + +def batch_dice_loss(inputs: torch.Tensor, targets: torch.Tensor): + """ + Compute the DICE loss, similar to generalized IOU for masks + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * torch.einsum("nc,mc->nm", inputs, targets) + denominator = inputs.sum(-1)[:, None] + targets.sum(-1)[None, :] + loss = 1 - (numerator + 1) / (denominator + 1) + return loss + + +batch_dice_loss_jit = torch.jit.script( + batch_dice_loss +) # type: torch.jit.ScriptModule + + +def batch_sigmoid_ce_loss(inputs: torch.Tensor, targets: torch.Tensor): + """ + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs + (0 for the negative class and 1 for the positive class). + Returns: + Loss tensor + """ + hw = inputs.shape[1] + + pos = F.binary_cross_entropy_with_logits( + inputs, torch.ones_like(inputs), reduction="none" + ) + neg = F.binary_cross_entropy_with_logits( + inputs, torch.zeros_like(inputs), reduction="none" + ) + + loss = torch.einsum("nc,mc->nm", pos, targets) + torch.einsum( + "nc,mc->nm", neg, (1 - targets) + ) + + return loss / hw + + +batch_sigmoid_ce_loss_jit = torch.jit.script( + batch_sigmoid_ce_loss +) # type: torch.jit.ScriptModule + + +class HungarianMatcher(nn.Module): + """This class computes an assignment between the targets and the predictions of the network + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, + there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, + while the others are un-matched (and thus treated as non-objects). + """ + + def __init__(self, cost_class: float = 1, cost_mask: float = 1, + cost_dice: float = 1, num_points: int = 0): + """Creates the matcher + + Params: + cost_class: This is the relative weight of the classification error in the matching cost + cost_mask: This is the relative weight of the focal loss of the binary mask in the matching cost + cost_dice: This is the relative weight of the dice loss of the binary mask in the matching cost + """ + super().__init__() + self.cost_class = cost_class + self.cost_mask = cost_mask + self.cost_dice = cost_dice + + assert cost_class != 0 or cost_mask != 0 or cost_dice != 0, "all costs cant be 0" + + self.num_points = num_points + + @torch.no_grad() + def memory_efficient_forward(self, outputs, targets): + """More memory-friendly matching""" + bs, num_queries = outputs["pred_logits"].shape[:2] + + indices = [] + + # Iterate through batch size + for b in range(bs): + out_prob = outputs["pred_logits"][b].softmax(-1) # [num_queries, num_classes] + tgt_ids = targets[b]["labels"] + + # Compute the classification cost. Contrary to the loss, we don't use the NLL, + # but approximate it in 1 - proba[target class]. + # The 1 is a constant that doesn't change the matching, it can be ommitted. + cost_class = -out_prob[:, tgt_ids] + + out_mask = outputs["pred_masks"][b] # [num_queries, H_pred, W_pred] + # gt masks are already padded when preparing target + tgt_mask = targets[b]["masks"].to(out_mask) + + out_mask = out_mask[:, None] + tgt_mask = tgt_mask[:, None] + # all masks share the same set of points for efficient matching! + point_coords = torch.rand(1, self.num_points, 2, device=out_mask.device) + # get gt labels + tgt_mask = point_sample( + tgt_mask, + point_coords.repeat(tgt_mask.shape[0], 1, 1), + align_corners=False, + ).squeeze(1) + + out_mask = point_sample( + out_mask, + point_coords.repeat(out_mask.shape[0], 1, 1), + align_corners=False, + ).squeeze(1) + + with autocast(enabled=False): + out_mask = out_mask.float() + tgt_mask = tgt_mask.float() + # Compute the focal loss between masks + cost_mask = batch_sigmoid_ce_loss_jit(out_mask, tgt_mask) + # Compute the dice loss betwen masks + cost_dice = batch_dice_loss(out_mask, tgt_mask) + + # Final cost matrix + C = ( + self.cost_mask * cost_mask + + self.cost_class * cost_class + + self.cost_dice * cost_dice + ) + C = C.reshape(num_queries, -1).cpu() + + indices.append(linear_sum_assignment_with_nan(C)) + + return [ + (torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) + for i, j in indices + ] + + @torch.no_grad() + def forward(self, outputs, targets): + """Performs the matching + + Params: + outputs: This is a dict that contains at least these entries: + "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + "pred_masks": Tensor of dim [batch_size, num_queries, H_pred, W_pred] with the predicted masks + + targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: + "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth + objects in the target) containing the class labels + "masks": Tensor of dim [num_target_boxes, H_gt, W_gt] containing the target masks + + Returns: + A list of size batch_size, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: + len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + + return self.memory_efficient_forward(outputs, targets) + + def __repr__(self, _repr_indent=4): + head = "Matcher " + self.__class__.__name__ + body = [ + "cost_class: {}".format(self.cost_class), + "cost_mask: {}".format(self.cost_mask), + "cost_dice: {}".format(self.cost_dice), + ] + lines = [head] + [" " * _repr_indent + line for line in body] + return "\n".join(lines) diff --git a/custom_oneformer/modeling/meta_arch/__init__.py b/custom_oneformer/modeling/meta_arch/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/custom_oneformer/modeling/meta_arch/__init__.py @@ -0,0 +1 @@ + diff --git a/custom_oneformer/modeling/meta_arch/oneformer_head.py b/custom_oneformer/modeling/meta_arch/oneformer_head.py new file mode 100644 index 0000000000000000000000000000000000000000..e7fba1624dd351d263b90187d3fcece33027cf34 --- /dev/null +++ b/custom_oneformer/modeling/meta_arch/oneformer_head.py @@ -0,0 +1,135 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/meta_arch/mask_former_head.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import logging +from copy import deepcopy +from typing import Callable, Dict, List, Optional, Tuple, Union + +import fvcore.nn.weight_init as weight_init +from torch import nn +from torch.nn import functional as F + +from custom_detectron2.config import configurable +from custom_detectron2.layers import Conv2d, ShapeSpec, get_norm +from custom_detectron2.modeling import SEM_SEG_HEADS_REGISTRY +from ..pixel_decoder.fpn import build_pixel_decoder +from ..transformer_decoder.oneformer_transformer_decoder import build_transformer_decoder + +@SEM_SEG_HEADS_REGISTRY.register() +class OneFormerHead(nn.Module): + + _version = 2 + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + version = local_metadata.get("version", None) + if version is None or version < 2: + # Do not warn if train from scratch + scratch = True + logger = logging.getLogger(__name__) + for k in list(state_dict.keys()): + newk = k + if "sem_seg_head" in k and not k.startswith(prefix + "predictor"): + newk = k.replace(prefix, prefix + "pixel_decoder.") + # logger.debug(f"{k} ==> {newk}") + if newk != k: + state_dict[newk] = state_dict[k] + del state_dict[k] + scratch = False + + if not scratch: + logger.warning( + f"Weight format of {self.__class__.__name__} have changed! " + "Please upgrade your models. Applying automatic conversion now ..." + ) + + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + num_classes: int, + pixel_decoder: nn.Module, + loss_weight: float = 1.0, + ignore_value: int = -1, + # extra parameters + transformer_predictor: nn.Module, + transformer_in_feature: str, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + num_classes: number of classes to predict + pixel_decoder: the pixel decoder module + loss_weight: loss weight + ignore_value: category id to be ignored during training. + transformer_predictor: the transformer decoder that makes prediction + transformer_in_feature: input feature name to the transformer_predictor + """ + super().__init__() + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] + feature_strides = [v.stride for k, v in input_shape] + feature_channels = [v.channels for k, v in input_shape] + + self.ignore_value = ignore_value + self.common_stride = 4 + self.loss_weight = loss_weight + + self.pixel_decoder = pixel_decoder + self.predictor = transformer_predictor + self.transformer_in_feature = transformer_in_feature + + self.num_classes = num_classes + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + # figure out in_channels to transformer predictor + if cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "transformer_encoder": + transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "pixel_embedding": + transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + elif cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE == "multi_scale_pixel_decoder": + transformer_predictor_in_channels = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + else: + transformer_predictor_in_channels = input_shape[cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE].channels + + return { + "input_shape": { + k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + }, + "ignore_value": cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE, + "num_classes": cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES, + "pixel_decoder": build_pixel_decoder(cfg, input_shape), + "loss_weight": cfg.MODEL.SEM_SEG_HEAD.LOSS_WEIGHT, + "transformer_in_feature": cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE, + "transformer_predictor": build_transformer_decoder( + cfg, + transformer_predictor_in_channels, + mask_classification=True, + ), + } + + def forward(self, features, tasks, mask=None): + return self.layers(features, tasks, mask) + + def layers(self, features, tasks, mask=None): + mask_features, transformer_encoder_features, multi_scale_features, _, _ = self.pixel_decoder.forward_features(features) + + if self.transformer_in_feature == "multi_scale_pixel_decoder": + predictions = self.predictor(multi_scale_features, mask_features, tasks, mask) + else: + if self.transformer_in_feature == "transformer_encoder": + assert ( + transformer_encoder_features is not None + ), "Please use the TransformerEncoderPixelDecoder." + predictions = self.predictor(transformer_encoder_features, mask_features, mask) + elif self.transformer_in_feature == "pixel_embedding": + predictions = self.predictor(mask_features, mask_features, mask) + else: + predictions = self.predictor(features[self.transformer_in_feature], mask_features, mask) + return predictions diff --git a/custom_oneformer/modeling/pixel_decoder/__init__.py b/custom_oneformer/modeling/pixel_decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9020c2df23e2af280b7bb168b996ae9eaf312eb8 --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/__init__.py @@ -0,0 +1 @@ +# Copyright (c) Facebook, Inc. and its affiliates. diff --git a/custom_oneformer/modeling/pixel_decoder/fpn.py b/custom_oneformer/modeling/pixel_decoder/fpn.py new file mode 100644 index 0000000000000000000000000000000000000000..a7003e39f5254e73351646e766f49080dda7533e --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/fpn.py @@ -0,0 +1,312 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +import numpy as np +from typing import Callable, Dict, List, Optional, Tuple, Union + +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ +from torch.cuda.amp import autocast + +from custom_detectron2.config import configurable +from custom_detectron2.layers import Conv2d, DeformConv, ShapeSpec, get_norm +from custom_detectron2.modeling import SEM_SEG_HEADS_REGISTRY + +from ..transformer_decoder.position_encoding import PositionEmbeddingSine +from ..transformer_decoder.transformer import TransformerEncoder, TransformerEncoderLayer, _get_clones, _get_activation_fn + + +def build_pixel_decoder(cfg, input_shape): + """ + Build a pixel decoder from `cfg.MODEL.MASK_FORMER.PIXEL_DECODER_NAME`. + """ + name = cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME + model = SEM_SEG_HEADS_REGISTRY.get(name)(cfg, input_shape) + forward_features = getattr(model, "forward_features", None) + if not callable(forward_features): + raise ValueError( + "Only SEM_SEG_HEADS with forward_features method can be used as pixel decoder. " + f"Please implement forward_features for {name} to only return mask features." + ) + return model + + +# This is a modified FPN decoder. +@SEM_SEG_HEADS_REGISTRY.register() +class BasePixelDecoder(nn.Module): + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + conv_dim: int, + mask_dim: int, + norm: Optional[Union[str, Callable]] = None, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + conv_dims: number of output channels for the intermediate conv layers. + mask_dim: number of output channels for the final conv layer. + norm (str or callable): normalization for all conv layers + """ + super().__init__() + + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" + feature_channels = [v.channels for k, v in input_shape] + + lateral_convs = [] + output_convs = [] + + use_bias = norm == "" + for idx, in_channels in enumerate(feature_channels): + if idx == len(self.in_features) - 1: + output_norm = get_norm(norm, conv_dim) + output_conv = Conv2d( + in_channels, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + weight_init.c2_xavier_fill(output_conv) + self.add_module("layer_{}".format(idx + 1), output_conv) + + lateral_convs.append(None) + output_convs.append(output_conv) + else: + lateral_norm = get_norm(norm, conv_dim) + output_norm = get_norm(norm, conv_dim) + + lateral_conv = Conv2d( + in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm + ) + output_conv = Conv2d( + conv_dim, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + weight_init.c2_xavier_fill(lateral_conv) + weight_init.c2_xavier_fill(output_conv) + self.add_module("adapter_{}".format(idx + 1), lateral_conv) + self.add_module("layer_{}".format(idx + 1), output_conv) + + lateral_convs.append(lateral_conv) + output_convs.append(output_conv) + # Place convs into top-down order (from low to high resolution) + # to make the top-down computation in forward clearer. + self.lateral_convs = lateral_convs[::-1] + self.output_convs = output_convs[::-1] + + self.mask_dim = mask_dim + self.mask_features = Conv2d( + conv_dim, + mask_dim, + kernel_size=3, + stride=1, + padding=1, + ) + weight_init.c2_xavier_fill(self.mask_features) + + self.oneformer_num_feature_levels = 3 # always use 3 scales + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + ret = {} + ret["input_shape"] = { + k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + } + ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM + return ret + + def forward_features(self, features): + multi_scale_features = [] + num_cur_levels = 0 + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.in_features[::-1]): + x = features[f] + lateral_conv = self.lateral_convs[idx] + output_conv = self.output_convs[idx] + if lateral_conv is None: + y = output_conv(x) + else: + cur_fpn = lateral_conv(x) + # Following FPN implementation, we use nearest upsampling here + y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") + y = output_conv(y) + if num_cur_levels < self.oneformer_num_feature_levels: + multi_scale_features.append(y) + num_cur_levels += 1 + return self.mask_features(y), None, multi_scale_features + + def forward(self, features, targets=None): + logger = logging.getLogger(__name__) + logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") + return self.forward_features(features) + + +class TransformerEncoderOnly(nn.Module): + def __init__( + self, + d_model=512, + nhead=8, + num_encoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, src, mask, pos_embed): + # flatten NxCxHxW to HWxNxC + bs, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + pos_embed = pos_embed.flatten(2).permute(2, 0, 1) + if mask is not None: + mask = mask.flatten(1) + + memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) + return memory.permute(1, 2, 0).view(bs, c, h, w) + + +# This is a modified FPN decoder with extra Transformer encoder that processes the lowest-resolution feature map. +@SEM_SEG_HEADS_REGISTRY.register() +class TransformerEncoderPixelDecoder(BasePixelDecoder): + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + transformer_dropout: float, + transformer_nheads: int, + transformer_dim_feedforward: int, + transformer_enc_layers: int, + transformer_pre_norm: bool, + conv_dim: int, + mask_dim: int, + norm: Optional[Union[str, Callable]] = None, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + transformer_dropout: dropout probability in transformer + transformer_nheads: number of heads in transformer + transformer_dim_feedforward: dimension of feedforward network + transformer_enc_layers: number of transformer encoder layers + transformer_pre_norm: whether to use pre-layernorm or not + conv_dims: number of output channels for the intermediate conv layers. + mask_dim: number of output channels for the final conv layer. + norm (str or callable): normalization for all conv layers + """ + super().__init__(input_shape, conv_dim=conv_dim, mask_dim=mask_dim, norm=norm) + + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" + feature_strides = [v.stride for k, v in input_shape] + feature_channels = [v.channels for k, v in input_shape] + + in_channels = feature_channels[len(self.in_features) - 1] + self.input_proj = Conv2d(in_channels, conv_dim, kernel_size=1) + weight_init.c2_xavier_fill(self.input_proj) + self.transformer = TransformerEncoderOnly( + d_model=conv_dim, + dropout=transformer_dropout, + nhead=transformer_nheads, + dim_feedforward=transformer_dim_feedforward, + num_encoder_layers=transformer_enc_layers, + normalize_before=transformer_pre_norm, + ) + N_steps = conv_dim // 2 + self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) + + # update layer + use_bias = norm == "" + output_norm = get_norm(norm, conv_dim) + output_conv = Conv2d( + conv_dim, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + weight_init.c2_xavier_fill(output_conv) + delattr(self, "layer_{}".format(len(self.in_features))) + self.add_module("layer_{}".format(len(self.in_features)), output_conv) + self.output_convs[0] = output_conv + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + ret = super().from_config(cfg, input_shape) + ret["transformer_dropout"] = cfg.MODEL.MASK_FORMER.DROPOUT + ret["transformer_nheads"] = cfg.MODEL.MASK_FORMER.NHEADS + ret["transformer_dim_feedforward"] = cfg.MODEL.MASK_FORMER.DIM_FEEDFORWARD + ret[ + "transformer_enc_layers" + ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config + ret["transformer_pre_norm"] = cfg.MODEL.MASK_FORMER.PRE_NORM + return ret + + def forward_features(self, features): + multi_scale_features = [] + num_cur_levels = 0 + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.in_features[::-1]): + x = features[f] + lateral_conv = self.lateral_convs[idx] + output_conv = self.output_convs[idx] + if lateral_conv is None: + transformer = self.input_proj(x) + pos = self.pe_layer(x) + transformer = self.transformer(transformer, None, pos) + y = output_conv(transformer) + # save intermediate feature as input to Transformer decoder + transformer_encoder_features = transformer + else: + cur_fpn = lateral_conv(x) + # Following FPN implementation, we use nearest upsampling here + y = cur_fpn + F.interpolate(y, size=cur_fpn.shape[-2:], mode="nearest") + y = output_conv(y) + if num_cur_levels < self.oneformer_num_feature_levels: + multi_scale_features.append(y) + num_cur_levels += 1 + return self.mask_features(y), transformer_encoder_features, multi_scale_features + + def forward(self, features, targets=None): + logger = logging.getLogger(__name__) + logger.warning("Calling forward() may cause unpredicted behavior of PixelDecoder module.") + return self.forward_features(features) diff --git a/custom_oneformer/modeling/pixel_decoder/msdeformattn.py b/custom_oneformer/modeling/pixel_decoder/msdeformattn.py new file mode 100644 index 0000000000000000000000000000000000000000..69a094da773c7873cf66f61a9afbd9dcdc4b6c8e --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/msdeformattn.py @@ -0,0 +1,358 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +import logging +import numpy as np +from typing import Callable, Dict, List, Optional, Tuple, Union + +import fvcore.nn.weight_init as weight_init +import torch +from torch import nn +from torch.nn import functional as F +from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ +from torch.cuda.amp import autocast + +from custom_detectron2.config import configurable +from custom_detectron2.layers import Conv2d, ShapeSpec, get_norm +from custom_detectron2.modeling import SEM_SEG_HEADS_REGISTRY + +from ..transformer_decoder.position_encoding import PositionEmbeddingSine +from ..transformer_decoder.transformer import _get_clones, _get_activation_fn +from .ops.modules import MSDeformAttn + + +# MSDeformAttn Transformer encoder in deformable detr +class MSDeformAttnTransformerEncoderOnly(nn.Module): + def __init__(self, d_model=256, nhead=8, + num_encoder_layers=6, dim_feedforward=1024, dropout=0.1, + activation="relu", + num_feature_levels=4, enc_n_points=4, + ): + super().__init__() + + self.d_model = d_model + self.nhead = nhead + + encoder_layer = MSDeformAttnTransformerEncoderLayer(d_model, dim_feedforward, + dropout, activation, + num_feature_levels, nhead, enc_n_points) + self.encoder = MSDeformAttnTransformerEncoder(encoder_layer, num_encoder_layers) + + self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + for m in self.modules(): + if isinstance(m, MSDeformAttn): + m._reset_parameters() + normal_(self.level_embed) + + def get_valid_ratio(self, mask): + _, H, W = mask.shape + valid_H = torch.sum(~mask[:, :, 0], 1) + valid_W = torch.sum(~mask[:, 0, :], 1) + valid_ratio_h = valid_H.float() / H + valid_ratio_w = valid_W.float() / W + valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1) + return valid_ratio + + def forward(self, srcs, pos_embeds): + masks = [torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) for x in srcs] + # prepare input for encoder + src_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): + bs, c, h, w = src.shape + spatial_shape = (h, w) + spatial_shapes.append(spatial_shape) + src = src.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + src_flatten.append(src) + mask_flatten.append(mask) + src_flatten = torch.cat(src_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=src_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1, )), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + + # encoder + memory = self.encoder(src_flatten, spatial_shapes, level_start_index, valid_ratios, lvl_pos_embed_flatten, mask_flatten) + + return memory, spatial_shapes, level_start_index, valid_ratios + + +class MSDeformAttnTransformerEncoderLayer(nn.Module): + def __init__(self, + d_model=256, d_ffn=1024, + dropout=0.1, activation="relu", + n_levels=4, n_heads=8, n_points=4): + super().__init__() + + # self attention + self.self_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points) + self.dropout1 = nn.Dropout(dropout) + self.norm1 = nn.LayerNorm(d_model) + + # ffn + self.linear1 = nn.Linear(d_model, d_ffn) + self.activation = _get_activation_fn(activation) + self.dropout2 = nn.Dropout(dropout) + self.linear2 = nn.Linear(d_ffn, d_model) + self.dropout3 = nn.Dropout(dropout) + self.norm2 = nn.LayerNorm(d_model) + + @staticmethod + def with_pos_embed(tensor, pos): + return tensor if pos is None else tensor + pos + + def forward_ffn(self, src): + src2 = self.linear2(self.dropout2(self.activation(self.linear1(src)))) + src = src + self.dropout3(src2) + src = self.norm2(src) + return src + + def forward(self, src, pos, reference_points, spatial_shapes, level_start_index, padding_mask=None): + # self attention + src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points, src, spatial_shapes, level_start_index, padding_mask) + src = src + self.dropout1(src2) + src = self.norm1(src) + + # ffn + src = self.forward_ffn(src) + + return src + + +class MSDeformAttnTransformerEncoder(nn.Module): + def __init__(self, encoder_layer, num_layers): + super().__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + reference_points_list = [] + for lvl, (H_, W_) in enumerate(spatial_shapes): + + ref_y, ref_x = torch.meshgrid(torch.linspace(0.5, H_ - 0.5, H_, dtype=torch.float32, device=device), + torch.linspace(0.5, W_ - 0.5, W_, dtype=torch.float32, device=device)) + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] * H_) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] * W_) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward(self, src, spatial_shapes, level_start_index, valid_ratios, pos=None, padding_mask=None): + output = src + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=src.device) + for _, layer in enumerate(self.layers): + output = layer(output, pos, reference_points, spatial_shapes, level_start_index, padding_mask) + + return output + + +@SEM_SEG_HEADS_REGISTRY.register() +class MSDeformAttnPixelDecoder(nn.Module): + @configurable + def __init__( + self, + input_shape: Dict[str, ShapeSpec], + *, + transformer_dropout: float, + transformer_nheads: int, + transformer_dim_feedforward: int, + transformer_enc_layers: int, + conv_dim: int, + mask_dim: int, + norm: Optional[Union[str, Callable]] = None, + # deformable transformer encoder args + transformer_in_features: List[str], + common_stride: int, + ): + """ + NOTE: this interface is experimental. + Args: + input_shape: shapes (channels and stride) of the input features + transformer_dropout: dropout probability in transformer + transformer_nheads: number of heads in transformer + transformer_dim_feedforward: dimension of feedforward network + transformer_enc_layers: number of transformer encoder layers + conv_dims: number of output channels for the intermediate conv layers. + mask_dim: number of output channels for the final conv layer. + norm (str or callable): normalization for all conv layers + """ + super().__init__() + transformer_input_shape = { + k: v for k, v in input_shape.items() if k in transformer_in_features + } + + # this is the input shape of pixel decoder + input_shape = sorted(input_shape.items(), key=lambda x: x[1].stride) + self.in_features = [k for k, v in input_shape] # starting from "res2" to "res5" + self.feature_strides = [v.stride for k, v in input_shape] + self.feature_channels = [v.channels for k, v in input_shape] + + # this is the input shape of transformer encoder (could use less features than pixel decoder + transformer_input_shape = sorted(transformer_input_shape.items(), key=lambda x: x[1].stride) + self.transformer_in_features = [k for k, v in transformer_input_shape] # starting from "res2" to "res5" + transformer_in_channels = [v.channels for k, v in transformer_input_shape] + self.transformer_feature_strides = [v.stride for k, v in transformer_input_shape] # to decide extra FPN layers + + self.transformer_num_feature_levels = len(self.transformer_in_features) + if self.transformer_num_feature_levels > 1: + input_proj_list = [] + # from low resolution to high resolution (res5 -> res2) + for in_channels in transformer_in_channels[::-1]: + input_proj_list.append(nn.Sequential( + nn.Conv2d(in_channels, conv_dim, kernel_size=1), + nn.GroupNorm(32, conv_dim), + )) + self.input_proj = nn.ModuleList(input_proj_list) + else: + self.input_proj = nn.ModuleList([ + nn.Sequential( + nn.Conv2d(transformer_in_channels[-1], conv_dim, kernel_size=1), + nn.GroupNorm(32, conv_dim), + )]) + + for proj in self.input_proj: + nn.init.xavier_uniform_(proj[0].weight, gain=1) + nn.init.constant_(proj[0].bias, 0) + + self.transformer = MSDeformAttnTransformerEncoderOnly( + d_model=conv_dim, + dropout=transformer_dropout, + nhead=transformer_nheads, + dim_feedforward=transformer_dim_feedforward, + num_encoder_layers=transformer_enc_layers, + num_feature_levels=self.transformer_num_feature_levels, + ) + N_steps = conv_dim // 2 + self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) + + self.mask_dim = mask_dim + # use 1x1 conv instead + self.mask_features = Conv2d( + conv_dim, + mask_dim, + kernel_size=1, + stride=1, + padding=0, + ) + weight_init.c2_xavier_fill(self.mask_features) + + self.oneformer_num_feature_levels = 3 # always use 3 scales + self.common_stride = common_stride + + # extra fpn levels + stride = min(self.transformer_feature_strides) + self.num_fpn_levels = int(np.log2(stride) - np.log2(self.common_stride)) + + lateral_convs = [] + output_convs = [] + + use_bias = norm == "" + for idx, in_channels in enumerate(self.feature_channels[:self.num_fpn_levels]): + lateral_norm = get_norm(norm, conv_dim) + output_norm = get_norm(norm, conv_dim) + + lateral_conv = Conv2d( + in_channels, conv_dim, kernel_size=1, bias=use_bias, norm=lateral_norm + ) + output_conv = Conv2d( + conv_dim, + conv_dim, + kernel_size=3, + stride=1, + padding=1, + bias=use_bias, + norm=output_norm, + activation=F.relu, + ) + weight_init.c2_xavier_fill(lateral_conv) + weight_init.c2_xavier_fill(output_conv) + self.add_module("adapter_{}".format(idx + 1), lateral_conv) + self.add_module("layer_{}".format(idx + 1), output_conv) + + lateral_convs.append(lateral_conv) + output_convs.append(output_conv) + # Place convs into top-down order (from low to high resolution) + # to make the top-down computation in forward clearer. + self.lateral_convs = lateral_convs[::-1] + self.output_convs = output_convs[::-1] + + @classmethod + def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec]): + ret = {} + ret["input_shape"] = { + k: v for k, v in input_shape.items() if k in cfg.MODEL.SEM_SEG_HEAD.IN_FEATURES + } + ret["conv_dim"] = cfg.MODEL.SEM_SEG_HEAD.CONVS_DIM + ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + ret["norm"] = cfg.MODEL.SEM_SEG_HEAD.NORM + ret["transformer_dropout"] = cfg.MODEL.ONE_FORMER.DROPOUT + ret["transformer_nheads"] = cfg.MODEL.ONE_FORMER.NHEADS + # ret["transformer_dim_feedforward"] = cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD + ret["transformer_dim_feedforward"] = 1024 # use 1024 for deformable transformer encoder + ret[ + "transformer_enc_layers" + ] = cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS # a separate config + ret["transformer_in_features"] = cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES + ret["common_stride"] = cfg.MODEL.SEM_SEG_HEAD.COMMON_STRIDE + return ret + + @autocast(enabled=False) + def forward_features(self, features): + srcs = [] + pos = [] + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.transformer_in_features[::-1]): + x = features[f].float() # deformable detr does not support half precision + srcs.append(self.input_proj[idx](x)) + pos.append(self.pe_layer(x)) + + y, spatial_shapes, level_start_index, valid_ratios = self.transformer(srcs, pos) + bs = y.shape[0] + + split_size_or_sections = [None] * self.transformer_num_feature_levels + for i in range(self.transformer_num_feature_levels): + if i < self.transformer_num_feature_levels - 1: + split_size_or_sections[i] = level_start_index[i + 1] - level_start_index[i] + else: + split_size_or_sections[i] = y.shape[1] - level_start_index[i] + y = torch.split(y, split_size_or_sections, dim=1) + + out = [] + multi_scale_features = [] + num_cur_levels = 0 + for i, z in enumerate(y): + out.append(z.transpose(1, 2).view(bs, -1, spatial_shapes[i][0], spatial_shapes[i][1])) + + # append `out` with extra FPN levels + # Reverse feature maps into top-down order (from low to high resolution) + for idx, f in enumerate(self.in_features[:self.num_fpn_levels][::-1]): + x = features[f].float() + lateral_conv = self.lateral_convs[idx] + output_conv = self.output_convs[idx] + cur_fpn = lateral_conv(x) + # Following FPN implementation, we use nearest upsampling here + y = cur_fpn + F.interpolate(out[-1], size=cur_fpn.shape[-2:], mode="bilinear", align_corners=False) + y = output_conv(y) + out.append(y) + + for o in out: + if num_cur_levels < self.oneformer_num_feature_levels: + multi_scale_features.append(o) + num_cur_levels += 1 + + return self.mask_features(out[-1]), out[0], multi_scale_features, spatial_shapes, level_start_index diff --git a/custom_oneformer/modeling/pixel_decoder/ops/functions/__init__.py b/custom_oneformer/modeling/pixel_decoder/ops/functions/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2b06b5ac538b63bdb9a6c82e4635b95bb5491d5b --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/functions/__init__.py @@ -0,0 +1,13 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from .ms_deform_attn_func import MSDeformAttnFunction + diff --git a/custom_oneformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py b/custom_oneformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py new file mode 100644 index 0000000000000000000000000000000000000000..e074eb69819151add821a8ff9ed215ed9b874070 --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/functions/ms_deform_attn_func.py @@ -0,0 +1,77 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import torch +import torch.nn.functional as F +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +# if torch.cuda.is_available(): +# try: +# import MultiScaleDeformableAttention as MSDA +# except ModuleNotFoundError as e: +# info_string = ( +# "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n" +# "\t`cd oneformer/modeling/pixel_decoder/ops`\n" +# "\t`sh make.sh`\n" +# ) +# raise ModuleNotFoundError(info_string) +# else: +# MultiScaleDeformableAttention = None + + + +class MSDeformAttnFunction(Function): + @staticmethod + def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step): + # ctx.im2col_step = im2col_step + output = ms_deform_attn_core_pytorch( + value, value_spatial_shapes, sampling_locations, attention_weights) + # ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights) + return output + + # @staticmethod + # @once_differentiable + # def backward(ctx, grad_output): + # value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors + # grad_value, grad_sampling_loc, grad_attn_weight = \ + # MSDA.ms_deform_attn_backward( + # value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step) + # + # return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights): + # for debug and test only, + # need to use cuda version instead + N_, S_, M_, D_ = value.shape + _, Lq_, M_, L_, P_, _ = sampling_locations.shape + value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for lid_, (H_, W_) in enumerate(value_spatial_shapes): + # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_ + value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_) + # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2 + sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1) + # N_*M_, D_, Lq_, P_ + sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_, + mode='bilinear', padding_mode='zeros', align_corners=False) + sampling_value_list.append(sampling_value_l_) + # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_) + attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_) + output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_) + return output.transpose(1, 2).contiguous() diff --git a/custom_oneformer/modeling/pixel_decoder/ops/make.sh b/custom_oneformer/modeling/pixel_decoder/ops/make.sh new file mode 100644 index 0000000000000000000000000000000000000000..ca5c0b469da786c847ba04d437bb31ee0fc938da --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/make.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +FORCE_CUDA=1 python setup.py build install diff --git a/custom_oneformer/modeling/pixel_decoder/ops/modules/__init__.py b/custom_oneformer/modeling/pixel_decoder/ops/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6fdbf03359958f3d67ab00f879bf6b61a6c8f06a --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/modules/__init__.py @@ -0,0 +1,12 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from .ms_deform_attn import MSDeformAttn diff --git a/custom_oneformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py b/custom_oneformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..5bc471d2da550c839a3446a6041e40d338425129 --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/modules/ms_deform_attn.py @@ -0,0 +1,120 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import warnings +import math + +import torch +from torch import nn +import torch.nn.functional as F +from torch.nn.init import xavier_uniform_, constant_ + +MSDeformAttnFunction = None +from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch + + +def _is_power_of_2(n): + if (not isinstance(n, int)) or (n < 0): + raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n))) + return (n & (n-1) == 0) and n != 0 + + +class MSDeformAttn(nn.Module): + def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4): + """ + Multi-Scale Deformable Attention Module + :param d_model hidden dimension + :param n_levels number of feature levels + :param n_heads number of attention heads + :param n_points number of sampling points per attention head per feature level + """ + super().__init__() + if d_model % n_heads != 0: + raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads)) + _d_per_head = d_model // n_heads + # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation + if not _is_power_of_2(_d_per_head): + warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 " + "which is more efficient in our CUDA implementation.") + + self.im2col_step = 128 + + self.d_model = d_model + self.n_levels = n_levels + self.n_heads = n_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2) + self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points) + self.value_proj = nn.Linear(d_model, d_model) + self.output_proj = nn.Linear(d_model, d_model) + + self._reset_parameters() + + def _reset_parameters(self): + constant_(self.sampling_offsets.weight.data, 0.) + thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + constant_(self.attention_weights.weight.data, 0.) + constant_(self.attention_weights.bias.data, 0.) + xavier_uniform_(self.value_proj.weight.data) + constant_(self.value_proj.bias.data, 0.) + xavier_uniform_(self.output_proj.weight.data) + constant_(self.output_proj.bias.data, 0.) + + def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None): + """ + :param query (N, Length_{query}, C) + :param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area + or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes + :param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C) + :param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})] + :param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}] + :param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements + :return output (N, Length_{query}, C) + """ + N, Len_q, _ = query.shape + N, Len_in, _ = input_flatten.shape + assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in + + value = self.value_proj(input_flatten) + if input_padding_mask is not None: + value = value.masked_fill(input_padding_mask[..., None], float(0)) + value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2) + attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points) + attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points) + # N, Len_q, n_heads, n_levels, n_points, 2 + if reference_points.shape[-1] == 2: + offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1) + sampling_locations = reference_points[:, :, None, :, None, :] \ + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + elif reference_points.shape[-1] == 4: + sampling_locations = reference_points[:, :, None, :, None, :2] \ + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + else: + raise ValueError( + 'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1])) + # try: + output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) + # # For FLOPs calculation only + # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights) + output = self.output_proj(output) + return output \ No newline at end of file diff --git a/custom_oneformer/modeling/pixel_decoder/ops/setup.py b/custom_oneformer/modeling/pixel_decoder/ops/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..3b57ad313ac8f9b6586892142da8ba943e516cec --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/setup.py @@ -0,0 +1,78 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +import os +import glob + +import torch + +from torch.utils.cpp_extension import CUDA_HOME +from torch.utils.cpp_extension import CppExtension +from torch.utils.cpp_extension import CUDAExtension + +from setuptools import find_packages +from setuptools import setup + +requirements = ["torch", "torchvision"] + +def get_extensions(): + this_dir = os.path.dirname(os.path.abspath(__file__)) + extensions_dir = os.path.join(this_dir, "src") + + main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) + source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) + + sources = main_file + source_cpu + extension = CppExtension + extra_compile_args = {"cxx": []} + define_macros = [] + + # Force cuda since torch ask for a device, not if cuda is in fact available. + if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None: + extension = CUDAExtension + sources += source_cuda + define_macros += [("WITH_CUDA", None)] + extra_compile_args["nvcc"] = [ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ] + else: + if CUDA_HOME is None: + raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.') + else: + raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().') + + sources = [os.path.join(extensions_dir, s) for s in sources] + include_dirs = [extensions_dir] + ext_modules = [ + extension( + "MultiScaleDeformableAttention", + sources, + include_dirs=include_dirs, + define_macros=define_macros, + extra_compile_args=extra_compile_args, + ) + ] + return ext_modules + +setup( + name="MultiScaleDeformableAttention", + version="1.0", + author="Weijie Su", + url="https://github.com/fundamentalvision/Deformable-DETR", + description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention", + packages=find_packages(exclude=("configs", "tests",)), + ext_modules=get_extensions(), + cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, +) diff --git a/custom_oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp b/custom_oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp new file mode 100644 index 0000000000000000000000000000000000000000..48757e2b0156b2c1513b615d2a17e5aee5172ae7 --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.cpp @@ -0,0 +1,46 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include + +#include +#include + + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + AT_ERROR("Not implement on cpu"); +} + diff --git a/custom_oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h b/custom_oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h new file mode 100644 index 0000000000000000000000000000000000000000..51bb27e9ee828f967e8aa854c2d55574040c6d7e --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/src/cpu/ms_deform_attn_cpu.h @@ -0,0 +1,38 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#pragma once +#include + +at::Tensor +ms_deform_attn_cpu_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector +ms_deform_attn_cpu_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + + diff --git a/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu b/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu new file mode 100644 index 0000000000000000000000000000000000000000..0c465dab3d636dfd6a44523c63f148b6e15084d9 --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.cu @@ -0,0 +1,158 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include +#include "cuda/ms_deform_im2col_cuda.cuh" + +#include +#include +#include +#include + + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto output = at::zeros({batch, num_query, num_heads, channels}, value.options()); + + const int batch_n = im2col_step_; + auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto columns = output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] { + ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + columns.data()); + + })); + } + + output = output.view({batch, num_query, num_heads*channels}); + + return output; +} + + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + + AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous"); + AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous"); + AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous"); + AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous"); + AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous"); + AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous"); + + AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor"); + AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor"); + AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor"); + AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor"); + AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor"); + AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor"); + + const int batch = value.size(0); + const int spatial_size = value.size(1); + const int num_heads = value.size(2); + const int channels = value.size(3); + + const int num_levels = spatial_shapes.size(0); + + const int num_query = sampling_loc.size(1); + const int num_point = sampling_loc.size(4); + + const int im2col_step_ = std::min(batch, im2col_step); + + AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_); + + auto grad_value = at::zeros_like(value); + auto grad_sampling_loc = at::zeros_like(sampling_loc); + auto grad_attn_weight = at::zeros_like(attn_weight); + + const int batch_n = im2col_step_; + auto per_value_size = spatial_size * num_heads * channels; + auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2; + auto per_attn_weight_size = num_query * num_heads * num_levels * num_point; + auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels}); + + for (int n = 0; n < batch/im2col_step_; ++n) + { + auto grad_output_g = grad_output_n.select(0, n); + AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] { + ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(), + grad_output_g.data(), + value.data() + n * im2col_step_ * per_value_size, + spatial_shapes.data(), + level_start_index.data(), + sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + attn_weight.data() + n * im2col_step_ * per_attn_weight_size, + batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point, + grad_value.data() + n * im2col_step_ * per_value_size, + grad_sampling_loc.data() + n * im2col_step_ * per_sample_loc_size, + grad_attn_weight.data() + n * im2col_step_ * per_attn_weight_size); + + })); + } + + return { + grad_value, grad_sampling_loc, grad_attn_weight + }; +} \ No newline at end of file diff --git a/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h b/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h new file mode 100644 index 0000000000000000000000000000000000000000..4f0658e8668a11f0e7d71deff9adac71884f2e87 --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_attn_cuda.h @@ -0,0 +1,35 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#pragma once +#include + +at::Tensor ms_deform_attn_cuda_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step); + +std::vector ms_deform_attn_cuda_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step); + diff --git a/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh b/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c04e0d4ab97d25c1756fcd8d08dd1e5a6d280b7c --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/src/cuda/ms_deform_im2col_cuda.cuh @@ -0,0 +1,1332 @@ +/*! +************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************** +* Modified from DCN (https://github.com/msracver/Deformable-ConvNets) +* Copyright (c) 2018 Microsoft +************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include +#include +#include + +#include +#include + +#include + +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); \ + i += blockDim.x * gridDim.x) + +const int CUDA_NUM_THREADS = 1024; +inline int GET_BLOCKS(const int N, const int num_threads) +{ + return (N + num_threads - 1) / num_threads; +} + + +template +__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + } + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + return val; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + *grad_attn_weight = top_grad * val; + *grad_sampling_loc = width * grad_w_weight * top_grad_value; + *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value; +} + + +template +__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, + const int &height, const int &width, const int &nheads, const int &channels, + const scalar_t &h, const scalar_t &w, const int &m, const int &c, + const scalar_t &top_grad, + const scalar_t &attn_weight, + scalar_t* &grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int h_low = floor(h); + const int w_low = floor(w); + const int h_high = h_low + 1; + const int w_high = w_low + 1; + + const scalar_t lh = h - h_low; + const scalar_t lw = w - w_low; + const scalar_t hh = 1 - lh, hw = 1 - lw; + + const int w_stride = nheads * channels; + const int h_stride = width * w_stride; + const int h_low_ptr_offset = h_low * h_stride; + const int h_high_ptr_offset = h_low_ptr_offset + h_stride; + const int w_low_ptr_offset = w_low * w_stride; + const int w_high_ptr_offset = w_low_ptr_offset + w_stride; + const int base_ptr = m * channels + c; + + const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw; + const scalar_t top_grad_value = top_grad * attn_weight; + scalar_t grad_h_weight = 0, grad_w_weight = 0; + + scalar_t v1 = 0; + if (h_low >= 0 && w_low >= 0) + { + const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr; + v1 = bottom_data[ptr1]; + grad_h_weight -= hw * v1; + grad_w_weight -= hh * v1; + atomicAdd(grad_value+ptr1, w1*top_grad_value); + } + scalar_t v2 = 0; + if (h_low >= 0 && w_high <= width - 1) + { + const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr; + v2 = bottom_data[ptr2]; + grad_h_weight -= lw * v2; + grad_w_weight += hh * v2; + atomicAdd(grad_value+ptr2, w2*top_grad_value); + } + scalar_t v3 = 0; + if (h_high <= height - 1 && w_low >= 0) + { + const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr; + v3 = bottom_data[ptr3]; + grad_h_weight += hw * v3; + grad_w_weight -= lh * v3; + atomicAdd(grad_value+ptr3, w3*top_grad_value); + } + scalar_t v4 = 0; + if (h_high <= height - 1 && w_high <= width - 1) + { + const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr; + v4 = bottom_data[ptr4]; + grad_h_weight += lw * v4; + grad_w_weight += lh * v4; + atomicAdd(grad_value+ptr4, w4*top_grad_value); + } + + const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4); + atomicAdd(grad_attn_weight, top_grad * val); + atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value); + atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value); +} + + +template +__global__ void ms_deformable_im2col_gpu_kernel(const int n, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *data_col) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + scalar_t *data_col_ptr = data_col + index; + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + scalar_t col = 0; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride); + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight; + } + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + } + } + *data_col_ptr = col; + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockSize; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2]; + __shared__ scalar_t cache_grad_attn_weight[blockSize]; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockSize/2; s>0; s>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + if (tid == 0) + { + scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0]; + int sid=2; + for (unsigned int tid = 1; tid < blockDim.x; ++tid) + { + _grad_w += cache_grad_sampling_loc[sid]; + _grad_h += cache_grad_sampling_loc[sid + 1]; + _grad_a += cache_grad_attn_weight[tid]; + sid += 2; + } + + + *grad_sampling_loc = _grad_w; + *(grad_sampling_loc + 1) = _grad_h; + *grad_attn_weight = _grad_a; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + *grad_sampling_loc = cache_grad_sampling_loc[0]; + *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1]; + *grad_attn_weight = cache_grad_attn_weight[0]; + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + +template +__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + extern __shared__ int _s[]; + scalar_t* cache_grad_sampling_loc = (scalar_t*)_s; + scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x; + unsigned int tid = threadIdx.x; + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0; + *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0; + *(cache_grad_attn_weight+threadIdx.x)=0; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x); + } + + __syncthreads(); + + for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1) + { + if (tid < s) { + const unsigned int xid1 = tid << 1; + const unsigned int xid2 = (tid + s) << 1; + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1]; + if (tid + (s << 1) < spre) + { + cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)]; + cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)]; + cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)]; + } + } + __syncthreads(); + } + + if (tid == 0) + { + atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]); + atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]); + atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]); + } + __syncthreads(); + + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n, + const scalar_t *grad_col, + const scalar_t *data_value, + const int64_t *data_spatial_shapes, + const int64_t *data_level_start_index, + const scalar_t *data_sampling_loc, + const scalar_t *data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t *grad_value, + scalar_t *grad_sampling_loc, + scalar_t *grad_attn_weight) +{ + CUDA_KERNEL_LOOP(index, n) + { + int _temp = index; + const int c_col = _temp % channels; + _temp /= channels; + const int sampling_index = _temp; + const int m_col = _temp % num_heads; + _temp /= num_heads; + const int q_col = _temp % num_query; + _temp /= num_query; + const int b_col = _temp; + + const scalar_t top_grad = grad_col[index]; + + int data_weight_ptr = sampling_index * num_levels * num_point; + int data_loc_w_ptr = data_weight_ptr << 1; + const int grad_sampling_ptr = data_weight_ptr; + grad_sampling_loc += grad_sampling_ptr << 1; + grad_attn_weight += grad_sampling_ptr; + const int grad_weight_stride = 1; + const int grad_loc_stride = 2; + const int qid_stride = num_heads * channels; + const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride; + + for (int l_col=0; l_col < num_levels; ++l_col) + { + const int level_start_id = data_level_start_index[l_col]; + const int spatial_h_ptr = l_col << 1; + const int spatial_h = data_spatial_shapes[spatial_h_ptr]; + const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1]; + const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride; + const scalar_t *data_value_ptr = data_value + value_ptr_offset; + scalar_t *grad_value_ptr = grad_value + value_ptr_offset; + + for (int p_col=0; p_col < num_point; ++p_col) + { + const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr]; + const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1]; + const scalar_t weight = data_attn_weight[data_weight_ptr]; + + const scalar_t h_im = loc_h * spatial_h - 0.5; + const scalar_t w_im = loc_w * spatial_w - 0.5; + if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) + { + ms_deform_attn_col2im_bilinear_gm( + data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col, + top_grad, weight, grad_value_ptr, + grad_sampling_loc, grad_attn_weight); + } + data_weight_ptr += 1; + data_loc_w_ptr += 2; + grad_attn_weight += grad_weight_stride; + grad_sampling_loc += grad_loc_stride; + } + } + } +} + + +template +void ms_deformable_im2col_cuda(cudaStream_t stream, + const scalar_t* data_value, + const int64_t* data_spatial_shapes, + const int64_t* data_level_start_index, + const scalar_t* data_sampling_loc, + const scalar_t* data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* data_col) +{ + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + const int num_threads = CUDA_NUM_THREADS; + ms_deformable_im2col_gpu_kernel + <<>>( + num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, + batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col); + + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err)); + } + +} + +template +void ms_deformable_col2im_cuda(cudaStream_t stream, + const scalar_t* grad_col, + const scalar_t* data_value, + const int64_t * data_spatial_shapes, + const int64_t * data_level_start_index, + const scalar_t * data_sampling_loc, + const scalar_t * data_attn_weight, + const int batch_size, + const int spatial_size, + const int num_heads, + const int channels, + const int num_levels, + const int num_query, + const int num_point, + scalar_t* grad_value, + scalar_t* grad_sampling_loc, + scalar_t* grad_attn_weight) +{ + const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels; + const int num_kernels = batch_size * num_query * num_heads * channels; + const int num_actual_kernels = batch_size * num_query * num_heads * channels; + if (channels > 1024) + { + if ((channels & 1023) == 0) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_gm + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + else{ + switch(channels) + { + case 1: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 2: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 4: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 8: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 16: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 32: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 64: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 128: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 256: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 512: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + case 1024: + ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + break; + default: + if (channels < 64) + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v1 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + else + { + ms_deformable_col2im_gpu_kernel_shm_reduce_v2 + <<>>( + num_kernels, + grad_col, + data_value, + data_spatial_shapes, + data_level_start_index, + data_sampling_loc, + data_attn_weight, + batch_size, + spatial_size, + num_heads, + channels, + num_levels, + num_query, + num_point, + grad_value, + grad_sampling_loc, + grad_attn_weight); + } + } + } + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) + { + printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err)); + } + +} \ No newline at end of file diff --git a/custom_oneformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h b/custom_oneformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h new file mode 100644 index 0000000000000000000000000000000000000000..2f80a1b294c55b37d13bb3558ff7aeadba3b37de --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/src/ms_deform_attn.h @@ -0,0 +1,67 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#pragma once + +#include "cpu/ms_deform_attn_cpu.h" + +#ifdef WITH_CUDA +#include "cuda/ms_deform_attn_cuda.h" +#endif + + +at::Tensor +ms_deform_attn_forward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_forward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + +std::vector +ms_deform_attn_backward( + const at::Tensor &value, + const at::Tensor &spatial_shapes, + const at::Tensor &level_start_index, + const at::Tensor &sampling_loc, + const at::Tensor &attn_weight, + const at::Tensor &grad_output, + const int im2col_step) +{ + if (value.type().is_cuda()) + { +#ifdef WITH_CUDA + return ms_deform_attn_cuda_backward( + value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step); +#else + AT_ERROR("Not compiled with GPU support"); +#endif + } + AT_ERROR("Not implemented on the CPU"); +} + diff --git a/custom_oneformer/modeling/pixel_decoder/ops/src/vision.cpp b/custom_oneformer/modeling/pixel_decoder/ops/src/vision.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4a08821e0121a77556aa7a263ec8ebfa928b13b6 --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/src/vision.cpp @@ -0,0 +1,21 @@ +/*! +************************************************************************************************** +* Deformable DETR +* Copyright (c) 2020 SenseTime. All Rights Reserved. +* Licensed under the Apache License, Version 2.0 [see LICENSE for details] +************************************************************************************************** +* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +************************************************************************************************** +*/ + +/*! +* Copyright (c) Facebook, Inc. and its affiliates. +* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR +*/ + +#include "ms_deform_attn.h" + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward"); + m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward"); +} diff --git a/custom_oneformer/modeling/pixel_decoder/ops/test.py b/custom_oneformer/modeling/pixel_decoder/ops/test.py new file mode 100644 index 0000000000000000000000000000000000000000..6e1b545459f6fd3235767e721eb5a1090ae14bef --- /dev/null +++ b/custom_oneformer/modeling/pixel_decoder/ops/test.py @@ -0,0 +1,92 @@ +# ------------------------------------------------------------------------------------------------ +# Deformable DETR +# Copyright (c) 2020 SenseTime. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 [see LICENSE for details] +# ------------------------------------------------------------------------------------------------ +# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0 +# ------------------------------------------------------------------------------------------------ + +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR + +from __future__ import absolute_import +from __future__ import print_function +from __future__ import division + +import time +import torch +import torch.nn as nn +from torch.autograd import gradcheck + +from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch + + +N, M, D = 1, 2, 2 +Lq, L, P = 2, 2, 2 +shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda() +level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1])) +S = sum([(H*W).item() for H, W in shapes]) + + +torch.manual_seed(3) + + +@torch.no_grad() +def check_forward_equal_with_pytorch_double(): + value = torch.rand(N, S, M, D).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu() + output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu() + fwdok = torch.allclose(output_cuda, output_pytorch) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() + + print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + +@torch.no_grad() +def check_forward_equal_with_pytorch_float(): + value = torch.rand(N, S, M, D).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu() + output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu() + fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3) + max_abs_err = (output_cuda - output_pytorch).abs().max() + max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max() + + print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}') + + +def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True): + + value = torch.rand(N, S, M, channels).cuda() * 0.01 + sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda() + attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5 + attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True) + im2col_step = 2 + func = MSDeformAttnFunction.apply + + value.requires_grad = grad_value + sampling_locations.requires_grad = grad_sampling_loc + attention_weights.requires_grad = grad_attn_weight + + gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step)) + + print(f'* {gradok} check_gradient_numerical(D={channels})') + + +if __name__ == '__main__': + check_forward_equal_with_pytorch_double() + check_forward_equal_with_pytorch_float() + + for channels in [30, 32, 64, 71, 1025, 2048, 3096]: + check_gradient_numerical(channels, True, True, True) + + + diff --git a/custom_oneformer/modeling/transformer_decoder/__init__.py b/custom_oneformer/modeling/transformer_decoder/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b84bd4ecb48f134ccc218c4d5f02c50f7033bcd9 --- /dev/null +++ b/custom_oneformer/modeling/transformer_decoder/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from .oneformer_transformer_decoder import ContrastiveMultiScaleMaskedTransformerDecoder \ No newline at end of file diff --git a/custom_oneformer/modeling/transformer_decoder/oneformer_transformer_decoder.py b/custom_oneformer/modeling/transformer_decoder/oneformer_transformer_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..39374b77578b1676f0976d0e8bd2ec048db4785c --- /dev/null +++ b/custom_oneformer/modeling/transformer_decoder/oneformer_transformer_decoder.py @@ -0,0 +1,528 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/mask2former_transformer_decoder.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +import logging +import fvcore.nn.weight_init as weight_init +from typing import Optional +import torch +from torch import nn, Tensor +from torch.nn import functional as F + +from custom_detectron2.config import configurable +from custom_detectron2.layers import Conv2d + +from .position_encoding import PositionEmbeddingSine +from .transformer import Transformer + +from custom_detectron2.utils.registry import Registry + + +TRANSFORMER_DECODER_REGISTRY = Registry("TRANSFORMER_MODULE") +TRANSFORMER_DECODER_REGISTRY.__doc__ = """ +Registry for transformer module in OneFormer. +""" + + +def build_transformer_decoder(cfg, in_channels, mask_classification=True): + """ + Build a instance embedding branch from `cfg.MODEL.INS_EMBED_HEAD.NAME`. + """ + name = cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME + return TRANSFORMER_DECODER_REGISTRY.get(name)(cfg, in_channels, mask_classification) + + +class SelfAttentionLayer(nn.Module): + + def __init__(self, d_model, nhead, dropout=0.0, + activation="relu", normalize_before=False): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + + self.norm = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, tgt, + tgt_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + + return tgt + + def forward_pre(self, tgt, + tgt_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + tgt2 = self.norm(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, + key_padding_mask=tgt_key_padding_mask)[0] + tgt = tgt + self.dropout(tgt2) + + return tgt + + def forward(self, tgt, + tgt_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(tgt, tgt_mask, + tgt_key_padding_mask, query_pos) + return self.forward_post(tgt, tgt_mask, + tgt_key_padding_mask, query_pos) + + +class CrossAttentionLayer(nn.Module): + + def __init__(self, d_model, nhead, dropout=0.0, + activation="relu", normalize_before=False): + super().__init__() + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + + self.norm = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, tgt, memory, + memory_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + + return tgt + + def forward_pre(self, tgt, memory, + memory_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + tgt2 = self.norm(tgt) + tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask)[0] + tgt = tgt + self.dropout(tgt2) + + return tgt + + def forward(self, tgt, memory, + memory_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None): + if self.normalize_before: + return self.forward_pre(tgt, memory, memory_mask, + memory_key_padding_mask, pos, query_pos) + return self.forward_post(tgt, memory, memory_mask, + memory_key_padding_mask, pos, query_pos) + + +class FFNLayer(nn.Module): + + def __init__(self, d_model, dim_feedforward=2048, dropout=0.0, + activation="relu", normalize_before=False): + super().__init__() + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm = nn.LayerNorm(d_model) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post(self, tgt): + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout(tgt2) + tgt = self.norm(tgt) + return tgt + + def forward_pre(self, tgt): + tgt2 = self.norm(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout(tgt2) + return tgt + + def forward(self, tgt): + if self.normalize_before: + return self.forward_pre(tgt) + return self.forward_post(tgt) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(F"activation should be relu/gelu, not {activation}.") + + +class MLP(nn.Module): + """ Very simple multi-layer perceptron (also called FFN)""" + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +@TRANSFORMER_DECODER_REGISTRY.register() +class ContrastiveMultiScaleMaskedTransformerDecoder(nn.Module): + + _version = 2 + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + version = local_metadata.get("version", None) + if version is None or version < 2: + # Do not warn if train from scratch + scratch = True + logger = logging.getLogger(__name__) + for k in list(state_dict.keys()): + newk = k + if "static_query" in k: + newk = k.replace("static_query", "query_feat") + if newk != k: + state_dict[newk] = state_dict[k] + del state_dict[k] + scratch = False + + if not scratch: + logger.warning( + f"Weight format of {self.__class__.__name__} have changed! " + "Please upgrade your models. Applying automatic conversion now ..." + ) + + @configurable + def __init__( + self, + in_channels, + mask_classification=True, + *, + num_classes: int, + hidden_dim: int, + num_queries: int, + nheads: int, + dropout: float, + dim_feedforward: int, + enc_layers: int, + is_train: bool, + dec_layers: int, + class_dec_layers: int, + pre_norm: bool, + mask_dim: int, + enforce_input_project: bool, + use_task_norm: bool, + ): + """ + NOTE: this interface is experimental. + Args: + in_channels: channels of the input features + mask_classification: whether to add mask classifier or not + num_classes: number of classes + hidden_dim: Transformer feature dimension + num_queries: number of queries + nheads: number of heads + dim_feedforward: feature dimension in feedforward network + enc_layers: number of Transformer encoder layers + dec_layers: number of Transformer decoder layers + pre_norm: whether to use pre-LayerNorm or not + mask_dim: mask feature dimension + enforce_input_project: add input project 1x1 conv even if input + channels and hidden dim is identical + """ + super().__init__() + + assert mask_classification, "Only support mask classification model" + self.mask_classification = mask_classification + self.is_train = is_train + self.use_task_norm = use_task_norm + + # positional encoding + N_steps = hidden_dim // 2 + self.pe_layer = PositionEmbeddingSine(N_steps, normalize=True) + + self.class_transformer = Transformer( + d_model=hidden_dim, + dropout=dropout, + nhead=nheads, + dim_feedforward=dim_feedforward, + num_encoder_layers=enc_layers, + num_decoder_layers=class_dec_layers, + normalize_before=pre_norm, + return_intermediate_dec=False, + ) + + # define Transformer decoder here + self.num_heads = nheads + self.num_layers = dec_layers + self.transformer_self_attention_layers = nn.ModuleList() + self.transformer_cross_attention_layers = nn.ModuleList() + self.transformer_ffn_layers = nn.ModuleList() + + for _ in range(self.num_layers): + self.transformer_self_attention_layers.append( + SelfAttentionLayer( + d_model=hidden_dim, + nhead=nheads, + dropout=0.0, + normalize_before=pre_norm, + ) + ) + + self.transformer_cross_attention_layers.append( + CrossAttentionLayer( + d_model=hidden_dim, + nhead=nheads, + dropout=0.0, + normalize_before=pre_norm, + ) + ) + + self.transformer_ffn_layers.append( + FFNLayer( + d_model=hidden_dim, + dim_feedforward=dim_feedforward, + dropout=0.0, + normalize_before=pre_norm, + ) + ) + + self.decoder_norm = nn.LayerNorm(hidden_dim) + + self.num_queries = num_queries + # learnable query p.e. + self.query_embed = nn.Embedding(num_queries, hidden_dim) + + # level embedding (we always use 3 scales) + self.num_feature_levels = 3 + self.level_embed = nn.Embedding(self.num_feature_levels, hidden_dim) + self.input_proj = nn.ModuleList() + for _ in range(self.num_feature_levels): + if in_channels != hidden_dim or enforce_input_project: + self.input_proj.append(Conv2d(in_channels, hidden_dim, kernel_size=1)) + weight_init.c2_xavier_fill(self.input_proj[-1]) + else: + self.input_proj.append(nn.Sequential()) + + self.class_input_proj = Conv2d(in_channels, hidden_dim, kernel_size=1) + weight_init.c2_xavier_fill(self.class_input_proj) + + # output FFNs + if self.mask_classification: + self.class_embed = nn.Linear(hidden_dim, num_classes + 1) + self.mask_embed = MLP(hidden_dim, hidden_dim, mask_dim, 3) + + @classmethod + def from_config(cls, cfg, in_channels, mask_classification): + ret = {} + ret["in_channels"] = in_channels + ret["mask_classification"] = mask_classification + + ret["num_classes"] = cfg.MODEL.SEM_SEG_HEAD.NUM_CLASSES + ret["hidden_dim"] = cfg.MODEL.ONE_FORMER.HIDDEN_DIM + ret["num_queries"] = cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES + # Transformer parameters: + ret["nheads"] = cfg.MODEL.ONE_FORMER.NHEADS + ret["dim_feedforward"] = cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD + + # NOTE: because we add learnable query features which requires supervision, + # we add minus 1 to decoder layers to be consistent with our loss + # implementation: that is, number of auxiliary losses is always + # equal to number of decoder layers. With learnable query features, the number of + # auxiliary losses equals number of decoders plus 1. + assert cfg.MODEL.ONE_FORMER.DEC_LAYERS >= 1 + ret["dec_layers"] = cfg.MODEL.ONE_FORMER.DEC_LAYERS - 1 + ret["class_dec_layers"] = cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS + ret["enc_layers"] = cfg.MODEL.ONE_FORMER.ENC_LAYERS + ret["dropout"] = cfg.MODEL.ONE_FORMER.DROPOUT + ret["pre_norm"] = cfg.MODEL.ONE_FORMER.PRE_NORM + ret["enforce_input_project"] = cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ + ret["is_train"] = cfg.MODEL.IS_TRAIN + ret["mask_dim"] = cfg.MODEL.SEM_SEG_HEAD.MASK_DIM + ret["use_task_norm"] = cfg.MODEL.ONE_FORMER.USE_TASK_NORM + + return ret + + def forward(self, x, mask_features, tasks, mask = None): + # x is a list of multi-scale feature + assert len(x) == self.num_feature_levels + src = [] + pos = [] + size_list = [] + + # disable mask, it does not affect performance + del mask + + for i in range(self.num_feature_levels): + size_list.append(x[i].shape[-2:]) + pos.append(self.pe_layer(x[i], None).flatten(2)) + src.append(self.input_proj[i](x[i]).flatten(2) + self.level_embed.weight[i][None, :, None]) + + # flatten NxCxHxW to HWxNxC + pos[-1] = pos[-1].permute(2, 0, 1) + src[-1] = src[-1].permute(2, 0, 1) + + _, bs, _ = src[0].shape + + # QxNxC + query_embed = self.query_embed.weight.unsqueeze(1).repeat(1, bs, 1) + tasks = tasks.unsqueeze(0) + if self.use_task_norm: + tasks = self.decoder_norm(tasks) + + feats = self.pe_layer(mask_features, None) + + out_t, _ = self.class_transformer(feats, None, + self.query_embed.weight[:-1], + self.class_input_proj(mask_features), + tasks if self.use_task_norm else None) + out_t = out_t[0].permute(1, 0, 2) + + out = torch.cat([out_t, tasks], dim=0) + + output = out.clone() + + predictions_class = [] + predictions_mask = [] + + # prediction heads on learnable query features + outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[0], i=0) + predictions_class.append(outputs_class) + predictions_mask.append(outputs_mask) + + for i in range(self.num_layers): + level_index = i % self.num_feature_levels + attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False + # attention: cross-attention first + output = self.transformer_cross_attention_layers[i]( + output, src[level_index], + memory_mask=attn_mask, + memory_key_padding_mask=None, # here we do not apply masking on padded region + pos=pos[level_index], query_pos=query_embed + ) + + output = self.transformer_self_attention_layers[i]( + output, tgt_mask=None, + tgt_key_padding_mask=None, + query_pos=query_embed + ) + + # FFN + output = self.transformer_ffn_layers[i]( + output + ) + + outputs_class, outputs_mask, attn_mask = self.forward_prediction_heads(output, mask_features, attn_mask_target_size=size_list[(i + 1) % self.num_feature_levels], i=i+1) + predictions_class.append(outputs_class) + predictions_mask.append(outputs_mask) + + assert len(predictions_class) == self.num_layers + 1 + if self.is_train: + query_class = out.permute(1, 0, 2) + else: + query_class = None + out = { + 'contrastive_logits': query_class, + 'pred_logits': predictions_class[-1], + 'pred_masks': predictions_mask[-1], + 'aux_outputs': self._set_aux_loss( + predictions_class if self.mask_classification else None, + predictions_mask, + ) + } + + return out + + def forward_prediction_heads(self, output, mask_features, attn_mask_target_size, i): + decoder_output = self.decoder_norm(output) + decoder_output = decoder_output.transpose(0, 1) + outputs_class = self.class_embed(decoder_output) + mask_embed = self.mask_embed(decoder_output) + outputs_mask = torch.einsum("bqc,bchw->bqhw", mask_embed, mask_features) + + # NOTE: prediction is of higher-resolution + # [B, Q, H, W] -> [B, Q, H*W] -> [B, h, Q, H*W] -> [B*h, Q, HW] + attn_mask = F.interpolate(outputs_mask, size=attn_mask_target_size, mode="bilinear", align_corners=False) + + # save_attn_masks(attn_mask.sigmoid() < 0.5, fname=f'demo/maps/{i}_pre_bool') + + # must use bool type + # If a BoolTensor is provided, positions with ``True`` are not allowed to attend while ``False`` values will be unchanged. + attn_mask = (attn_mask.sigmoid().flatten(2).unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1) < 0.5).bool() + attn_mask = attn_mask.detach() + + return outputs_class, outputs_mask, attn_mask + + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_seg_masks): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + if self.mask_classification: + aux_list = [ + {"pred_logits": a, "pred_masks": b} + for a, b in zip(outputs_class[:-1], outputs_seg_masks[:-1]) + ] + else: + aux_list = [{"pred_masks": b} for b, in outputs_seg_masks[:-1]] + + return aux_list \ No newline at end of file diff --git a/custom_oneformer/modeling/transformer_decoder/position_encoding.py b/custom_oneformer/modeling/transformer_decoder/position_encoding.py new file mode 100644 index 0000000000000000000000000000000000000000..051984d9ea6e04e834f6fae3daf7d8317c2f0819 --- /dev/null +++ b/custom_oneformer/modeling/transformer_decoder/position_encoding.py @@ -0,0 +1,67 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/position_encoding.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +""" +Various positional encodings for the transformer. +""" +import math + +import torch +from torch import nn + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): + super().__init__() + self.num_pos_feats = num_pos_feats + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + def forward(self, x, mask=None): + if mask is None: + mask = torch.zeros((x.size(0), x.size(2), x.size(3)), device=x.device, dtype=torch.bool) + not_mask = ~mask + y_embed = not_mask.cumsum(1, dtype=torch.float32) + x_embed = not_mask.cumsum(2, dtype=torch.float32) + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + def __repr__(self, _repr_indent=4): + head = "Positional encoding " + self.__class__.__name__ + body = [ + "num_pos_feats: {}".format(self.num_pos_feats), + "temperature: {}".format(self.temperature), + "normalize: {}".format(self.normalize), + "scale: {}".format(self.scale), + ] + # _repr_indent = 4 + lines = [head] + [" " * _repr_indent + line for line in body] + return "\n".join(lines) diff --git a/custom_oneformer/modeling/transformer_decoder/text_transformer.py b/custom_oneformer/modeling/transformer_decoder/text_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..2edc3058d14f83a7d1c07e78e27cd3a73bb47090 --- /dev/null +++ b/custom_oneformer/modeling/transformer_decoder/text_transformer.py @@ -0,0 +1,257 @@ +# ------------------------------------------------------------------------- +# MIT License +# +# Copyright (c) 2021 OpenAI +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +# ------------------------------------------------------------------------- + +import torch +import torch.utils.checkpoint as checkpoint +from torch import nn +from collections import OrderedDict +from custom_timm.models.layers import trunc_normal_ + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.q_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.k_proj = nn.Linear(dim, dim, bias=qkv_bias) + self.v_proj = nn.Linear(dim, dim, bias=qkv_bias) + + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, q, k, v): + B, N, C = q.shape + assert k.shape == v.shape + B, M, C = k.shape + q = self.q_proj(q).reshape(B, N, self.num_heads, C // self.num_heads) + k = self.k_proj(k).reshape(B, M, self.num_heads, C // self.num_heads) + v = self.v_proj(v).reshape(B, M, self.num_heads, C // self.num_heads) + + attn = torch.einsum('bnkc,bmkc->bknm', q, k) * self.scale + + attn = attn.softmax(dim=-1) + + x = torch.einsum('bknm,bmkc->bnkc', attn, v).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + +class TransformerDecoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dropout=0.1, + ): + super().__init__() + self.self_attn = Attention(d_model, nhead, proj_drop=dropout) + self.cross_attn = Attention(d_model, nhead, proj_drop=dropout) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout = nn.Dropout(dropout) + + self.mlp = nn.Sequential( + nn.Linear(d_model, d_model * 4), + nn.GELU(), + nn.Dropout(dropout), + nn.Linear(d_model * 4, d_model) + ) + + def forward(self, x, mem): + q = k = v = self.norm1(x) + x = x + self.self_attn(q, k, v) + q = self.norm2(x) + x = x + self.cross_attn(q, mem, mem) + x = x + self.dropout(self.mlp(self.norm3(x))) + return x + + +class ContextDecoder(nn.Module): + def __init__(self, + transformer_width=256, + transformer_heads=4, + transformer_layers=6, + visual_dim=1024, + dropout=0.1, + **kwargs): + super().__init__() + + self.memory_proj = nn.Sequential( + nn.LayerNorm(visual_dim), + nn.Linear(visual_dim, transformer_width), + nn.LayerNorm(transformer_width), + ) + + self.text_proj = nn.Sequential( + nn.LayerNorm(visual_dim), + nn.Linear(visual_dim, transformer_width), + ) + + self.decoder = nn.ModuleList([ + TransformerDecoderLayer(transformer_width, transformer_heads, dropout) for _ in range(transformer_layers) + ]) + + self.out_proj = nn.Sequential( + nn.LayerNorm(transformer_width), + nn.Linear(transformer_width, visual_dim) + ) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + + def forward(self, text, visual): + B, N, C = visual.shape + visual = self.memory_proj(visual) + x = self.text_proj(text) + + for layer in self.decoder: + x = layer(x, visual) + + return self.out_proj(x) + + +class QuickGELU(nn.Module): + + def forward(self, x: torch.Tensor): + return x * torch.sigmoid(1.702 * x) + + +class ResidualAttentionBlock(nn.Module): + + def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None): + super().__init__() + + self.attn = nn.MultiheadAttention(d_model, n_head) + self.ln_1 = nn.LayerNorm(d_model) + self.mlp = nn.Sequential( + OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)), ('gelu', QuickGELU()), + ('c_proj', nn.Linear(d_model * 4, d_model))])) + self.ln_2 = nn.LayerNorm(d_model) + self.attn_mask = attn_mask + + def attention(self, x: torch.Tensor, key_padding_mask: torch.Tensor): + self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None + return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask, key_padding_mask=key_padding_mask)[0] + + def forward(self, x: torch.Tensor, key_padding_mask=None): + x = x + self.attention(self.ln_1(x), key_padding_mask=key_padding_mask) + x = x + self.mlp(self.ln_2(x)) + return x + +class Transformer(nn.Module): + + def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, use_checkpoint=False): + super().__init__() + self.width = width + self.layers = layers + self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)]) + proj_std = (self.width**-0.5) * ((2 * self.layers)**-0.5) + attn_std = self.width**-0.5 + fc_std = (2 * self.width)**-0.5 + for block in self.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std) + + self.use_checkpoint = use_checkpoint + + def forward(self, x: torch.Tensor): + for resblock in self.resblocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(resblock, x) + else: + x = resblock(x) + return x + + +class TextTransformer(nn.Module): + + def __init__( + self, + context_length: int, + width: int, + layers: int, + vocab_size, + use_checkpoint=False, + ): + + super().__init__() + heads = width // 64 + self.context_length = context_length + self.width = width + self.transformer = Transformer( + width=width, + layers=layers, + heads=heads, + attn_mask=self.build_attention_mask(), + use_checkpoint=use_checkpoint) + + self.positional_embedding = nn.Parameter(torch.empty(self.context_length, width)) + self.ln_final = nn.LayerNorm(width) + self.token_embedding = nn.Embedding(vocab_size, width) + nn.init.normal_(self.token_embedding.weight, std=0.02) + + # initialization + nn.init.normal_(self.positional_embedding, std=0.01) + + def build_attention_mask(self): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(self.context_length, self.context_length) + mask.fill_(float('-inf')) + mask.triu_(1) # zero out the lower diagonal + return mask + + def forward(self, text): + x = self.token_embedding(text) + x = x + self.positional_embedding + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + x = self.ln_final(x) + + # x.shape = [batch_size, n_ctx, transformer.width] + # take features from the eot embedding (eot_token is the highest number in each sequence) + x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] + + return x \ No newline at end of file diff --git a/custom_oneformer/modeling/transformer_decoder/transformer.py b/custom_oneformer/modeling/transformer_decoder/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..cd07525673b9b1165e1fdd0c9990a8f29c84f199 --- /dev/null +++ b/custom_oneformer/modeling/transformer_decoder/transformer.py @@ -0,0 +1,376 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/modeling/transformer_decoder/transformer.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +""" +Transformer class. + +Copy-paste from torch.nn.Transformer with modifications: + * positional encodings are passed in MHattention + * extra LN at the end of encoder is removed + * decoder returns a stack of activations from all decoding layers +""" +import copy +from typing import List, Optional + +import torch +import torch.nn.functional as F +from torch import Tensor, nn + + +class Transformer(nn.Module): + def __init__( + self, + d_model=512, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + return_intermediate_dec=False, + ): + super().__init__() + + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + encoder_norm = nn.LayerNorm(d_model) if normalize_before else None + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) + + decoder_layer = TransformerDecoderLayer( + d_model, nhead, dim_feedforward, dropout, activation, normalize_before + ) + decoder_norm = nn.LayerNorm(d_model) + self.decoder = TransformerDecoder( + decoder_layer, + num_decoder_layers, + decoder_norm, + return_intermediate=return_intermediate_dec, + ) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + nn.init.xavier_uniform_(p) + + def forward(self, src, mask, query_embed, pos_embed, task_token=None): + # flatten NxCxHxW to HWxNxC + bs, c, h, w = src.shape + src = src.flatten(2).permute(2, 0, 1) + pos_embed = pos_embed.flatten(2).permute(2, 0, 1) + query_embed = query_embed.unsqueeze(1).repeat(1, bs, 1) + if mask is not None: + mask = mask.flatten(1) + + if task_token is None: + tgt = torch.zeros_like(query_embed) + else: + tgt = task_token.repeat(query_embed.shape[0], 1, 1) + + memory = self.encoder(src, src_key_padding_mask=mask, pos=pos_embed) + hs = self.decoder( + tgt, memory, memory_key_padding_mask=mask, pos=pos_embed, query_pos=query_embed + ) + return hs.transpose(1, 2), memory.permute(1, 2, 0).view(bs, c, h, w) + + +class TransformerEncoder(nn.Module): + def __init__(self, encoder_layer, num_layers, norm=None): + super().__init__() + self.layers = _get_clones(encoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + + def forward( + self, + src, + mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + output = src + + for layer in self.layers: + output = layer( + output, src_mask=mask, src_key_padding_mask=src_key_padding_mask, pos=pos + ) + + if self.norm is not None: + output = self.norm(output) + + return output + + +class TransformerDecoder(nn.Module): + def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): + super().__init__() + self.layers = _get_clones(decoder_layer, num_layers) + self.num_layers = num_layers + self.norm = norm + self.return_intermediate = return_intermediate + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + output = tgt + + intermediate = [] + + for layer in self.layers: + output = layer( + output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + tgt_key_padding_mask=tgt_key_padding_mask, + memory_key_padding_mask=memory_key_padding_mask, + pos=pos, + query_pos=query_pos, + ) + if self.return_intermediate: + intermediate.append(self.norm(output)) + + if self.norm is not None: + output = self.norm(output) + if self.return_intermediate: + intermediate.pop() + intermediate.append(output) + + if self.return_intermediate: + return torch.stack(intermediate) + + return output.unsqueeze(0) + + +class TransformerEncoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + q = k = self.with_pos_embed(src, pos) + src2 = self.self_attn( + q, k, value=src, attn_mask=src_mask, key_padding_mask=src_key_padding_mask + )[0] + src = src + self.dropout1(src2) + src = self.norm1(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = src + self.dropout2(src2) + src = self.norm2(src) + return src + + def forward_pre( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + src2 = self.norm1(src) + q = k = self.with_pos_embed(src2, pos) + src2 = self.self_attn( + q, k, value=src2, attn_mask=src_mask, key_padding_mask=src_key_padding_mask + )[0] + src = src + self.dropout1(src2) + src2 = self.norm2(src) + src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) + src = src + self.dropout2(src2) + return src + + def forward( + self, + src, + src_mask: Optional[Tensor] = None, + src_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + ): + if self.normalize_before: + return self.forward_pre(src, src_mask, src_key_padding_mask, pos) + return self.forward_post(src, src_mask, src_key_padding_mask, pos) + + +class TransformerDecoderLayer(nn.Module): + def __init__( + self, + d_model, + nhead, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + normalize_before=False, + ): + super().__init__() + self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation = _get_activation_fn(activation) + self.normalize_before = normalize_before + + def with_pos_embed(self, tensor, pos: Optional[Tensor]): + return tensor if pos is None else tensor + pos + + def forward_post( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + q = k = self.with_pos_embed(tgt, query_pos) + tgt2 = self.self_attn( + q, k, value=tgt, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask + )[0] + tgt = tgt + self.dropout1(tgt2) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + def forward_pre( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + tgt2 = self.norm1(tgt) + q = k = self.with_pos_embed(tgt2, query_pos) + tgt2 = self.self_attn( + q, k, value=tgt2, attn_mask=tgt_mask, key_padding_mask=tgt_key_padding_mask + )[0] + tgt = tgt + self.dropout1(tgt2) + tgt2 = self.norm2(tgt) + tgt2 = self.multihead_attn( + query=self.with_pos_embed(tgt2, query_pos), + key=self.with_pos_embed(memory, pos), + value=memory, + attn_mask=memory_mask, + key_padding_mask=memory_key_padding_mask, + )[0] + tgt = tgt + self.dropout2(tgt2) + tgt2 = self.norm3(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout3(tgt2) + return tgt + + def forward( + self, + tgt, + memory, + tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + ): + if self.normalize_before: + return self.forward_pre( + tgt, + memory, + tgt_mask, + memory_mask, + tgt_key_padding_mask, + memory_key_padding_mask, + pos, + query_pos, + ) + return self.forward_post( + tgt, + memory, + tgt_mask, + memory_mask, + tgt_key_padding_mask, + memory_key_padding_mask, + pos, + query_pos, + ) + + +def _get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +def _get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") diff --git a/custom_oneformer/oneformer_model.py b/custom_oneformer/oneformer_model.py new file mode 100644 index 0000000000000000000000000000000000000000..1a84485f86f57d4794a48ba8a93c742a9004f601 --- /dev/null +++ b/custom_oneformer/oneformer_model.py @@ -0,0 +1,470 @@ +# ------------------------------------------------------------------------------ +# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/maskformer_model.py +# Modified by Jitesh Jain (https://github.com/praeclarumjj3) +# ------------------------------------------------------------------------------ + +from typing import Tuple + +import torch +from torch import nn +from torch.nn import functional as F + +from custom_detectron2.config import configurable +from custom_detectron2.data import MetadataCatalog +from custom_detectron2.modeling import META_ARCH_REGISTRY, build_backbone, build_sem_seg_head +from custom_detectron2.modeling.backbone import Backbone +from custom_detectron2.modeling.postprocessing import sem_seg_postprocess +from custom_detectron2.structures import Boxes, ImageList, Instances, BitMasks +from custom_detectron2.utils.memory import retry_if_cuda_oom + +from .modeling.matcher import HungarianMatcher +from einops import rearrange +from .modeling.transformer_decoder.text_transformer import TextTransformer +from .modeling.transformer_decoder.oneformer_transformer_decoder import MLP +from custom_oneformer.data.tokenizer import SimpleTokenizer, Tokenize + +@META_ARCH_REGISTRY.register() +class OneFormer(nn.Module): + """ + Main class for mask classification semantic segmentation architectures. + """ + + @configurable + def __init__( + self, + *, + backbone: Backbone, + sem_seg_head: nn.Module, + task_mlp: nn.Module, + text_encoder: nn.Module, + text_projector: nn.Module, + prompt_ctx: nn.Embedding, + num_queries: int, + object_mask_threshold: float, + overlap_threshold: float, + metadata, + size_divisibility: int, + sem_seg_postprocess_before_inference: bool, + pixel_mean: Tuple[float], + pixel_std: Tuple[float], + # inference + semantic_on: bool, + panoptic_on: bool, + instance_on: bool, + detection_on: bool, + test_topk_per_image: int, + task_seq_len: int, + max_seq_len: int, + is_demo: bool, + ): + """ + Args: + backbone: a backbone module, must follow detectron2's backbone interface + sem_seg_head: a module that predicts semantic segmentation from backbone features + criterion: a module that defines the loss + num_queries: int, number of queries + object_mask_threshold: float, threshold to filter query based on classification score + for panoptic segmentation inference + overlap_threshold: overlap threshold used in general inference for panoptic segmentation + metadata: dataset meta, get `thing` and `stuff` category names for panoptic + segmentation inference + size_divisibility: Some backbones require the input height and width to be divisible by a + specific integer. We can use this to override such requirement. + sem_seg_postprocess_before_inference: whether to resize the prediction back + to original input size before semantic segmentation inference or after. + For high-resolution dataset like Mapillary, resizing predictions before + inference will cause OOM error. + pixel_mean, pixel_std: list or tuple with #channels element, representing + the per-channel mean and std to be used to normalize the input image + semantic_on: bool, whether to output semantic segmentation prediction + instance_on: bool, whether to output instance segmentation prediction + panoptic_on: bool, whether to output panoptic segmentation prediction + test_topk_per_image: int, instance segmentation parameter, keep topk instances per image + """ + super().__init__() + self.backbone = backbone + self.sem_seg_head = sem_seg_head + self.task_mlp = task_mlp + self.text_encoder = text_encoder + self.text_projector = text_projector + self.prompt_ctx = prompt_ctx + self.num_queries = num_queries + self.overlap_threshold = overlap_threshold + self.object_mask_threshold = object_mask_threshold + self.metadata = metadata + if size_divisibility < 0: + # use backbone size_divisibility if not set + size_divisibility = self.backbone.size_divisibility + self.size_divisibility = size_divisibility + self.sem_seg_postprocess_before_inference = sem_seg_postprocess_before_inference + self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) + + # additional args + self.semantic_on = semantic_on + self.instance_on = instance_on + self.panoptic_on = panoptic_on + self.detection_on = detection_on + self.test_topk_per_image = test_topk_per_image + + self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len) + self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len) + self.is_demo = is_demo + + self.thing_indices = [k for k in self.metadata.thing_dataset_id_to_contiguous_id.keys()] + + if not self.semantic_on: + assert self.sem_seg_postprocess_before_inference + + @classmethod + def from_config(cls, cfg): + backbone = build_backbone(cfg) + sem_seg_head = build_sem_seg_head(cfg, backbone.output_shape()) + + if cfg.MODEL.IS_TRAIN: + text_encoder = TextTransformer(context_length=cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH, + width=cfg.MODEL.TEXT_ENCODER.WIDTH, + layers=cfg.MODEL.TEXT_ENCODER.NUM_LAYERS, + vocab_size=cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE) + text_projector = MLP(text_encoder.width, cfg.MODEL.ONE_FORMER.HIDDEN_DIM, + cfg.MODEL.ONE_FORMER.HIDDEN_DIM, cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS) + if cfg.MODEL.TEXT_ENCODER.N_CTX > 0: + prompt_ctx = nn.Embedding(cfg.MODEL.TEXT_ENCODER.N_CTX, cfg.MODEL.TEXT_ENCODER.WIDTH) + else: + prompt_ctx = None + else: + text_encoder = None + text_projector = None + prompt_ctx = None + + task_mlp = MLP(cfg.INPUT.TASK_SEQ_LEN, cfg.MODEL.ONE_FORMER.HIDDEN_DIM, + cfg.MODEL.ONE_FORMER.HIDDEN_DIM, 2) + + # Loss parameters: + deep_supervision = cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION + no_object_weight = cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT + + # loss weights + class_weight = cfg.MODEL.ONE_FORMER.CLASS_WEIGHT + dice_weight = cfg.MODEL.ONE_FORMER.DICE_WEIGHT + mask_weight = cfg.MODEL.ONE_FORMER.MASK_WEIGHT + contrastive_weight = cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT + + # building criterion + matcher = HungarianMatcher( + cost_class=class_weight, + cost_mask=mask_weight, + cost_dice=dice_weight, + num_points=cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS, + ) + + weight_dict = {"loss_ce": class_weight, "loss_mask": mask_weight, + "loss_dice": dice_weight, "loss_contrastive": contrastive_weight} + + + if deep_supervision: + dec_layers = cfg.MODEL.ONE_FORMER.DEC_LAYERS + aux_weight_dict = {} + for i in range(dec_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + + losses = ["labels", "masks", "contrastive"] + + return { + "backbone": backbone, + "sem_seg_head": sem_seg_head, + "task_mlp": task_mlp, + "prompt_ctx": prompt_ctx, + "text_encoder": text_encoder, + "text_projector": text_projector, + "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES, + "object_mask_threshold": cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD, + "overlap_threshold": cfg.MODEL.TEST.OVERLAP_THRESHOLD, + "metadata": MetadataCatalog.get(cfg.DATASETS.TRAIN[0]), + "size_divisibility": cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY, + "sem_seg_postprocess_before_inference": ( + cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE + or cfg.MODEL.TEST.PANOPTIC_ON + or cfg.MODEL.TEST.INSTANCE_ON + ), + "pixel_mean": cfg.MODEL.PIXEL_MEAN, + "pixel_std": cfg.MODEL.PIXEL_STD, + # inference + "semantic_on": cfg.MODEL.TEST.SEMANTIC_ON, + "instance_on": cfg.MODEL.TEST.INSTANCE_ON, + "panoptic_on": cfg.MODEL.TEST.PANOPTIC_ON, + "detection_on": cfg.MODEL.TEST.DETECTION_ON, + "test_topk_per_image": cfg.TEST.DETECTIONS_PER_IMAGE, + "task_seq_len": cfg.INPUT.TASK_SEQ_LEN, + "max_seq_len": cfg.INPUT.MAX_SEQ_LEN, + "is_demo": cfg.MODEL.IS_DEMO, + } + + @property + def device(self): + return self.pixel_mean.device + + def encode_text(self, text): + assert text.ndim in [2, 3], text.ndim + b = text.shape[0] + squeeze_dim = False + num_text = 1 + if text.ndim == 3: + num_text = text.shape[1] + text = rearrange(text, 'b n l -> (b n) l', n=num_text) + squeeze_dim = True + + # [B, C] + x = self.text_encoder(text) + + text_x = self.text_projector(x) + + if squeeze_dim: + text_x = rearrange(text_x, '(b n) c -> b n c', n=num_text) + if self.prompt_ctx is not None: + text_ctx = self.prompt_ctx.weight.unsqueeze(0).repeat(text_x.shape[0], 1, 1) + text_x = torch.cat([text_x, text_ctx], dim=1) + + return {"texts": text_x} + + def forward(self, batched_inputs): + """ + Args: + batched_inputs: a list, batched outputs of :class:`DatasetMapper`. + Each item in the list contains the inputs for one image. + For now, each item in the list is a dict that contains: + * "image": Tensor, image in (C, H, W) format. + * "instances": per-region ground truth + * Other information that's included in the original dicts, such as: + "height", "width" (int): the output resolution of the model (may be different + from input resolution), used in inference. + Returns: + list[dict]: + each dict has the results for one image. The dict contains the following keys: + * "sem_seg": + A Tensor that represents the + per-pixel segmentation prediced by the head. + The prediction has shape KxHxW that represents the logits of + each class for each pixel. + * "panoptic_seg": + A tuple that represent panoptic output + panoptic_seg (Tensor): of shape (height, width) where the values are ids for each segment. + segments_info (list[dict]): Describe each segment in `panoptic_seg`. + Each dict contains keys "id", "category_id", "isthing". + """ + images = [x["image"].to(self.device) for x in batched_inputs] + images = [(x - self.pixel_mean) / self.pixel_std for x in images] + images = ImageList.from_tensors(images, self.size_divisibility) + + tasks = torch.cat([self.task_tokenizer(x["task"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0) + tasks = self.task_mlp(tasks.float()) + + features = self.backbone(images.tensor) + outputs = self.sem_seg_head(features, tasks) + + if self.training: + texts = torch.cat([self.text_tokenizer(x["text"]).to(self.device).unsqueeze(0) for x in batched_inputs], dim=0) + texts_x = self.encode_text(texts) + + outputs = {**outputs, **texts_x} + + # mask classification target + if "instances" in batched_inputs[0]: + gt_instances = [x["instances"].to(self.device) for x in batched_inputs] + targets = self.prepare_targets(gt_instances, images) + else: + targets = None + + # bipartite matching-based loss + losses = self.criterion(outputs, targets) + + for k in list(losses.keys()): + if k in self.criterion.weight_dict: + losses[k] *= self.criterion.weight_dict[k] + else: + # remove this loss if not specified in `weight_dict` + losses.pop(k) + return losses + else: + mask_cls_results = outputs["pred_logits"] + mask_pred_results = outputs["pred_masks"] + # upsample masks + mask_pred_results = F.interpolate( + mask_pred_results, + size=(images.tensor.shape[-2], images.tensor.shape[-1]), + mode="bilinear", + align_corners=False, + ) + + del outputs + + processed_results = [] + for i, data in enumerate(zip( + mask_cls_results, mask_pred_results, batched_inputs, images.image_sizes + )): + mask_cls_result, mask_pred_result, input_per_image, image_size = data + height = input_per_image.get("height", image_size[0]) + width = input_per_image.get("width", image_size[1]) + processed_results.append({}) + + if self.sem_seg_postprocess_before_inference: + mask_pred_result = retry_if_cuda_oom(sem_seg_postprocess)( + mask_pred_result, image_size, height, width + ) + mask_cls_result = mask_cls_result.to(mask_pred_result) + + # semantic segmentation inference + if self.semantic_on: + r = retry_if_cuda_oom(self.semantic_inference)(mask_cls_result, mask_pred_result) + if not self.sem_seg_postprocess_before_inference: + r = retry_if_cuda_oom(sem_seg_postprocess)(r, image_size, height, width) + processed_results[-1]["sem_seg"] = r + + # panoptic segmentation inference + if self.panoptic_on: + panoptic_r = retry_if_cuda_oom(self.panoptic_inference)(mask_cls_result, mask_pred_result) + processed_results[-1]["panoptic_seg"] = panoptic_r + + # instance segmentation inference + if self.instance_on: + instance_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result) + processed_results[-1]["instances"] = instance_r + + if self.detection_on: + bbox_r = retry_if_cuda_oom(self.instance_inference)(mask_cls_result, mask_pred_result) + processed_results[-1]["box_instances"] = bbox_r + + return processed_results + + def prepare_targets(self, targets, images): + h_pad, w_pad = images.tensor.shape[-2:] + new_targets = [] + for targets_per_image in targets: + # pad gt + gt_masks = targets_per_image.gt_masks + padded_masks = torch.zeros((gt_masks.shape[0], h_pad, w_pad), dtype=gt_masks.dtype, device=gt_masks.device) + padded_masks[:, : gt_masks.shape[1], : gt_masks.shape[2]] = gt_masks + new_targets.append( + { + "labels": targets_per_image.gt_classes, + "masks": padded_masks, + } + ) + return new_targets + + def semantic_inference(self, mask_cls, mask_pred): + mask_cls = F.softmax(mask_cls, dim=-1)[..., :-1] + mask_pred = mask_pred.sigmoid() + semseg = torch.einsum("qc,qhw->chw", mask_cls, mask_pred) + return semseg + + def panoptic_inference(self, mask_cls, mask_pred): + scores, labels = F.softmax(mask_cls, dim=-1).max(-1) + mask_pred = mask_pred.sigmoid() + + keep = labels.ne(self.sem_seg_head.num_classes) & (scores > self.object_mask_threshold) + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_masks = mask_pred[keep] + cur_mask_cls = mask_cls[keep] + cur_mask_cls = cur_mask_cls[:, :-1] + + cur_prob_masks = cur_scores.view(-1, 1, 1) * cur_masks + + h, w = cur_masks.shape[-2:] + panoptic_seg = torch.zeros((h, w), dtype=torch.int32, device=cur_masks.device) + segments_info = [] + + current_segment_id = 0 + + if cur_masks.shape[0] == 0: + # We didn't detect any mask :( + return panoptic_seg, segments_info + else: + # take argmax + cur_mask_ids = cur_prob_masks.argmax(0) + stuff_memory_list = {} + for k in range(cur_classes.shape[0]): + pred_class = cur_classes[k].item() + isthing = pred_class in self.metadata.thing_dataset_id_to_contiguous_id.values() + mask_area = (cur_mask_ids == k).sum().item() + original_area = (cur_masks[k] >= 0.5).sum().item() + mask = (cur_mask_ids == k) & (cur_masks[k] >= 0.5) + + if mask_area > 0 and original_area > 0 and mask.sum().item() > 0: + if mask_area / original_area < self.overlap_threshold: + continue + + # merge stuff regions + if not isthing: + if int(pred_class) in stuff_memory_list.keys(): + panoptic_seg[mask] = stuff_memory_list[int(pred_class)] + continue + else: + stuff_memory_list[int(pred_class)] = current_segment_id + 1 + + current_segment_id += 1 + panoptic_seg[mask] = current_segment_id + + segments_info.append( + { + "id": current_segment_id, + "isthing": bool(isthing), + "category_id": int(pred_class), + } + ) + + return panoptic_seg, segments_info + + def instance_inference(self, mask_cls, mask_pred): + # mask_pred is already processed to have the same shape as original input + image_size = mask_pred.shape[-2:] + + # [Q, K] + scores = F.softmax(mask_cls, dim=-1)[:, :-1] + labels = torch.arange(self.sem_seg_head.num_classes, device=self.device).unsqueeze(0).repeat(self.num_queries, 1).flatten(0, 1) + + # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False) + scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.test_topk_per_image, sorted=False) + labels_per_image = labels[topk_indices] + + topk_indices = topk_indices // self.sem_seg_head.num_classes + # mask_pred = mask_pred.unsqueeze(1).repeat(1, self.sem_seg_head.num_classes, 1).flatten(0, 1) + mask_pred = mask_pred[topk_indices] + + # Only consider scores with confidence over [self.object_mask_threshold] for demo + if self.is_demo: + keep = scores_per_image > self.object_mask_threshold + scores_per_image = scores_per_image[keep] + labels_per_image = labels_per_image[keep] + mask_pred = mask_pred[keep] + + # if this is panoptic segmentation, we only keep the "thing" classes + if self.panoptic_on: + keep = torch.zeros_like(scores_per_image).bool() + for i, lab in enumerate(labels_per_image): + keep[i] = lab in self.metadata.thing_dataset_id_to_contiguous_id.values() + + scores_per_image = scores_per_image[keep] + labels_per_image = labels_per_image[keep] + mask_pred = mask_pred[keep] + + if 'ade20k' in self.metadata.name: + for i in range(labels_per_image.shape[0]): + labels_per_image[i] = self.thing_indices.index(labels_per_image[i].item()) + + result = Instances(image_size) + # mask (before sigmoid) + result.pred_masks = (mask_pred > 0).float() + if self.detection_on: + # Uncomment the following to get boxes from masks (this is slow) + result.pred_boxes = BitMasks(mask_pred > 0).get_bounding_boxes() + else: + result.pred_boxes = Boxes(torch.zeros(mask_pred.size(0), 4)) + + # calculate average mask prob + mask_scores_per_image = (mask_pred.sigmoid().flatten(1) * result.pred_masks.flatten(1)).sum(1) / (result.pred_masks.flatten(1).sum(1) + 1e-6) + result.scores = scores_per_image * mask_scores_per_image + result.pred_classes = labels_per_image + return result \ No newline at end of file diff --git a/custom_oneformer/utils/__init__.py b/custom_oneformer/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..130d3011b032f91df1a9cf965625e54922f6c81b --- /dev/null +++ b/custom_oneformer/utils/__init__.py @@ -0,0 +1,2 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +from .events import setup_wandb, WandbWriter \ No newline at end of file diff --git a/custom_oneformer/utils/box_ops.py b/custom_oneformer/utils/box_ops.py new file mode 100644 index 0000000000000000000000000000000000000000..a2b62ad99ed1fc35cdb10a9e11acdeb0ff1abcc4 --- /dev/null +++ b/custom_oneformer/utils/box_ops.py @@ -0,0 +1,133 @@ +# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved +""" +Utilities for bounding box manipulation and GIoU. +""" +import torch, os +from torchvision.ops.boxes import box_area + + +def box_cxcywh_to_xyxy(x): + x_c, y_c, w, h = x.unbind(-1) + b = [(x_c - 0.5 * w), (y_c - 0.5 * h), + (x_c + 0.5 * w), (y_c + 0.5 * h)] + return torch.stack(b, dim=-1) + + +def box_xyxy_to_cxcywh(x): + x0, y0, x1, y1 = x.unbind(-1) + b = [(x0 + x1) / 2, (y0 + y1) / 2, + (x1 - x0), (y1 - y0)] + return torch.stack(b, dim=-1) + + +# modified from torchvision to also return the union +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + # import ipdb; ipdb.set_trace() + lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + wh = (rb - lt).clamp(min=0) # [N,M,2] + inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / (union + 1e-6) + return iou, union + + +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/ + The boxes should be in [x0, y0, x1, y1] format + Returns a [N, M] pairwise matrix, where N = len(boxes1) + and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + # except: + # import ipdb; ipdb.set_trace() + iou, union = box_iou(boxes1, boxes2) + + lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + wh = (rb - lt).clamp(min=0) # [N,M,2] + area = wh[:, :, 0] * wh[:, :, 1] + + return iou - (area - union) / (area + 1e-6) + + + +# modified from torchvision to also return the union +def box_iou_pairwise(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + lt = torch.max(boxes1[:, :2], boxes2[:, :2]) # [N,2] + rb = torch.min(boxes1[:, 2:], boxes2[:, 2:]) # [N,2] + + wh = (rb - lt).clamp(min=0) # [N,2] + inter = wh[:, 0] * wh[:, 1] # [N] + + union = area1 + area2 - inter + + iou = inter / union + return iou, union + + +def generalized_box_iou_pairwise(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/ + Input: + - boxes1, boxes2: N,4 + Output: + - giou: N, 4 + """ + # degenerate boxes gives inf / nan results + # so do an early check + assert (boxes1[:, 2:] >= boxes1[:, :2]).all() + assert (boxes2[:, 2:] >= boxes2[:, :2]).all() + assert boxes1.shape == boxes2.shape + iou, union = box_iou_pairwise(boxes1, boxes2) # N, 4 + + lt = torch.min(boxes1[:, :2], boxes2[:, :2]) + rb = torch.max(boxes1[:, 2:], boxes2[:, 2:]) + + wh = (rb - lt).clamp(min=0) # [N,2] + area = wh[:, 0] * wh[:, 1] + + return iou - (area - union) / area + +def masks_to_boxes(masks): + """Compute the bounding boxes around the provided masks + The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. + Returns a [N, 4] tensors, with the boxes in xyxy format + """ + if masks.numel() == 0: + return torch.zeros((0, 4), device=masks.device) + + h, w = masks.shape[-2:] + + y = torch.arange(0, h, dtype=torch.float) + x = torch.arange(0, w, dtype=torch.float) + y, x = torch.meshgrid(y, x) + + x_mask = (masks * x.unsqueeze(0)) + x_max = x_mask.flatten(1).max(-1)[0] + x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + y_mask = (masks * y.unsqueeze(0)) + y_max = y_mask.flatten(1).max(-1)[0] + y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] + + return torch.stack([x_min, y_min, x_max, y_max], 1) + +if __name__ == '__main__': + x = torch.rand(5, 4) + y = torch.rand(3, 4) + iou, union = box_iou(x, y) \ No newline at end of file diff --git a/custom_oneformer/utils/events.py b/custom_oneformer/utils/events.py new file mode 100644 index 0000000000000000000000000000000000000000..9c6519ca86572218eedf465b6fe6bf19472a56e8 --- /dev/null +++ b/custom_oneformer/utils/events.py @@ -0,0 +1,120 @@ +import os +import wandb +from custom_detectron2.utils import comm +from custom_detectron2.utils.events import EventWriter, get_event_storage + + +def setup_wandb(cfg, args): + if comm.is_main_process(): + init_args = { + k.lower(): v + for k, v in cfg.WANDB.items() + if isinstance(k, str) and k not in ["config"] + } + # only include most related part to avoid too big table + # TODO: add configurable params to select which part of `cfg` should be saved in config + if "config_exclude_keys" in init_args: + init_args["config"] = cfg + init_args["config"]["cfg_file"] = args.config_file + else: + init_args["config"] = { + "model": cfg.MODEL, + "solver": cfg.SOLVER, + "cfg_file": args.config_file, + } + if ("name" not in init_args) or (init_args["name"] is None): + init_args["name"] = os.path.basename(args.config_file) + else: + init_args["name"] = init_args["name"] + '_' + os.path.basename(args.config_file) + wandb.init(**init_args) + + +class BaseRule(object): + def __call__(self, target): + return target + + +class IsIn(BaseRule): + def __init__(self, keyword: str): + self.keyword = keyword + + def __call__(self, target): + return self.keyword in target + + +class Prefix(BaseRule): + def __init__(self, keyword: str): + self.keyword = keyword + + def __call__(self, target): + return "/".join([self.keyword, target]) + + +class WandbWriter(EventWriter): + """ + Write all scalars to a tensorboard file. + """ + + def __init__(self): + """ + Args: + log_dir (str): the directory to save the output events + kwargs: other arguments passed to `torch.utils.tensorboard.SummaryWriter(...)` + """ + self._last_write = -1 + self._group_rules = [ + (IsIn("/"), BaseRule()), + (IsIn("loss"), Prefix("train")), + ] + + def write(self): + + storage = get_event_storage() + + def _group_name(scalar_name): + for (rule, op) in self._group_rules: + if rule(scalar_name): + return op(scalar_name) + return scalar_name + + stats = { + _group_name(name): scalars[0] + for name, scalars in storage.latest().items() + if scalars[1] > self._last_write + } + if len(stats) > 0: + self._last_write = max([v[1] for k, v in storage.latest().items()]) + + # storage.put_{image,histogram} is only meant to be used by + # tensorboard writer. So we access its internal fields directly from here. + if len(storage._vis_data) >= 1: + stats["image"] = [ + wandb.Image(img, caption=img_name) + for img_name, img, step_num in storage._vis_data + ] + # Storage stores all image data and rely on this writer to clear them. + # As a result it assumes only one writer will use its image data. + # An alternative design is to let storage store limited recent + # data (e.g. only the most recent image) that all writers can access. + # In that case a writer may not see all image data if its period is long. + storage.clear_images() + + if len(storage._histograms) >= 1: + + def create_bar(tag, bucket_limits, bucket_counts, **kwargs): + data = [ + [label, val] for (label, val) in zip(bucket_limits, bucket_counts) + ] + table = wandb.Table(data=data, columns=["label", "value"]) + return wandb.plot.bar(table, "label", "value", title=tag) + + stats["hist"] = [create_bar(**params) for params in storage._histograms] + + storage.clear_histograms() + + if len(stats) == 0: + return + wandb.log(stats, step=storage.iter) + + def close(self): + wandb.finish() \ No newline at end of file diff --git a/custom_oneformer/utils/misc.py b/custom_oneformer/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..f2bca7733278c3a4b1f145bd7e5da23683b74961 --- /dev/null +++ b/custom_oneformer/utils/misc.py @@ -0,0 +1,197 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# Modified by Bowen Cheng from https://github.com/facebookresearch/detr/blob/master/util/misc.py +""" +Misc functions, including distributed helpers. + +Mostly copy-paste from torchvision references. +""" +from typing import List, Optional + +import torch +import torch.distributed as dist +import torchvision +from torch import Tensor +import warnings +import torch.nn.functional as F +import math + +def inverse_sigmoid(x, eps=1e-3): + x = x.clamp(min=0, max=1) + x1 = x.clamp(min=eps) + x2 = (1 - x).clamp(min=eps) + return torch.log(x1/x2) + +def _no_grad_trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2) + + with torch.no_grad(): + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + # type: (Tensor, float, float, float, float) -> Tensor + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + return _no_grad_trunc_normal_(tensor, mean, std, a, b) + +def resize(input, + size=None, + scale_factor=None, + mode='nearest', + align_corners=None, + warning=True): + if warning: + if size is not None and align_corners: + input_h, input_w = tuple(int(x) for x in input.shape[2:]) + output_h, output_w = tuple(int(x) for x in size) + if output_h > input_h or output_w > output_h: + if ((output_h > 1 and output_w > 1 and input_h > 1 + and input_w > 1) and (output_h - 1) % (input_h - 1) + and (output_w - 1) % (input_w - 1)): + warnings.warn( + f'When align_corners={align_corners}, ' + 'the output would more aligned if ' + f'input size {(input_h, input_w)} is `x+1` and ' + f'out size {(output_h, output_w)} is `nx+1`') + if isinstance(size, torch.Size): + size = tuple(int(x) for x in size) + return F.interpolate(input, size, scale_factor, mode, align_corners) + +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + # type: (Device) -> NestedTensor # noqa + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + assert mask is not None + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + # TODO make this more general + if tensor_list[0].ndim == 3: + if torchvision._is_tracing(): + # nested_tensor_from_tensor_list() does not export well to ONNX + # call _onnx_nested_tensor_from_tensor_list() instead + return _onnx_nested_tensor_from_tensor_list(tensor_list) + + # TODO make it support different-sized images + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) + batch_shape = [len(tensor_list)] + max_size + b, c, h, w = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((b, h, w), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("not supported") + return NestedTensor(tensor, mask) + + +# _onnx_nested_tensor_from_tensor_list() is an implementation of +# nested_tensor_from_tensor_list() that is supported by ONNX tracing. +@torch.jit.unused +def _onnx_nested_tensor_from_tensor_list(tensor_list: List[Tensor]) -> NestedTensor: + max_size = [] + for i in range(tensor_list[0].dim()): + max_size_i = torch.max( + torch.stack([img.shape[i] for img in tensor_list]).to(torch.float32) + ).to(torch.int64) + max_size.append(max_size_i) + max_size = tuple(max_size) + + # work around for + # pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + # m[: img.shape[1], :img.shape[2]] = False + # which is not yet supported in onnx + padded_imgs = [] + padded_masks = [] + for img in tensor_list: + padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))] + padded_img = torch.nn.functional.pad(img, (0, padding[2], 0, padding[1], 0, padding[0])) + padded_imgs.append(padded_img) + + m = torch.zeros_like(img[0], dtype=torch.int, device=img.device) + padded_mask = torch.nn.functional.pad(m, (0, padding[2], 0, padding[1]), "constant", 1) + padded_masks.append(padded_mask.to(torch.bool)) + + tensor = torch.stack(padded_imgs) + mask = torch.stack(padded_masks) + + return NestedTensor(tensor, mask=mask) + + +def is_dist_avail_and_initialized(): + if not dist.is_available(): + return False + if not dist.is_initialized(): + return False + return True diff --git a/custom_oneformer/utils/pos_embed.py b/custom_oneformer/utils/pos_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..aa11d60db65fa98c140e7d75bdf985ff7ece8f18 --- /dev/null +++ b/custom_oneformer/utils/pos_embed.py @@ -0,0 +1,122 @@ +# -------------------------------------------------------- +# Position embedding utils +# -------------------------------------------------------- + +from typing import Tuple + +import numpy as np +import torch + + +# -------------------------------------------------------- +# 2D sine-cosine position embedding +# References: +# Transformer: https://github.com/tensorflow/models/blob/master/official/nlp/transformer/model_utils.py +# MoCo v3: https://github.com/facebookresearch/moco-v3 +# -------------------------------------------------------- +def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False): + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + grid_h = np.arange(grid_size, dtype=np.float32) + grid_w = np.arange(grid_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + + grid = grid.reshape([2, 1, grid_size, grid_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0) + return pos_embed + + +def get_2d_sincos_pos_embed_from_grid(embed_dim, grid): + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2) + + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + return emb + + +def get_1d_sincos_pos_embed_from_grid(embed_dim, pos): + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) + out: (M, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000 ** omega # (D/2,) + + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + return emb + + +# -------------------------------------------------------- +# Interpolate position embeddings for high-resolution +# References: +# DeiT: https://github.com/facebookresearch/deit +# -------------------------------------------------------- +def interpolate_pos_embed(model, checkpoint_model, pos_embed_key): + if pos_embed_key in checkpoint_model: + pos_embed_checkpoint = checkpoint_model[pos_embed_key] + embedding_size = pos_embed_checkpoint.shape[-1] + num_patches = model.num_patches + if pos_embed_key.startswith("decoder"): + num_extra_tokens = model.decoder_pos_embed.shape[-2] - num_patches + else: + num_extra_tokens = model.pos_embed.shape[-2] - num_patches + # height (== width) for the checkpoint position embedding + orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5) + # height (== width) for the new position embedding + new_size = int(num_patches ** 0.5) + # class_token and dist_token are kept unchanged + if orig_size != new_size: + print( + "Position interpolate from %dx%d to %dx%d" + % (orig_size, orig_size, new_size, new_size) + ) + extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] + # only the position tokens are interpolated + pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] + pos_tokens = pos_tokens.reshape( + -1, orig_size, orig_size, embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, + size=(new_size, new_size), + mode="bicubic", + align_corners=False, + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + checkpoint_model[pos_embed_key] = new_pos_embed + + +def interpolate_pos_embed_online( + pos_embed, orig_size: Tuple[int], new_size: Tuple[int], num_extra_tokens: int +): + extra_tokens = pos_embed[:, :num_extra_tokens] + pos_tokens = pos_embed[:, num_extra_tokens:] + embedding_size = pos_tokens.shape[-1] + pos_tokens = pos_tokens.reshape( + -1, orig_size[0], orig_size[1], embedding_size + ).permute(0, 3, 1, 2) + pos_tokens = torch.nn.functional.interpolate( + pos_tokens, size=new_size, mode="bicubic", align_corners=False, + ) + pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) + new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) + return new_pos_embed diff --git a/custom_pycocotools/__init__.py b/custom_pycocotools/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3f7d85bba884ea8f83fc6ab2a1e6ade80d98d4d9 --- /dev/null +++ b/custom_pycocotools/__init__.py @@ -0,0 +1 @@ +__author__ = 'tylin' diff --git a/custom_pycocotools/coco.py b/custom_pycocotools/coco.py new file mode 100644 index 0000000000000000000000000000000000000000..7485e23ab38fdd3cca9d4b86bb6c5a9bb17e8c92 --- /dev/null +++ b/custom_pycocotools/coco.py @@ -0,0 +1,444 @@ +__author__ = 'tylin' +__version__ = '2.0' +# Interface for accessing the Microsoft COCO dataset. + +# Microsoft COCO is a large image dataset designed for object detection, +# segmentation, and caption generation. custom_pycocotools is a Python API that +# assists in loading, parsing and visualizing the annotations in COCO. +# Please visit http://mscoco.org/ for more information on COCO, including +# for the data, paper, and tutorials. The exact format of the annotations +# is also described on the COCO website. For example usage of the custom_pycocotools +# please see custom_pycocotools_demo.ipynb. In addition to this API, please download both +# the COCO images and annotations in order to run the demo. + +# An alternative to using the API is to load the annotations directly +# into Python dictionary +# Using the API provides additional utility functions. Note that this API +# supports both *instance* and *caption* annotations. In the case of +# captions not all functions are defined (e.g. categories are undefined). + +# The following API functions are defined: +# COCO - COCO api class that loads COCO annotation file and prepare data structures. +# decodeMask - Decode binary mask M encoded via run-length encoding. +# encodeMask - Encode binary mask M using run-length encoding. +# getAnnIds - Get ann ids that satisfy given filter conditions. +# getCatIds - Get cat ids that satisfy given filter conditions. +# getImgIds - Get img ids that satisfy given filter conditions. +# loadAnns - Load anns with the specified ids. +# loadCats - Load cats with the specified ids. +# loadImgs - Load imgs with the specified ids. +# annToMask - Convert segmentation in an annotation to binary mask. +# showAnns - Display the specified annotations. +# loadRes - Load algorithm results and create API for accessing them. +# download - Download COCO images from mscoco.org server. +# Throughout the API "ann"=annotation, "cat"=category, and "img"=image. +# Help on each functions can be accessed by: "help COCO>function". + +# See also COCO>decodeMask, +# COCO>encodeMask, COCO>getAnnIds, COCO>getCatIds, +# COCO>getImgIds, COCO>loadAnns, COCO>loadCats, +# COCO>loadImgs, COCO>annToMask, COCO>showAnns + +# Microsoft COCO Toolbox. version 2.0 +# Data, paper, and tutorials available at: http://mscoco.org/ +# Code written by Piotr Dollar and Tsung-Yi Lin, 2014. +# Licensed under the Simplified BSD License [see bsd.txt] + +import json +import time +import numpy as np +import copy +import itertools +from . import mask as maskUtils +import os +from collections import defaultdict +import sys +PYTHON_VERSION = sys.version_info[0] +if PYTHON_VERSION == 2: + from urllib import urlretrieve +elif PYTHON_VERSION == 3: + from urllib.request import urlretrieve + + +def _isArrayLike(obj): + return hasattr(obj, '__iter__') and hasattr(obj, '__len__') + + +class COCO: + def __init__(self, annotation_file=None): + """ + Constructor of Microsoft COCO helper class for reading and visualizing annotations. + :param annotation_file (str): location of annotation file + :param image_folder (str): location to the folder that hosts images. + :return: + """ + # load dataset + self.dataset,self.anns,self.cats,self.imgs = dict(),dict(),dict(),dict() + self.imgToAnns, self.catToImgs = defaultdict(list), defaultdict(list) + if not annotation_file == None: + print('loading annotations into memory...') + tic = time.time() + with open(annotation_file, 'r') as f: + dataset = json.load(f) + assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset)) + print('Done (t={:0.2f}s)'.format(time.time()- tic)) + self.dataset = dataset + self.createIndex() + + def createIndex(self): + # create index + print('creating index...') + anns, cats, imgs = {}, {}, {} + imgToAnns,catToImgs = defaultdict(list),defaultdict(list) + if 'annotations' in self.dataset: + for ann in self.dataset['annotations']: + imgToAnns[ann['image_id']].append(ann) + anns[ann['id']] = ann + + if 'images' in self.dataset: + for img in self.dataset['images']: + imgs[img['id']] = img + + if 'categories' in self.dataset: + for cat in self.dataset['categories']: + cats[cat['id']] = cat + + if 'annotations' in self.dataset and 'categories' in self.dataset: + for ann in self.dataset['annotations']: + catToImgs[ann['category_id']].append(ann['image_id']) + + print('index created!') + + # create class members + self.anns = anns + self.imgToAnns = imgToAnns + self.catToImgs = catToImgs + self.imgs = imgs + self.cats = cats + + def info(self): + """ + Print information about the annotation file. + :return: + """ + for key, value in self.dataset['info'].items(): + print('{}: {}'.format(key, value)) + + def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None): + """ + Get ann ids that satisfy given filter conditions. default skips that filter + :param imgIds (int array) : get anns for given imgs + catIds (int array) : get anns for given cats + areaRng (float array) : get anns for given area range (e.g. [0 inf]) + iscrowd (boolean) : get anns for given crowd label (False or True) + :return: ids (int array) : integer array of ann ids + """ + imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(imgIds) == len(catIds) == len(areaRng) == 0: + anns = self.dataset['annotations'] + else: + if not len(imgIds) == 0: + lists = [self.imgToAnns[imgId] for imgId in imgIds if imgId in self.imgToAnns] + anns = list(itertools.chain.from_iterable(lists)) + else: + anns = self.dataset['annotations'] + anns = anns if len(catIds) == 0 else [ann for ann in anns if ann['category_id'] in catIds] + anns = anns if len(areaRng) == 0 else [ann for ann in anns if ann['area'] > areaRng[0] and ann['area'] < areaRng[1]] + if not iscrowd == None: + ids = [ann['id'] for ann in anns if ann['iscrowd'] == iscrowd] + else: + ids = [ann['id'] for ann in anns] + return ids + + def getCatIds(self, catNms=[], supNms=[], catIds=[]): + """ + filtering parameters. default skips that filter. + :param catNms (str array) : get cats for given cat names + :param supNms (str array) : get cats for given supercategory names + :param catIds (int array) : get cats for given cat ids + :return: ids (int array) : integer array of cat ids + """ + catNms = catNms if _isArrayLike(catNms) else [catNms] + supNms = supNms if _isArrayLike(supNms) else [supNms] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(catNms) == len(supNms) == len(catIds) == 0: + cats = self.dataset['categories'] + else: + cats = self.dataset['categories'] + cats = cats if len(catNms) == 0 else [cat for cat in cats if cat['name'] in catNms] + cats = cats if len(supNms) == 0 else [cat for cat in cats if cat['supercategory'] in supNms] + cats = cats if len(catIds) == 0 else [cat for cat in cats if cat['id'] in catIds] + ids = [cat['id'] for cat in cats] + return ids + + def getImgIds(self, imgIds=[], catIds=[]): + ''' + Get img ids that satisfy given filter conditions. + :param imgIds (int array) : get imgs for given ids + :param catIds (int array) : get imgs with all given cats + :return: ids (int array) : integer array of img ids + ''' + imgIds = imgIds if _isArrayLike(imgIds) else [imgIds] + catIds = catIds if _isArrayLike(catIds) else [catIds] + + if len(imgIds) == len(catIds) == 0: + ids = self.imgs.keys() + else: + ids = set(imgIds) + for i, catId in enumerate(catIds): + if i == 0 and len(ids) == 0: + ids = set(self.catToImgs[catId]) + else: + ids &= set(self.catToImgs[catId]) + return list(ids) + + def loadAnns(self, ids=[]): + """ + Load anns with the specified ids. + :param ids (int array) : integer ids specifying anns + :return: anns (object array) : loaded ann objects + """ + if _isArrayLike(ids): + return [self.anns[id] for id in ids] + elif type(ids) == int: + return [self.anns[ids]] + + def loadCats(self, ids=[]): + """ + Load cats with the specified ids. + :param ids (int array) : integer ids specifying cats + :return: cats (object array) : loaded cat objects + """ + if _isArrayLike(ids): + return [self.cats[id] for id in ids] + elif type(ids) == int: + return [self.cats[ids]] + + def loadImgs(self, ids=[]): + """ + Load anns with the specified ids. + :param ids (int array) : integer ids specifying img + :return: imgs (object array) : loaded img objects + """ + if _isArrayLike(ids): + return [self.imgs[id] for id in ids] + elif type(ids) == int: + return [self.imgs[ids]] + + def showAnns(self, anns, draw_bbox=False): + """ + Display the specified annotations. + :param anns (array of object): annotations to display + :return: None + """ + if len(anns) == 0: + return 0 + if 'segmentation' in anns[0] or 'keypoints' in anns[0]: + datasetType = 'instances' + elif 'caption' in anns[0]: + datasetType = 'captions' + else: + raise Exception('datasetType not supported') + if datasetType == 'instances': + import matplotlib.pyplot as plt + from matplotlib.collections import PatchCollection + from matplotlib.patches import Polygon + + ax = plt.gca() + ax.set_autoscale_on(False) + polygons = [] + color = [] + for ann in anns: + c = (np.random.random((1, 3))*0.6+0.4).tolist()[0] + if 'segmentation' in ann: + if type(ann['segmentation']) == list: + # polygon + for seg in ann['segmentation']: + poly = np.array(seg).reshape((int(len(seg)/2), 2)) + polygons.append(Polygon(poly)) + color.append(c) + else: + # mask + t = self.imgs[ann['image_id']] + if type(ann['segmentation']['counts']) == list: + rle = maskUtils.frPyObjects([ann['segmentation']], t['height'], t['width']) + else: + rle = [ann['segmentation']] + m = maskUtils.decode(rle) + img = np.ones( (m.shape[0], m.shape[1], 3) ) + if ann['iscrowd'] == 1: + color_mask = np.array([2.0,166.0,101.0])/255 + if ann['iscrowd'] == 0: + color_mask = np.random.random((1, 3)).tolist()[0] + for i in range(3): + img[:,:,i] = color_mask[i] + ax.imshow(np.dstack( (img, m*0.5) )) + if 'keypoints' in ann and type(ann['keypoints']) == list: + # turn skeleton into zero-based index + sks = np.array(self.loadCats(ann['category_id'])[0]['skeleton'])-1 + kp = np.array(ann['keypoints']) + x = kp[0::3] + y = kp[1::3] + v = kp[2::3] + for sk in sks: + if np.all(v[sk]>0): + plt.plot(x[sk],y[sk], linewidth=3, color=c) + plt.plot(x[v>0], y[v>0],'o',markersize=8, markerfacecolor=c, markeredgecolor='k',markeredgewidth=2) + plt.plot(x[v>1], y[v>1],'o',markersize=8, markerfacecolor=c, markeredgecolor=c, markeredgewidth=2) + + if draw_bbox: + [bbox_x, bbox_y, bbox_w, bbox_h] = ann['bbox'] + poly = [[bbox_x, bbox_y], [bbox_x, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y+bbox_h], [bbox_x+bbox_w, bbox_y]] + np_poly = np.array(poly).reshape((4,2)) + polygons.append(Polygon(np_poly)) + color.append(c) + + p = PatchCollection(polygons, facecolor=color, linewidths=0, alpha=0.4) + ax.add_collection(p) + p = PatchCollection(polygons, facecolor='none', edgecolors=color, linewidths=2) + ax.add_collection(p) + elif datasetType == 'captions': + for ann in anns: + print(ann['caption']) + + def loadRes(self, resFile): + """ + Load result file and return a result api object. + :param resFile (str) : file name of result file + :return: res (obj) : result api object + """ + res = COCO() + res.dataset['images'] = [img for img in self.dataset['images']] + + print('Loading and preparing results...') + tic = time.time() + if type(resFile) == str or (PYTHON_VERSION == 2 and type(resFile) == unicode): + with open(resFile) as f: + anns = json.load(f) + elif type(resFile) == np.ndarray: + anns = self.loadNumpyAnnotations(resFile) + else: + anns = resFile + assert type(anns) == list, 'results in not an array of objects' + annsImgIds = [ann['image_id'] for ann in anns] + assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \ + 'Results do not correspond to current coco set' + if 'caption' in anns[0]: + imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns]) + res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds] + for id, ann in enumerate(anns): + ann['id'] = id+1 + elif 'bbox' in anns[0] and not anns[0]['bbox'] == []: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + bb = ann['bbox'] + x1, x2, y1, y2 = [bb[0], bb[0]+bb[2], bb[1], bb[1]+bb[3]] + if not 'segmentation' in ann: + ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]] + ann['area'] = bb[2]*bb[3] + ann['id'] = id+1 + ann['iscrowd'] = 0 + elif 'segmentation' in anns[0]: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + # now only support compressed RLE format as segmentation results + ann['area'] = maskUtils.area(ann['segmentation']) + if not 'bbox' in ann: + ann['bbox'] = maskUtils.toBbox(ann['segmentation']) + ann['id'] = id+1 + ann['iscrowd'] = 0 + elif 'keypoints' in anns[0]: + res.dataset['categories'] = copy.deepcopy(self.dataset['categories']) + for id, ann in enumerate(anns): + s = ann['keypoints'] + x = s[0::3] + y = s[1::3] + x0,x1,y0,y1 = np.min(x), np.max(x), np.min(y), np.max(y) + ann['area'] = (x1-x0)*(y1-y0) + ann['id'] = id + 1 + ann['bbox'] = [x0,y0,x1-x0,y1-y0] + print('DONE (t={:0.2f}s)'.format(time.time()- tic)) + + res.dataset['annotations'] = anns + res.createIndex() + return res + + def download(self, tarDir = None, imgIds = [] ): + ''' + Download COCO images from mscoco.org server. + :param tarDir (str): COCO results directory name + imgIds (list): images to be downloaded + :return: + ''' + if tarDir is None: + print('Please specify target directory') + return -1 + if len(imgIds) == 0: + imgs = self.imgs.values() + else: + imgs = self.loadImgs(imgIds) + N = len(imgs) + if not os.path.exists(tarDir): + os.makedirs(tarDir) + for i, img in enumerate(imgs): + tic = time.time() + fname = os.path.join(tarDir, img['file_name']) + if not os.path.exists(fname): + urlretrieve(img['coco_url'], fname) + print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)) + + def loadNumpyAnnotations(self, data): + """ + Convert result data from a numpy array [Nx7] where each row contains {imageID,x1,y1,w,h,score,class} + :param data (numpy.ndarray) + :return: annotations (python nested list) + """ + print('Converting ndarray to lists...') + assert(type(data) == np.ndarray) + print(data.shape) + assert(data.shape[1] == 7) + N = data.shape[0] + ann = [] + for i in range(N): + if i % 1000000 == 0: + print('{}/{}'.format(i,N)) + ann += [{ + 'image_id' : int(data[i, 0]), + 'bbox' : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ], + 'score' : data[i, 5], + 'category_id': int(data[i, 6]), + }] + return ann + + def annToRLE(self, ann): + """ + Convert annotation which can be polygons, uncompressed RLE to RLE. + :return: binary mask (numpy 2D array) + """ + t = self.imgs[ann['image_id']] + h, w = t['height'], t['width'] + segm = ann['segmentation'] + if type(segm) == list: + # polygon -- a single object might consist of multiple parts + # we merge all parts into one mask rle code + rles = maskUtils.frPyObjects(segm, h, w) + rle = maskUtils.merge(rles) + elif type(segm['counts']) == list: + # uncompressed RLE + rle = maskUtils.frPyObjects(segm, h, w) + else: + # rle + rle = ann['segmentation'] + return rle + + def annToMask(self, ann): + """ + Convert annotation which can be polygons, uncompressed RLE, or RLE to binary mask. + :return: binary mask (numpy 2D array) + """ + rle = self.annToRLE(ann) + m = maskUtils.decode(rle) + return m diff --git a/custom_pycocotools/cocoeval.py b/custom_pycocotools/cocoeval.py new file mode 100644 index 0000000000000000000000000000000000000000..89c251e1652a0cfc7e8ff1bbb1024a801ed2ebe7 --- /dev/null +++ b/custom_pycocotools/cocoeval.py @@ -0,0 +1,534 @@ +__author__ = 'tsungyi' + +import numpy as np +import datetime +import time +from collections import defaultdict +from . import mask as maskUtils +import copy + +class COCOeval: + # Interface for evaluating detection on the Microsoft COCO dataset. + # + # The usage for CocoEval is as follows: + # cocoGt=..., cocoDt=... # load dataset and results + # E = CocoEval(cocoGt,cocoDt); # initialize CocoEval object + # E.params.recThrs = ...; # set parameters as desired + # E.evaluate(); # run per image evaluation + # E.accumulate(); # accumulate per image results + # E.summarize(); # display summary metrics of results + # For example usage see evalDemo.m and http://mscoco.org/. + # + # The evaluation parameters are as follows (defaults in brackets): + # imgIds - [all] N img ids to use for evaluation + # catIds - [all] K cat ids to use for evaluation + # iouThrs - [.5:.05:.95] T=10 IoU thresholds for evaluation + # recThrs - [0:.01:1] R=101 recall thresholds for evaluation + # areaRng - [...] A=4 object area ranges for evaluation + # maxDets - [1 10 100] M=3 thresholds on max detections per image + # iouType - ['segm'] set iouType to 'segm', 'bbox' or 'keypoints' + # iouType replaced the now DEPRECATED useSegm parameter. + # useCats - [1] if true use category labels for evaluation + # Note: if useCats=0 category labels are ignored as in proposal scoring. + # Note: multiple areaRngs [Ax2] and maxDets [Mx1] can be specified. + # + # evaluate(): evaluates detections on every image and every category and + # concats the results into the "evalImgs" with fields: + # dtIds - [1xD] id for each of the D detections (dt) + # gtIds - [1xG] id for each of the G ground truths (gt) + # dtMatches - [TxD] matching gt id at each IoU or 0 + # gtMatches - [TxG] matching dt id at each IoU or 0 + # dtScores - [1xD] confidence of each dt + # gtIgnore - [1xG] ignore flag for each gt + # dtIgnore - [TxD] ignore flag for each dt at each IoU + # + # accumulate(): accumulates the per-image, per-category evaluation + # results in "evalImgs" into the dictionary "eval" with fields: + # params - parameters used for evaluation + # date - date evaluation was performed + # counts - [T,R,K,A,M] parameter dimensions (see above) + # precision - [TxRxKxAxM] precision for every evaluation setting + # recall - [TxKxAxM] max recall for every evaluation setting + # Note: precision and recall==-1 for settings with no gt objects. + # + # See also coco, mask, pycocoDemo, pycocoEvalDemo + # + # Microsoft COCO Toolbox. version 2.0 + # Data, paper, and tutorials available at: http://mscoco.org/ + # Code written by Piotr Dollar and Tsung-Yi Lin, 2015. + # Licensed under the Simplified BSD License [see coco/license.txt] + def __init__(self, cocoGt=None, cocoDt=None, iouType='segm'): + ''' + Initialize CocoEval using coco APIs for gt and dt + :param cocoGt: coco object with ground truth annotations + :param cocoDt: coco object with detection results + :return: None + ''' + if not iouType: + print('iouType not specified. use default iouType segm') + self.cocoGt = cocoGt # ground truth COCO API + self.cocoDt = cocoDt # detections COCO API + self.evalImgs = defaultdict(list) # per-image per-category evaluation results [KxAxI] elements + self.eval = {} # accumulated evaluation results + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + self.params = Params(iouType=iouType) # parameters + self._paramsEval = {} # parameters for evaluation + self.stats = [] # result summarization + self.ious = {} # ious between all gts and dts + if not cocoGt is None: + self.params.imgIds = sorted(cocoGt.getImgIds()) + self.params.catIds = sorted(cocoGt.getCatIds()) + + + def _prepare(self): + ''' + Prepare ._gts and ._dts for evaluation based on params + :return: None + ''' + def _toMask(anns, coco): + # modify ann['segmentation'] by reference + for ann in anns: + rle = coco.annToRLE(ann) + ann['segmentation'] = rle + p = self.params + if p.useCats: + gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) + dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds, catIds=p.catIds)) + else: + gts=self.cocoGt.loadAnns(self.cocoGt.getAnnIds(imgIds=p.imgIds)) + dts=self.cocoDt.loadAnns(self.cocoDt.getAnnIds(imgIds=p.imgIds)) + + # convert ground truth to mask if iouType == 'segm' + if p.iouType == 'segm': + _toMask(gts, self.cocoGt) + _toMask(dts, self.cocoDt) + # set ignore flag + for gt in gts: + gt['ignore'] = gt['ignore'] if 'ignore' in gt else 0 + gt['ignore'] = 'iscrowd' in gt and gt['iscrowd'] + if p.iouType == 'keypoints': + gt['ignore'] = (gt['num_keypoints'] == 0) or gt['ignore'] + self._gts = defaultdict(list) # gt for evaluation + self._dts = defaultdict(list) # dt for evaluation + for gt in gts: + self._gts[gt['image_id'], gt['category_id']].append(gt) + for dt in dts: + self._dts[dt['image_id'], dt['category_id']].append(dt) + self.evalImgs = defaultdict(list) # per-image per-category evaluation results + self.eval = {} # accumulated evaluation results + + def evaluate(self): + ''' + Run per image evaluation on given images and store results (a list of dict) in self.evalImgs + :return: None + ''' + tic = time.time() + print('Running per image evaluation...') + p = self.params + # add backward compatibility if useSegm is specified in params + if not p.useSegm is None: + p.iouType = 'segm' if p.useSegm == 1 else 'bbox' + print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)) + print('Evaluate annotation type *{}*'.format(p.iouType)) + p.imgIds = list(np.unique(p.imgIds)) + if p.useCats: + p.catIds = list(np.unique(p.catIds)) + p.maxDets = sorted(p.maxDets) + self.params=p + + self._prepare() + # loop through images, area range, max detection number + catIds = p.catIds if p.useCats else [-1] + + if p.iouType == 'segm' or p.iouType == 'bbox': + computeIoU = self.computeIoU + elif p.iouType == 'keypoints': + computeIoU = self.computeOks + self.ious = {(imgId, catId): computeIoU(imgId, catId) \ + for imgId in p.imgIds + for catId in catIds} + + evaluateImg = self.evaluateImg + maxDet = p.maxDets[-1] + self.evalImgs = [evaluateImg(imgId, catId, areaRng, maxDet) + for catId in catIds + for areaRng in p.areaRng + for imgId in p.imgIds + ] + self._paramsEval = copy.deepcopy(self.params) + toc = time.time() + print('DONE (t={:0.2f}s).'.format(toc-tic)) + + def computeIoU(self, imgId, catId): + p = self.params + if p.useCats: + gt = self._gts[imgId,catId] + dt = self._dts[imgId,catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] + if len(gt) == 0 and len(dt) ==0: + return [] + inds = np.argsort([-d['score'] for d in dt], kind='mergesort') + dt = [dt[i] for i in inds] + if len(dt) > p.maxDets[-1]: + dt=dt[0:p.maxDets[-1]] + + if p.iouType == 'segm': + g = [g['segmentation'] for g in gt] + d = [d['segmentation'] for d in dt] + elif p.iouType == 'bbox': + g = [g['bbox'] for g in gt] + d = [d['bbox'] for d in dt] + else: + raise Exception('unknown iouType for iou computation') + + # compute iou between each dt and gt region + iscrowd = [int(o['iscrowd']) for o in gt] + ious = maskUtils.iou(d,g,iscrowd) + return ious + + def computeOks(self, imgId, catId): + p = self.params + # dimention here should be Nxm + gts = self._gts[imgId, catId] + dts = self._dts[imgId, catId] + inds = np.argsort([-d['score'] for d in dts], kind='mergesort') + dts = [dts[i] for i in inds] + if len(dts) > p.maxDets[-1]: + dts = dts[0:p.maxDets[-1]] + # if len(gts) == 0 and len(dts) == 0: + if len(gts) == 0 or len(dts) == 0: + return [] + ious = np.zeros((len(dts), len(gts))) + sigmas = p.kpt_oks_sigmas + vars = (sigmas * 2)**2 + k = len(sigmas) + # compute oks between each detection and ground truth object + for j, gt in enumerate(gts): + # create bounds for ignore regions(double the gt bbox) + g = np.array(gt['keypoints']) + xg = g[0::3]; yg = g[1::3]; vg = g[2::3] + k1 = np.count_nonzero(vg > 0) + bb = gt['bbox'] + x0 = bb[0] - bb[2]; x1 = bb[0] + bb[2] * 2 + y0 = bb[1] - bb[3]; y1 = bb[1] + bb[3] * 2 + for i, dt in enumerate(dts): + d = np.array(dt['keypoints']) + xd = d[0::3]; yd = d[1::3] + if k1>0: + # measure the per-keypoint distance if keypoints visible + dx = xd - xg + dy = yd - yg + else: + # measure minimum distance to keypoints in (x0,y0) & (x1,y1) + z = np.zeros((k)) + dx = np.max((z, x0-xd),axis=0)+np.max((z, xd-x1),axis=0) + dy = np.max((z, y0-yd),axis=0)+np.max((z, yd-y1),axis=0) + e = (dx**2 + dy**2) / vars / (gt['area']+np.spacing(1)) / 2 + if k1 > 0: + e=e[vg > 0] + ious[i, j] = np.sum(np.exp(-e)) / e.shape[0] + return ious + + def evaluateImg(self, imgId, catId, aRng, maxDet): + ''' + perform evaluation for single category and image + :return: dict (single image results) + ''' + p = self.params + if p.useCats: + gt = self._gts[imgId,catId] + dt = self._dts[imgId,catId] + else: + gt = [_ for cId in p.catIds for _ in self._gts[imgId,cId]] + dt = [_ for cId in p.catIds for _ in self._dts[imgId,cId]] + if len(gt) == 0 and len(dt) ==0: + return None + + for g in gt: + if g['ignore'] or (g['area']aRng[1]): + g['_ignore'] = 1 + else: + g['_ignore'] = 0 + + # sort dt highest score first, sort gt ignore last + gtind = np.argsort([g['_ignore'] for g in gt], kind='mergesort') + gt = [gt[i] for i in gtind] + dtind = np.argsort([-d['score'] for d in dt], kind='mergesort') + dt = [dt[i] for i in dtind[0:maxDet]] + iscrowd = [int(o['iscrowd']) for o in gt] + # load computed ious + ious = self.ious[imgId, catId][:, gtind] if len(self.ious[imgId, catId]) > 0 else self.ious[imgId, catId] + + T = len(p.iouThrs) + G = len(gt) + D = len(dt) + gtm = np.zeros((T,G)) + dtm = np.zeros((T,D)) + gtIg = np.array([g['_ignore'] for g in gt]) + dtIg = np.zeros((T,D)) + if not len(ious)==0: + for tind, t in enumerate(p.iouThrs): + for dind, d in enumerate(dt): + # information about best match so far (m=-1 -> unmatched) + iou = min([t,1-1e-10]) + m = -1 + for gind, g in enumerate(gt): + # if this gt already matched, and not a crowd, continue + if gtm[tind,gind]>0 and not iscrowd[gind]: + continue + # if dt matched to reg gt, and on ignore gt, stop + if m>-1 and gtIg[m]==0 and gtIg[gind]==1: + break + # continue to next gt unless better match made + if ious[dind,gind] < iou: + continue + # if match successful and best so far, store appropriately + iou=ious[dind,gind] + m=gind + # if match made store id of match for both dt and gt + if m ==-1: + continue + dtIg[tind,dind] = gtIg[m] + dtm[tind,dind] = gt[m]['id'] + gtm[tind,m] = d['id'] + # set unmatched detections outside of area range to ignore + a = np.array([d['area']aRng[1] for d in dt]).reshape((1, len(dt))) + dtIg = np.logical_or(dtIg, np.logical_and(dtm==0, np.repeat(a,T,0))) + # store results for given image and category + return { + 'image_id': imgId, + 'category_id': catId, + 'aRng': aRng, + 'maxDet': maxDet, + 'dtIds': [d['id'] for d in dt], + 'gtIds': [g['id'] for g in gt], + 'dtMatches': dtm, + 'gtMatches': gtm, + 'dtScores': [d['score'] for d in dt], + 'gtIgnore': gtIg, + 'dtIgnore': dtIg, + } + + def accumulate(self, p = None): + ''' + Accumulate per image evaluation results and store the result in self.eval + :param p: input params for evaluation + :return: None + ''' + print('Accumulating evaluation results...') + tic = time.time() + if not self.evalImgs: + print('Please run evaluate() first') + # allows input customized parameters + if p is None: + p = self.params + p.catIds = p.catIds if p.useCats == 1 else [-1] + T = len(p.iouThrs) + R = len(p.recThrs) + K = len(p.catIds) if p.useCats else 1 + A = len(p.areaRng) + M = len(p.maxDets) + precision = -np.ones((T,R,K,A,M)) # -1 for the precision of absent categories + recall = -np.ones((T,K,A,M)) + scores = -np.ones((T,R,K,A,M)) + + # create dictionary for future indexing + _pe = self._paramsEval + catIds = _pe.catIds if _pe.useCats else [-1] + setK = set(catIds) + setA = set(map(tuple, _pe.areaRng)) + setM = set(_pe.maxDets) + setI = set(_pe.imgIds) + # get inds to evaluate + k_list = [n for n, k in enumerate(p.catIds) if k in setK] + m_list = [m for n, m in enumerate(p.maxDets) if m in setM] + a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA] + i_list = [n for n, i in enumerate(p.imgIds) if i in setI] + I0 = len(_pe.imgIds) + A0 = len(_pe.areaRng) + # retrieve E at each category, area range, and max number of detections + for k, k0 in enumerate(k_list): + Nk = k0*A0*I0 + for a, a0 in enumerate(a_list): + Na = a0*I0 + for m, maxDet in enumerate(m_list): + E = [self.evalImgs[Nk + Na + i] for i in i_list] + E = [e for e in E if not e is None] + if len(E) == 0: + continue + dtScores = np.concatenate([e['dtScores'][0:maxDet] for e in E]) + + # different sorting method generates slightly different results. + # mergesort is used to be consistent as Matlab implementation. + inds = np.argsort(-dtScores, kind='mergesort') + dtScoresSorted = dtScores[inds] + + dtm = np.concatenate([e['dtMatches'][:,0:maxDet] for e in E], axis=1)[:,inds] + dtIg = np.concatenate([e['dtIgnore'][:,0:maxDet] for e in E], axis=1)[:,inds] + gtIg = np.concatenate([e['gtIgnore'] for e in E]) + npig = np.count_nonzero(gtIg==0 ) + if npig == 0: + continue + tps = np.logical_and( dtm, np.logical_not(dtIg) ) + fps = np.logical_and(np.logical_not(dtm), np.logical_not(dtIg) ) + + tp_sum = np.cumsum(tps, axis=1).astype(dtype=float) + fp_sum = np.cumsum(fps, axis=1).astype(dtype=float) + for t, (tp, fp) in enumerate(zip(tp_sum, fp_sum)): + tp = np.array(tp) + fp = np.array(fp) + nd = len(tp) + rc = tp / npig + pr = tp / (fp+tp+np.spacing(1)) + q = np.zeros((R,)) + ss = np.zeros((R,)) + + if nd: + recall[t,k,a,m] = rc[-1] + else: + recall[t,k,a,m] = 0 + + # numpy is slow without cython optimization for accessing elements + # use python array gets significant speed improvement + pr = pr.tolist(); q = q.tolist() + + for i in range(nd-1, 0, -1): + if pr[i] > pr[i-1]: + pr[i-1] = pr[i] + + inds = np.searchsorted(rc, p.recThrs, side='left') + try: + for ri, pi in enumerate(inds): + q[ri] = pr[pi] + ss[ri] = dtScoresSorted[pi] + except: + pass + precision[t,:,k,a,m] = np.array(q) + scores[t,:,k,a,m] = np.array(ss) + self.eval = { + 'params': p, + 'counts': [T, R, K, A, M], + 'date': datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'), + 'precision': precision, + 'recall': recall, + 'scores': scores, + } + toc = time.time() + print('DONE (t={:0.2f}s).'.format( toc-tic)) + + def summarize(self): + ''' + Compute and display summary metrics for evaluation results. + Note this functin can *only* be applied on the default parameter setting + ''' + def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ): + p = self.params + iStr = ' {:<18} {} @[ IoU={:<9} | area={:>6s} | maxDets={:>3d} ] = {:0.3f}' + titleStr = 'Average Precision' if ap == 1 else 'Average Recall' + typeStr = '(AP)' if ap==1 else '(AR)' + iouStr = '{:0.2f}:{:0.2f}'.format(p.iouThrs[0], p.iouThrs[-1]) \ + if iouThr is None else '{:0.2f}'.format(iouThr) + + aind = [i for i, aRng in enumerate(p.areaRngLbl) if aRng == areaRng] + mind = [i for i, mDet in enumerate(p.maxDets) if mDet == maxDets] + if ap == 1: + # dimension of precision: [TxRxKxAxM] + s = self.eval['precision'] + # IoU + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:,:,:,aind,mind] + else: + # dimension of recall: [TxKxAxM] + s = self.eval['recall'] + if iouThr is not None: + t = np.where(iouThr == p.iouThrs)[0] + s = s[t] + s = s[:,:,aind,mind] + if len(s[s>-1])==0: + mean_s = -1 + else: + mean_s = np.mean(s[s>-1]) + print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)) + return mean_s + def _summarizeDets(): + stats = np.zeros((12,)) + stats[0] = _summarize(1) + stats[1] = _summarize(1, iouThr=.5, maxDets=self.params.maxDets[2]) + stats[2] = _summarize(1, iouThr=.75, maxDets=self.params.maxDets[2]) + stats[3] = _summarize(1, areaRng='small', maxDets=self.params.maxDets[2]) + stats[4] = _summarize(1, areaRng='medium', maxDets=self.params.maxDets[2]) + stats[5] = _summarize(1, areaRng='large', maxDets=self.params.maxDets[2]) + stats[6] = _summarize(0, maxDets=self.params.maxDets[0]) + stats[7] = _summarize(0, maxDets=self.params.maxDets[1]) + stats[8] = _summarize(0, maxDets=self.params.maxDets[2]) + stats[9] = _summarize(0, areaRng='small', maxDets=self.params.maxDets[2]) + stats[10] = _summarize(0, areaRng='medium', maxDets=self.params.maxDets[2]) + stats[11] = _summarize(0, areaRng='large', maxDets=self.params.maxDets[2]) + return stats + def _summarizeKps(): + stats = np.zeros((10,)) + stats[0] = _summarize(1, maxDets=20) + stats[1] = _summarize(1, maxDets=20, iouThr=.5) + stats[2] = _summarize(1, maxDets=20, iouThr=.75) + stats[3] = _summarize(1, maxDets=20, areaRng='medium') + stats[4] = _summarize(1, maxDets=20, areaRng='large') + stats[5] = _summarize(0, maxDets=20) + stats[6] = _summarize(0, maxDets=20, iouThr=.5) + stats[7] = _summarize(0, maxDets=20, iouThr=.75) + stats[8] = _summarize(0, maxDets=20, areaRng='medium') + stats[9] = _summarize(0, maxDets=20, areaRng='large') + return stats + if not self.eval: + raise Exception('Please run accumulate() first') + iouType = self.params.iouType + if iouType == 'segm' or iouType == 'bbox': + summarize = _summarizeDets + elif iouType == 'keypoints': + summarize = _summarizeKps + self.stats = summarize() + + def __str__(self): + self.summarize() + +class Params: + ''' + Params for coco evaluation api + ''' + def setDetParams(self): + self.imgIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange is slightly larger than the true value + self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) + self.maxDets = [1, 10, 100] + self.areaRng = [[0 ** 2, 1e5 ** 2], [0 ** 2, 32 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] + self.areaRngLbl = ['all', 'small', 'medium', 'large'] + self.useCats = 1 + + def setKpParams(self): + self.imgIds = [] + self.catIds = [] + # np.arange causes trouble. the data point on arange is slightly larger than the true value + self.iouThrs = np.linspace(.5, 0.95, int(np.round((0.95 - .5) / .05)) + 1, endpoint=True) + self.recThrs = np.linspace(.0, 1.00, int(np.round((1.00 - .0) / .01)) + 1, endpoint=True) + self.maxDets = [20] + self.areaRng = [[0 ** 2, 1e5 ** 2], [32 ** 2, 96 ** 2], [96 ** 2, 1e5 ** 2]] + self.areaRngLbl = ['all', 'medium', 'large'] + self.useCats = 1 + self.kpt_oks_sigmas = np.array([.26, .25, .25, .35, .35, .79, .79, .72, .72, .62,.62, 1.07, 1.07, .87, .87, .89, .89])/10.0 + + def __init__(self, iouType='segm'): + if iouType == 'segm' or iouType == 'bbox': + self.setDetParams() + elif iouType == 'keypoints': + self.setKpParams() + else: + raise Exception('iouType not supported') + self.iouType = iouType + # useSegm is deprecated + self.useSegm = None diff --git a/custom_pycocotools/mask.py b/custom_pycocotools/mask.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed8b91bbc8f23c94c495d66780ba88fac342445 --- /dev/null +++ b/custom_pycocotools/mask.py @@ -0,0 +1,107 @@ +__author__ = 'tsungyi' + +# import custom_pycocotools._mask as _mask + +# Interface for manipulating masks stored in RLE format. +# +# RLE is a simple yet efficient format for storing binary masks. RLE +# first divides a vector (or vectorized image) into a series of piecewise +# constant regions and then for each piece simply stores the length of +# that piece. For example, given M=[0 0 1 1 1 0 1] the RLE counts would +# be [2 3 1 1], or for M=[1 1 1 1 1 1 0] the counts would be [0 6 1] +# (note that the odd counts are always the numbers of zeros). Instead of +# storing the counts directly, additional compression is achieved with a +# variable bitrate representation based on a common scheme called LEB128. +# +# Compression is greatest given large piecewise constant regions. +# Specifically, the size of the RLE is proportional to the number of +# *boundaries* in M (or for an image the number of boundaries in the y +# direction). Assuming fairly simple shapes, the RLE representation is +# O(sqrt(n)) where n is number of pixels in the object. Hence space usage +# is substantially lower, especially for large simple objects (large n). +# +# Many common operations on masks can be computed directly using the RLE +# (without need for decoding). This includes computations such as area, +# union, intersection, etc. All of these operations are linear in the +# size of the RLE, in other words they are O(sqrt(n)) where n is the area +# of the object. Computing these operations on the original mask is O(n). +# Thus, using the RLE can result in substantial computational savings. +# +# The following API functions are defined: +# encode - Encode binary masks using RLE. +# decode - Decode binary masks encoded via RLE. +# merge - Compute union or intersection of encoded masks. +# iou - Compute intersection over union between masks. +# area - Compute area of encoded masks. +# toBbox - Get bounding boxes surrounding encoded masks. +# frPyObjects - Convert polygon, bbox, and uncompressed RLE to encoded RLE mask. +# +# Usage: +# Rs = encode( masks ) +# masks = decode( Rs ) +# R = merge( Rs, intersect=false ) +# o = iou( dt, gt, iscrowd ) +# a = area( Rs ) +# bbs = toBbox( Rs ) +# Rs = frPyObjects( [pyObjects], h, w ) +# +# In the API the following formats are used: +# Rs - [dict] Run-length encoding of binary masks +# R - dict Run-length encoding of binary mask +# masks - [hxwxn] Binary mask(s) (must have type np.ndarray(dtype=uint8) in column-major order) +# iscrowd - [nx1] list of np.ndarray. 1 indicates corresponding gt image has crowd region to ignore +# bbs - [nx4] Bounding box(es) stored as [x y w h] +# poly - Polygon stored as [[x1 y1 x2 y2...],[x1 y1 ...],...] (2D list) +# dt,gt - May be either bounding boxes or encoded masks +# Both poly and bbs are 0-indexed (bbox=[0 0 1 1] encloses first pixel). +# +# Finally, a note about the intersection over union (iou) computation. +# The standard iou of a ground truth (gt) and detected (dt) object is +# iou(gt,dt) = area(intersect(gt,dt)) / area(union(gt,dt)) +# For "crowd" regions, we use a modified criteria. If a gt object is +# marked as "iscrowd", we allow a dt to match any subregion of the gt. +# Choosing gt' in the crowd gt that best matches the dt can be done using +# gt'=intersect(dt,gt). Since by definition union(gt',dt)=dt, computing +# iou(gt,dt,iscrowd) = iou(gt',dt) = area(intersect(gt,dt)) / area(dt) +# For crowd gt regions we use this modified criteria above for the iou. +# +# To compile run "python setup.py build_ext --inplace" +# Please do not contact us for help with compiling. +# +# Microsoft COCO Toolbox. version 2.0 +# Data, paper, and tutorials available at: http://mscoco.org/ +# Code written by Piotr Dollar and Tsung-Yi Lin, 2015. +# Licensed under the Simplified BSD License [see coco/license.txt] + +# iou = _mask.iou +# merge = _mask.merge +# frPyObjects = _mask.frPyObjects + +def encode(bimask): + pass + # if len(bimask.shape) == 3: + # return _mask.encode(bimask) + # elif len(bimask.shape) == 2: + # h, w = bimask.shape + # return _mask.encode(bimask.reshape((h, w, 1), order='F'))[0] + +def decode(rleObjs): + pass + # if type(rleObjs) == list: + # return _mask.decode(rleObjs) + # else: + # return _mask.decode([rleObjs])[:,:,0] + +def area(rleObjs): + pass + # if type(rleObjs) == list: + # return _mask.area(rleObjs) + # else: + # return _mask.area([rleObjs])[0] + +def toBbox(rleObjs): + pass + # if type(rleObjs) == list: + # return _mask.toBbox(rleObjs) + # else: + # return _mask.toBbox([rleObjs])[0] \ No newline at end of file diff --git a/custom_qudida/LICENSE b/custom_qudida/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..c0d82af9687db97bf2488d08aafa6b04f9f97110 --- /dev/null +++ b/custom_qudida/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Arseny Kravchenko + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/custom_qudida/__init__.py b/custom_qudida/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ff1f1ee8e00870e0a58aa52dd827a37116e5c5a4 --- /dev/null +++ b/custom_qudida/__init__.py @@ -0,0 +1,70 @@ +import abc +from copy import deepcopy + +import cv2 +import numpy as np +from sklearn.decomposition import PCA +from typing_extensions import Protocol + + +class TransformerInterface(Protocol): + @abc.abstractmethod + def inverse_transform(self, X: np.ndarray) -> np.ndarray: + ... + + @abc.abstractmethod + def fit(self, X: np.ndarray, y=None): + ... + + @abc.abstractmethod + def transform(self, X: np.ndarray, y=None) -> np.ndarray: + ... + + +class DomainAdapter: + def __init__(self, + transformer: TransformerInterface, + ref_img: np.ndarray, + color_conversions=(None, None), + ): + self.color_in, self.color_out = color_conversions + self.source_transformer = deepcopy(transformer) + self.target_transformer = transformer + self.target_transformer.fit(self.flatten(ref_img)) + + def to_colorspace(self, img): + if self.color_in is None: + return img + return cv2.cvtColor(img, self.color_in) + + def from_colorspace(self, img): + if self.color_out is None: + return img + return cv2.cvtColor(img.astype('uint8'), self.color_out) + + def flatten(self, img): + img = self.to_colorspace(img) + img = img.astype('float32') / 255. + return img.reshape(-1, 3) + + def reconstruct(self, pixels, h, w): + pixels = (np.clip(pixels, 0, 1) * 255).astype('uint8') + return self.from_colorspace(pixels.reshape(h, w, 3)) + + @staticmethod + def _pca_sign(x): + return np.sign(np.trace(x.components_)) + + def __call__(self, image: np.ndarray): + h, w, _ = image.shape + pixels = self.flatten(image) + self.source_transformer.fit(pixels) + + if self.target_transformer.__class__ in (PCA,): + # dirty hack to make sure colors are not inverted + if self._pca_sign(self.target_transformer) != self._pca_sign(self.source_transformer): + self.target_transformer.components_ *= -1 + + representation = self.source_transformer.transform(pixels) + result = self.target_transformer.inverse_transform(representation) + return self.reconstruct(result, h, w) diff --git a/custom_qudida/__version__.py b/custom_qudida/__version__.py new file mode 100644 index 0000000000000000000000000000000000000000..81f0fdeccf66f1b74184b96bd53b64be1b903aae --- /dev/null +++ b/custom_qudida/__version__.py @@ -0,0 +1 @@ +__version__ = "0.0.4" diff --git a/custom_timm/__init__.py b/custom_timm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c5f797b156d939831ba0173ce29e33583b0a05a3 --- /dev/null +++ b/custom_timm/__init__.py @@ -0,0 +1,4 @@ +from .version import __version__ +from .models import create_model, list_models, is_model, list_modules, model_entrypoint, \ + is_scriptable, is_exportable, set_scriptable, set_exportable, has_pretrained_cfg_key, is_pretrained_cfg_key, \ + get_pretrained_cfg_value, is_model_pretrained diff --git a/custom_timm/data/__init__.py b/custom_timm/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0eb10a660c1195250fc418884fc93482efd4f144 --- /dev/null +++ b/custom_timm/data/__init__.py @@ -0,0 +1,13 @@ +from .auto_augment import RandAugment, AutoAugment, rand_augment_ops, auto_augment_policy,\ + rand_augment_transform, auto_augment_transform +from .config import resolve_data_config +from .constants import * +from .dataset import ImageDataset, IterableImageDataset, AugMixDataset +from .dataset_factory import create_dataset +from .loader import create_loader +from .mixup import Mixup, FastCollateMixup +from .parsers import create_parser,\ + get_img_extensions, is_img_extension, set_img_extensions, add_img_extensions, del_img_extensions +from .real_labels import RealLabelsImagenet +from .transforms import * +from .transforms_factory import create_transform diff --git a/custom_timm/data/auto_augment.py b/custom_timm/data/auto_augment.py new file mode 100644 index 0000000000000000000000000000000000000000..1b51ccb458fe8f614bcd3389cc943c7d201f63de --- /dev/null +++ b/custom_timm/data/auto_augment.py @@ -0,0 +1,870 @@ +""" AutoAugment, RandAugment, and AugMix for PyTorch + +This code implements the searched ImageNet policies with various tweaks and improvements and +does not include any of the search code. + +AA and RA Implementation adapted from: + https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/autoaugment.py + +AugMix adapted from: + https://github.com/google-research/augmix + +Papers: + AutoAugment: Learning Augmentation Policies from Data - https://arxiv.org/abs/1805.09501 + Learning Data Augmentation Strategies for Object Detection - https://arxiv.org/abs/1906.11172 + RandAugment: Practical automated data augmentation... - https://arxiv.org/abs/1909.13719 + AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - https://arxiv.org/abs/1912.02781 + +Hacked together by / Copyright 2019, Ross Wightman +""" +import random +import math +import re +from PIL import Image, ImageOps, ImageEnhance, ImageChops +import PIL +import numpy as np + + +_PIL_VER = tuple([int(x) for x in PIL.__version__.split('.')[:2]]) + +_FILL = (128, 128, 128) + +_LEVEL_DENOM = 10. # denominator for conversion from 'Mx' magnitude scale to fractional aug level for op arguments + +_HPARAMS_DEFAULT = dict( + translate_const=250, + img_mean=_FILL, +) + +if hasattr(Image, "Resampling"): + _RANDOM_INTERPOLATION = (Image.Resampling.BILINEAR, Image.Resampling.BICUBIC) + _DEFAULT_INTERPOLATION = Image.Resampling.BICUBIC +else: + _RANDOM_INTERPOLATION = (Image.BILINEAR, Image.BICUBIC) + _DEFAULT_INTERPOLATION = Image.BICUBIC + + +def _interpolation(kwargs): + interpolation = kwargs.pop('resample', _DEFAULT_INTERPOLATION) + if isinstance(interpolation, (list, tuple)): + return random.choice(interpolation) + else: + return interpolation + + +def _check_args_tf(kwargs): + if 'fillcolor' in kwargs and _PIL_VER < (5, 0): + kwargs.pop('fillcolor') + kwargs['resample'] = _interpolation(kwargs) + + +def shear_x(img, factor, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, factor, 0, 0, 1, 0), **kwargs) + + +def shear_y(img, factor, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, factor, 1, 0), **kwargs) + + +def translate_x_rel(img, pct, **kwargs): + pixels = pct * img.size[0] + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs) + + +def translate_y_rel(img, pct, **kwargs): + pixels = pct * img.size[1] + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs) + + +def translate_x_abs(img, pixels, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, pixels, 0, 1, 0), **kwargs) + + +def translate_y_abs(img, pixels, **kwargs): + _check_args_tf(kwargs) + return img.transform(img.size, Image.AFFINE, (1, 0, 0, 0, 1, pixels), **kwargs) + + +def rotate(img, degrees, **kwargs): + _check_args_tf(kwargs) + if _PIL_VER >= (5, 2): + return img.rotate(degrees, **kwargs) + elif _PIL_VER >= (5, 0): + w, h = img.size + post_trans = (0, 0) + rotn_center = (w / 2.0, h / 2.0) + angle = -math.radians(degrees) + matrix = [ + round(math.cos(angle), 15), + round(math.sin(angle), 15), + 0.0, + round(-math.sin(angle), 15), + round(math.cos(angle), 15), + 0.0, + ] + + def transform(x, y, matrix): + (a, b, c, d, e, f) = matrix + return a * x + b * y + c, d * x + e * y + f + + matrix[2], matrix[5] = transform( + -rotn_center[0] - post_trans[0], -rotn_center[1] - post_trans[1], matrix + ) + matrix[2] += rotn_center[0] + matrix[5] += rotn_center[1] + return img.transform(img.size, Image.AFFINE, matrix, **kwargs) + else: + return img.rotate(degrees, resample=kwargs['resample']) + + +def auto_contrast(img, **__): + return ImageOps.autocontrast(img) + + +def invert(img, **__): + return ImageOps.invert(img) + + +def equalize(img, **__): + return ImageOps.equalize(img) + + +def solarize(img, thresh, **__): + return ImageOps.solarize(img, thresh) + + +def solarize_add(img, add, thresh=128, **__): + lut = [] + for i in range(256): + if i < thresh: + lut.append(min(255, i + add)) + else: + lut.append(i) + if img.mode in ("L", "RGB"): + if img.mode == "RGB" and len(lut) == 256: + lut = lut + lut + lut + return img.point(lut) + else: + return img + + +def posterize(img, bits_to_keep, **__): + if bits_to_keep >= 8: + return img + return ImageOps.posterize(img, bits_to_keep) + + +def contrast(img, factor, **__): + return ImageEnhance.Contrast(img).enhance(factor) + + +def color(img, factor, **__): + return ImageEnhance.Color(img).enhance(factor) + + +def brightness(img, factor, **__): + return ImageEnhance.Brightness(img).enhance(factor) + + +def sharpness(img, factor, **__): + return ImageEnhance.Sharpness(img).enhance(factor) + + +def _randomly_negate(v): + """With 50% prob, negate the value""" + return -v if random.random() > 0.5 else v + + +def _rotate_level_to_arg(level, _hparams): + # range [-30, 30] + level = (level / _LEVEL_DENOM) * 30. + level = _randomly_negate(level) + return level, + + +def _enhance_level_to_arg(level, _hparams): + # range [0.1, 1.9] + return (level / _LEVEL_DENOM) * 1.8 + 0.1, + + +def _enhance_increasing_level_to_arg(level, _hparams): + # the 'no change' level is 1.0, moving away from that towards 0. or 2.0 increases the enhancement blend + # range [0.1, 1.9] if level <= _LEVEL_DENOM + level = (level / _LEVEL_DENOM) * .9 + level = max(0.1, 1.0 + _randomly_negate(level)) # keep it >= 0.1 + return level, + + +def _shear_level_to_arg(level, _hparams): + # range [-0.3, 0.3] + level = (level / _LEVEL_DENOM) * 0.3 + level = _randomly_negate(level) + return level, + + +def _translate_abs_level_to_arg(level, hparams): + translate_const = hparams['translate_const'] + level = (level / _LEVEL_DENOM) * float(translate_const) + level = _randomly_negate(level) + return level, + + +def _translate_rel_level_to_arg(level, hparams): + # default range [-0.45, 0.45] + translate_pct = hparams.get('translate_pct', 0.45) + level = (level / _LEVEL_DENOM) * translate_pct + level = _randomly_negate(level) + return level, + + +def _posterize_level_to_arg(level, _hparams): + # As per Tensorflow TPU EfficientNet impl + # range [0, 4], 'keep 0 up to 4 MSB of original image' + # intensity/severity of augmentation decreases with level + return int((level / _LEVEL_DENOM) * 4), + + +def _posterize_increasing_level_to_arg(level, hparams): + # As per Tensorflow models research and UDA impl + # range [4, 0], 'keep 4 down to 0 MSB of original image', + # intensity/severity of augmentation increases with level + return 4 - _posterize_level_to_arg(level, hparams)[0], + + +def _posterize_original_level_to_arg(level, _hparams): + # As per original AutoAugment paper description + # range [4, 8], 'keep 4 up to 8 MSB of image' + # intensity/severity of augmentation decreases with level + return int((level / _LEVEL_DENOM) * 4) + 4, + + +def _solarize_level_to_arg(level, _hparams): + # range [0, 256] + # intensity/severity of augmentation decreases with level + return int((level / _LEVEL_DENOM) * 256), + + +def _solarize_increasing_level_to_arg(level, _hparams): + # range [0, 256] + # intensity/severity of augmentation increases with level + return 256 - _solarize_level_to_arg(level, _hparams)[0], + + +def _solarize_add_level_to_arg(level, _hparams): + # range [0, 110] + return int((level / _LEVEL_DENOM) * 110), + + +LEVEL_TO_ARG = { + 'AutoContrast': None, + 'Equalize': None, + 'Invert': None, + 'Rotate': _rotate_level_to_arg, + # There are several variations of the posterize level scaling in various Tensorflow/Google repositories/papers + 'Posterize': _posterize_level_to_arg, + 'PosterizeIncreasing': _posterize_increasing_level_to_arg, + 'PosterizeOriginal': _posterize_original_level_to_arg, + 'Solarize': _solarize_level_to_arg, + 'SolarizeIncreasing': _solarize_increasing_level_to_arg, + 'SolarizeAdd': _solarize_add_level_to_arg, + 'Color': _enhance_level_to_arg, + 'ColorIncreasing': _enhance_increasing_level_to_arg, + 'Contrast': _enhance_level_to_arg, + 'ContrastIncreasing': _enhance_increasing_level_to_arg, + 'Brightness': _enhance_level_to_arg, + 'BrightnessIncreasing': _enhance_increasing_level_to_arg, + 'Sharpness': _enhance_level_to_arg, + 'SharpnessIncreasing': _enhance_increasing_level_to_arg, + 'ShearX': _shear_level_to_arg, + 'ShearY': _shear_level_to_arg, + 'TranslateX': _translate_abs_level_to_arg, + 'TranslateY': _translate_abs_level_to_arg, + 'TranslateXRel': _translate_rel_level_to_arg, + 'TranslateYRel': _translate_rel_level_to_arg, +} + + +NAME_TO_OP = { + 'AutoContrast': auto_contrast, + 'Equalize': equalize, + 'Invert': invert, + 'Rotate': rotate, + 'Posterize': posterize, + 'PosterizeIncreasing': posterize, + 'PosterizeOriginal': posterize, + 'Solarize': solarize, + 'SolarizeIncreasing': solarize, + 'SolarizeAdd': solarize_add, + 'Color': color, + 'ColorIncreasing': color, + 'Contrast': contrast, + 'ContrastIncreasing': contrast, + 'Brightness': brightness, + 'BrightnessIncreasing': brightness, + 'Sharpness': sharpness, + 'SharpnessIncreasing': sharpness, + 'ShearX': shear_x, + 'ShearY': shear_y, + 'TranslateX': translate_x_abs, + 'TranslateY': translate_y_abs, + 'TranslateXRel': translate_x_rel, + 'TranslateYRel': translate_y_rel, +} + + +class AugmentOp: + + def __init__(self, name, prob=0.5, magnitude=10, hparams=None): + hparams = hparams or _HPARAMS_DEFAULT + self.name = name + self.aug_fn = NAME_TO_OP[name] + self.level_fn = LEVEL_TO_ARG[name] + self.prob = prob + self.magnitude = magnitude + self.hparams = hparams.copy() + self.kwargs = dict( + fillcolor=hparams['img_mean'] if 'img_mean' in hparams else _FILL, + resample=hparams['interpolation'] if 'interpolation' in hparams else _RANDOM_INTERPOLATION, + ) + + # If magnitude_std is > 0, we introduce some randomness + # in the usually fixed policy and sample magnitude from a normal distribution + # with mean `magnitude` and std-dev of `magnitude_std`. + # NOTE This is my own hack, being tested, not in papers or reference impls. + # If magnitude_std is inf, we sample magnitude from a uniform distribution + self.magnitude_std = self.hparams.get('magnitude_std', 0) + self.magnitude_max = self.hparams.get('magnitude_max', None) + + def __call__(self, img): + if self.prob < 1.0 and random.random() > self.prob: + return img + magnitude = self.magnitude + if self.magnitude_std > 0: + # magnitude randomization enabled + if self.magnitude_std == float('inf'): + magnitude = random.uniform(0, magnitude) + elif self.magnitude_std > 0: + magnitude = random.gauss(magnitude, self.magnitude_std) + # default upper_bound for the timm RA impl is _LEVEL_DENOM (10) + # setting magnitude_max overrides this to allow M > 10 (behaviour closer to Google TF RA impl) + upper_bound = self.magnitude_max or _LEVEL_DENOM + magnitude = max(0., min(magnitude, upper_bound)) + level_args = self.level_fn(magnitude, self.hparams) if self.level_fn is not None else tuple() + return self.aug_fn(img, *level_args, **self.kwargs) + + def __repr__(self): + fs = self.__class__.__name__ + f'(name={self.name}, p={self.prob}' + fs += f', m={self.magnitude}, mstd={self.magnitude_std}' + if self.magnitude_max is not None: + fs += f', mmax={self.magnitude_max}' + fs += ')' + return fs + + +def auto_augment_policy_v0(hparams): + # ImageNet v0 policy from TPU EfficientNet impl, cannot find a paper reference. + policy = [ + [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)], + [('Color', 0.4, 9), ('Equalize', 0.6, 3)], + [('Color', 0.4, 1), ('Rotate', 0.6, 8)], + [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], + [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], + [('Color', 0.2, 0), ('Equalize', 0.8, 8)], + [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], + [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)], + [('Color', 0.6, 1), ('Equalize', 1.0, 2)], + [('Invert', 0.4, 9), ('Rotate', 0.6, 0)], + [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)], + [('Color', 0.4, 7), ('Equalize', 0.6, 0)], + [('Posterize', 0.4, 6), ('AutoContrast', 0.4, 7)], + [('Solarize', 0.6, 8), ('Color', 0.6, 9)], + [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)], + [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)], + [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)], + [('ShearY', 0.8, 0), ('Color', 0.6, 4)], + [('Color', 1.0, 0), ('Rotate', 0.6, 2)], + [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], + [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], + [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)], + [('Posterize', 0.8, 2), ('Solarize', 0.6, 10)], # This results in black image with Tpu posterize + [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], + [('Color', 0.8, 6), ('Rotate', 0.4, 5)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy_v0r(hparams): + # ImageNet v0 policy from TPU EfficientNet impl, with variation of Posterize used + # in Google research implementation (number of bits discarded increases with magnitude) + policy = [ + [('Equalize', 0.8, 1), ('ShearY', 0.8, 4)], + [('Color', 0.4, 9), ('Equalize', 0.6, 3)], + [('Color', 0.4, 1), ('Rotate', 0.6, 8)], + [('Solarize', 0.8, 3), ('Equalize', 0.4, 7)], + [('Solarize', 0.4, 2), ('Solarize', 0.6, 2)], + [('Color', 0.2, 0), ('Equalize', 0.8, 8)], + [('Equalize', 0.4, 8), ('SolarizeAdd', 0.8, 3)], + [('ShearX', 0.2, 9), ('Rotate', 0.6, 8)], + [('Color', 0.6, 1), ('Equalize', 1.0, 2)], + [('Invert', 0.4, 9), ('Rotate', 0.6, 0)], + [('Equalize', 1.0, 9), ('ShearY', 0.6, 3)], + [('Color', 0.4, 7), ('Equalize', 0.6, 0)], + [('PosterizeIncreasing', 0.4, 6), ('AutoContrast', 0.4, 7)], + [('Solarize', 0.6, 8), ('Color', 0.6, 9)], + [('Solarize', 0.2, 4), ('Rotate', 0.8, 9)], + [('Rotate', 1.0, 7), ('TranslateYRel', 0.8, 9)], + [('ShearX', 0.0, 0), ('Solarize', 0.8, 4)], + [('ShearY', 0.8, 0), ('Color', 0.6, 4)], + [('Color', 1.0, 0), ('Rotate', 0.6, 2)], + [('Equalize', 0.8, 4), ('Equalize', 0.0, 8)], + [('Equalize', 1.0, 4), ('AutoContrast', 0.6, 2)], + [('ShearY', 0.4, 7), ('SolarizeAdd', 0.6, 7)], + [('PosterizeIncreasing', 0.8, 2), ('Solarize', 0.6, 10)], + [('Solarize', 0.6, 8), ('Equalize', 0.6, 1)], + [('Color', 0.8, 6), ('Rotate', 0.4, 5)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy_original(hparams): + # ImageNet policy from https://arxiv.org/abs/1805.09501 + policy = [ + [('PosterizeOriginal', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('PosterizeOriginal', 0.6, 7), ('PosterizeOriginal', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('PosterizeOriginal', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('PosterizeOriginal', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy_originalr(hparams): + # ImageNet policy from https://arxiv.org/abs/1805.09501 with research posterize variation + policy = [ + [('PosterizeIncreasing', 0.4, 8), ('Rotate', 0.6, 9)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + [('PosterizeIncreasing', 0.6, 7), ('PosterizeIncreasing', 0.6, 6)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Equalize', 0.4, 4), ('Rotate', 0.8, 8)], + [('Solarize', 0.6, 3), ('Equalize', 0.6, 7)], + [('PosterizeIncreasing', 0.8, 5), ('Equalize', 1.0, 2)], + [('Rotate', 0.2, 3), ('Solarize', 0.6, 8)], + [('Equalize', 0.6, 8), ('PosterizeIncreasing', 0.4, 6)], + [('Rotate', 0.8, 8), ('Color', 0.4, 0)], + [('Rotate', 0.4, 9), ('Equalize', 0.6, 2)], + [('Equalize', 0.0, 7), ('Equalize', 0.8, 8)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Rotate', 0.8, 8), ('Color', 1.0, 2)], + [('Color', 0.8, 8), ('Solarize', 0.8, 7)], + [('Sharpness', 0.4, 7), ('Invert', 0.6, 8)], + [('ShearX', 0.6, 5), ('Equalize', 1.0, 9)], + [('Color', 0.4, 0), ('Equalize', 0.6, 3)], + [('Equalize', 0.4, 7), ('Solarize', 0.2, 4)], + [('Solarize', 0.6, 5), ('AutoContrast', 0.6, 5)], + [('Invert', 0.6, 4), ('Equalize', 1.0, 8)], + [('Color', 0.6, 4), ('Contrast', 1.0, 8)], + [('Equalize', 0.8, 8), ('Equalize', 0.6, 3)], + ] + pc = [[AugmentOp(*a, hparams=hparams) for a in sp] for sp in policy] + return pc + + +def auto_augment_policy(name='v0', hparams=None): + hparams = hparams or _HPARAMS_DEFAULT + if name == 'original': + return auto_augment_policy_original(hparams) + elif name == 'originalr': + return auto_augment_policy_originalr(hparams) + elif name == 'v0': + return auto_augment_policy_v0(hparams) + elif name == 'v0r': + return auto_augment_policy_v0r(hparams) + else: + assert False, 'Unknown AA policy (%s)' % name + + +class AutoAugment: + + def __init__(self, policy): + self.policy = policy + + def __call__(self, img): + sub_policy = random.choice(self.policy) + for op in sub_policy: + img = op(img) + return img + + def __repr__(self): + fs = self.__class__.__name__ + f'(policy=' + for p in self.policy: + fs += '\n\t[' + fs += ', '.join([str(op) for op in p]) + fs += ']' + fs += ')' + return fs + + +def auto_augment_transform(config_str, hparams): + """ + Create a AutoAugment transform + + :param config_str: String defining configuration of auto augmentation. Consists of multiple sections separated by + dashes ('-'). The first section defines the AutoAugment policy (one of 'v0', 'v0r', 'original', 'originalr'). + The remaining sections, not order sepecific determine + 'mstd' - float std deviation of magnitude noise applied + Ex 'original-mstd0.5' results in AutoAugment with original policy, magnitude_std 0.5 + + :param hparams: Other hparams (kwargs) for the AutoAugmentation scheme + + :return: A PyTorch compatible Transform + """ + config = config_str.split('-') + policy_name = config[0] + config = config[1:] + for c in config: + cs = re.split(r'(\d.*)', c) + if len(cs) < 2: + continue + key, val = cs[:2] + if key == 'mstd': + # noise param injected via hparams for now + hparams.setdefault('magnitude_std', float(val)) + else: + assert False, 'Unknown AutoAugment config section' + aa_policy = auto_augment_policy(policy_name, hparams=hparams) + return AutoAugment(aa_policy) + + +_RAND_TRANSFORMS = [ + 'AutoContrast', + 'Equalize', + 'Invert', + 'Rotate', + 'Posterize', + 'Solarize', + 'SolarizeAdd', + 'Color', + 'Contrast', + 'Brightness', + 'Sharpness', + 'ShearX', + 'ShearY', + 'TranslateXRel', + 'TranslateYRel', + #'Cutout' # NOTE I've implement this as random erasing separately +] + + +_RAND_INCREASING_TRANSFORMS = [ + 'AutoContrast', + 'Equalize', + 'Invert', + 'Rotate', + 'PosterizeIncreasing', + 'SolarizeIncreasing', + 'SolarizeAdd', + 'ColorIncreasing', + 'ContrastIncreasing', + 'BrightnessIncreasing', + 'SharpnessIncreasing', + 'ShearX', + 'ShearY', + 'TranslateXRel', + 'TranslateYRel', + #'Cutout' # NOTE I've implement this as random erasing separately +] + + + +# These experimental weights are based loosely on the relative improvements mentioned in paper. +# They may not result in increased performance, but could likely be tuned to so. +_RAND_CHOICE_WEIGHTS_0 = { + 'Rotate': 0.3, + 'ShearX': 0.2, + 'ShearY': 0.2, + 'TranslateXRel': 0.1, + 'TranslateYRel': 0.1, + 'Color': .025, + 'Sharpness': 0.025, + 'AutoContrast': 0.025, + 'Solarize': .005, + 'SolarizeAdd': .005, + 'Contrast': .005, + 'Brightness': .005, + 'Equalize': .005, + 'Posterize': 0, + 'Invert': 0, +} + + +def _select_rand_weights(weight_idx=0, transforms=None): + transforms = transforms or _RAND_TRANSFORMS + assert weight_idx == 0 # only one set of weights currently + rand_weights = _RAND_CHOICE_WEIGHTS_0 + probs = [rand_weights[k] for k in transforms] + probs /= np.sum(probs) + return probs + + +def rand_augment_ops(magnitude=10, hparams=None, transforms=None): + hparams = hparams or _HPARAMS_DEFAULT + transforms = transforms or _RAND_TRANSFORMS + return [AugmentOp( + name, prob=0.5, magnitude=magnitude, hparams=hparams) for name in transforms] + + +class RandAugment: + def __init__(self, ops, num_layers=2, choice_weights=None): + self.ops = ops + self.num_layers = num_layers + self.choice_weights = choice_weights + + def __call__(self, img): + # no replacement when using weighted choice + ops = np.random.choice( + self.ops, self.num_layers, replace=self.choice_weights is None, p=self.choice_weights) + for op in ops: + img = op(img) + return img + + def __repr__(self): + fs = self.__class__.__name__ + f'(n={self.num_layers}, ops=' + for op in self.ops: + fs += f'\n\t{op}' + fs += ')' + return fs + + +def rand_augment_transform(config_str, hparams): + """ + Create a RandAugment transform + + :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by + dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining + sections, not order sepecific determine + 'm' - integer magnitude of rand augment + 'n' - integer num layers (number of transform ops selected per image) + 'w' - integer probabiliy weight index (index of a set of weights to influence choice of op) + 'mstd' - float std deviation of magnitude noise applied, or uniform sampling if infinity (or > 100) + 'mmax' - set upper bound for magnitude to something other than default of _LEVEL_DENOM (10) + 'inc' - integer (bool), use augmentations that increase in severity with magnitude (default: 0) + Ex 'rand-m9-n3-mstd0.5' results in RandAugment with magnitude 9, num_layers 3, magnitude_std 0.5 + 'rand-mstd1-w0' results in magnitude_std 1.0, weights 0, default magnitude of 10 and num_layers 2 + + :param hparams: Other hparams (kwargs) for the RandAugmentation scheme + + :return: A PyTorch compatible Transform + """ + magnitude = _LEVEL_DENOM # default to _LEVEL_DENOM for magnitude (currently 10) + num_layers = 2 # default to 2 ops per image + weight_idx = None # default to no probability weights for op choice + transforms = _RAND_TRANSFORMS + config = config_str.split('-') + assert config[0] == 'rand' + config = config[1:] + for c in config: + cs = re.split(r'(\d.*)', c) + if len(cs) < 2: + continue + key, val = cs[:2] + if key == 'mstd': + # noise param / randomization of magnitude values + mstd = float(val) + if mstd > 100: + # use uniform sampling in 0 to magnitude if mstd is > 100 + mstd = float('inf') + hparams.setdefault('magnitude_std', mstd) + elif key == 'mmax': + # clip magnitude between [0, mmax] instead of default [0, _LEVEL_DENOM] + hparams.setdefault('magnitude_max', int(val)) + elif key == 'inc': + if bool(val): + transforms = _RAND_INCREASING_TRANSFORMS + elif key == 'm': + magnitude = int(val) + elif key == 'n': + num_layers = int(val) + elif key == 'w': + weight_idx = int(val) + else: + assert False, 'Unknown RandAugment config section' + ra_ops = rand_augment_ops(magnitude=magnitude, hparams=hparams, transforms=transforms) + choice_weights = None if weight_idx is None else _select_rand_weights(weight_idx) + return RandAugment(ra_ops, num_layers, choice_weights=choice_weights) + + +_AUGMIX_TRANSFORMS = [ + 'AutoContrast', + 'ColorIncreasing', # not in paper + 'ContrastIncreasing', # not in paper + 'BrightnessIncreasing', # not in paper + 'SharpnessIncreasing', # not in paper + 'Equalize', + 'Rotate', + 'PosterizeIncreasing', + 'SolarizeIncreasing', + 'ShearX', + 'ShearY', + 'TranslateXRel', + 'TranslateYRel', +] + + +def augmix_ops(magnitude=10, hparams=None, transforms=None): + hparams = hparams or _HPARAMS_DEFAULT + transforms = transforms or _AUGMIX_TRANSFORMS + return [AugmentOp( + name, prob=1.0, magnitude=magnitude, hparams=hparams) for name in transforms] + + +class AugMixAugment: + """ AugMix Transform + Adapted and improved from impl here: https://github.com/google-research/augmix/blob/master/imagenet.py + From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - + https://arxiv.org/abs/1912.02781 + """ + def __init__(self, ops, alpha=1., width=3, depth=-1, blended=False): + self.ops = ops + self.alpha = alpha + self.width = width + self.depth = depth + self.blended = blended # blended mode is faster but not well tested + + def _calc_blended_weights(self, ws, m): + ws = ws * m + cump = 1. + rws = [] + for w in ws[::-1]: + alpha = w / cump + cump *= (1 - alpha) + rws.append(alpha) + return np.array(rws[::-1], dtype=np.float32) + + def _apply_blended(self, img, mixing_weights, m): + # This is my first crack and implementing a slightly faster mixed augmentation. Instead + # of accumulating the mix for each chain in a Numpy array and then blending with original, + # it recomputes the blending coefficients and applies one PIL image blend per chain. + # TODO the results appear in the right ballpark but they differ by more than rounding. + img_orig = img.copy() + ws = self._calc_blended_weights(mixing_weights, m) + for w in ws: + depth = self.depth if self.depth > 0 else np.random.randint(1, 4) + ops = np.random.choice(self.ops, depth, replace=True) + img_aug = img_orig # no ops are in-place, deep copy not necessary + for op in ops: + img_aug = op(img_aug) + img = Image.blend(img, img_aug, w) + return img + + def _apply_basic(self, img, mixing_weights, m): + # This is a literal adaptation of the paper/official implementation without normalizations and + # PIL <-> Numpy conversions between every op. It is still quite CPU compute heavy compared to the + # typical augmentation transforms, could use a GPU / Kornia implementation. + img_shape = img.size[0], img.size[1], len(img.getbands()) + mixed = np.zeros(img_shape, dtype=np.float32) + for mw in mixing_weights: + depth = self.depth if self.depth > 0 else np.random.randint(1, 4) + ops = np.random.choice(self.ops, depth, replace=True) + img_aug = img # no ops are in-place, deep copy not necessary + for op in ops: + img_aug = op(img_aug) + mixed += mw * np.asarray(img_aug, dtype=np.float32) + np.clip(mixed, 0, 255., out=mixed) + mixed = Image.fromarray(mixed.astype(np.uint8)) + return Image.blend(img, mixed, m) + + def __call__(self, img): + mixing_weights = np.float32(np.random.dirichlet([self.alpha] * self.width)) + m = np.float32(np.random.beta(self.alpha, self.alpha)) + if self.blended: + mixed = self._apply_blended(img, mixing_weights, m) + else: + mixed = self._apply_basic(img, mixing_weights, m) + return mixed + + def __repr__(self): + fs = self.__class__.__name__ + f'(alpha={self.alpha}, width={self.width}, depth={self.depth}, ops=' + for op in self.ops: + fs += f'\n\t{op}' + fs += ')' + return fs + + +def augment_and_mix_transform(config_str, hparams): + """ Create AugMix PyTorch transform + + :param config_str: String defining configuration of random augmentation. Consists of multiple sections separated by + dashes ('-'). The first section defines the specific variant of rand augment (currently only 'rand'). The remaining + sections, not order sepecific determine + 'm' - integer magnitude (severity) of augmentation mix (default: 3) + 'w' - integer width of augmentation chain (default: 3) + 'd' - integer depth of augmentation chain (-1 is random [1, 3], default: -1) + 'b' - integer (bool), blend each branch of chain into end result without a final blend, less CPU (default: 0) + 'mstd' - float std deviation of magnitude noise applied (default: 0) + Ex 'augmix-m5-w4-d2' results in AugMix with severity 5, chain width 4, chain depth 2 + + :param hparams: Other hparams (kwargs) for the Augmentation transforms + + :return: A PyTorch compatible Transform + """ + magnitude = 3 + width = 3 + depth = -1 + alpha = 1. + blended = False + config = config_str.split('-') + assert config[0] == 'augmix' + config = config[1:] + for c in config: + cs = re.split(r'(\d.*)', c) + if len(cs) < 2: + continue + key, val = cs[:2] + if key == 'mstd': + # noise param injected via hparams for now + hparams.setdefault('magnitude_std', float(val)) + elif key == 'm': + magnitude = int(val) + elif key == 'w': + width = int(val) + elif key == 'd': + depth = int(val) + elif key == 'a': + alpha = float(val) + elif key == 'b': + blended = bool(val) + else: + assert False, 'Unknown AugMix config section' + hparams.setdefault('magnitude_std', float('inf')) # default to uniform sampling (if not set via mstd arg) + ops = augmix_ops(magnitude=magnitude, hparams=hparams) + return AugMixAugment(ops, alpha=alpha, width=width, depth=depth, blended=blended) diff --git a/custom_timm/data/config.py b/custom_timm/data/config.py new file mode 100644 index 0000000000000000000000000000000000000000..78176e4ba9a16fab4373395edef1fb1dee313c33 --- /dev/null +++ b/custom_timm/data/config.py @@ -0,0 +1,82 @@ +import logging +from .constants import * + + +_logger = logging.getLogger(__name__) + + +def resolve_data_config(args, default_cfg={}, model=None, use_test_size=False, verbose=False): + new_config = {} + default_cfg = default_cfg + if not default_cfg and model is not None and hasattr(model, 'default_cfg'): + default_cfg = model.default_cfg + + # Resolve input/image size + in_chans = 3 + if 'chans' in args and args['chans'] is not None: + in_chans = args['chans'] + + input_size = (in_chans, 224, 224) + if 'input_size' in args and args['input_size'] is not None: + assert isinstance(args['input_size'], (tuple, list)) + assert len(args['input_size']) == 3 + input_size = tuple(args['input_size']) + in_chans = input_size[0] # input_size overrides in_chans + elif 'img_size' in args and args['img_size'] is not None: + assert isinstance(args['img_size'], int) + input_size = (in_chans, args['img_size'], args['img_size']) + else: + if use_test_size and 'test_input_size' in default_cfg: + input_size = default_cfg['test_input_size'] + elif 'input_size' in default_cfg: + input_size = default_cfg['input_size'] + new_config['input_size'] = input_size + + # resolve interpolation method + new_config['interpolation'] = 'bicubic' + if 'interpolation' in args and args['interpolation']: + new_config['interpolation'] = args['interpolation'] + elif 'interpolation' in default_cfg: + new_config['interpolation'] = default_cfg['interpolation'] + + # resolve dataset + model mean for normalization + new_config['mean'] = IMAGENET_DEFAULT_MEAN + if 'mean' in args and args['mean'] is not None: + mean = tuple(args['mean']) + if len(mean) == 1: + mean = tuple(list(mean) * in_chans) + else: + assert len(mean) == in_chans + new_config['mean'] = mean + elif 'mean' in default_cfg: + new_config['mean'] = default_cfg['mean'] + + # resolve dataset + model std deviation for normalization + new_config['std'] = IMAGENET_DEFAULT_STD + if 'std' in args and args['std'] is not None: + std = tuple(args['std']) + if len(std) == 1: + std = tuple(list(std) * in_chans) + else: + assert len(std) == in_chans + new_config['std'] = std + elif 'std' in default_cfg: + new_config['std'] = default_cfg['std'] + + # resolve default crop percentage + crop_pct = DEFAULT_CROP_PCT + if 'crop_pct' in args and args['crop_pct'] is not None: + crop_pct = args['crop_pct'] + else: + if use_test_size and 'test_crop_pct' in default_cfg: + crop_pct = default_cfg['test_crop_pct'] + elif 'crop_pct' in default_cfg: + crop_pct = default_cfg['crop_pct'] + new_config['crop_pct'] = crop_pct + + if verbose: + _logger.info('Data processing configuration for current model + dataset:') + for n, v in new_config.items(): + _logger.info('\t%s: %s' % (n, str(v))) + + return new_config diff --git a/custom_timm/data/constants.py b/custom_timm/data/constants.py new file mode 100644 index 0000000000000000000000000000000000000000..e4d8bb7ed4419bda758bfc22448572cdc32e8f27 --- /dev/null +++ b/custom_timm/data/constants.py @@ -0,0 +1,9 @@ +DEFAULT_CROP_PCT = 0.875 +IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406) +IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225) +IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5) +IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5) +IMAGENET_DPN_MEAN = (124 / 255, 117 / 255, 104 / 255) +IMAGENET_DPN_STD = tuple([1 / (.0167 * 255)] * 3) +OPENAI_CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073) +OPENAI_CLIP_STD = (0.26862954, 0.26130258, 0.27577711) diff --git a/custom_timm/data/dataset.py b/custom_timm/data/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..20b663cecaacacf7efbe8a05bb00414610c810f7 --- /dev/null +++ b/custom_timm/data/dataset.py @@ -0,0 +1,152 @@ +""" Quick n Simple Image Folder, Tarfile based DataSet + +Hacked together by / Copyright 2019, Ross Wightman +""" +import torch.utils.data as data +import os +import torch +import logging + +from PIL import Image + +from .parsers import create_parser + +_logger = logging.getLogger(__name__) + + +_ERROR_RETRY = 50 + + +class ImageDataset(data.Dataset): + + def __init__( + self, + root, + parser=None, + class_map=None, + load_bytes=False, + transform=None, + target_transform=None, + ): + if parser is None or isinstance(parser, str): + parser = create_parser(parser or '', root=root, class_map=class_map) + self.parser = parser + self.load_bytes = load_bytes + self.transform = transform + self.target_transform = target_transform + self._consecutive_errors = 0 + + def __getitem__(self, index): + img, target = self.parser[index] + try: + img = img.read() if self.load_bytes else Image.open(img).convert('RGB') + except Exception as e: + _logger.warning(f'Skipped sample (index {index}, file {self.parser.filename(index)}). {str(e)}') + self._consecutive_errors += 1 + if self._consecutive_errors < _ERROR_RETRY: + return self.__getitem__((index + 1) % len(self.parser)) + else: + raise e + self._consecutive_errors = 0 + if self.transform is not None: + img = self.transform(img) + if target is None: + target = -1 + elif self.target_transform is not None: + target = self.target_transform(target) + return img, target + + def __len__(self): + return len(self.parser) + + def filename(self, index, basename=False, absolute=False): + return self.parser.filename(index, basename, absolute) + + def filenames(self, basename=False, absolute=False): + return self.parser.filenames(basename, absolute) + + +class IterableImageDataset(data.IterableDataset): + + def __init__( + self, + root, + parser=None, + split='train', + is_training=False, + batch_size=None, + repeats=0, + download=False, + transform=None, + target_transform=None, + ): + assert parser is not None + if isinstance(parser, str): + self.parser = create_parser( + parser, root=root, split=split, is_training=is_training, + batch_size=batch_size, repeats=repeats, download=download) + else: + self.parser = parser + self.transform = transform + self.target_transform = target_transform + self._consecutive_errors = 0 + + def __iter__(self): + for img, target in self.parser: + if self.transform is not None: + img = self.transform(img) + if self.target_transform is not None: + target = self.target_transform(target) + yield img, target + + def __len__(self): + if hasattr(self.parser, '__len__'): + return len(self.parser) + else: + return 0 + + def filename(self, index, basename=False, absolute=False): + assert False, 'Filename lookup by index not supported, use filenames().' + + def filenames(self, basename=False, absolute=False): + return self.parser.filenames(basename, absolute) + + +class AugMixDataset(torch.utils.data.Dataset): + """Dataset wrapper to perform AugMix or other clean/augmentation mixes""" + + def __init__(self, dataset, num_splits=2): + self.augmentation = None + self.normalize = None + self.dataset = dataset + if self.dataset.transform is not None: + self._set_transforms(self.dataset.transform) + self.num_splits = num_splits + + def _set_transforms(self, x): + assert isinstance(x, (list, tuple)) and len(x) == 3, 'Expecting a tuple/list of 3 transforms' + self.dataset.transform = x[0] + self.augmentation = x[1] + self.normalize = x[2] + + @property + def transform(self): + return self.dataset.transform + + @transform.setter + def transform(self, x): + self._set_transforms(x) + + def _normalize(self, x): + return x if self.normalize is None else self.normalize(x) + + def __getitem__(self, i): + x, y = self.dataset[i] # all splits share the same dataset base transform + x_list = [self._normalize(x)] # first split only normalizes (this is the 'clean' split) + # run the full augmentation on the remaining splits + for _ in range(self.num_splits - 1): + x_list.append(self._normalize(self.augmentation(x))) + return tuple(x_list), y + + def __len__(self): + return len(self.dataset) diff --git a/custom_timm/data/dataset_factory.py b/custom_timm/data/dataset_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..d0ac30b168080a1d359dfe9fb57e3a4e5901ada4 --- /dev/null +++ b/custom_timm/data/dataset_factory.py @@ -0,0 +1,143 @@ +""" Dataset Factory + +Hacked together by / Copyright 2021, Ross Wightman +""" +import os + +from torchvision.datasets import CIFAR100, CIFAR10, MNIST, QMNIST, KMNIST, FashionMNIST, ImageNet, ImageFolder +try: + from torchvision.datasets import Places365 + has_places365 = True +except ImportError: + has_places365 = False +try: + from torchvision.datasets import INaturalist + has_inaturalist = True +except ImportError: + has_inaturalist = False + +from .dataset import IterableImageDataset, ImageDataset + +_TORCH_BASIC_DS = dict( + cifar10=CIFAR10, + cifar100=CIFAR100, + mnist=MNIST, + qmist=QMNIST, + kmnist=KMNIST, + fashion_mnist=FashionMNIST, +) +_TRAIN_SYNONYM = dict(train=None, training=None) +_EVAL_SYNONYM = dict(val=None, valid=None, validation=None, eval=None, evaluation=None) + + +def _search_split(root, split): + # look for sub-folder with name of split in root and use that if it exists + split_name = split.split('[')[0] + try_root = os.path.join(root, split_name) + if os.path.exists(try_root): + return try_root + + def _try(syn): + for s in syn: + try_root = os.path.join(root, s) + if os.path.exists(try_root): + return try_root + return root + if split_name in _TRAIN_SYNONYM: + root = _try(_TRAIN_SYNONYM) + elif split_name in _EVAL_SYNONYM: + root = _try(_EVAL_SYNONYM) + return root + + +def create_dataset( + name, + root, + split='validation', + search_split=True, + class_map=None, + load_bytes=False, + is_training=False, + download=False, + batch_size=None, + repeats=0, + **kwargs +): + """ Dataset factory method + + In parenthesis after each arg are the type of dataset supported for each arg, one of: + * folder - default, timm folder (or tar) based ImageDataset + * torch - torchvision based datasets + * TFDS - Tensorflow-datasets wrapper in IterabeDataset interface via IterableImageDataset + * all - any of the above + + Args: + name: dataset name, empty is okay for folder based datasets + root: root folder of dataset (all) + split: dataset split (all) + search_split: search for split specific child fold from root so one can specify + `imagenet/` instead of `/imagenet/val`, etc on cmd line / config. (folder, torch/folder) + class_map: specify class -> index mapping via text file or dict (folder) + load_bytes: load data, return images as undecoded bytes (folder) + download: download dataset if not present and supported (TFDS, torch) + is_training: create dataset in train mode, this is different from the split. + For Iterable / TDFS it enables shuffle, ignored for other datasets. (TFDS) + batch_size: batch size hint for (TFDS) + repeats: dataset repeats per iteration i.e. epoch (TFDS) + **kwargs: other args to pass to dataset + + Returns: + Dataset object + """ + name = name.lower() + if name.startswith('torch/'): + name = name.split('/', 2)[-1] + torch_kwargs = dict(root=root, download=download, **kwargs) + if name in _TORCH_BASIC_DS: + ds_class = _TORCH_BASIC_DS[name] + use_train = split in _TRAIN_SYNONYM + ds = ds_class(train=use_train, **torch_kwargs) + elif name == 'inaturalist' or name == 'inat': + assert has_inaturalist, 'Please update to PyTorch 1.10, torchvision 0.11+ for Inaturalist' + target_type = 'full' + split_split = split.split('/') + if len(split_split) > 1: + target_type = split_split[0].split('_') + if len(target_type) == 1: + target_type = target_type[0] + split = split_split[-1] + if split in _TRAIN_SYNONYM: + split = '2021_train' + elif split in _EVAL_SYNONYM: + split = '2021_valid' + ds = INaturalist(version=split, target_type=target_type, **torch_kwargs) + elif name == 'places365': + assert has_places365, 'Please update to a newer PyTorch and torchvision for Places365 dataset.' + if split in _TRAIN_SYNONYM: + split = 'train-standard' + elif split in _EVAL_SYNONYM: + split = 'val' + ds = Places365(split=split, **torch_kwargs) + elif name == 'imagenet': + if split in _EVAL_SYNONYM: + split = 'val' + ds = ImageNet(split=split, **torch_kwargs) + elif name == 'image_folder' or name == 'folder': + # in case torchvision ImageFolder is preferred over timm ImageDataset for some reason + if search_split and os.path.isdir(root): + # look for split specific sub-folder in root + root = _search_split(root, split) + ds = ImageFolder(root, **kwargs) + else: + assert False, f"Unknown torchvision dataset {name}" + elif name.startswith('tfds/'): + ds = IterableImageDataset( + root, parser=name, split=split, is_training=is_training, + download=download, batch_size=batch_size, repeats=repeats, **kwargs) + else: + # FIXME support more advance split cfg for ImageFolder/Tar datasets in the future + if search_split and os.path.isdir(root): + # look for split specific sub-folder in root + root = _search_split(root, split) + ds = ImageDataset(root, parser=name, class_map=class_map, load_bytes=load_bytes, **kwargs) + return ds diff --git a/custom_timm/data/distributed_sampler.py b/custom_timm/data/distributed_sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..54ff0459504a7f952d701720727ebd50d07e9c7a --- /dev/null +++ b/custom_timm/data/distributed_sampler.py @@ -0,0 +1,135 @@ +import math +import torch +from torch.utils.data import Sampler +import torch.distributed as dist + + +class OrderedDistributedSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset. + It is especially useful in conjunction with + :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each + process can pass a DistributedSampler instance as a DataLoader sampler, + and load a subset of the original dataset that is exclusive to it. + .. note:: + Dataset is assumed to be of constant size. + Arguments: + dataset: Dataset used for sampling. + num_replicas (optional): Number of processes participating in + distributed training. + rank (optional): Rank of the current process within num_replicas. + """ + + def __init__(self, dataset, num_replicas=None, rank=None): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + + def __iter__(self): + indices = list(range(len(self.dataset))) + + # add extra samples to make it evenly divisible + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + def __len__(self): + return self.num_samples + + +class RepeatAugSampler(Sampler): + """Sampler that restricts data loading to a subset of the dataset for distributed, + with repeated augmentation. + It ensures that different each augmented version of a sample will be visible to a + different process (GPU). Heavily based on torch.utils.data.DistributedSampler + + This sampler was taken from https://github.com/facebookresearch/deit/blob/0c4b8f60/samplers.py + Used in + Copyright (c) 2015-present, Facebook, Inc. + """ + + def __init__( + self, + dataset, + num_replicas=None, + rank=None, + shuffle=True, + num_repeats=3, + selected_round=256, + selected_ratio=0, + ): + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + self.shuffle = shuffle + self.num_repeats = num_repeats + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * num_repeats / self.num_replicas)) + self.total_size = self.num_samples * self.num_replicas + # Determine the number of samples to select per epoch for each rank. + # num_selected logic defaults to be the same as original RASampler impl, but this one can be tweaked + # via selected_ratio and selected_round args. + selected_ratio = selected_ratio or num_replicas # ratio to reduce selected samples by, num_replicas if 0 + if selected_round: + self.num_selected_samples = int(math.floor( + len(self.dataset) // selected_round * selected_round / selected_ratio)) + else: + self.num_selected_samples = int(math.ceil(len(self.dataset) / selected_ratio)) + + def __iter__(self): + # deterministically shuffle based on epoch + g = torch.Generator() + g.manual_seed(self.epoch) + if self.shuffle: + indices = torch.randperm(len(self.dataset), generator=g) + else: + indices = torch.arange(start=0, end=len(self.dataset)) + + # produce repeats e.g. [0, 0, 0, 1, 1, 1, 2, 2, 2....] + if isinstance(self.num_repeats, float) and not self.num_repeats.is_integer(): + # resample for repeats w/ non-integer ratio + repeat_size = math.ceil(self.num_repeats * len(self.dataset)) + indices = indices[torch.tensor([int(i // self.num_repeats) for i in range(repeat_size)])] + else: + indices = torch.repeat_interleave(indices, repeats=int(self.num_repeats), dim=0) + indices = indices.tolist() # leaving as tensor thrashes dataloader memory + # add extra samples to make it evenly divisible + padding_size = self.total_size - len(indices) + if padding_size > 0: + indices += indices[:padding_size] + assert len(indices) == self.total_size + + # subsample per rank + indices = indices[self.rank:self.total_size:self.num_replicas] + assert len(indices) == self.num_samples + + # return up to num selected samples + return iter(indices[:self.num_selected_samples]) + + def __len__(self): + return self.num_selected_samples + + def set_epoch(self, epoch): + self.epoch = epoch diff --git a/custom_timm/data/loader.py b/custom_timm/data/loader.py new file mode 100644 index 0000000000000000000000000000000000000000..ecc075c027b4dd46e63fb1f33d819d2555bff25e --- /dev/null +++ b/custom_timm/data/loader.py @@ -0,0 +1,308 @@ +""" Loader Factory, Fast Collate, CUDA Prefetcher + +Prefetcher and Fast Collate inspired by NVIDIA APEX example at +https://github.com/NVIDIA/apex/commit/d5e2bb4bdeedd27b1dfaf5bb2b24d6c000dee9be#diff-cf86c282ff7fba81fad27a559379d5bf + +Hacked together by / Copyright 2019, Ross Wightman +""" +import random +from functools import partial +from itertools import repeat +from typing import Callable + +import torch.utils.data +import numpy as np + +from .transforms_factory import create_transform +from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .distributed_sampler import OrderedDistributedSampler, RepeatAugSampler +from .random_erasing import RandomErasing +from .mixup import FastCollateMixup + + +def fast_collate(batch): + """ A fast collation function optimized for uint8 images (np array or torch) and int64 targets (labels)""" + assert isinstance(batch[0], tuple) + batch_size = len(batch) + if isinstance(batch[0][0], tuple): + # This branch 'deinterleaves' and flattens tuples of input tensors into one tensor ordered by position + # such that all tuple of position n will end up in a torch.split(tensor, batch_size) in nth position + inner_tuple_size = len(batch[0][0]) + flattened_batch_size = batch_size * inner_tuple_size + targets = torch.zeros(flattened_batch_size, dtype=torch.int64) + tensor = torch.zeros((flattened_batch_size, *batch[0][0][0].shape), dtype=torch.uint8) + for i in range(batch_size): + assert len(batch[i][0]) == inner_tuple_size # all input tensor tuples must be same length + for j in range(inner_tuple_size): + targets[i + j * batch_size] = batch[i][1] + tensor[i + j * batch_size] += torch.from_numpy(batch[i][0][j]) + return tensor, targets + elif isinstance(batch[0][0], np.ndarray): + targets = torch.tensor([b[1] for b in batch], dtype=torch.int64) + assert len(targets) == batch_size + tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + for i in range(batch_size): + tensor[i] += torch.from_numpy(batch[i][0]) + return tensor, targets + elif isinstance(batch[0][0], torch.Tensor): + targets = torch.tensor([b[1] for b in batch], dtype=torch.int64) + assert len(targets) == batch_size + tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + for i in range(batch_size): + tensor[i].copy_(batch[i][0]) + return tensor, targets + else: + assert False + + +def expand_to_chs(x, n): + if not isinstance(x, (tuple, list)): + x = tuple(repeat(x, n)) + elif len(x) == 1: + x = x * n + else: + assert len(x) == n, 'normalization stats must match image channels' + return x + + +class PrefetchLoader: + + def __init__( + self, + loader, + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + channels=3, + fp16=False, + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0): + + mean = expand_to_chs(mean, channels) + std = expand_to_chs(std, channels) + normalization_shape = (1, channels, 1, 1) + + self.loader = loader + self.mean = torch.tensor([x * 255 for x in mean]).cuda().view(normalization_shape) + self.std = torch.tensor([x * 255 for x in std]).cuda().view(normalization_shape) + self.fp16 = fp16 + if fp16: + self.mean = self.mean.half() + self.std = self.std.half() + if re_prob > 0.: + self.random_erasing = RandomErasing( + probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits) + else: + self.random_erasing = None + + def __iter__(self): + stream = torch.cuda.Stream() + first = True + + for next_input, next_target in self.loader: + with torch.cuda.stream(stream): + next_input = next_input.cuda(non_blocking=True) + next_target = next_target.cuda(non_blocking=True) + if self.fp16: + next_input = next_input.half().sub_(self.mean).div_(self.std) + else: + next_input = next_input.float().sub_(self.mean).div_(self.std) + if self.random_erasing is not None: + next_input = self.random_erasing(next_input) + + if not first: + yield input, target + else: + first = False + + torch.cuda.current_stream().wait_stream(stream) + input = next_input + target = next_target + + yield input, target + + def __len__(self): + return len(self.loader) + + @property + def sampler(self): + return self.loader.sampler + + @property + def dataset(self): + return self.loader.dataset + + @property + def mixup_enabled(self): + if isinstance(self.loader.collate_fn, FastCollateMixup): + return self.loader.collate_fn.mixup_enabled + else: + return False + + @mixup_enabled.setter + def mixup_enabled(self, x): + if isinstance(self.loader.collate_fn, FastCollateMixup): + self.loader.collate_fn.mixup_enabled = x + + +def _worker_init(worker_id, worker_seeding='all'): + worker_info = torch.utils.data.get_worker_info() + assert worker_info.id == worker_id + if isinstance(worker_seeding, Callable): + seed = worker_seeding(worker_info) + random.seed(seed) + torch.manual_seed(seed) + np.random.seed(seed % (2 ** 32 - 1)) + else: + assert worker_seeding in ('all', 'part') + # random / torch seed already called in dataloader iter class w/ worker_info.seed + # to reproduce some old results (same seed + hparam combo), partial seeding is required (skip numpy re-seed) + if worker_seeding == 'all': + np.random.seed(worker_info.seed % (2 ** 32 - 1)) + + +def create_loader( + dataset, + input_size, + batch_size, + is_training=False, + use_prefetcher=True, + no_aug=False, + re_prob=0., + re_mode='const', + re_count=1, + re_split=False, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + num_aug_repeats=0, + num_aug_splits=0, + interpolation='bilinear', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + num_workers=1, + distributed=False, + crop_pct=None, + collate_fn=None, + pin_memory=False, + fp16=False, + tf_preprocessing=False, + use_multi_epochs_loader=False, + persistent_workers=True, + worker_seeding='all', +): + re_num_splits = 0 + if re_split: + # apply RE to second half of batch if no aug split otherwise line up with aug split + re_num_splits = num_aug_splits or 2 + dataset.transform = create_transform( + input_size, + is_training=is_training, + use_prefetcher=use_prefetcher, + no_aug=no_aug, + scale=scale, + ratio=ratio, + hflip=hflip, + vflip=vflip, + color_jitter=color_jitter, + auto_augment=auto_augment, + interpolation=interpolation, + mean=mean, + std=std, + crop_pct=crop_pct, + tf_preprocessing=tf_preprocessing, + re_prob=re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + separate=num_aug_splits > 0, + ) + + sampler = None + if distributed and not isinstance(dataset, torch.utils.data.IterableDataset): + if is_training: + if num_aug_repeats: + sampler = RepeatAugSampler(dataset, num_repeats=num_aug_repeats) + else: + sampler = torch.utils.data.distributed.DistributedSampler(dataset) + else: + # This will add extra duplicate entries to result in equal num + # of samples per-process, will slightly alter validation results + sampler = OrderedDistributedSampler(dataset) + else: + assert num_aug_repeats == 0, "RepeatAugment not currently supported in non-distributed or IterableDataset use" + + if collate_fn is None: + collate_fn = fast_collate if use_prefetcher else torch.utils.data.dataloader.default_collate + + loader_class = torch.utils.data.DataLoader + if use_multi_epochs_loader: + loader_class = MultiEpochsDataLoader + + loader_args = dict( + batch_size=batch_size, + shuffle=not isinstance(dataset, torch.utils.data.IterableDataset) and sampler is None and is_training, + num_workers=num_workers, + sampler=sampler, + collate_fn=collate_fn, + pin_memory=pin_memory, + drop_last=is_training, + worker_init_fn=partial(_worker_init, worker_seeding=worker_seeding), + persistent_workers=persistent_workers + ) + try: + loader = loader_class(dataset, **loader_args) + except TypeError as e: + loader_args.pop('persistent_workers') # only in Pytorch 1.7+ + loader = loader_class(dataset, **loader_args) + if use_prefetcher: + prefetch_re_prob = re_prob if is_training and not no_aug else 0. + loader = PrefetchLoader( + loader, + mean=mean, + std=std, + channels=input_size[0], + fp16=fp16, + re_prob=prefetch_re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits + ) + + return loader + + +class MultiEpochsDataLoader(torch.utils.data.DataLoader): + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._DataLoader__initialized = False + self.batch_sampler = _RepeatSampler(self.batch_sampler) + self._DataLoader__initialized = True + self.iterator = super().__iter__() + + def __len__(self): + return len(self.batch_sampler.sampler) + + def __iter__(self): + for i in range(len(self)): + yield next(self.iterator) + + +class _RepeatSampler(object): + """ Sampler that repeats forever. + + Args: + sampler (Sampler) + """ + + def __init__(self, sampler): + self.sampler = sampler + + def __iter__(self): + while True: + yield from iter(self.sampler) diff --git a/custom_timm/data/mixup.py b/custom_timm/data/mixup.py new file mode 100644 index 0000000000000000000000000000000000000000..c8789a0c35cbf545b508cdf513b890cab80eebf9 --- /dev/null +++ b/custom_timm/data/mixup.py @@ -0,0 +1,316 @@ +""" Mixup and Cutmix + +Papers: +mixup: Beyond Empirical Risk Minimization (https://arxiv.org/abs/1710.09412) + +CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features (https://arxiv.org/abs/1905.04899) + +Code Reference: +CutMix: https://github.com/clovaai/CutMix-PyTorch + +Hacked together by / Copyright 2019, Ross Wightman +""" +import numpy as np +import torch + + +def one_hot(x, num_classes, on_value=1., off_value=0., device='cuda'): + x = x.long().view(-1, 1) + return torch.full((x.size()[0], num_classes), off_value, device=device).scatter_(1, x, on_value) + + +def mixup_target(target, num_classes, lam=1., smoothing=0.0, device='cuda'): + off_value = smoothing / num_classes + on_value = 1. - smoothing + off_value + y1 = one_hot(target, num_classes, on_value=on_value, off_value=off_value, device=device) + y2 = one_hot(target.flip(0), num_classes, on_value=on_value, off_value=off_value, device=device) + return y1 * lam + y2 * (1. - lam) + + +def rand_bbox(img_shape, lam, margin=0., count=None): + """ Standard CutMix bounding-box + Generates a random square bbox based on lambda value. This impl includes + support for enforcing a border margin as percent of bbox dimensions. + + Args: + img_shape (tuple): Image shape as tuple + lam (float): Cutmix lambda value + margin (float): Percentage of bbox dimension to enforce as margin (reduce amount of box outside image) + count (int): Number of bbox to generate + """ + ratio = np.sqrt(1 - lam) + img_h, img_w = img_shape[-2:] + cut_h, cut_w = int(img_h * ratio), int(img_w * ratio) + margin_y, margin_x = int(margin * cut_h), int(margin * cut_w) + cy = np.random.randint(0 + margin_y, img_h - margin_y, size=count) + cx = np.random.randint(0 + margin_x, img_w - margin_x, size=count) + yl = np.clip(cy - cut_h // 2, 0, img_h) + yh = np.clip(cy + cut_h // 2, 0, img_h) + xl = np.clip(cx - cut_w // 2, 0, img_w) + xh = np.clip(cx + cut_w // 2, 0, img_w) + return yl, yh, xl, xh + + +def rand_bbox_minmax(img_shape, minmax, count=None): + """ Min-Max CutMix bounding-box + Inspired by Darknet cutmix impl, generates a random rectangular bbox + based on min/max percent values applied to each dimension of the input image. + + Typical defaults for minmax are usually in the .2-.3 for min and .8-.9 range for max. + + Args: + img_shape (tuple): Image shape as tuple + minmax (tuple or list): Min and max bbox ratios (as percent of image size) + count (int): Number of bbox to generate + """ + assert len(minmax) == 2 + img_h, img_w = img_shape[-2:] + cut_h = np.random.randint(int(img_h * minmax[0]), int(img_h * minmax[1]), size=count) + cut_w = np.random.randint(int(img_w * minmax[0]), int(img_w * minmax[1]), size=count) + yl = np.random.randint(0, img_h - cut_h, size=count) + xl = np.random.randint(0, img_w - cut_w, size=count) + yu = yl + cut_h + xu = xl + cut_w + return yl, yu, xl, xu + + +def cutmix_bbox_and_lam(img_shape, lam, ratio_minmax=None, correct_lam=True, count=None): + """ Generate bbox and apply lambda correction. + """ + if ratio_minmax is not None: + yl, yu, xl, xu = rand_bbox_minmax(img_shape, ratio_minmax, count=count) + else: + yl, yu, xl, xu = rand_bbox(img_shape, lam, count=count) + if correct_lam or ratio_minmax is not None: + bbox_area = (yu - yl) * (xu - xl) + lam = 1. - bbox_area / float(img_shape[-2] * img_shape[-1]) + return (yl, yu, xl, xu), lam + + +class Mixup: + """ Mixup/Cutmix that applies different params to each element or whole batch + + Args: + mixup_alpha (float): mixup alpha value, mixup is active if > 0. + cutmix_alpha (float): cutmix alpha value, cutmix is active if > 0. + cutmix_minmax (List[float]): cutmix min/max image ratio, cutmix is active and uses this vs alpha if not None. + prob (float): probability of applying mixup or cutmix per batch or element + switch_prob (float): probability of switching to cutmix instead of mixup when both are active + mode (str): how to apply mixup/cutmix params (per 'batch', 'pair' (pair of elements), 'elem' (element) + correct_lam (bool): apply lambda correction when cutmix bbox clipped by image borders + label_smoothing (float): apply label smoothing to the mixed target tensor + num_classes (int): number of classes for target + """ + def __init__(self, mixup_alpha=1., cutmix_alpha=0., cutmix_minmax=None, prob=1.0, switch_prob=0.5, + mode='batch', correct_lam=True, label_smoothing=0.1, num_classes=1000): + self.mixup_alpha = mixup_alpha + self.cutmix_alpha = cutmix_alpha + self.cutmix_minmax = cutmix_minmax + if self.cutmix_minmax is not None: + assert len(self.cutmix_minmax) == 2 + # force cutmix alpha == 1.0 when minmax active to keep logic simple & safe + self.cutmix_alpha = 1.0 + self.mix_prob = prob + self.switch_prob = switch_prob + self.label_smoothing = label_smoothing + self.num_classes = num_classes + self.mode = mode + self.correct_lam = correct_lam # correct lambda based on clipped area for cutmix + self.mixup_enabled = True # set to false to disable mixing (intended tp be set by train loop) + + def _params_per_elem(self, batch_size): + lam = np.ones(batch_size, dtype=np.float32) + use_cutmix = np.zeros(batch_size, dtype=np.bool) + if self.mixup_enabled: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand(batch_size) < self.switch_prob + lam_mix = np.where( + use_cutmix, + np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size), + np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size)) + elif self.mixup_alpha > 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha, size=batch_size) + elif self.cutmix_alpha > 0.: + use_cutmix = np.ones(batch_size, dtype=np.bool) + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha, size=batch_size) + else: + assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." + lam = np.where(np.random.rand(batch_size) < self.mix_prob, lam_mix.astype(np.float32), lam) + return lam, use_cutmix + + def _params_per_batch(self): + lam = 1. + use_cutmix = False + if self.mixup_enabled and np.random.rand() < self.mix_prob: + if self.mixup_alpha > 0. and self.cutmix_alpha > 0.: + use_cutmix = np.random.rand() < self.switch_prob + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) if use_cutmix else \ + np.random.beta(self.mixup_alpha, self.mixup_alpha) + elif self.mixup_alpha > 0.: + lam_mix = np.random.beta(self.mixup_alpha, self.mixup_alpha) + elif self.cutmix_alpha > 0.: + use_cutmix = True + lam_mix = np.random.beta(self.cutmix_alpha, self.cutmix_alpha) + else: + assert False, "One of mixup_alpha > 0., cutmix_alpha > 0., cutmix_minmax not None should be true." + lam = float(lam_mix) + return lam, use_cutmix + + def _mix_elem(self, x): + batch_size = len(x) + lam_batch, use_cutmix = self._params_per_elem(batch_size) + x_orig = x.clone() # need to keep an unmodified original for mixing source + for i in range(batch_size): + j = batch_size - i - 1 + lam = lam_batch[i] + if lam != 1.: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + x[i] = x[i] * lam + x_orig[j] * (1 - lam) + return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) + + def _mix_pair(self, x): + batch_size = len(x) + lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) + x_orig = x.clone() # need to keep an unmodified original for mixing source + for i in range(batch_size // 2): + j = batch_size - i - 1 + lam = lam_batch[i] + if lam != 1.: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x[i].shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + x[i][:, yl:yh, xl:xh] = x_orig[j][:, yl:yh, xl:xh] + x[j][:, yl:yh, xl:xh] = x_orig[i][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + x[i] = x[i] * lam + x_orig[j] * (1 - lam) + x[j] = x[j] * lam + x_orig[i] * (1 - lam) + lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) + return torch.tensor(lam_batch, device=x.device, dtype=x.dtype).unsqueeze(1) + + def _mix_batch(self, x): + lam, use_cutmix = self._params_per_batch() + if lam == 1.: + return 1. + if use_cutmix: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + x.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + x[:, :, yl:yh, xl:xh] = x.flip(0)[:, :, yl:yh, xl:xh] + else: + x_flipped = x.flip(0).mul_(1. - lam) + x.mul_(lam).add_(x_flipped) + return lam + + def __call__(self, x, target): + assert len(x) % 2 == 0, 'Batch size should be even when using this' + if self.mode == 'elem': + lam = self._mix_elem(x) + elif self.mode == 'pair': + lam = self._mix_pair(x) + else: + lam = self._mix_batch(x) + target = mixup_target(target, self.num_classes, lam, self.label_smoothing, x.device) + return x, target + + +class FastCollateMixup(Mixup): + """ Fast Collate w/ Mixup/Cutmix that applies different params to each element or whole batch + + A Mixup impl that's performed while collating the batches. + """ + + def _mix_elem_collate(self, output, batch, half=False): + batch_size = len(batch) + num_elem = batch_size // 2 if half else batch_size + assert len(output) == num_elem + lam_batch, use_cutmix = self._params_per_elem(num_elem) + for i in range(num_elem): + j = batch_size - i - 1 + lam = lam_batch[i] + mixed = batch[i][0] + if lam != 1.: + if use_cutmix[i]: + if not half: + mixed = mixed.copy() + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh] + lam_batch[i] = lam + else: + mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) + np.rint(mixed, out=mixed) + output[i] += torch.from_numpy(mixed.astype(np.uint8)) + if half: + lam_batch = np.concatenate((lam_batch, np.ones(num_elem))) + return torch.tensor(lam_batch).unsqueeze(1) + + def _mix_pair_collate(self, output, batch): + batch_size = len(batch) + lam_batch, use_cutmix = self._params_per_elem(batch_size // 2) + for i in range(batch_size // 2): + j = batch_size - i - 1 + lam = lam_batch[i] + mixed_i = batch[i][0] + mixed_j = batch[j][0] + assert 0 <= lam <= 1.0 + if lam < 1.: + if use_cutmix[i]: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + patch_i = mixed_i[:, yl:yh, xl:xh].copy() + mixed_i[:, yl:yh, xl:xh] = mixed_j[:, yl:yh, xl:xh] + mixed_j[:, yl:yh, xl:xh] = patch_i + lam_batch[i] = lam + else: + mixed_temp = mixed_i.astype(np.float32) * lam + mixed_j.astype(np.float32) * (1 - lam) + mixed_j = mixed_j.astype(np.float32) * lam + mixed_i.astype(np.float32) * (1 - lam) + mixed_i = mixed_temp + np.rint(mixed_j, out=mixed_j) + np.rint(mixed_i, out=mixed_i) + output[i] += torch.from_numpy(mixed_i.astype(np.uint8)) + output[j] += torch.from_numpy(mixed_j.astype(np.uint8)) + lam_batch = np.concatenate((lam_batch, lam_batch[::-1])) + return torch.tensor(lam_batch).unsqueeze(1) + + def _mix_batch_collate(self, output, batch): + batch_size = len(batch) + lam, use_cutmix = self._params_per_batch() + if use_cutmix: + (yl, yh, xl, xh), lam = cutmix_bbox_and_lam( + output.shape, lam, ratio_minmax=self.cutmix_minmax, correct_lam=self.correct_lam) + for i in range(batch_size): + j = batch_size - i - 1 + mixed = batch[i][0] + if lam != 1.: + if use_cutmix: + mixed = mixed.copy() # don't want to modify the original while iterating + mixed[:, yl:yh, xl:xh] = batch[j][0][:, yl:yh, xl:xh] + else: + mixed = mixed.astype(np.float32) * lam + batch[j][0].astype(np.float32) * (1 - lam) + np.rint(mixed, out=mixed) + output[i] += torch.from_numpy(mixed.astype(np.uint8)) + return lam + + def __call__(self, batch, _=None): + batch_size = len(batch) + assert batch_size % 2 == 0, 'Batch size should be even when using this' + half = 'half' in self.mode + if half: + batch_size //= 2 + output = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8) + if self.mode == 'elem' or self.mode == 'half': + lam = self._mix_elem_collate(output, batch, half=half) + elif self.mode == 'pair': + lam = self._mix_pair_collate(output, batch) + else: + lam = self._mix_batch_collate(output, batch) + target = torch.tensor([b[1] for b in batch], dtype=torch.int64) + target = mixup_target(target, self.num_classes, lam, self.label_smoothing, device='cpu') + target = target[:batch_size] + return output, target + diff --git a/custom_timm/data/parsers/__init__.py b/custom_timm/data/parsers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4e820d5e027ba82c937829ad50b2b2c9a97d2f28 --- /dev/null +++ b/custom_timm/data/parsers/__init__.py @@ -0,0 +1,2 @@ +from .parser_factory import create_parser +from .img_extensions import * diff --git a/custom_timm/data/parsers/class_map.py b/custom_timm/data/parsers/class_map.py new file mode 100644 index 0000000000000000000000000000000000000000..6cf3f57e014566e165374acae8dec031c02048f8 --- /dev/null +++ b/custom_timm/data/parsers/class_map.py @@ -0,0 +1,22 @@ +import os +import pickle + +def load_class_map(map_or_filename, root=''): + if isinstance(map_or_filename, dict): + assert dict, 'class_map dict must be non-empty' + return map_or_filename + class_map_path = map_or_filename + if not os.path.exists(class_map_path): + class_map_path = os.path.join(root, class_map_path) + assert os.path.exists(class_map_path), 'Cannot locate specified class map file (%s)' % map_or_filename + class_map_ext = os.path.splitext(map_or_filename)[-1].lower() + if class_map_ext == '.txt': + with open(class_map_path) as f: + class_to_idx = {v.strip(): k for k, v in enumerate(f)} + elif class_map_ext == '.pkl': + with open(class_map_path,'rb') as f: + class_to_idx = pickle.load(f) + else: + assert False, f'Unsupported class map file extension ({class_map_ext}).' + return class_to_idx + diff --git a/custom_timm/data/parsers/img_extensions.py b/custom_timm/data/parsers/img_extensions.py new file mode 100644 index 0000000000000000000000000000000000000000..45c85aabd00ca5ebf7bd6fa85c674570fe60f9c8 --- /dev/null +++ b/custom_timm/data/parsers/img_extensions.py @@ -0,0 +1,50 @@ +from copy import deepcopy + +__all__ = ['get_img_extensions', 'is_img_extension', 'set_img_extensions', 'add_img_extensions', 'del_img_extensions'] + + +IMG_EXTENSIONS = ('.png', '.jpg', '.jpeg') # singleton, kept public for bwd compat use +_IMG_EXTENSIONS_SET = set(IMG_EXTENSIONS) # set version, private, kept in sync + + +def _set_extensions(extensions): + global IMG_EXTENSIONS + global _IMG_EXTENSIONS_SET + dedupe = set() # NOTE de-duping tuple while keeping original order + IMG_EXTENSIONS = tuple(x for x in extensions if x not in dedupe and not dedupe.add(x)) + _IMG_EXTENSIONS_SET = set(extensions) + + +def _valid_extension(x: str): + return x and isinstance(x, str) and len(x) >= 2 and x.startswith('.') + + +def is_img_extension(ext): + return ext in _IMG_EXTENSIONS_SET + + +def get_img_extensions(as_set=False): + return deepcopy(_IMG_EXTENSIONS_SET if as_set else IMG_EXTENSIONS) + + +def set_img_extensions(extensions): + assert len(extensions) + for x in extensions: + assert _valid_extension(x) + _set_extensions(extensions) + + +def add_img_extensions(ext): + if not isinstance(ext, (list, tuple, set)): + ext = (ext,) + for x in ext: + assert _valid_extension(x) + extensions = IMG_EXTENSIONS + tuple(ext) + _set_extensions(extensions) + + +def del_img_extensions(ext): + if not isinstance(ext, (list, tuple, set)): + ext = (ext,) + extensions = tuple(x for x in IMG_EXTENSIONS if x not in ext) + _set_extensions(extensions) diff --git a/custom_timm/data/parsers/parser.py b/custom_timm/data/parsers/parser.py new file mode 100644 index 0000000000000000000000000000000000000000..76ab6d18283644702424d0ff2af5832d6d6dd3b7 --- /dev/null +++ b/custom_timm/data/parsers/parser.py @@ -0,0 +1,17 @@ +from abc import abstractmethod + + +class Parser: + def __init__(self): + pass + + @abstractmethod + def _filename(self, index, basename=False, absolute=False): + pass + + def filename(self, index, basename=False, absolute=False): + return self._filename(index, basename=basename, absolute=absolute) + + def filenames(self, basename=False, absolute=False): + return [self._filename(index, basename=basename, absolute=absolute) for index in range(len(self))] + diff --git a/custom_timm/data/parsers/parser_factory.py b/custom_timm/data/parsers/parser_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..0665c02a8b4db12b8ac6b7095999751c5b26f384 --- /dev/null +++ b/custom_timm/data/parsers/parser_factory.py @@ -0,0 +1,28 @@ +import os + +from .parser_image_folder import ParserImageFolder +from .parser_image_in_tar import ParserImageInTar + + +def create_parser(name, root, split='train', **kwargs): + name = name.lower() + name = name.split('/', 2) + prefix = '' + if len(name) > 1: + prefix = name[0] + name = name[-1] + + # FIXME improve the selection right now just tfds prefix or fallback path, will need options to + # explicitly select other options shortly + if prefix == 'tfds': + from .parser_tfds import ParserTfds # defer tensorflow import + parser = ParserTfds(root, name, split=split, **kwargs) + else: + assert os.path.exists(root) + # default fallback path (backwards compat), use image tar if root is a .tar file, otherwise image folder + # FIXME support split here, in parser? + if os.path.isfile(root) and os.path.splitext(root)[1] == '.tar': + parser = ParserImageInTar(root, **kwargs) + else: + parser = ParserImageFolder(root, **kwargs) + return parser diff --git a/custom_timm/data/parsers/parser_image_folder.py b/custom_timm/data/parsers/parser_image_folder.py new file mode 100644 index 0000000000000000000000000000000000000000..d82b024377e99a26fb87c92256a076505d894666 --- /dev/null +++ b/custom_timm/data/parsers/parser_image_folder.py @@ -0,0 +1,90 @@ +""" A dataset parser that reads images from folders + +Folders are scannerd recursively to find image files. Labels are based +on the folder hierarchy, just leaf folders by default. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import os +from typing import Dict, List, Optional, Set, Tuple, Union + +from custom_timm.utils.misc import natural_key + +from .class_map import load_class_map +from .img_extensions import get_img_extensions +from .parser import Parser + + +def find_images_and_targets( + folder: str, + types: Optional[Union[List, Tuple, Set]] = None, + class_to_idx: Optional[Dict] = None, + leaf_name_only: bool = True, + sort: bool = True +): + """ Walk folder recursively to discover images and map them to classes by folder names. + + Args: + folder: root of folder to recrusively search + types: types (file extensions) to search for in path + class_to_idx: specify mapping for class (folder name) to class index if set + leaf_name_only: use only leaf-name of folder walk for class names + sort: re-sort found images by name (for consistent ordering) + + Returns: + A list of image and target tuples, class_to_idx mapping + """ + types = get_img_extensions(as_set=True) if not types else set(types) + labels = [] + filenames = [] + for root, subdirs, files in os.walk(folder, topdown=False, followlinks=True): + rel_path = os.path.relpath(root, folder) if (root != folder) else '' + label = os.path.basename(rel_path) if leaf_name_only else rel_path.replace(os.path.sep, '_') + for f in files: + base, ext = os.path.splitext(f) + if ext.lower() in types: + filenames.append(os.path.join(root, f)) + labels.append(label) + if class_to_idx is None: + # building class index + unique_labels = set(labels) + sorted_labels = list(sorted(unique_labels, key=natural_key)) + class_to_idx = {c: idx for idx, c in enumerate(sorted_labels)} + images_and_targets = [(f, class_to_idx[l]) for f, l in zip(filenames, labels) if l in class_to_idx] + if sort: + images_and_targets = sorted(images_and_targets, key=lambda k: natural_key(k[0])) + return images_and_targets, class_to_idx + + +class ParserImageFolder(Parser): + + def __init__( + self, + root, + class_map=''): + super().__init__() + + self.root = root + class_to_idx = None + if class_map: + class_to_idx = load_class_map(class_map, root) + self.samples, self.class_to_idx = find_images_and_targets(root, class_to_idx=class_to_idx) + if len(self.samples) == 0: + raise RuntimeError( + f'Found 0 images in subfolders of {root}. ' + f'Supported image extensions are {", ".join(get_img_extensions())}') + + def __getitem__(self, index): + path, target = self.samples[index] + return open(path, 'rb'), target + + def __len__(self): + return len(self.samples) + + def _filename(self, index, basename=False, absolute=False): + filename = self.samples[index][0] + if basename: + filename = os.path.basename(filename) + elif not absolute: + filename = os.path.relpath(filename, self.root) + return filename diff --git a/custom_timm/data/parsers/parser_image_in_tar.py b/custom_timm/data/parsers/parser_image_in_tar.py new file mode 100644 index 0000000000000000000000000000000000000000..7d3c1765b5bd3809f93a5c1707b472f7f54e5eb7 --- /dev/null +++ b/custom_timm/data/parsers/parser_image_in_tar.py @@ -0,0 +1,229 @@ +""" A dataset parser that reads tarfile based datasets + +This parser can read and extract image samples from: +* a single tar of image files +* a folder of multiple tarfiles containing imagefiles +* a tar of tars containing image files + +Labels are based on the combined folder and/or tar name structure. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import logging +import os +import pickle +import tarfile +from glob import glob +from typing import List, Tuple, Dict, Set, Optional, Union + +import numpy as np + +from custom_timm.utils.misc import natural_key + +from .class_map import load_class_map +from .img_extensions import get_img_extensions +from .parser import Parser + +_logger = logging.getLogger(__name__) +CACHE_FILENAME_SUFFIX = '_tarinfos.pickle' + + +class TarState: + + def __init__(self, tf: tarfile.TarFile = None, ti: tarfile.TarInfo = None): + self.tf: tarfile.TarFile = tf + self.ti: tarfile.TarInfo = ti + self.children: Dict[str, TarState] = {} # child states (tars within tars) + + def reset(self): + self.tf = None + + +def _extract_tarinfo(tf: tarfile.TarFile, parent_info: Dict, extensions: Set[str]): + sample_count = 0 + for i, ti in enumerate(tf): + if not ti.isfile(): + continue + dirname, basename = os.path.split(ti.path) + name, ext = os.path.splitext(basename) + ext = ext.lower() + if ext == '.tar': + with tarfile.open(fileobj=tf.extractfile(ti), mode='r|') as ctf: + child_info = dict( + name=ti.name, path=os.path.join(parent_info['path'], name), ti=ti, children=[], samples=[]) + sample_count += _extract_tarinfo(ctf, child_info, extensions=extensions) + _logger.debug(f'{i}/?. Extracted child tarinfos from {ti.name}. {len(child_info["samples"])} images.') + parent_info['children'].append(child_info) + elif ext in extensions: + parent_info['samples'].append(ti) + sample_count += 1 + return sample_count + + +def extract_tarinfos( + root, + class_name_to_idx: Optional[Dict] = None, + cache_tarinfo: Optional[bool] = None, + extensions: Optional[Union[List, Tuple, Set]] = None, + sort: bool = True +): + extensions = get_img_extensions(as_set=True) if not extensions else set(extensions) + root_is_tar = False + if os.path.isfile(root): + assert os.path.splitext(root)[-1].lower() == '.tar' + tar_filenames = [root] + root, root_name = os.path.split(root) + root_name = os.path.splitext(root_name)[0] + root_is_tar = True + else: + root_name = root.strip(os.path.sep).split(os.path.sep)[-1] + tar_filenames = glob(os.path.join(root, '*.tar'), recursive=True) + num_tars = len(tar_filenames) + tar_bytes = sum([os.path.getsize(f) for f in tar_filenames]) + assert num_tars, f'No .tar files found at specified path ({root}).' + + _logger.info(f'Scanning {tar_bytes/1024**2:.2f}MB of tar files...') + info = dict(tartrees=[]) + cache_path = '' + if cache_tarinfo is None: + cache_tarinfo = True if tar_bytes > 10*1024**3 else False # FIXME magic number, 10GB + if cache_tarinfo: + cache_filename = '_' + root_name + CACHE_FILENAME_SUFFIX + cache_path = os.path.join(root, cache_filename) + if os.path.exists(cache_path): + _logger.info(f'Reading tar info from cache file {cache_path}.') + with open(cache_path, 'rb') as pf: + info = pickle.load(pf) + assert len(info['tartrees']) == num_tars, "Cached tartree len doesn't match number of tarfiles" + else: + for i, fn in enumerate(tar_filenames): + path = '' if root_is_tar else os.path.splitext(os.path.basename(fn))[0] + with tarfile.open(fn, mode='r|') as tf: # tarinfo scans done in streaming mode + parent_info = dict(name=os.path.relpath(fn, root), path=path, ti=None, children=[], samples=[]) + num_samples = _extract_tarinfo(tf, parent_info, extensions=extensions) + num_children = len(parent_info["children"]) + _logger.debug( + f'{i}/{num_tars}. Extracted tarinfos from {fn}. {num_children} children, {num_samples} samples.') + info['tartrees'].append(parent_info) + if cache_path: + _logger.info(f'Writing tar info to cache file {cache_path}.') + with open(cache_path, 'wb') as pf: + pickle.dump(info, pf) + + samples = [] + labels = [] + build_class_map = False + if class_name_to_idx is None: + build_class_map = True + + # Flatten tartree info into lists of samples and targets w/ targets based on label id via + # class map arg or from unique paths. + # NOTE: currently only flattening up to two-levels, filesystem .tars and then one level of sub-tar children + # this covers my current use cases and keeps things a little easier to test for now. + tarfiles = [] + + def _label_from_paths(*path, leaf_only=True): + path = os.path.join(*path).strip(os.path.sep) + return path.split(os.path.sep)[-1] if leaf_only else path.replace(os.path.sep, '_') + + def _add_samples(info, fn): + added = 0 + for s in info['samples']: + label = _label_from_paths(info['path'], os.path.dirname(s.path)) + if not build_class_map and label not in class_name_to_idx: + continue + samples.append((s, fn, info['ti'])) + labels.append(label) + added += 1 + return added + + _logger.info(f'Collecting samples and building tar states.') + for parent_info in info['tartrees']: + # if tartree has children, we assume all samples are at the child level + tar_name = None if root_is_tar else parent_info['name'] + tar_state = TarState() + parent_added = 0 + for child_info in parent_info['children']: + child_added = _add_samples(child_info, fn=tar_name) + if child_added: + tar_state.children[child_info['name']] = TarState(ti=child_info['ti']) + parent_added += child_added + parent_added += _add_samples(parent_info, fn=tar_name) + if parent_added: + tarfiles.append((tar_name, tar_state)) + del info + + if build_class_map: + # build class index + sorted_labels = list(sorted(set(labels), key=natural_key)) + class_name_to_idx = {c: idx for idx, c in enumerate(sorted_labels)} + + _logger.info(f'Mapping targets and sorting samples.') + samples_and_targets = [(s, class_name_to_idx[l]) for s, l in zip(samples, labels) if l in class_name_to_idx] + if sort: + samples_and_targets = sorted(samples_and_targets, key=lambda k: natural_key(k[0][0].path)) + samples, targets = zip(*samples_and_targets) + samples = np.array(samples) + targets = np.array(targets) + _logger.info(f'Finished processing {len(samples)} samples across {len(tarfiles)} tar files.') + return samples, targets, class_name_to_idx, tarfiles + + +class ParserImageInTar(Parser): + """ Multi-tarfile dataset parser where there is one .tar file per class + """ + + def __init__(self, root, class_map='', cache_tarfiles=True, cache_tarinfo=None): + super().__init__() + + class_name_to_idx = None + if class_map: + class_name_to_idx = load_class_map(class_map, root) + self.root = root + self.samples, self.targets, self.class_name_to_idx, tarfiles = extract_tarinfos( + self.root, + class_name_to_idx=class_name_to_idx, + cache_tarinfo=cache_tarinfo + ) + self.class_idx_to_name = {v: k for k, v in self.class_name_to_idx.items()} + if len(tarfiles) == 1 and tarfiles[0][0] is None: + self.root_is_tar = True + self.tar_state = tarfiles[0][1] + else: + self.root_is_tar = False + self.tar_state = dict(tarfiles) + self.cache_tarfiles = cache_tarfiles + + def __len__(self): + return len(self.samples) + + def __getitem__(self, index): + sample = self.samples[index] + target = self.targets[index] + sample_ti, parent_fn, child_ti = sample + parent_abs = os.path.join(self.root, parent_fn) if parent_fn else self.root + + tf = None + cache_state = None + if self.cache_tarfiles: + cache_state = self.tar_state if self.root_is_tar else self.tar_state[parent_fn] + tf = cache_state.tf + if tf is None: + tf = tarfile.open(parent_abs) + if self.cache_tarfiles: + cache_state.tf = tf + if child_ti is not None: + ctf = cache_state.children[child_ti.name].tf if self.cache_tarfiles else None + if ctf is None: + ctf = tarfile.open(fileobj=tf.extractfile(child_ti)) + if self.cache_tarfiles: + cache_state.children[child_ti.name].tf = ctf + tf = ctf + + return tf.extractfile(sample_ti), target + + def _filename(self, index, basename=False, absolute=False): + filename = self.samples[index][0].name + if basename: + filename = os.path.basename(filename) + return filename diff --git a/custom_timm/data/parsers/parser_image_tar.py b/custom_timm/data/parsers/parser_image_tar.py new file mode 100644 index 0000000000000000000000000000000000000000..c5520ee64c1d798a37d45b5361ab3b800f5adbe6 --- /dev/null +++ b/custom_timm/data/parsers/parser_image_tar.py @@ -0,0 +1,74 @@ +""" A dataset parser that reads single tarfile based datasets + +This parser can read datasets consisting if a single tarfile containing images. +I am planning to deprecated it in favour of ParerImageInTar. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import os +import tarfile + +from custom_timm.utils.misc import natural_key + +from .class_map import load_class_map +from .img_extensions import get_img_extensions +from .parser import Parser + + +def extract_tarinfo(tarfile, class_to_idx=None, sort=True): + extensions = get_img_extensions(as_set=True) + files = [] + labels = [] + for ti in tarfile.getmembers(): + if not ti.isfile(): + continue + dirname, basename = os.path.split(ti.path) + label = os.path.basename(dirname) + ext = os.path.splitext(basename)[1] + if ext.lower() in extensions: + files.append(ti) + labels.append(label) + if class_to_idx is None: + unique_labels = set(labels) + sorted_labels = list(sorted(unique_labels, key=natural_key)) + class_to_idx = {c: idx for idx, c in enumerate(sorted_labels)} + tarinfo_and_targets = [(f, class_to_idx[l]) for f, l in zip(files, labels) if l in class_to_idx] + if sort: + tarinfo_and_targets = sorted(tarinfo_and_targets, key=lambda k: natural_key(k[0].path)) + return tarinfo_and_targets, class_to_idx + + +class ParserImageTar(Parser): + """ Single tarfile dataset where classes are mapped to folders within tar + NOTE: This class is being deprecated in favour of the more capable ParserImageInTar that can + operate on folders of tars or tars in tars. + """ + def __init__(self, root, class_map=''): + super().__init__() + + class_to_idx = None + if class_map: + class_to_idx = load_class_map(class_map, root) + assert os.path.isfile(root) + self.root = root + + with tarfile.open(root) as tf: # cannot keep this open across processes, reopen later + self.samples, self.class_to_idx = extract_tarinfo(tf, class_to_idx) + self.imgs = self.samples + self.tarfile = None # lazy init in __getitem__ + + def __getitem__(self, index): + if self.tarfile is None: + self.tarfile = tarfile.open(self.root) + tarinfo, target = self.samples[index] + fileobj = self.tarfile.extractfile(tarinfo) + return fileobj, target + + def __len__(self): + return len(self.samples) + + def _filename(self, index, basename=False, absolute=False): + filename = self.samples[index][0].name + if basename: + filename = os.path.basename(filename) + return filename diff --git a/custom_timm/data/parsers/parser_tfds.py b/custom_timm/data/parsers/parser_tfds.py new file mode 100644 index 0000000000000000000000000000000000000000..739f3813d0ad20bcb92676662dad62d53be1fe70 --- /dev/null +++ b/custom_timm/data/parsers/parser_tfds.py @@ -0,0 +1,301 @@ +""" Dataset parser interface that wraps TFDS datasets + +Wraps many (most?) TFDS image-classification datasets +from https://github.com/tensorflow/datasets +https://www.tensorflow.org/datasets/catalog/overview#image_classification + +Hacked together by / Copyright 2020 Ross Wightman +""" +import math +import torch +import torch.distributed as dist +from PIL import Image + +try: + import tensorflow as tf + tf.config.set_visible_devices([], 'GPU') # Hands off my GPU! (or pip install tensorflow-cpu) + import tensorflow_datasets as tfds + try: + tfds.even_splits('', 1, drop_remainder=False) # non-buggy even_splits has drop_remainder arg + has_buggy_even_splits = False + except TypeError: + print("Warning: This version of tfds doesn't have the latest even_splits impl. " + "Please update or use tfds-nightly for better fine-grained split behaviour.") + has_buggy_even_splits = True + # NOTE uncomment below if having file limit issues on dataset build (or alter your OS defaults) + # import resource + # low, high = resource.getrlimit(resource.RLIMIT_NOFILE) + # resource.setrlimit(resource.RLIMIT_NOFILE, (high, high)) +except ImportError as e: + print(e) + print("Please install tensorflow_datasets package `pip install tensorflow-datasets`.") + exit(1) +from .parser import Parser + + +MAX_TP_SIZE = 8 # maximum TF threadpool size, only doing jpeg decodes and queuing activities +SHUFFLE_SIZE = 8192 # examples to shuffle in DS queue +PREFETCH_SIZE = 2048 # examples to prefetch + + +def even_split_indices(split, n, num_examples): + partitions = [round(i * num_examples / n) for i in range(n + 1)] + return [f"{split}[{partitions[i]}:{partitions[i + 1]}]" for i in range(n)] + + +def get_class_labels(info): + if 'label' not in info.features: + return {} + class_label = info.features['label'] + class_to_idx = {n: class_label.str2int(n) for n in class_label.names} + return class_to_idx + + +class ParserTfds(Parser): + """ Wrap Tensorflow Datasets for use in PyTorch + + There several things to be aware of: + * To prevent excessive examples being dropped per epoch w/ distributed training or multiplicity of + dataloader workers, the train iterator wraps to avoid returning partial batches that trigger drop_last + https://github.com/pytorch/pytorch/issues/33413 + * With PyTorch IterableDatasets, each worker in each replica operates in isolation, the final batch + from each worker could be a different size. For training this is worked around by option above, for + validation extra examples are inserted iff distributed mode is enabled so that the batches being reduced + across replicas are of same size. This will slightly alter the results, distributed validation will not be + 100% correct. This is similar to common handling in DistributedSampler for normal Datasets but a bit worse + since there are up to N * J extra examples with IterableDatasets. + * The sharding (splitting of dataset into TFRecord) files imposes limitations on the number of + replicas and dataloader workers you can use. For really small datasets that only contain a few shards + you may have to train non-distributed w/ 1-2 dataloader workers. This is likely not a huge concern as the + benefit of distributed training or fast dataloading should be much less for small datasets. + * This wrapper is currently configured to return individual, decompressed image examples from the TFDS + dataset. The augmentation (transforms) and batching is still done in PyTorch. It would be possible + to specify TF augmentation fn and return augmented batches w/ some modifications to other downstream + components. + + """ + + def __init__( + self, + root, + name, + split='train', + is_training=False, + batch_size=None, + download=False, + repeats=0, + seed=42, + input_name='image', + input_image='RGB', + target_name='label', + target_image='', + prefetch_size=None, + shuffle_size=None, + max_threadpool_size=None + ): + """ Tensorflow-datasets Wrapper + + Args: + root: root data dir (ie your TFDS_DATA_DIR. not dataset specific sub-dir) + name: tfds dataset name (eg `imagenet2012`) + split: tfds dataset split (can use all TFDS split strings eg `train[:10%]`) + is_training: training mode, shuffle enabled, dataset len rounded by batch_size + batch_size: batch_size to use to unsure total examples % batch_size == 0 in training across all dis nodes + download: download and build TFDS dataset if set, otherwise must use tfds CLI + repeats: iterate through (repeat) the dataset this many times per iteration (once if 0 or 1) + seed: common seed for shard shuffle across all distributed/worker instances + input_name: name of Feature to return as data (input) + input_image: image mode if input is an image (currently PIL mode string) + target_name: name of Feature to return as target (label) + target_image: image mode if target is an image (currently PIL mode string) + prefetch_size: override default tf.data prefetch buffer size + shuffle_size: override default tf.data shuffle buffer size + max_threadpool_size: override default threadpool size for tf.data + """ + super().__init__() + self.root = root + self.split = split + self.is_training = is_training + if self.is_training: + assert batch_size is not None, \ + "Must specify batch_size in training mode for reasonable behaviour w/ TFDS wrapper" + self.batch_size = batch_size + self.repeats = repeats + self.common_seed = seed # a seed that's fixed across all worker / distributed instances + + # performance settings + self.prefetch_size = prefetch_size or PREFETCH_SIZE + self.shuffle_size = shuffle_size or SHUFFLE_SIZE + self.max_threadpool_size = max_threadpool_size or MAX_TP_SIZE + + # TFDS builder and split information + self.input_name = input_name # FIXME support tuples / lists of inputs and targets and full range of Feature + self.input_image = input_image + self.target_name = target_name + self.target_image = target_image + self.builder = tfds.builder(name, data_dir=root) + # NOTE: the tfds command line app can be used download & prepare datasets if you don't enable download flag + if download: + self.builder.download_and_prepare() + self.class_to_idx = get_class_labels(self.builder.info) if self.target_name == 'label' else {} + self.split_info = self.builder.info.splits[split] + self.num_examples = self.split_info.num_examples + + # Distributed world state + self.dist_rank = 0 + self.dist_num_replicas = 1 + if dist.is_available() and dist.is_initialized() and dist.get_world_size() > 1: + self.dist_rank = dist.get_rank() + self.dist_num_replicas = dist.get_world_size() + + # Attributes that are updated in _lazy_init, including the tf.data pipeline itself + self.global_num_workers = 1 + self.worker_info = None + self.worker_seed = 0 # seed unique to each work instance + self.subsplit = None # set when data is distributed across workers using sub-splits + self.ds = None # initialized lazily on each dataloader worker process + + def _lazy_init(self): + """ Lazily initialize the dataset. + + This is necessary to init the Tensorflow dataset pipeline in the (dataloader) process that + will be using the dataset instance. The __init__ method is called on the main process, + this will be called in a dataloader worker process. + + NOTE: There will be problems if you try to re-use this dataset across different loader/worker + instances once it has been initialized. Do not call any dataset methods that can call _lazy_init + before it is passed to dataloader. + """ + worker_info = torch.utils.data.get_worker_info() + + # setup input context to split dataset across distributed processes + num_workers = 1 + global_worker_id = 0 + if worker_info is not None: + self.worker_info = worker_info + self.worker_seed = worker_info.seed + num_workers = worker_info.num_workers + self.global_num_workers = self.dist_num_replicas * num_workers + global_worker_id = self.dist_rank * num_workers + worker_info.id + + """ Data sharding + InputContext will assign subset of underlying TFRecord files to each 'pipeline' if used. + My understanding is that using split, the underling TFRecord files will shuffle (shuffle_files=True) + between the splits each iteration, but that understanding could be wrong. + + I am currently using a mix of InputContext shard assignment and fine-grained sub-splits for distributing + the data across workers. For training InputContext is used to assign shards to nodes unless num_shards + in dataset < total number of workers. Otherwise sub-split API is used for datasets without enough shards or + for validation where we can't drop examples and need to avoid minimize uneven splits to avoid padding. + """ + should_subsplit = self.global_num_workers > 1 and ( + self.split_info.num_shards < self.global_num_workers or not self.is_training) + if should_subsplit: + # split the dataset w/o using sharding for more even examples / worker, can result in less optimal + # read patterns for distributed training (overlap across shards) so better to use InputContext there + if has_buggy_even_splits: + # my even_split workaround doesn't work on subsplits, upgrade tfds! + if not isinstance(self.split_info, tfds.core.splits.SubSplitInfo): + subsplits = even_split_indices(self.split, self.global_num_workers, self.num_examples) + self.subsplit = subsplits[global_worker_id] + else: + subsplits = tfds.even_splits(self.split, self.global_num_workers) + self.subsplit = subsplits[global_worker_id] + + input_context = None + if self.global_num_workers > 1 and self.subsplit is None: + # set input context to divide shards among distributed replicas + input_context = tf.distribute.InputContext( + num_input_pipelines=self.global_num_workers, + input_pipeline_id=global_worker_id, + num_replicas_in_sync=self.dist_num_replicas # FIXME does this arg have any impact? + ) + read_config = tfds.ReadConfig( + shuffle_seed=self.common_seed, + shuffle_reshuffle_each_iteration=True, + input_context=input_context) + ds = self.builder.as_dataset( + split=self.subsplit or self.split, shuffle_files=self.is_training, read_config=read_config) + # avoid overloading threading w/ combo of TF ds threads + PyTorch workers + options = tf.data.Options() + thread_member = 'threading' if hasattr(options, 'threading') else 'experimental_threading' + getattr(options, thread_member).private_threadpool_size = max(1, self.max_threadpool_size // num_workers) + getattr(options, thread_member).max_intra_op_parallelism = 1 + ds = ds.with_options(options) + if self.is_training or self.repeats > 1: + # to prevent excessive drop_last batch behaviour w/ IterableDatasets + # see warnings at https://pytorch.org/docs/stable/data.html#multi-process-data-loading + ds = ds.repeat() # allow wrap around and break iteration manually + if self.is_training: + ds = ds.shuffle(min(self.num_examples, self.shuffle_size) // self.global_num_workers, seed=self.worker_seed) + ds = ds.prefetch(min(self.num_examples // self.global_num_workers, self.prefetch_size)) + self.ds = tfds.as_numpy(ds) + + def __iter__(self): + if self.ds is None: + self._lazy_init() + + # Compute a rounded up sample count that is used to: + # 1. make batches even cross workers & replicas in distributed validation. + # This adds extra examples and will slightly alter validation results. + # 2. determine loop ending condition in training w/ repeat enabled so that only full batch_size + # batches are produced (underlying tfds iter wraps around) + target_example_count = math.ceil(max(1, self.repeats) * self.num_examples / self.global_num_workers) + if self.is_training: + # round up to nearest batch_size per worker-replica + target_example_count = math.ceil(target_example_count / self.batch_size) * self.batch_size + + # Iterate until exhausted or sample count hits target when training (ds.repeat enabled) + example_count = 0 + for example in self.ds: + input_data = example[self.input_name] + if self.input_image: + input_data = Image.fromarray(input_data, mode=self.input_image) + target_data = example[self.target_name] + if self.target_image: + target_data = Image.fromarray(target_data, mode=self.target_image) + yield input_data, target_data + example_count += 1 + if self.is_training and example_count >= target_example_count: + # Need to break out of loop when repeat() is enabled for training w/ oversampling + # this results in extra examples per epoch but seems more desirable than dropping + # up to N*J batches per epoch (where N = num distributed processes, and J = num worker processes) + break + + # Pad across distributed nodes (make counts equal by adding examples) + if not self.is_training and self.dist_num_replicas > 1 and self.subsplit is not None and \ + 0 < example_count < target_example_count: + # Validation batch padding only done for distributed training where results are reduced across nodes. + # For single process case, it won't matter if workers return different batch sizes. + # If using input_context or % based splits, sample count can vary significantly across workers and this + # approach should not be used (hence disabled if self.subsplit isn't set). + while example_count < target_example_count: + yield input_data, target_data # yield prev sample again + example_count += 1 + + def __len__(self): + # this is just an estimate and does not factor in extra examples added to pad batches based on + # complete worker & replica info (not available until init in dataloader). + return math.ceil(max(1, self.repeats) * self.num_examples / self.dist_num_replicas) + + def _filename(self, index, basename=False, absolute=False): + assert False, "Not supported" # no random access to examples + + def filenames(self, basename=False, absolute=False): + """ Return all filenames in dataset, overrides base""" + if self.ds is None: + self._lazy_init() + names = [] + for sample in self.ds: + if len(names) > self.num_examples: + break # safety for ds.repeat() case + if 'file_name' in sample: + name = sample['file_name'] + elif 'filename' in sample: + name = sample['filename'] + elif 'id' in sample: + name = sample['id'] + else: + assert False, "No supported name field present" + names.append(name) + return names diff --git a/custom_timm/data/random_erasing.py b/custom_timm/data/random_erasing.py new file mode 100644 index 0000000000000000000000000000000000000000..98108488da5392787d6502e2d21487259fe8c5e3 --- /dev/null +++ b/custom_timm/data/random_erasing.py @@ -0,0 +1,103 @@ +""" Random Erasing (Cutout) + +Originally inspired by impl at https://github.com/zhunzhong07/Random-Erasing, Apache 2.0 +Copyright Zhun Zhong & Liang Zheng + +Hacked together by / Copyright 2019, Ross Wightman +""" +import random +import math +import torch + + +def _get_pixels(per_pixel, rand_color, patch_size, dtype=torch.float32, device='cuda'): + # NOTE I've seen CUDA illegal memory access errors being caused by the normal_() + # paths, flip the order so normal is run on CPU if this becomes a problem + # Issue has been fixed in master https://github.com/pytorch/pytorch/issues/19508 + if per_pixel: + return torch.empty(patch_size, dtype=dtype, device=device).normal_() + elif rand_color: + return torch.empty((patch_size[0], 1, 1), dtype=dtype, device=device).normal_() + else: + return torch.zeros((patch_size[0], 1, 1), dtype=dtype, device=device) + + +class RandomErasing: + """ Randomly selects a rectangle region in an image and erases its pixels. + 'Random Erasing Data Augmentation' by Zhong et al. + See https://arxiv.org/pdf/1708.04896.pdf + + This variant of RandomErasing is intended to be applied to either a batch + or single image tensor after it has been normalized by dataset mean and std. + Args: + probability: Probability that the Random Erasing operation will be performed. + min_area: Minimum percentage of erased area wrt input image area. + max_area: Maximum percentage of erased area wrt input image area. + min_aspect: Minimum aspect ratio of erased area. + mode: pixel color mode, one of 'const', 'rand', or 'pixel' + 'const' - erase block is constant color of 0 for all channels + 'rand' - erase block is same per-channel random (normal) color + 'pixel' - erase block is per-pixel random (normal) color + max_count: maximum number of erasing blocks per image, area per box is scaled by count. + per-image count is randomly chosen between 1 and this value. + """ + + def __init__( + self, + probability=0.5, min_area=0.02, max_area=1/3, min_aspect=0.3, max_aspect=None, + mode='const', min_count=1, max_count=None, num_splits=0, device='cuda'): + self.probability = probability + self.min_area = min_area + self.max_area = max_area + max_aspect = max_aspect or 1 / min_aspect + self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect)) + self.min_count = min_count + self.max_count = max_count or min_count + self.num_splits = num_splits + self.mode = mode.lower() + self.rand_color = False + self.per_pixel = False + if self.mode == 'rand': + self.rand_color = True # per block random normal + elif self.mode == 'pixel': + self.per_pixel = True # per pixel random normal + else: + assert not self.mode or self.mode == 'const' + self.device = device + + def _erase(self, img, chan, img_h, img_w, dtype): + if random.random() > self.probability: + return + area = img_h * img_w + count = self.min_count if self.min_count == self.max_count else \ + random.randint(self.min_count, self.max_count) + for _ in range(count): + for attempt in range(10): + target_area = random.uniform(self.min_area, self.max_area) * area / count + aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio)) + h = int(round(math.sqrt(target_area * aspect_ratio))) + w = int(round(math.sqrt(target_area / aspect_ratio))) + if w < img_w and h < img_h: + top = random.randint(0, img_h - h) + left = random.randint(0, img_w - w) + img[:, top:top + h, left:left + w] = _get_pixels( + self.per_pixel, self.rand_color, (chan, h, w), + dtype=dtype, device=self.device) + break + + def __call__(self, input): + if len(input.size()) == 3: + self._erase(input, *input.size(), input.dtype) + else: + batch_size, chan, img_h, img_w = input.size() + # skip first slice of batch if num_splits is set (for clean portion of samples) + batch_start = batch_size // self.num_splits if self.num_splits > 1 else 0 + for i in range(batch_start, batch_size): + self._erase(input[i], chan, img_h, img_w, input.dtype) + return input + + def __repr__(self): + # NOTE simplified state for repr + fs = self.__class__.__name__ + f'(p={self.probability}, mode={self.mode}' + fs += f', count=({self.min_count}, {self.max_count}))' + return fs diff --git a/custom_timm/data/real_labels.py b/custom_timm/data/real_labels.py new file mode 100644 index 0000000000000000000000000000000000000000..939c34867e7915ce3e4cc7da04a5bc1653ec4f2c --- /dev/null +++ b/custom_timm/data/real_labels.py @@ -0,0 +1,42 @@ +""" Real labels evaluator for ImageNet +Paper: `Are we done with ImageNet?` - https://arxiv.org/abs/2006.07159 +Based on Numpy example at https://github.com/google-research/reassessed-imagenet + +Hacked together by / Copyright 2020 Ross Wightman +""" +import os +import json +import numpy as np + + +class RealLabelsImagenet: + + def __init__(self, filenames, real_json='real.json', topk=(1, 5)): + with open(real_json) as real_labels: + real_labels = json.load(real_labels) + real_labels = {f'ILSVRC2012_val_{i + 1:08d}.JPEG': labels for i, labels in enumerate(real_labels)} + self.real_labels = real_labels + self.filenames = filenames + assert len(self.filenames) == len(self.real_labels) + self.topk = topk + self.is_correct = {k: [] for k in topk} + self.sample_idx = 0 + + def add_result(self, output): + maxk = max(self.topk) + _, pred_batch = output.topk(maxk, 1, True, True) + pred_batch = pred_batch.cpu().numpy() + for pred in pred_batch: + filename = self.filenames[self.sample_idx] + filename = os.path.basename(filename) + if self.real_labels[filename]: + for k in self.topk: + self.is_correct[k].append( + any([p in self.real_labels[filename] for p in pred[:k]])) + self.sample_idx += 1 + + def get_accuracy(self, k=None): + if k is None: + return {k: float(np.mean(self.is_correct[k])) * 100 for k in self.topk} + else: + return float(np.mean(self.is_correct[k])) * 100 diff --git a/custom_timm/data/tf_preprocessing.py b/custom_timm/data/tf_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..44b4a3af7372c6865b1cdddda0a8da0ccc6b93a0 --- /dev/null +++ b/custom_timm/data/tf_preprocessing.py @@ -0,0 +1,232 @@ +""" Tensorflow Preprocessing Adapter + +Allows use of Tensorflow preprocessing pipeline in PyTorch Transform + +Copyright of original Tensorflow code below. + +Hacked together by / Copyright 2020 Ross Wightman +""" + +# Copyright 2018 The TensorFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""ImageNet preprocessing for MnasNet.""" +import tensorflow as tf +import numpy as np + +IMAGE_SIZE = 224 +CROP_PADDING = 32 + + +def distorted_bounding_box_crop(image_bytes, + bbox, + min_object_covered=0.1, + aspect_ratio_range=(0.75, 1.33), + area_range=(0.05, 1.0), + max_attempts=100, + scope=None): + """Generates cropped_image using one of the bboxes randomly distorted. + + See `tf.image.sample_distorted_bounding_box` for more documentation. + + Args: + image_bytes: `Tensor` of binary image data. + bbox: `Tensor` of bounding boxes arranged `[1, num_boxes, coords]` + where each coordinate is [0, 1) and the coordinates are arranged + as `[ymin, xmin, ymax, xmax]`. If num_boxes is 0 then use the whole + image. + min_object_covered: An optional `float`. Defaults to `0.1`. The cropped + area of the image must contain at least this fraction of any bounding + box supplied. + aspect_ratio_range: An optional list of `float`s. The cropped area of the + image must have an aspect ratio = width / height within this range. + area_range: An optional list of `float`s. The cropped area of the image + must contain a fraction of the supplied image within in this range. + max_attempts: An optional `int`. Number of attempts at generating a cropped + region of the image of the specified constraints. After `max_attempts` + failures, return the entire image. + scope: Optional `str` for name scope. + Returns: + cropped image `Tensor` + """ + with tf.name_scope(scope, 'distorted_bounding_box_crop', [image_bytes, bbox]): + shape = tf.image.extract_jpeg_shape(image_bytes) + sample_distorted_bounding_box = tf.image.sample_distorted_bounding_box( + shape, + bounding_boxes=bbox, + min_object_covered=min_object_covered, + aspect_ratio_range=aspect_ratio_range, + area_range=area_range, + max_attempts=max_attempts, + use_image_if_no_bounding_boxes=True) + bbox_begin, bbox_size, _ = sample_distorted_bounding_box + + # Crop the image to the specified bounding box. + offset_y, offset_x, _ = tf.unstack(bbox_begin) + target_height, target_width, _ = tf.unstack(bbox_size) + crop_window = tf.stack([offset_y, offset_x, target_height, target_width]) + image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3) + + return image + + +def _at_least_x_are_equal(a, b, x): + """At least `x` of `a` and `b` `Tensors` are equal.""" + match = tf.equal(a, b) + match = tf.cast(match, tf.int32) + return tf.greater_equal(tf.reduce_sum(match), x) + + +def _decode_and_random_crop(image_bytes, image_size, resize_method): + """Make a random crop of image_size.""" + bbox = tf.constant([0.0, 0.0, 1.0, 1.0], dtype=tf.float32, shape=[1, 1, 4]) + image = distorted_bounding_box_crop( + image_bytes, + bbox, + min_object_covered=0.1, + aspect_ratio_range=(3. / 4, 4. / 3.), + area_range=(0.08, 1.0), + max_attempts=10, + scope=None) + original_shape = tf.image.extract_jpeg_shape(image_bytes) + bad = _at_least_x_are_equal(original_shape, tf.shape(image), 3) + + image = tf.cond( + bad, + lambda: _decode_and_center_crop(image_bytes, image_size), + lambda: tf.image.resize([image], [image_size, image_size], resize_method)[0]) + + return image + + +def _decode_and_center_crop(image_bytes, image_size, resize_method): + """Crops to center of image with padding then scales image_size.""" + shape = tf.image.extract_jpeg_shape(image_bytes) + image_height = shape[0] + image_width = shape[1] + + padded_center_crop_size = tf.cast( + ((image_size / (image_size + CROP_PADDING)) * + tf.cast(tf.minimum(image_height, image_width), tf.float32)), + tf.int32) + + offset_height = ((image_height - padded_center_crop_size) + 1) // 2 + offset_width = ((image_width - padded_center_crop_size) + 1) // 2 + crop_window = tf.stack([offset_height, offset_width, + padded_center_crop_size, padded_center_crop_size]) + image = tf.image.decode_and_crop_jpeg(image_bytes, crop_window, channels=3) + image = tf.image.resize([image], [image_size, image_size], resize_method)[0] + + return image + + +def _flip(image): + """Random horizontal image flip.""" + image = tf.image.random_flip_left_right(image) + return image + + +def preprocess_for_train(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'): + """Preprocesses the given image for evaluation. + + Args: + image_bytes: `Tensor` representing an image binary of arbitrary size. + use_bfloat16: `bool` for whether to use bfloat16. + image_size: image size. + interpolation: image interpolation method + + Returns: + A preprocessed image `Tensor`. + """ + resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR + image = _decode_and_random_crop(image_bytes, image_size, resize_method) + image = _flip(image) + image = tf.reshape(image, [image_size, image_size, 3]) + image = tf.image.convert_image_dtype( + image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32) + return image + + +def preprocess_for_eval(image_bytes, use_bfloat16, image_size=IMAGE_SIZE, interpolation='bicubic'): + """Preprocesses the given image for evaluation. + + Args: + image_bytes: `Tensor` representing an image binary of arbitrary size. + use_bfloat16: `bool` for whether to use bfloat16. + image_size: image size. + interpolation: image interpolation method + + Returns: + A preprocessed image `Tensor`. + """ + resize_method = tf.image.ResizeMethod.BICUBIC if interpolation == 'bicubic' else tf.image.ResizeMethod.BILINEAR + image = _decode_and_center_crop(image_bytes, image_size, resize_method) + image = tf.reshape(image, [image_size, image_size, 3]) + image = tf.image.convert_image_dtype( + image, dtype=tf.bfloat16 if use_bfloat16 else tf.float32) + return image + + +def preprocess_image(image_bytes, + is_training=False, + use_bfloat16=False, + image_size=IMAGE_SIZE, + interpolation='bicubic'): + """Preprocesses the given image. + + Args: + image_bytes: `Tensor` representing an image binary of arbitrary size. + is_training: `bool` for whether the preprocessing is for training. + use_bfloat16: `bool` for whether to use bfloat16. + image_size: image size. + interpolation: image interpolation method + + Returns: + A preprocessed image `Tensor` with value range of [0, 255]. + """ + if is_training: + return preprocess_for_train(image_bytes, use_bfloat16, image_size, interpolation) + else: + return preprocess_for_eval(image_bytes, use_bfloat16, image_size, interpolation) + + +class TfPreprocessTransform: + + def __init__(self, is_training=False, size=224, interpolation='bicubic'): + self.is_training = is_training + self.size = size[0] if isinstance(size, tuple) else size + self.interpolation = interpolation + self._image_bytes = None + self.process_image = self._build_tf_graph() + self.sess = None + + def _build_tf_graph(self): + with tf.device('/cpu:0'): + self._image_bytes = tf.placeholder( + shape=[], + dtype=tf.string, + ) + img = preprocess_image( + self._image_bytes, self.is_training, False, self.size, self.interpolation) + return img + + def __call__(self, image_bytes): + if self.sess is None: + self.sess = tf.Session() + img = self.sess.run(self.process_image, feed_dict={self._image_bytes: image_bytes}) + img = img.round().clip(0, 255).astype(np.uint8) + if img.ndim < 3: + img = np.expand_dims(img, axis=-1) + img = np.rollaxis(img, 2) # HWC to CHW + return img diff --git a/custom_timm/data/transforms.py b/custom_timm/data/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..3eb3bc32768f8c153233dc5bf7aa19dff9a80d39 --- /dev/null +++ b/custom_timm/data/transforms.py @@ -0,0 +1,197 @@ +import torch +import torchvision.transforms.functional as F +try: + from torchvision.transforms.functional import InterpolationMode + has_interpolation_mode = True +except ImportError: + has_interpolation_mode = False +from PIL import Image +import warnings +import math +import random +import numpy as np + + +class ToNumpy: + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return np_img + + +class ToTensor: + + def __init__(self, dtype=torch.float32): + self.dtype = dtype + + def __call__(self, pil_img): + np_img = np.array(pil_img, dtype=np.uint8) + if np_img.ndim < 3: + np_img = np.expand_dims(np_img, axis=-1) + np_img = np.rollaxis(np_img, 2) # HWC to CHW + return torch.from_numpy(np_img).to(dtype=self.dtype) + + +# Pillow is deprecating the top-level resampling attributes (e.g., Image.BILINEAR) in +# favor of the Image.Resampling enum. The top-level resampling attributes will be +# removed in Pillow 10. +if hasattr(Image, "Resampling"): + _pil_interpolation_to_str = { + Image.Resampling.NEAREST: 'nearest', + Image.Resampling.BILINEAR: 'bilinear', + Image.Resampling.BICUBIC: 'bicubic', + Image.Resampling.BOX: 'box', + Image.Resampling.HAMMING: 'hamming', + Image.Resampling.LANCZOS: 'lanczos', + } +else: + _pil_interpolation_to_str = { + Image.NEAREST: 'nearest', + Image.BILINEAR: 'bilinear', + Image.BICUBIC: 'bicubic', + Image.BOX: 'box', + Image.HAMMING: 'hamming', + Image.LANCZOS: 'lanczos', + } + +_str_to_pil_interpolation = {b: a for a, b in _pil_interpolation_to_str.items()} + + +if has_interpolation_mode: + _torch_interpolation_to_str = { + InterpolationMode.NEAREST: 'nearest', + InterpolationMode.BILINEAR: 'bilinear', + InterpolationMode.BICUBIC: 'bicubic', + InterpolationMode.BOX: 'box', + InterpolationMode.HAMMING: 'hamming', + InterpolationMode.LANCZOS: 'lanczos', + } + _str_to_torch_interpolation = {b: a for a, b in _torch_interpolation_to_str.items()} +else: + _pil_interpolation_to_torch = {} + _torch_interpolation_to_str = {} + + +def str_to_pil_interp(mode_str): + return _str_to_pil_interpolation[mode_str] + + +def str_to_interp_mode(mode_str): + if has_interpolation_mode: + return _str_to_torch_interpolation[mode_str] + else: + return _str_to_pil_interpolation[mode_str] + + +def interp_mode_to_str(mode): + if has_interpolation_mode: + return _torch_interpolation_to_str[mode] + else: + return _pil_interpolation_to_str[mode] + + +_RANDOM_INTERPOLATION = (str_to_interp_mode('bilinear'), str_to_interp_mode('bicubic')) + + +class RandomResizedCropAndInterpolation: + """Crop the given PIL Image to random size and aspect ratio with random interpolation. + + A crop of random size (default: of 0.08 to 1.0) of the original size and a random + aspect ratio (default: of 3/4 to 4/3) of the original aspect ratio is made. This crop + is finally resized to given size. + This is popularly used to train the Inception networks. + + Args: + size: expected output size of each edge + scale: range of size of the origin size cropped + ratio: range of aspect ratio of the origin aspect ratio cropped + interpolation: Default: PIL.Image.BILINEAR + """ + + def __init__(self, size, scale=(0.08, 1.0), ratio=(3. / 4., 4. / 3.), + interpolation='bilinear'): + if isinstance(size, (list, tuple)): + self.size = tuple(size) + else: + self.size = (size, size) + if (scale[0] > scale[1]) or (ratio[0] > ratio[1]): + warnings.warn("range should be of kind (min, max)") + + if interpolation == 'random': + self.interpolation = _RANDOM_INTERPOLATION + else: + self.interpolation = str_to_interp_mode(interpolation) + self.scale = scale + self.ratio = ratio + + @staticmethod + def get_params(img, scale, ratio): + """Get parameters for ``crop`` for a random sized crop. + + Args: + img (PIL Image): Image to be cropped. + scale (tuple): range of size of the origin size cropped + ratio (tuple): range of aspect ratio of the origin aspect ratio cropped + + Returns: + tuple: params (i, j, h, w) to be passed to ``crop`` for a random + sized crop. + """ + area = img.size[0] * img.size[1] + + for attempt in range(10): + target_area = random.uniform(*scale) * area + log_ratio = (math.log(ratio[0]), math.log(ratio[1])) + aspect_ratio = math.exp(random.uniform(*log_ratio)) + + w = int(round(math.sqrt(target_area * aspect_ratio))) + h = int(round(math.sqrt(target_area / aspect_ratio))) + + if w <= img.size[0] and h <= img.size[1]: + i = random.randint(0, img.size[1] - h) + j = random.randint(0, img.size[0] - w) + return i, j, h, w + + # Fallback to central crop + in_ratio = img.size[0] / img.size[1] + if in_ratio < min(ratio): + w = img.size[0] + h = int(round(w / min(ratio))) + elif in_ratio > max(ratio): + h = img.size[1] + w = int(round(h * max(ratio))) + else: # whole image + w = img.size[0] + h = img.size[1] + i = (img.size[1] - h) // 2 + j = (img.size[0] - w) // 2 + return i, j, h, w + + def __call__(self, img): + """ + Args: + img (PIL Image): Image to be cropped and resized. + + Returns: + PIL Image: Randomly cropped and resized image. + """ + i, j, h, w = self.get_params(img, self.scale, self.ratio) + if isinstance(self.interpolation, (tuple, list)): + interpolation = random.choice(self.interpolation) + else: + interpolation = self.interpolation + return F.resized_crop(img, i, j, h, w, self.size, interpolation) + + def __repr__(self): + if isinstance(self.interpolation, (tuple, list)): + interpolate_str = ' '.join([interp_mode_to_str(x) for x in self.interpolation]) + else: + interpolate_str = interp_mode_to_str(self.interpolation) + format_string = self.__class__.__name__ + '(size={0}'.format(self.size) + format_string += ', scale={0}'.format(tuple(round(s, 4) for s in self.scale)) + format_string += ', ratio={0}'.format(tuple(round(r, 4) for r in self.ratio)) + format_string += ', interpolation={0})'.format(interpolate_str) + return format_string diff --git a/custom_timm/data/transforms_factory.py b/custom_timm/data/transforms_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..70f05dbf8393d94f41999cfa599b3e4bdf80f8e6 --- /dev/null +++ b/custom_timm/data/transforms_factory.py @@ -0,0 +1,236 @@ +""" Transforms Factory +Factory methods for building image transforms for use with TIMM (PyTorch Image Models) + +Hacked together by / Copyright 2019, Ross Wightman +""" +import math + +import torch +from torchvision import transforms + +from custom_timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT +from custom_timm.data.auto_augment import rand_augment_transform, augment_and_mix_transform, auto_augment_transform +from custom_timm.data.transforms import str_to_interp_mode, str_to_pil_interp, RandomResizedCropAndInterpolation, ToNumpy +from custom_timm.data.random_erasing import RandomErasing + + +def transforms_noaug_train( + img_size=224, + interpolation='bilinear', + use_prefetcher=False, + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, +): + if interpolation == 'random': + # random interpolation not supported with no-aug + interpolation = 'bilinear' + tfl = [ + transforms.Resize(img_size, interpolation=str_to_interp_mode(interpolation)), + transforms.CenterCrop(img_size) + ] + if use_prefetcher: + # prefetcher and collate will handle tensor conversion and norm + tfl += [ToNumpy()] + else: + tfl += [ + transforms.ToTensor(), + transforms.Normalize( + mean=torch.tensor(mean), + std=torch.tensor(std)) + ] + return transforms.Compose(tfl) + + +def transforms_imagenet_train( + img_size=224, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + interpolation='random', + use_prefetcher=False, + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0, + separate=False, +): + """ + If separate==True, the transforms are returned as a tuple of 3 separate transforms + for use in a mixing dataset that passes + * all data through the first (primary) transform, called the 'clean' data + * a portion of the data through the secondary transform + * normalizes and converts the branches above with the third, final transform + """ + scale = tuple(scale or (0.08, 1.0)) # default imagenet scale range + ratio = tuple(ratio or (3./4., 4./3.)) # default imagenet ratio range + primary_tfl = [ + RandomResizedCropAndInterpolation(img_size, scale=scale, ratio=ratio, interpolation=interpolation)] + if hflip > 0.: + primary_tfl += [transforms.RandomHorizontalFlip(p=hflip)] + if vflip > 0.: + primary_tfl += [transforms.RandomVerticalFlip(p=vflip)] + + secondary_tfl = [] + if auto_augment: + assert isinstance(auto_augment, str) + if isinstance(img_size, (tuple, list)): + img_size_min = min(img_size) + else: + img_size_min = img_size + aa_params = dict( + translate_const=int(img_size_min * 0.45), + img_mean=tuple([min(255, round(255 * x)) for x in mean]), + ) + if interpolation and interpolation != 'random': + aa_params['interpolation'] = str_to_pil_interp(interpolation) + if auto_augment.startswith('rand'): + secondary_tfl += [rand_augment_transform(auto_augment, aa_params)] + elif auto_augment.startswith('augmix'): + aa_params['translate_pct'] = 0.3 + secondary_tfl += [augment_and_mix_transform(auto_augment, aa_params)] + else: + secondary_tfl += [auto_augment_transform(auto_augment, aa_params)] + elif color_jitter is not None: + # color jitter is enabled when not using AA + if isinstance(color_jitter, (list, tuple)): + # color jitter should be a 3-tuple/list if spec brightness/contrast/saturation + # or 4 if also augmenting hue + assert len(color_jitter) in (3, 4) + else: + # if it's a scalar, duplicate for brightness, contrast, and saturation, no hue + color_jitter = (float(color_jitter),) * 3 + secondary_tfl += [transforms.ColorJitter(*color_jitter)] + + final_tfl = [] + if use_prefetcher: + # prefetcher and collate will handle tensor conversion and norm + final_tfl += [ToNumpy()] + else: + final_tfl += [ + transforms.ToTensor(), + transforms.Normalize( + mean=torch.tensor(mean), + std=torch.tensor(std)) + ] + if re_prob > 0.: + final_tfl.append( + RandomErasing(re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits, device='cpu')) + + if separate: + return transforms.Compose(primary_tfl), transforms.Compose(secondary_tfl), transforms.Compose(final_tfl) + else: + return transforms.Compose(primary_tfl + secondary_tfl + final_tfl) + + +def transforms_imagenet_eval( + img_size=224, + crop_pct=None, + interpolation='bilinear', + use_prefetcher=False, + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD): + crop_pct = crop_pct or DEFAULT_CROP_PCT + + if isinstance(img_size, (tuple, list)): + assert len(img_size) == 2 + if img_size[-1] == img_size[-2]: + # fall-back to older behaviour so Resize scales to shortest edge if target is square + scale_size = int(math.floor(img_size[0] / crop_pct)) + else: + scale_size = tuple([int(x / crop_pct) for x in img_size]) + else: + scale_size = int(math.floor(img_size / crop_pct)) + + tfl = [ + transforms.Resize(scale_size, interpolation=str_to_interp_mode(interpolation)), + transforms.CenterCrop(img_size), + ] + if use_prefetcher: + # prefetcher and collate will handle tensor conversion and norm + tfl += [ToNumpy()] + else: + tfl += [ + transforms.ToTensor(), + transforms.Normalize( + mean=torch.tensor(mean), + std=torch.tensor(std)) + ] + + return transforms.Compose(tfl) + + +def create_transform( + input_size, + is_training=False, + use_prefetcher=False, + no_aug=False, + scale=None, + ratio=None, + hflip=0.5, + vflip=0., + color_jitter=0.4, + auto_augment=None, + interpolation='bilinear', + mean=IMAGENET_DEFAULT_MEAN, + std=IMAGENET_DEFAULT_STD, + re_prob=0., + re_mode='const', + re_count=1, + re_num_splits=0, + crop_pct=None, + tf_preprocessing=False, + separate=False): + + if isinstance(input_size, (tuple, list)): + img_size = input_size[-2:] + else: + img_size = input_size + + if tf_preprocessing and use_prefetcher: + assert not separate, "Separate transforms not supported for TF preprocessing" + from custom_timm.data.tf_preprocessing import TfPreprocessTransform + transform = TfPreprocessTransform( + is_training=is_training, size=img_size, interpolation=interpolation) + else: + if is_training and no_aug: + assert not separate, "Cannot perform split augmentation with no_aug" + transform = transforms_noaug_train( + img_size, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + mean=mean, + std=std) + elif is_training: + transform = transforms_imagenet_train( + img_size, + scale=scale, + ratio=ratio, + hflip=hflip, + vflip=vflip, + color_jitter=color_jitter, + auto_augment=auto_augment, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + mean=mean, + std=std, + re_prob=re_prob, + re_mode=re_mode, + re_count=re_count, + re_num_splits=re_num_splits, + separate=separate) + else: + assert not separate, "Separate transforms not supported for validation preprocessing" + transform = transforms_imagenet_eval( + img_size, + interpolation=interpolation, + use_prefetcher=use_prefetcher, + mean=mean, + std=std, + crop_pct=crop_pct) + + return transform diff --git a/custom_timm/loss/__init__.py b/custom_timm/loss/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ea7f15f2f79673c962f68d6d4b06898e73ac1df6 --- /dev/null +++ b/custom_timm/loss/__init__.py @@ -0,0 +1,4 @@ +from .asymmetric_loss import AsymmetricLossMultiLabel, AsymmetricLossSingleLabel +from .binary_cross_entropy import BinaryCrossEntropy +from .cross_entropy import LabelSmoothingCrossEntropy, SoftTargetCrossEntropy +from .jsd import JsdCrossEntropy diff --git a/custom_timm/loss/asymmetric_loss.py b/custom_timm/loss/asymmetric_loss.py new file mode 100644 index 0000000000000000000000000000000000000000..a8b10f9c797c2cb3b2652302717b592dada216f3 --- /dev/null +++ b/custom_timm/loss/asymmetric_loss.py @@ -0,0 +1,97 @@ +import torch +import torch.nn as nn + + +class AsymmetricLossMultiLabel(nn.Module): + def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=False): + super(AsymmetricLossMultiLabel, self).__init__() + + self.gamma_neg = gamma_neg + self.gamma_pos = gamma_pos + self.clip = clip + self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss + self.eps = eps + + def forward(self, x, y): + """" + Parameters + ---------- + x: input logits + y: targets (multi-label binarized vector) + """ + + # Calculating Probabilities + x_sigmoid = torch.sigmoid(x) + xs_pos = x_sigmoid + xs_neg = 1 - x_sigmoid + + # Asymmetric Clipping + if self.clip is not None and self.clip > 0: + xs_neg = (xs_neg + self.clip).clamp(max=1) + + # Basic CE calculation + los_pos = y * torch.log(xs_pos.clamp(min=self.eps)) + los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps)) + loss = los_pos + los_neg + + # Asymmetric Focusing + if self.gamma_neg > 0 or self.gamma_pos > 0: + if self.disable_torch_grad_focal_loss: + torch._C.set_grad_enabled(False) + pt0 = xs_pos * y + pt1 = xs_neg * (1 - y) # pt = p if t > 0 else 1-p + pt = pt0 + pt1 + one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y) + one_sided_w = torch.pow(1 - pt, one_sided_gamma) + if self.disable_torch_grad_focal_loss: + torch._C.set_grad_enabled(True) + loss *= one_sided_w + + return -loss.sum() + + +class AsymmetricLossSingleLabel(nn.Module): + def __init__(self, gamma_pos=1, gamma_neg=4, eps: float = 0.1, reduction='mean'): + super(AsymmetricLossSingleLabel, self).__init__() + + self.eps = eps + self.logsoftmax = nn.LogSoftmax(dim=-1) + self.targets_classes = [] # prevent gpu repeated memory allocation + self.gamma_pos = gamma_pos + self.gamma_neg = gamma_neg + self.reduction = reduction + + def forward(self, inputs, target, reduction=None): + """" + Parameters + ---------- + x: input logits + y: targets (1-hot vector) + """ + + num_classes = inputs.size()[-1] + log_preds = self.logsoftmax(inputs) + self.targets_classes = torch.zeros_like(inputs).scatter_(1, target.long().unsqueeze(1), 1) + + # ASL weights + targets = self.targets_classes + anti_targets = 1 - targets + xs_pos = torch.exp(log_preds) + xs_neg = 1 - xs_pos + xs_pos = xs_pos * targets + xs_neg = xs_neg * anti_targets + asymmetric_w = torch.pow(1 - xs_pos - xs_neg, + self.gamma_pos * targets + self.gamma_neg * anti_targets) + log_preds = log_preds * asymmetric_w + + if self.eps > 0: # label smoothing + self.targets_classes.mul_(1 - self.eps).add_(self.eps / num_classes) + + # loss calculation + loss = - self.targets_classes.mul(log_preds) + + loss = loss.sum(dim=-1) + if self.reduction == 'mean': + loss = loss.mean() + + return loss diff --git a/custom_timm/loss/binary_cross_entropy.py b/custom_timm/loss/binary_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..ed76c1e8e004ca9a7255cf3650e322e6525c0577 --- /dev/null +++ b/custom_timm/loss/binary_cross_entropy.py @@ -0,0 +1,47 @@ +""" Binary Cross Entropy w/ a few extras + +Hacked together by / Copyright 2021 Ross Wightman +""" +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class BinaryCrossEntropy(nn.Module): + """ BCE with optional one-hot from dense targets, label smoothing, thresholding + NOTE for experiments comparing CE to BCE /w label smoothing, may remove + """ + def __init__( + self, smoothing=0.1, target_threshold: Optional[float] = None, weight: Optional[torch.Tensor] = None, + reduction: str = 'mean', pos_weight: Optional[torch.Tensor] = None): + super(BinaryCrossEntropy, self).__init__() + assert 0. <= smoothing < 1.0 + self.smoothing = smoothing + self.target_threshold = target_threshold + self.reduction = reduction + self.register_buffer('weight', weight) + self.register_buffer('pos_weight', pos_weight) + + def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + assert x.shape[0] == target.shape[0] + if target.shape != x.shape: + # NOTE currently assume smoothing or other label softening is applied upstream if targets are already sparse + num_classes = x.shape[-1] + # FIXME should off/on be different for smoothing w/ BCE? Other impl out there differ + off_value = self.smoothing / num_classes + on_value = 1. - self.smoothing + off_value + target = target.long().view(-1, 1) + target = torch.full( + (target.size()[0], num_classes), + off_value, + device=x.device, dtype=x.dtype).scatter_(1, target, on_value) + if self.target_threshold is not None: + # Make target 0, or 1 if threshold set + target = target.gt(self.target_threshold).to(dtype=target.dtype) + return F.binary_cross_entropy_with_logits( + x, target, + self.weight, + pos_weight=self.pos_weight, + reduction=self.reduction) diff --git a/custom_timm/loss/cross_entropy.py b/custom_timm/loss/cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..85198107f3ad2a1ff775a677d77c03569ff5d04d --- /dev/null +++ b/custom_timm/loss/cross_entropy.py @@ -0,0 +1,36 @@ +""" Cross Entropy w/ smoothing or soft targets + +Hacked together by / Copyright 2021 Ross Wightman +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class LabelSmoothingCrossEntropy(nn.Module): + """ NLL loss with label smoothing. + """ + def __init__(self, smoothing=0.1): + super(LabelSmoothingCrossEntropy, self).__init__() + assert smoothing < 1.0 + self.smoothing = smoothing + self.confidence = 1. - smoothing + + def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + logprobs = F.log_softmax(x, dim=-1) + nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1)) + nll_loss = nll_loss.squeeze(1) + smooth_loss = -logprobs.mean(dim=-1) + loss = self.confidence * nll_loss + self.smoothing * smooth_loss + return loss.mean() + + +class SoftTargetCrossEntropy(nn.Module): + + def __init__(self): + super(SoftTargetCrossEntropy, self).__init__() + + def forward(self, x: torch.Tensor, target: torch.Tensor) -> torch.Tensor: + loss = torch.sum(-target * F.log_softmax(x, dim=-1), dim=-1) + return loss.mean() diff --git a/custom_timm/loss/jsd.py b/custom_timm/loss/jsd.py new file mode 100644 index 0000000000000000000000000000000000000000..dd64e156c23d27aa03817a587ae367e8175fc126 --- /dev/null +++ b/custom_timm/loss/jsd.py @@ -0,0 +1,39 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .cross_entropy import LabelSmoothingCrossEntropy + + +class JsdCrossEntropy(nn.Module): + """ Jensen-Shannon Divergence + Cross-Entropy Loss + + Based on impl here: https://github.com/google-research/augmix/blob/master/imagenet.py + From paper: 'AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty - + https://arxiv.org/abs/1912.02781 + + Hacked together by / Copyright 2020 Ross Wightman + """ + def __init__(self, num_splits=3, alpha=12, smoothing=0.1): + super().__init__() + self.num_splits = num_splits + self.alpha = alpha + if smoothing is not None and smoothing > 0: + self.cross_entropy_loss = LabelSmoothingCrossEntropy(smoothing) + else: + self.cross_entropy_loss = torch.nn.CrossEntropyLoss() + + def __call__(self, output, target): + split_size = output.shape[0] // self.num_splits + assert split_size * self.num_splits == output.shape[0] + logits_split = torch.split(output, split_size) + + # Cross-entropy is only computed on clean images + loss = self.cross_entropy_loss(logits_split[0], target[:split_size]) + probs = [F.softmax(logits, dim=1) for logits in logits_split] + + # Clamp mixture distribution to avoid exploding KL divergence + logp_mixture = torch.clamp(torch.stack(probs).mean(axis=0), 1e-7, 1).log() + loss += self.alpha * sum([F.kl_div( + logp_mixture, p_split, reduction='batchmean') for p_split in probs]) / len(probs) + return loss diff --git a/custom_timm/models/__init__.py b/custom_timm/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5ff79595d83197ecfb9a164ae9b9125ec3804863 --- /dev/null +++ b/custom_timm/models/__init__.py @@ -0,0 +1,74 @@ +from .beit import * +from .byoanet import * +from .byobnet import * +from .cait import * +from .coat import * +from .convit import * +from .convmixer import * +from .convnext import * +from .crossvit import * +from .cspnet import * +from .deit import * +from .densenet import * +from .dla import * +from .dpn import * +from .edgenext import * +from .efficientformer import * +from .efficientnet import * +from .gcvit import * +from .ghostnet import * +from .gluon_resnet import * +from .gluon_xception import * +from .hardcorenas import * +from .hrnet import * +from .inception_resnet_v2 import * +from .inception_v3 import * +from .inception_v4 import * +from .levit import * +from .maxxvit import * +from .mlp_mixer import * +from .mobilenetv3 import * +from .mobilevit import * +from .mvitv2 import * +from .nasnet import * +from .nest import * +from .nfnet import * +from .pit import * +from .pnasnet import * +from .poolformer import * +from .pvt_v2 import * +from .regnet import * +from .res2net import * +from .resnest import * +from .resnet import * +from .resnetv2 import * +from .rexnet import * +from .selecsls import * +from .senet import * +from .sequencer import * +from .sknet import * +from .swin_transformer import * +from .swin_transformer_v2 import * +from .swin_transformer_v2_cr import * +from .tnt import * +from .tresnet import * +from .twins import * +from .vgg import * +from .visformer import * +from .vision_transformer import * +from .vision_transformer_hybrid import * +from .vision_transformer_relpos import * +from .volo import * +from .vovnet import * +from .xception import * +from .xception_aligned import * +from .xcit import * + +from .factory import create_model, parse_model_name, safe_model_name +from .helpers import load_checkpoint, resume_checkpoint, model_parameters +from .layers import TestTimePoolHead, apply_test_time_pool +from .layers import convert_splitbn_model, convert_sync_batchnorm +from .layers import is_scriptable, is_exportable, set_scriptable, set_exportable, is_no_jit, set_no_jit +from .layers import set_fast_norm +from .registry import register_model, model_entrypoint, list_models, is_model, list_modules, is_model_in_modules,\ + is_model_pretrained, get_pretrained_cfg, has_pretrained_cfg_key, is_pretrained_cfg_key, get_pretrained_cfg_value diff --git a/custom_timm/models/beit.py b/custom_timm/models/beit.py new file mode 100644 index 0000000000000000000000000000000000000000..2f81b008ebfc372aef4c211babc95be32c910629 --- /dev/null +++ b/custom_timm/models/beit.py @@ -0,0 +1,502 @@ +""" BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) + +Model from official source: https://github.com/microsoft/unilm/tree/master/beit +and +https://github.com/microsoft/unilm/tree/master/beit2 + +@inproceedings{beit, +title={{BEiT}: {BERT} Pre-Training of Image Transformers}, +author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei}, +booktitle={International Conference on Learning Representations}, +year={2022}, +url={https://openreview.net/forum?id=p-BhZSz59o4} +} + +@article{beitv2, +title={{BEiT v2}: Masked Image Modeling with Vector-Quantized Visual Tokenizers}, +author={Zhiliang Peng and Li Dong and Hangbo Bao and Qixiang Ye and Furu Wei}, +year={2022}, +eprint={2208.06366}, +archivePrefix={arXiv}, +primaryClass={cs.CV} +} + +At this point only the 1k fine-tuned classification weights and model configs have been added, +see original source above for pre-training models and procedure. + +Modifications by / Copyright 2021 Ross Wightman, original copyrights below +""" +# -------------------------------------------------------- +# BEIT: BERT Pre-Training of Image Transformers (https://arxiv.org/abs/2106.08254) +# Github source: https://github.com/microsoft/unilm/tree/master/beit +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# By Hangbo Bao +# Based on timm and DeiT code bases +# https://github.com/rwightman/pytorch-image-models/tree/master/timm +# https://github.com/facebookresearch/deit/ +# https://github.com/facebookresearch/dino +# --------------------------------------------------------' +import math +from functools import partial +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_ +from .registry import register_model +from .vision_transformer import checkpoint_filter_fn + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'beit_base_patch16_224': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth'), + 'beit_base_patch16_384': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_384_pt22k_ft22kto1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, + ), + 'beit_base_patch16_224_in22k': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22k.pth', + num_classes=21841, + ), + 'beit_large_patch16_224': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22kto1k.pth'), + 'beit_large_patch16_384': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_384_pt22k_ft22kto1k.pth', + input_size=(3, 384, 384), crop_pct=1.0, + ), + 'beit_large_patch16_512': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_512_pt22k_ft22kto1k.pth', + input_size=(3, 512, 512), crop_pct=1.0, + ), + 'beit_large_patch16_224_in22k': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_large_patch16_224_pt22k_ft22k.pth', + num_classes=21841, + ), + + 'beitv2_base_patch16_224': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21kto1k.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD + ), + 'beitv2_base_patch16_224_in22k': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_base_patch16_224_pt1k_ft21k.pth', + num_classes=21841, + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD + ), + 'beitv2_large_patch16_224': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21kto1k.pth', + crop_pct=0.95, + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD + ), + 'beitv2_large_patch16_224_in22k': _cfg( + url='https://conversationhub.blob.core.windows.net/beit-share-public/beitv2/beitv2_large_patch16_224_pt1k_ft21k.pth', + num_classes=21841, + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD + ), +} + + +def gen_relative_position_index(window_size: Tuple[int, int]) -> torch.Tensor: + num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + # cls to token & token 2 cls & cls to cls + # get pair-wise relative position index for each token inside the window + window_area = window_size[0] * window_size[1] + coords = torch.stack(torch.meshgrid( + [torch.arange(window_size[0]), + torch.arange(window_size[1])])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros(size=(window_area + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = num_relative_distance - 3 + relative_position_index[0:, 0] = num_relative_distance - 2 + relative_position_index[0, 0] = num_relative_distance - 1 + return relative_position_index + + +class Attention(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, attn_drop=0., + proj_drop=0., window_size=None, attn_head_dim=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + if attn_head_dim is not None: + head_dim = attn_head_dim + all_head_dim = head_dim * self.num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(all_head_dim)) + self.register_buffer('k_bias', torch.zeros(all_head_dim), persistent=False) + self.v_bias = nn.Parameter(torch.zeros(all_head_dim)) + else: + self.q_bias = None + self.k_bias = None + self.v_bias = None + + if window_size: + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, num_heads)) # 2*Wh-1 * 2*Ww-1, nH + self.register_buffer("relative_position_index", gen_relative_position_index(window_size)) + else: + self.window_size = None + self.relative_position_bias_table = None + self.relative_position_index = None + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(all_head_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def _get_rel_pos_bias(self): + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1] + 1, + self.window_size[0] * self.window_size[1] + 1, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + return relative_position_bias.unsqueeze(0) + + def forward(self, x, shared_rel_pos_bias: Optional[torch.Tensor] = None): + B, N, C = x.shape + + qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) if self.q_bias is not None else None + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + + if self.relative_position_bias_table is not None: + attn = attn + self._get_rel_pos_bias() + if shared_rel_pos_bias is not None: + attn = attn + shared_rel_pos_bias + + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm, + window_size=None, attn_head_dim=None): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, + window_size=window_size, attn_head_dim=attn_head_dim) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + if init_values: + self.gamma_1 = nn.Parameter(init_values * torch.ones(dim)) + self.gamma_2 = nn.Parameter(init_values * torch.ones(dim)) + else: + self.gamma_1, self.gamma_2 = None, None + + def forward(self, x, shared_rel_pos_bias: Optional[torch.Tensor] = None): + if self.gamma_1 is None: + x = x + self.drop_path(self.attn(self.norm1(x), shared_rel_pos_bias=shared_rel_pos_bias)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + else: + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), shared_rel_pos_bias=shared_rel_pos_bias)) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class RelativePositionBias(nn.Module): + + def __init__(self, window_size, num_heads): + super().__init__() + self.window_size = window_size + self.window_area = window_size[0] * window_size[1] + num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter(torch.zeros(num_relative_distance, num_heads)) + # trunc_normal_(self.relative_position_bias_table, std=.02) + self.register_buffer("relative_position_index", gen_relative_position_index(window_size)) + + def forward(self): + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_area + 1, self.window_area + 1, -1) # Wh*Ww,Wh*Ww,nH + return relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + + +class Beit(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + + def __init__( + self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='avg', + embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0., + attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6), + init_values=None, use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False, + head_init_scale=0.001): + super().__init__() + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.grad_checkpointing = False + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + # self.mask_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) if use_abs_pos_emb else None + self.pos_drop = nn.Dropout(p=drop_rate) + + if use_shared_rel_pos_bias: + self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.grid_size, num_heads=num_heads) + else: + self.rel_pos_bias = None + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + init_values=init_values, window_size=self.patch_embed.grid_size if use_rel_pos_bias else None) + for i in range(depth)]) + use_fc_norm = self.global_pool == 'avg' + self.norm = nn.Identity() if use_fc_norm else norm_layer(embed_dim) + self.fc_norm = norm_layer(embed_dim) if use_fc_norm else None + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + if self.pos_embed is not None: + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + # trunc_normal_(self.mask_token, std=.02) + self.fix_init_weight() + if isinstance(self.head, nn.Linear): + trunc_normal_(self.head.weight, std=.02) + self.head.weight.data.mul_(head_init_scale) + self.head.bias.data.mul_(head_init_scale) + + def fix_init_weight(self): + def rescale(param, layer_id): + param.div_(math.sqrt(2.0 * layer_id)) + + for layer_id, layer in enumerate(self.blocks): + rescale(layer.attn.proj.weight.data, layer_id + 1) + rescale(layer.mlp.fc2.weight.data, layer_id + 1) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + nwd = {'pos_embed', 'cls_token'} + for n, _ in self.named_parameters(): + if 'relative_position_bias_table' in n: + nwd.add(n) + return nwd + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^cls_token|pos_embed|patch_embed|rel_pos_bias', # stem and embed + blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))], + ) + return matcher + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + if self.pos_embed is not None: + x = x + self.pos_embed + x = self.pos_drop(x) + + rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(blk, x, shared_rel_pos_bias=rel_pos_bias) + else: + x = blk(x, shared_rel_pos_bias=rel_pos_bias) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.fc_norm is not None: + x = x[:, 1:].mean(dim=1) + x = self.fc_norm(x) + else: + x = x[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _beit_checkpoint_filter_fn(state_dict, model): + if 'module' in state_dict: + # beit v2 didn't strip module + state_dict = state_dict['module'] + return checkpoint_filter_fn(state_dict, model) + + +def _create_beit(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Beit models.') + + model = build_model_with_cfg( + Beit, variant, pretrained, + # FIXME an updated filter fn needed to interpolate rel pos emb if fine tuning to diff model sizes + pretrained_filter_fn=_beit_checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def beit_base_patch16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs) + model = _create_beit('beit_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beit_base_patch16_384(pretrained=False, **kwargs): + model_kwargs = dict( + img_size=384, patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs) + model = _create_beit('beit_base_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beit_base_patch16_224_in22k(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=0.1, **kwargs) + model = _create_beit('beit_base_patch16_224_in22k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beit_large_patch16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beit_large_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beit_large_patch16_384(pretrained=False, **kwargs): + model_kwargs = dict( + img_size=384, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beit_large_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beit_large_patch16_512(pretrained=False, **kwargs): + model_kwargs = dict( + img_size=512, patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beit_large_patch16_512', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beit_large_patch16_224_in22k(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beit_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beitv2_base_patch16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beitv2_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beitv2_base_patch16_224_in22k(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beitv2_base_patch16_224_in22k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beitv2_large_patch16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beitv2_large_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def beitv2_large_patch16_224_in22k(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, mlp_ratio=4, qkv_bias=True, + use_abs_pos_emb=False, use_rel_pos_bias=True, init_values=1e-5, **kwargs) + model = _create_beit('beitv2_large_patch16_224_in22k', pretrained=pretrained, **model_kwargs) + return model diff --git a/custom_timm/models/byoanet.py b/custom_timm/models/byoanet.py new file mode 100644 index 0000000000000000000000000000000000000000..34a557be90fc1af1ed858a08feb1987ed2281dac --- /dev/null +++ b/custom_timm/models/byoanet.py @@ -0,0 +1,442 @@ +""" Bring-Your-Own-Attention Network + +A flexible network w/ dataclass based config for stacking NN blocks including +self-attention (or similar) layers. + +Currently used to implement experimental variants of: + * Bottleneck Transformers + * Lambda ResNets + * HaloNets + +Consider all of the models definitions here as experimental WIP and likely to change. + +Hacked together by / copyright Ross Wightman, 2021. +""" +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .byobnet import ByoBlockCfg, ByoModelCfg, ByobNet, interleave_blocks +from .helpers import build_model_with_cfg +from .registry import register_model + +__all__ = [] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.95, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc', + 'fixed_input_size': False, 'min_input_size': (3, 224, 224), + **kwargs + } + + +default_cfgs = { + # GPU-Efficient (ResNet) weights + 'botnet26t_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/botnet26t_c1_256-167a0e9f.pth', + fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)), + 'sebotnet33ts_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/sebotnet33ts_a1h2_256-957e3c3e.pth', + fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94), + 'botnet50ts_256': _cfg( + url='', + fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)), + 'eca_botnext26ts_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_botnext26ts_c_256-95a898f6.pth', + fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)), + + 'halonet_h1': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)), + 'halonet26t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/halonet26t_a1h_256-3083328c.pth', + input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256)), + 'sehalonet33ts': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/sehalonet33ts_256-87e053f9.pth', + input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256), crop_pct=0.94), + 'halonet50ts': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/halonet50ts_a1h2_256-f3a3daee.pth', + input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256), crop_pct=0.94), + 'eca_halonext26ts': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_halonext26ts_c_256-06906299.pth', + input_size=(3, 256, 256), pool_size=(8, 8), min_input_size=(3, 256, 256), crop_pct=0.94), + + 'lambda_resnet26t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lambda_resnet26t_c_256-e5a5c857.pth', + min_input_size=(3, 128, 128), input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94), + 'lambda_resnet50ts': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lambda_resnet50ts_a1h_256-b87370f7.pth', + min_input_size=(3, 128, 128), input_size=(3, 256, 256), pool_size=(8, 8)), + 'lambda_resnet26rpt_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lambda_resnet26rpt_c_256-ab00292d.pth', + fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94), + + 'haloregnetz_b': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/haloregnetz_c_raa_256-c8ad7616.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + first_conv='stem.conv', input_size=(3, 224, 224), pool_size=(7, 7), min_input_size=(3, 224, 224), crop_pct=0.94), + + 'lamhalobotnet50ts_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/lamhalobotnet50ts_a1h2_256-fe3d9445.pth', + fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)), + 'halo2botnet50ts_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/halo2botnet50ts_a1h2_256-fd9c11a3.pth', + fixed_input_size=True, input_size=(3, 256, 256), pool_size=(8, 8)), +} + + +model_cfgs = dict( + + botnet26t=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + fixed_input_size=True, + self_attn_layer='bottleneck', + self_attn_kwargs=dict() + ), + sebotnet33ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg('self_attn', d=2, c=1536, s=2, gs=0, br=0.333), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + act_layer='silu', + num_features=1280, + attn_layer='se', + self_attn_layer='bottleneck', + self_attn_kwargs=dict() + ), + botnet50ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=4, d=4, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + fixed_input_size=True, + self_attn_layer='bottleneck', + self_attn_kwargs=dict() + ), + eca_botnext26ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=16, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + fixed_input_size=True, + act_layer='silu', + attn_layer='eca', + self_attn_layer='bottleneck', + self_attn_kwargs=dict(dim_head=16) + ), + + halonet_h1=ByoModelCfg( + blocks=( + ByoBlockCfg(type='self_attn', d=3, c=64, s=1, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=3, c=128, s=2, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=10, c=256, s=2, gs=0, br=1.0), + ByoBlockCfg(type='self_attn', d=3, c=512, s=2, gs=0, br=1.0), + ), + stem_chs=64, + stem_type='7x7', + stem_pool='maxpool', + + self_attn_layer='halo', + self_attn_kwargs=dict(block_size=8, halo_size=3), + ), + halonet26t=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + self_attn_layer='halo', + self_attn_kwargs=dict(block_size=8, halo_size=2) + ), + sehalonet33ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=[2], d=3, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg('self_attn', d=2, c=1536, s=2, gs=0, br=0.333), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + act_layer='silu', + num_features=1280, + attn_layer='se', + self_attn_layer='halo', + self_attn_kwargs=dict(block_size=8, halo_size=3) + ), + halonet50ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + interleave_blocks( + types=('bottle', 'self_attn'), every=4, d=4, c=512, s=2, gs=0, br=0.25, + self_attn_layer='halo', self_attn_kwargs=dict(block_size=8, halo_size=3, num_heads=4)), + interleave_blocks(types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + self_attn_layer='halo', + self_attn_kwargs=dict(block_size=8, halo_size=3) + ), + eca_halonext26ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=16, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=16, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=16, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=16, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + attn_layer='eca', + self_attn_layer='halo', + self_attn_kwargs=dict(block_size=8, halo_size=2, dim_head=16) + ), + + lambda_resnet26t=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + self_attn_layer='lambda', + self_attn_kwargs=dict(r=9) + ), + lambda_resnet50ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), every=4, d=4, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + self_attn_layer='lambda', + self_attn_kwargs=dict(r=9) + ), + lambda_resnet26rpt_256=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=0, br=0.25), + interleave_blocks(types=('bottle', 'self_attn'), d=2, c=1024, s=2, gs=0, br=0.25), + ByoBlockCfg(type='self_attn', d=2, c=2048, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + self_attn_layer='lambda', + self_attn_kwargs=dict(r=None) + ), + + # experimental + haloregnetz_b=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=3), + ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=3), + interleave_blocks(types=('bottle', 'self_attn'), every=3, d=12, c=192, s=2, gs=16, br=3), + ByoBlockCfg('self_attn', d=2, c=288, s=2, gs=16, br=3), + ), + stem_chs=32, + stem_pool='', + downsample='', + num_features=1536, + act_layer='silu', + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + self_attn_layer='halo', + self_attn_kwargs=dict(block_size=7, halo_size=2, qk_ratio=0.33) + ), + + # experimental + lamhalobotnet50ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + interleave_blocks( + types=('bottle', 'self_attn'), d=4, c=512, s=2, gs=0, br=0.25, + self_attn_layer='lambda', self_attn_kwargs=dict(r=13)), + interleave_blocks( + types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25, + self_attn_layer='halo', self_attn_kwargs=dict(halo_size=3)), + interleave_blocks( + types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25, + self_attn_layer='bottleneck', self_attn_kwargs=dict()), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + act_layer='silu', + ), + halo2botnet50ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=0, br=0.25), + interleave_blocks( + types=('bottle', 'self_attn'), d=4, c=512, s=2, gs=0, br=0.25, + self_attn_layer='halo', self_attn_kwargs=dict(halo_size=3)), + interleave_blocks( + types=('bottle', 'self_attn'), d=6, c=1024, s=2, gs=0, br=0.25, + self_attn_layer='halo', self_attn_kwargs=dict(halo_size=3)), + interleave_blocks( + types=('bottle', 'self_attn'), d=3, c=2048, s=2, gs=0, br=0.25, + self_attn_layer='bottleneck', self_attn_kwargs=dict()), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + act_layer='silu', + ), +) + + +def _create_byoanet(variant, cfg_variant=None, pretrained=False, **kwargs): + return build_model_with_cfg( + ByobNet, variant, pretrained, + model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], + feature_cfg=dict(flatten_sequential=True), + **kwargs) + + +@register_model +def botnet26t_256(pretrained=False, **kwargs): + """ Bottleneck Transformer w/ ResNet26-T backbone. + """ + kwargs.setdefault('img_size', 256) + return _create_byoanet('botnet26t_256', 'botnet26t', pretrained=pretrained, **kwargs) + + +@register_model +def sebotnet33ts_256(pretrained=False, **kwargs): + """ Bottleneck Transformer w/ a ResNet33-t backbone, SE attn for non Halo blocks, SiLU, + """ + return _create_byoanet('sebotnet33ts_256', 'sebotnet33ts', pretrained=pretrained, **kwargs) + + +@register_model +def botnet50ts_256(pretrained=False, **kwargs): + """ Bottleneck Transformer w/ ResNet50-T backbone, silu act. + """ + kwargs.setdefault('img_size', 256) + return _create_byoanet('botnet50ts_256', 'botnet50ts', pretrained=pretrained, **kwargs) + + +@register_model +def eca_botnext26ts_256(pretrained=False, **kwargs): + """ Bottleneck Transformer w/ ResNet26-T backbone, silu act. + """ + kwargs.setdefault('img_size', 256) + return _create_byoanet('eca_botnext26ts_256', 'eca_botnext26ts', pretrained=pretrained, **kwargs) + + +@register_model +def halonet_h1(pretrained=False, **kwargs): + """ HaloNet-H1. Halo attention in all stages as per the paper. + NOTE: This runs very slowly! + """ + return _create_byoanet('halonet_h1', pretrained=pretrained, **kwargs) + + +@register_model +def halonet26t(pretrained=False, **kwargs): + """ HaloNet w/ a ResNet26-t backbone. Halo attention in final two stages + """ + return _create_byoanet('halonet26t', pretrained=pretrained, **kwargs) + + +@register_model +def sehalonet33ts(pretrained=False, **kwargs): + """ HaloNet w/ a ResNet33-t backbone, SE attn for non Halo blocks, SiLU, 1-2 Halo in stage 2,3,4. + """ + return _create_byoanet('sehalonet33ts', pretrained=pretrained, **kwargs) + + +@register_model +def halonet50ts(pretrained=False, **kwargs): + """ HaloNet w/ a ResNet50-t backbone, silu act. Halo attention in final two stages + """ + return _create_byoanet('halonet50ts', pretrained=pretrained, **kwargs) + + +@register_model +def eca_halonext26ts(pretrained=False, **kwargs): + """ HaloNet w/ a ResNet26-t backbone, silu act. Halo attention in final two stages + """ + return _create_byoanet('eca_halonext26ts', pretrained=pretrained, **kwargs) + + +@register_model +def lambda_resnet26t(pretrained=False, **kwargs): + """ Lambda-ResNet-26-T. Lambda layers w/ conv pos in last two stages. + """ + return _create_byoanet('lambda_resnet26t', pretrained=pretrained, **kwargs) + + +@register_model +def lambda_resnet50ts(pretrained=False, **kwargs): + """ Lambda-ResNet-50-TS. SiLU act. Lambda layers w/ conv pos in last two stages. + """ + return _create_byoanet('lambda_resnet50ts', pretrained=pretrained, **kwargs) + + +@register_model +def lambda_resnet26rpt_256(pretrained=False, **kwargs): + """ Lambda-ResNet-26-R-T. Lambda layers w/ rel pos embed in last two stages. + """ + kwargs.setdefault('img_size', 256) + return _create_byoanet('lambda_resnet26rpt_256', pretrained=pretrained, **kwargs) + + +@register_model +def haloregnetz_b(pretrained=False, **kwargs): + """ Halo + RegNetZ + """ + return _create_byoanet('haloregnetz_b', pretrained=pretrained, **kwargs) + + +@register_model +def lamhalobotnet50ts_256(pretrained=False, **kwargs): + """ Combo Attention (Lambda + Halo + Bot) Network + """ + return _create_byoanet('lamhalobotnet50ts_256', 'lamhalobotnet50ts', pretrained=pretrained, **kwargs) + + +@register_model +def halo2botnet50ts_256(pretrained=False, **kwargs): + """ Combo Attention (Halo + Halo + Bot) Network + """ + return _create_byoanet('halo2botnet50ts_256', 'halo2botnet50ts', pretrained=pretrained, **kwargs) diff --git a/custom_timm/models/byobnet.py b/custom_timm/models/byobnet.py new file mode 100644 index 0000000000000000000000000000000000000000..71b6dd446af4d779012a6ea149fb7862b2ff3e27 --- /dev/null +++ b/custom_timm/models/byobnet.py @@ -0,0 +1,1587 @@ +""" Bring-Your-Own-Blocks Network + +A flexible network w/ dataclass based config for stacking those NN blocks. + +This model is currently used to implement the following networks: + +GPU Efficient (ResNets) - gernet_l/m/s (original versions called genet, but this was already used (by SENet author)). +Paper: `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 +Code and weights: https://github.com/idstcv/GPU-Efficient-Networks, licensed Apache 2.0 + +RepVGG - repvgg_* +Paper: `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 +Code and weights: https://github.com/DingXiaoH/RepVGG, licensed MIT + +In all cases the models have been modified to fit within the design of ByobNet. I've remapped +the original weights and verified accuracies. + +For GPU Efficient nets, I used the original names for the blocks since they were for the most part +the same as original residual blocks in ResNe(X)t, DarkNet, and other existing models. Note also some +changes introduced in RegNet were also present in the stem and bottleneck blocks for this model. + +A significant number of different network archs can be implemented here, including variants of the +above nets that include attention. + +Hacked together by / copyright Ross Wightman, 2021. +""" +import math +from dataclasses import dataclass, field, replace +from typing import Tuple, List, Dict, Optional, Union, Any, Callable, Sequence +from functools import partial + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, named_apply, checkpoint_seq +from .layers import ClassifierHead, ConvNormAct, BatchNormAct2d, DropPath, AvgPool2dSame, \ + create_conv2d, get_act_layer, get_norm_act_layer, get_attn, make_divisible, to_2tuple, EvoNorm2dS0, EvoNorm2dS0a,\ + EvoNorm2dS1, EvoNorm2dS1a, EvoNorm2dS2, EvoNorm2dS2a, FilterResponseNormAct2d, FilterResponseNormTlu2d +from .registry import register_model + +__all__ = ['ByobNet', 'ByoModelCfg', 'ByoBlockCfg', 'create_byob_stem', 'create_block'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv', 'classifier': 'head.fc', + **kwargs + } + + +def _cfgr(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8), + 'crop_pct': 0.9, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = { + # GPU-Efficient (ResNet) weights + 'gernet_s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_s-756b4751.pth'), + 'gernet_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_m-0873c53a.pth'), + 'gernet_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_l-f31e2e8d.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + + # RepVGG weights + 'repvgg_a2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_a2-c1ee6d2b.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + 'repvgg_b0': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b0-80ac3f1b.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + 'repvgg_b1': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1-77ca2989.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + 'repvgg_b1g4': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1g4-abde5d92.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + 'repvgg_b2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2-25b7494e.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + 'repvgg_b2g4': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2g4-165a85f2.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + 'repvgg_b3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3-199bc50d.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + 'repvgg_b3g4': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3g4-73c370bf.pth', + first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')), + + # experimental configs + 'resnet51q': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet51q_ra2-d47dcc76.pth', + first_conv='stem.conv1', input_size=(3, 256, 256), pool_size=(8, 8), + test_input_size=(3, 288, 288), crop_pct=1.0), + 'resnet61q': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet61q_ra2-6afc536c.pth', + test_input_size=(3, 288, 288), crop_pct=1.0), + + 'resnext26ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnext26ts_256_ra2-8bbd9106.pth'), + 'gcresnext26ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnext26ts_256-e414378b.pth'), + 'seresnext26ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/seresnext26ts_256-6f0d74a3.pth'), + 'eca_resnext26ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_resnext26ts_256-5a1d030f.pth'), + 'bat_resnext26ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/bat_resnext26ts_256-fa6fd595.pth', + min_input_size=(3, 256, 256)), + + 'resnet32ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet32ts_256-aacf5250.pth'), + 'resnet33ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet33ts_256-e91b09a4.pth'), + 'gcresnet33ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnet33ts_256-0e0cd345.pth'), + 'seresnet33ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/seresnet33ts_256-f8ad44d9.pth'), + 'eca_resnet33ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/eca_resnet33ts_256-8f98face.pth'), + + 'gcresnet50t': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnet50t_256-96374d1c.pth'), + + 'gcresnext50ts': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/gcresnext50ts_256-3e0f515e.pth'), + + # experimental models, likely to change ot be removed + 'regnetz_b16': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_b_raa-677d9606.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 224, 224), pool_size=(7, 7), test_input_size=(3, 288, 288), first_conv='stem.conv', crop_pct=0.94), + 'regnetz_c16': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_c_rab2_256-a54bf36a.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), first_conv='stem.conv', crop_pct=0.94), + 'regnetz_d32': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_d_rab_256-b8073a89.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=0.95), + 'regnetz_d8': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_d8_bh-afc03c55.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=1.0), + 'regnetz_e8': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/regnetz_e8_bh-aace8e6e.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=1.0), + + 'regnetz_b16_evos': _cfgr( + url='', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 224, 224), pool_size=(7, 7), test_input_size=(3, 288, 288), first_conv='stem.conv', + crop_pct=0.94), + 'regnetz_c16_evos': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_c16_evos_ch-d8311942.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), first_conv='stem.conv', crop_pct=0.95), + 'regnetz_d8_evos': _cfgr( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_d8_evos_ch-2bc12646.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), test_input_size=(3, 320, 320), crop_pct=0.95), +} + + +@dataclass +class ByoBlockCfg: + type: Union[str, nn.Module] + d: int # block depth (number of block repeats in stage) + c: int # number of output channels for each block in stage + s: int = 2 # stride of stage (first block) + gs: Optional[Union[int, Callable]] = None # group-size of blocks in stage, conv is depthwise if gs == 1 + br: float = 1. # bottleneck-ratio of blocks in stage + + # NOTE: these config items override the model cfgs that are applied to all blocks by default + attn_layer: Optional[str] = None + attn_kwargs: Optional[Dict[str, Any]] = None + self_attn_layer: Optional[str] = None + self_attn_kwargs: Optional[Dict[str, Any]] = None + block_kwargs: Optional[Dict[str, Any]] = None + + +@dataclass +class ByoModelCfg: + blocks: Tuple[Union[ByoBlockCfg, Tuple[ByoBlockCfg, ...]], ...] + downsample: str = 'conv1x1' + stem_type: str = '3x3' + stem_pool: Optional[str] = 'maxpool' + stem_chs: int = 32 + width_factor: float = 1.0 + num_features: int = 0 # num out_channels for final conv, no final 1x1 conv if 0 + zero_init_last: bool = True # zero init last weight (usually bn) in residual path + fixed_input_size: bool = False # model constrained to a fixed-input size / img_size must be provided on creation + + act_layer: str = 'relu' + norm_layer: str = 'batchnorm' + + # NOTE: these config items will be overridden by the block cfg (per-block) if they are set there + attn_layer: Optional[str] = None + attn_kwargs: dict = field(default_factory=lambda: dict()) + self_attn_layer: Optional[str] = None + self_attn_kwargs: dict = field(default_factory=lambda: dict()) + block_kwargs: Dict[str, Any] = field(default_factory=lambda: dict()) + + +def _rep_vgg_bcfg(d=(4, 6, 16, 1), wf=(1., 1., 1., 1.), groups=0): + c = (64, 128, 256, 512) + group_size = 0 + if groups > 0: + group_size = lambda chs, idx: chs // groups if (idx + 1) % 2 == 0 else 0 + bcfg = tuple([ByoBlockCfg(type='rep', d=d, c=c * wf, gs=group_size) for d, c, wf in zip(d, c, wf)]) + return bcfg + + +def interleave_blocks( + types: Tuple[str, str], d, every: Union[int, List[int]] = 1, first: bool = False, **kwargs +) -> Tuple[ByoBlockCfg]: + """ interleave 2 block types in stack + """ + assert len(types) == 2 + if isinstance(every, int): + every = list(range(0 if first else every, d, every + 1)) + if not every: + every = [d - 1] + set(every) + blocks = [] + for i in range(d): + block_type = types[1] if i in every else types[0] + blocks += [ByoBlockCfg(type=block_type, d=1, **kwargs)] + return tuple(blocks) + + +model_cfgs = dict( + gernet_l=ByoModelCfg( + blocks=( + ByoBlockCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.), + ByoBlockCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.), + ByoBlockCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4), + ByoBlockCfg(type='bottle', d=5, c=640, s=2, gs=1, br=3.), + ByoBlockCfg(type='bottle', d=4, c=640, s=1, gs=1, br=3.), + ), + stem_chs=32, + stem_pool=None, + num_features=2560, + ), + gernet_m=ByoModelCfg( + blocks=( + ByoBlockCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.), + ByoBlockCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.), + ByoBlockCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4), + ByoBlockCfg(type='bottle', d=4, c=640, s=2, gs=1, br=3.), + ByoBlockCfg(type='bottle', d=1, c=640, s=1, gs=1, br=3.), + ), + stem_chs=32, + stem_pool=None, + num_features=2560, + ), + gernet_s=ByoModelCfg( + blocks=( + ByoBlockCfg(type='basic', d=1, c=48, s=2, gs=0, br=1.), + ByoBlockCfg(type='basic', d=3, c=48, s=2, gs=0, br=1.), + ByoBlockCfg(type='bottle', d=7, c=384, s=2, gs=0, br=1 / 4), + ByoBlockCfg(type='bottle', d=2, c=560, s=2, gs=1, br=3.), + ByoBlockCfg(type='bottle', d=1, c=256, s=1, gs=1, br=3.), + ), + stem_chs=13, + stem_pool=None, + num_features=1920, + ), + + repvgg_a2=ByoModelCfg( + blocks=_rep_vgg_bcfg(d=(2, 4, 14, 1), wf=(1.5, 1.5, 1.5, 2.75)), + stem_type='rep', + stem_chs=64, + ), + repvgg_b0=ByoModelCfg( + blocks=_rep_vgg_bcfg(wf=(1., 1., 1., 2.5)), + stem_type='rep', + stem_chs=64, + ), + repvgg_b1=ByoModelCfg( + blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.)), + stem_type='rep', + stem_chs=64, + ), + repvgg_b1g4=ByoModelCfg( + blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.), groups=4), + stem_type='rep', + stem_chs=64, + ), + repvgg_b2=ByoModelCfg( + blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.)), + stem_type='rep', + stem_chs=64, + ), + repvgg_b2g4=ByoModelCfg( + blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.), groups=4), + stem_type='rep', + stem_chs=64, + ), + repvgg_b3=ByoModelCfg( + blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.)), + stem_type='rep', + stem_chs=64, + ), + repvgg_b3g4=ByoModelCfg( + blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.), groups=4), + stem_type='rep', + stem_chs=64, + ), + + # 4 x conv stem w/ 2 act, no maxpool, 2,4,6,4 repeats, group size 32 in first 3 blocks + # DW convs in last block, 2048 pre-FC, silu act + resnet51q=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0), + ), + stem_chs=128, + stem_type='quad2', + stem_pool=None, + num_features=2048, + act_layer='silu', + ), + + # 4 x conv stem w/ 4 act, no maxpool, 1,4,6,4 repeats, edge block first, group size 32 in next 2 blocks + # DW convs in last block, 4 conv for each bottle block, 2048 pre-FC, silu act + resnet61q=ByoModelCfg( + blocks=( + ByoBlockCfg(type='edge', d=1, c=256, s=1, gs=0, br=1.0, block_kwargs=dict()), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1536, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=1536, s=2, gs=1, br=1.0), + ), + stem_chs=128, + stem_type='quad', + stem_pool=None, + num_features=2048, + act_layer='silu', + block_kwargs=dict(extra_conv=True), + ), + + # A series of ResNeXt-26 models w/ one of none, GC, SE, ECA, BAT attn, group size 32, SiLU act, + # and a tiered stem w/ maxpool + resnext26ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + ), + gcresnext26ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + attn_layer='gca', + ), + seresnext26ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + attn_layer='se', + ), + eca_resnext26ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + attn_layer='eca', + ), + bat_resnext26ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1024, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=2048, s=2, gs=32, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + act_layer='silu', + attn_layer='bat', + attn_kwargs=dict(block_size=8) + ), + + # ResNet-32 (2, 3, 3, 2) models w/ no attn, no groups, SiLU act, no pre-fc feat layer, tiered stem w/o maxpool + resnet32ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + num_features=0, + act_layer='silu', + ), + + # ResNet-33 (2, 3, 3, 2) models w/ no attn, no groups, SiLU act, 1280 pre-FC feat, tiered stem w/o maxpool + resnet33ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + num_features=1280, + act_layer='silu', + ), + + # A series of ResNet-33 (2, 3, 3, 2) models w/ one of GC, SE, ECA attn, no groups, SiLU act, 1280 pre-FC feat + # and a tiered stem w/ no maxpool + gcresnet33ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + num_features=1280, + act_layer='silu', + attn_layer='gca', + ), + seresnet33ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + num_features=1280, + act_layer='silu', + attn_layer='se', + ), + eca_resnet33ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=256, s=1, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=1536, s=2, gs=0, br=0.25), + ByoBlockCfg(type='bottle', d=2, c=1536, s=2, gs=0, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + num_features=1280, + act_layer='silu', + attn_layer='eca', + ), + + gcresnet50t=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1024, s=2, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=2048, s=2, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + attn_layer='gca', + ), + + gcresnext50ts=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=256, s=1, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=4, c=512, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=6, c=1024, s=2, gs=32, br=0.25), + ByoBlockCfg(type='bottle', d=3, c=2048, s=2, gs=32, br=0.25), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='maxpool', + # stem_pool=None, + act_layer='silu', + attn_layer='gca', + ), + + # experimental models, closer to a RegNetZ than a ResNet. Similar to EfficientNets but w/ groups instead of DW + regnetz_b16=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=3), + ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=3), + ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=3), + ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=3), + ), + stem_chs=32, + stem_pool='', + downsample='', + num_features=1536, + act_layer='silu', + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), + regnetz_c16=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=4), + ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=4), + ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=4), + ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=4), + ), + stem_chs=32, + stem_pool='', + downsample='', + num_features=1536, + act_layer='silu', + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), + regnetz_d32=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=32, br=4), + ByoBlockCfg(type='bottle', d=6, c=128, s=2, gs=32, br=4), + ByoBlockCfg(type='bottle', d=12, c=256, s=2, gs=32, br=4), + ByoBlockCfg(type='bottle', d=3, c=384, s=2, gs=32, br=4), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + downsample='', + num_features=1792, + act_layer='silu', + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), + regnetz_d8=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=8, br=4), + ByoBlockCfg(type='bottle', d=6, c=128, s=2, gs=8, br=4), + ByoBlockCfg(type='bottle', d=12, c=256, s=2, gs=8, br=4), + ByoBlockCfg(type='bottle', d=3, c=384, s=2, gs=8, br=4), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + downsample='', + num_features=1792, + act_layer='silu', + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), + regnetz_e8=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=96, s=1, gs=8, br=4), + ByoBlockCfg(type='bottle', d=8, c=192, s=2, gs=8, br=4), + ByoBlockCfg(type='bottle', d=16, c=384, s=2, gs=8, br=4), + ByoBlockCfg(type='bottle', d=3, c=512, s=2, gs=8, br=4), + ), + stem_chs=64, + stem_type='tiered', + stem_pool='', + downsample='', + num_features=2048, + act_layer='silu', + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), + + # experimental EvoNorm configs + regnetz_b16_evos=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=3), + ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=3), + ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=3), + ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=3), + ), + stem_chs=32, + stem_pool='', + downsample='', + num_features=1536, + act_layer='silu', + norm_layer=partial(EvoNorm2dS0a, group_size=16), + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), + regnetz_c16_evos=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=2, c=48, s=2, gs=16, br=4), + ByoBlockCfg(type='bottle', d=6, c=96, s=2, gs=16, br=4), + ByoBlockCfg(type='bottle', d=12, c=192, s=2, gs=16, br=4), + ByoBlockCfg(type='bottle', d=2, c=288, s=2, gs=16, br=4), + ), + stem_chs=32, + stem_pool='', + downsample='', + num_features=1536, + act_layer='silu', + norm_layer=partial(EvoNorm2dS0a, group_size=16), + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), + regnetz_d8_evos=ByoModelCfg( + blocks=( + ByoBlockCfg(type='bottle', d=3, c=64, s=1, gs=8, br=4), + ByoBlockCfg(type='bottle', d=6, c=128, s=2, gs=8, br=4), + ByoBlockCfg(type='bottle', d=12, c=256, s=2, gs=8, br=4), + ByoBlockCfg(type='bottle', d=3, c=384, s=2, gs=8, br=4), + ), + stem_chs=64, + stem_type='deep', + stem_pool='', + downsample='', + num_features=1792, + act_layer='silu', + norm_layer=partial(EvoNorm2dS0a, group_size=16), + attn_layer='se', + attn_kwargs=dict(rd_ratio=0.25), + block_kwargs=dict(bottle_in=True, linear_out=True), + ), +) + +@register_model +def gernet_l(pretrained=False, **kwargs): + """ GEResNet-Large (GENet-Large from official impl) + `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 + """ + return _create_byobnet('gernet_l', pretrained=pretrained, **kwargs) + + +@register_model +def gernet_m(pretrained=False, **kwargs): + """ GEResNet-Medium (GENet-Normal from official impl) + `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 + """ + return _create_byobnet('gernet_m', pretrained=pretrained, **kwargs) + + +@register_model +def gernet_s(pretrained=False, **kwargs): + """ EResNet-Small (GENet-Small from official impl) + `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090 + """ + return _create_byobnet('gernet_s', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_a2(pretrained=False, **kwargs): + """ RepVGG-A2 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_a2', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b0(pretrained=False, **kwargs): + """ RepVGG-B0 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b0', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b1(pretrained=False, **kwargs): + """ RepVGG-B1 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b1', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b1g4(pretrained=False, **kwargs): + """ RepVGG-B1g4 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b1g4', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b2(pretrained=False, **kwargs): + """ RepVGG-B2 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b2', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b2g4(pretrained=False, **kwargs): + """ RepVGG-B2g4 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b2g4', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b3(pretrained=False, **kwargs): + """ RepVGG-B3 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b3', pretrained=pretrained, **kwargs) + + +@register_model +def repvgg_b3g4(pretrained=False, **kwargs): + """ RepVGG-B3g4 + `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697 + """ + return _create_byobnet('repvgg_b3g4', pretrained=pretrained, **kwargs) + + +@register_model +def resnet51q(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('resnet51q', pretrained=pretrained, **kwargs) + + +@register_model +def resnet61q(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('resnet61q', pretrained=pretrained, **kwargs) + + +@register_model +def resnext26ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('resnext26ts', pretrained=pretrained, **kwargs) + + +@register_model +def gcresnext26ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('gcresnext26ts', pretrained=pretrained, **kwargs) + + +@register_model +def seresnext26ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('seresnext26ts', pretrained=pretrained, **kwargs) + + +@register_model +def eca_resnext26ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('eca_resnext26ts', pretrained=pretrained, **kwargs) + + +@register_model +def bat_resnext26ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('bat_resnext26ts', pretrained=pretrained, **kwargs) + + +@register_model +def resnet32ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('resnet32ts', pretrained=pretrained, **kwargs) + + +@register_model +def resnet33ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('resnet33ts', pretrained=pretrained, **kwargs) + + +@register_model +def gcresnet33ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('gcresnet33ts', pretrained=pretrained, **kwargs) + + +@register_model +def seresnet33ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('seresnet33ts', pretrained=pretrained, **kwargs) + + +@register_model +def eca_resnet33ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('eca_resnet33ts', pretrained=pretrained, **kwargs) + + +@register_model +def gcresnet50t(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('gcresnet50t', pretrained=pretrained, **kwargs) + + +@register_model +def gcresnext50ts(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('gcresnext50ts', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_b16(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_b16', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_c16(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_c16', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_d32(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_d32', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_d8(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_d8', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_e8(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_e8', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_b16_evos(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_b16_evos', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_c16_evos(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_c16_evos', pretrained=pretrained, **kwargs) + + +@register_model +def regnetz_d8_evos(pretrained=False, **kwargs): + """ + """ + return _create_byobnet('regnetz_d8_evos', pretrained=pretrained, **kwargs) + + +def expand_blocks_cfg(stage_blocks_cfg: Union[ByoBlockCfg, Sequence[ByoBlockCfg]]) -> List[ByoBlockCfg]: + if not isinstance(stage_blocks_cfg, Sequence): + stage_blocks_cfg = (stage_blocks_cfg,) + block_cfgs = [] + for i, cfg in enumerate(stage_blocks_cfg): + block_cfgs += [replace(cfg, d=1) for _ in range(cfg.d)] + return block_cfgs + + +def num_groups(group_size, channels): + if not group_size: # 0 or None + return 1 # normal conv with 1 group + else: + # NOTE group_size == 1 -> depthwise conv + assert channels % group_size == 0 + return channels // group_size + + +@dataclass +class LayerFn: + conv_norm_act: Callable = ConvNormAct + norm_act: Callable = BatchNormAct2d + act: Callable = nn.ReLU + attn: Optional[Callable] = None + self_attn: Optional[Callable] = None + + +class DownsampleAvg(nn.Module): + def __init__(self, in_chs, out_chs, stride=1, dilation=1, apply_act=False, layers: LayerFn = None): + """ AvgPool Downsampling as in 'D' ResNet variants.""" + super(DownsampleAvg, self).__init__() + layers = layers or LayerFn() + avg_stride = stride if dilation == 1 else 1 + if stride > 1 or dilation > 1: + avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d + self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False) + else: + self.pool = nn.Identity() + self.conv = layers.conv_norm_act(in_chs, out_chs, 1, apply_act=apply_act) + + def forward(self, x): + return self.conv(self.pool(x)) + + +def create_shortcut(downsample_type, layers: LayerFn, in_chs, out_chs, stride, dilation, **kwargs): + assert downsample_type in ('avg', 'conv1x1', '') + if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]: + if not downsample_type: + return None # no shortcut + elif downsample_type == 'avg': + return DownsampleAvg(in_chs, out_chs, stride=stride, dilation=dilation[0], **kwargs) + else: + return layers.conv_norm_act(in_chs, out_chs, kernel_size=1, stride=stride, dilation=dilation[0], **kwargs) + else: + return nn.Identity() # identity shortcut + + +class BasicBlock(nn.Module): + """ ResNet Basic Block - kxk + kxk + """ + + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), group_size=None, bottle_ratio=1.0, + downsample='avg', attn_last=True, linear_out=False, layers: LayerFn = None, drop_block=None, + drop_path_rate=0.): + super(BasicBlock, self).__init__() + layers = layers or LayerFn() + mid_chs = make_divisible(out_chs * bottle_ratio) + groups = num_groups(group_size, mid_chs) + + self.shortcut = create_shortcut( + downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation, + apply_act=False, layers=layers) + + self.conv1_kxk = layers.conv_norm_act(in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0]) + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) + self.conv2_kxk = layers.conv_norm_act( + mid_chs, out_chs, kernel_size, dilation=dilation[1], groups=groups, drop_layer=drop_block, apply_act=False) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + self.act = nn.Identity() if linear_out else layers.act(inplace=True) + + def init_weights(self, zero_init_last: bool = False): + if zero_init_last and self.shortcut is not None: + nn.init.zeros_(self.conv2_kxk.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() + + def forward(self, x): + shortcut = x + x = self.conv1_kxk(x) + x = self.conv2_kxk(x) + x = self.attn(x) + x = self.drop_path(x) + if self.shortcut is not None: + x = x + self.shortcut(shortcut) + return self.act(x) + + +class BottleneckBlock(nn.Module): + """ ResNet-like Bottleneck Block - 1x1 - kxk - 1x1 + """ + + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None, + downsample='avg', attn_last=False, linear_out=False, extra_conv=False, bottle_in=False, + layers: LayerFn = None, drop_block=None, drop_path_rate=0.): + super(BottleneckBlock, self).__init__() + layers = layers or LayerFn() + mid_chs = make_divisible((in_chs if bottle_in else out_chs) * bottle_ratio) + groups = num_groups(group_size, mid_chs) + + self.shortcut = create_shortcut( + downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation, + apply_act=False, layers=layers) + + self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1) + self.conv2_kxk = layers.conv_norm_act( + mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_layer=drop_block) + if extra_conv: + self.conv2b_kxk = layers.conv_norm_act(mid_chs, mid_chs, kernel_size, dilation=dilation[1], groups=groups) + else: + self.conv2b_kxk = nn.Identity() + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) + self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + self.act = nn.Identity() if linear_out else layers.act(inplace=True) + + def init_weights(self, zero_init_last: bool = False): + if zero_init_last and self.shortcut is not None: + nn.init.zeros_(self.conv3_1x1.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() + + def forward(self, x): + shortcut = x + x = self.conv1_1x1(x) + x = self.conv2_kxk(x) + x = self.conv2b_kxk(x) + x = self.attn(x) + x = self.conv3_1x1(x) + x = self.attn_last(x) + x = self.drop_path(x) + if self.shortcut is not None: + x = x + self.shortcut(shortcut) + return self.act(x) + + +class DarkBlock(nn.Module): + """ DarkNet-like (1x1 + 3x3 w/ stride) block + + The GE-Net impl included a 1x1 + 3x3 block in their search space. It was not used in the feature models. + This block is pretty much a DarkNet block (also DenseNet) hence the name. Neither DarkNet or DenseNet + uses strides within the block (external 3x3 or maxpool downsampling is done in front of the block repeats). + + If one does want to use a lot of these blocks w/ stride, I'd recommend using the EdgeBlock (3x3 /w stride + 1x1) + for more optimal compute. + """ + + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None, + downsample='avg', attn_last=True, linear_out=False, layers: LayerFn = None, drop_block=None, + drop_path_rate=0.): + super(DarkBlock, self).__init__() + layers = layers or LayerFn() + mid_chs = make_divisible(out_chs * bottle_ratio) + groups = num_groups(group_size, mid_chs) + + self.shortcut = create_shortcut( + downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation, + apply_act=False, layers=layers) + + self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1) + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) + self.conv2_kxk = layers.conv_norm_act( + mid_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0], + groups=groups, drop_layer=drop_block, apply_act=False) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + self.act = nn.Identity() if linear_out else layers.act(inplace=True) + + def init_weights(self, zero_init_last: bool = False): + if zero_init_last and self.shortcut is not None: + nn.init.zeros_(self.conv2_kxk.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() + + def forward(self, x): + shortcut = x + x = self.conv1_1x1(x) + x = self.attn(x) + x = self.conv2_kxk(x) + x = self.attn_last(x) + x = self.drop_path(x) + if self.shortcut is not None: + x = x + self.shortcut(shortcut) + return self.act(x) + + +class EdgeBlock(nn.Module): + """ EdgeResidual-like (3x3 + 1x1) block + + A two layer block like DarkBlock, but with the order of the 3x3 and 1x1 convs reversed. + Very similar to the EfficientNet Edge-Residual block but this block it ends with activations, is + intended to be used with either expansion or bottleneck contraction, and can use DW/group/non-grouped convs. + + FIXME is there a more common 3x3 + 1x1 conv block to name this after? + """ + + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None, + downsample='avg', attn_last=False, linear_out=False, layers: LayerFn = None, + drop_block=None, drop_path_rate=0.): + super(EdgeBlock, self).__init__() + layers = layers or LayerFn() + mid_chs = make_divisible(out_chs * bottle_ratio) + groups = num_groups(group_size, mid_chs) + + self.shortcut = create_shortcut( + downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation, + apply_act=False, layers=layers) + + self.conv1_kxk = layers.conv_norm_act( + in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], groups=groups, drop_layer=drop_block) + self.attn = nn.Identity() if attn_last or layers.attn is None else layers.attn(mid_chs) + self.conv2_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) + self.attn_last = nn.Identity() if not attn_last or layers.attn is None else layers.attn(out_chs) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + self.act = nn.Identity() if linear_out else layers.act(inplace=True) + + def init_weights(self, zero_init_last: bool = False): + if zero_init_last and self.shortcut is not None: + nn.init.zeros_(self.conv2_1x1.bn.weight) + for attn in (self.attn, self.attn_last): + if hasattr(attn, 'reset_parameters'): + attn.reset_parameters() + + def forward(self, x): + shortcut = x + x = self.conv1_kxk(x) + x = self.attn(x) + x = self.conv2_1x1(x) + x = self.attn_last(x) + x = self.drop_path(x) + if self.shortcut is not None: + x = x + self.shortcut(shortcut) + return self.act(x) + + +class RepVggBlock(nn.Module): + """ RepVGG Block. + + Adapted from impl at https://github.com/DingXiaoH/RepVGG + + This version does not currently support the deploy optimization. It is currently fixed in 'train' mode. + """ + + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None, + downsample='', layers: LayerFn = None, drop_block=None, drop_path_rate=0.): + super(RepVggBlock, self).__init__() + layers = layers or LayerFn() + groups = num_groups(group_size, in_chs) + + use_ident = in_chs == out_chs and stride == 1 and dilation[0] == dilation[1] + self.identity = layers.norm_act(out_chs, apply_act=False) if use_ident else None + self.conv_kxk = layers.conv_norm_act( + in_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0], + groups=groups, drop_layer=drop_block, apply_act=False) + self.conv_1x1 = layers.conv_norm_act(in_chs, out_chs, 1, stride=stride, groups=groups, apply_act=False) + self.attn = nn.Identity() if layers.attn is None else layers.attn(out_chs) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. and use_ident else nn.Identity() + self.act = layers.act(inplace=True) + + def init_weights(self, zero_init_last: bool = False): + # NOTE this init overrides that base model init with specific changes for the block type + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + nn.init.normal_(m.weight, .1, .1) + nn.init.normal_(m.bias, 0, .1) + if hasattr(self.attn, 'reset_parameters'): + self.attn.reset_parameters() + + def forward(self, x): + if self.identity is None: + x = self.conv_1x1(x) + self.conv_kxk(x) + else: + identity = self.identity(x) + x = self.conv_1x1(x) + self.conv_kxk(x) + x = self.drop_path(x) # not in the paper / official impl, experimental + x = x + identity + x = self.attn(x) # no attn in the paper / official impl, experimental + return self.act(x) + + +class SelfAttnBlock(nn.Module): + """ ResNet-like Bottleneck Block - 1x1 - optional kxk - self attn - 1x1 + """ + + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None, + downsample='avg', extra_conv=False, linear_out=False, bottle_in=False, post_attn_na=True, + feat_size=None, layers: LayerFn = None, drop_block=None, drop_path_rate=0.): + super(SelfAttnBlock, self).__init__() + assert layers is not None + mid_chs = make_divisible((in_chs if bottle_in else out_chs) * bottle_ratio) + groups = num_groups(group_size, mid_chs) + + self.shortcut = create_shortcut( + downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation, + apply_act=False, layers=layers) + + self.conv1_1x1 = layers.conv_norm_act(in_chs, mid_chs, 1) + if extra_conv: + self.conv2_kxk = layers.conv_norm_act( + mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], + groups=groups, drop_layer=drop_block) + stride = 1 # striding done via conv if enabled + else: + self.conv2_kxk = nn.Identity() + opt_kwargs = {} if feat_size is None else dict(feat_size=feat_size) + # FIXME need to dilate self attn to have dilated network support, moop moop + self.self_attn = layers.self_attn(mid_chs, stride=stride, **opt_kwargs) + self.post_attn = layers.norm_act(mid_chs) if post_attn_na else nn.Identity() + self.conv3_1x1 = layers.conv_norm_act(mid_chs, out_chs, 1, apply_act=False) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity() + self.act = nn.Identity() if linear_out else layers.act(inplace=True) + + def init_weights(self, zero_init_last: bool = False): + if zero_init_last and self.shortcut is not None: + nn.init.zeros_(self.conv3_1x1.bn.weight) + if hasattr(self.self_attn, 'reset_parameters'): + self.self_attn.reset_parameters() + + def forward(self, x): + shortcut = x + x = self.conv1_1x1(x) + x = self.conv2_kxk(x) + x = self.self_attn(x) + x = self.post_attn(x) + x = self.conv3_1x1(x) + x = self.drop_path(x) + if self.shortcut is not None: + x = x + self.shortcut(shortcut) + return self.act(x) + +_block_registry = dict( + basic=BasicBlock, + bottle=BottleneckBlock, + dark=DarkBlock, + edge=EdgeBlock, + rep=RepVggBlock, + self_attn=SelfAttnBlock, +) + + +def register_block(block_type:str, block_fn: nn.Module): + _block_registry[block_type] = block_fn + + +def create_block(block: Union[str, nn.Module], **kwargs): + if isinstance(block, (nn.Module, partial)): + return block(**kwargs) + assert block in _block_registry, f'Unknown block type ({block}' + return _block_registry[block](**kwargs) + + +class Stem(nn.Sequential): + + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=4, pool='maxpool', + num_rep=3, num_act=None, chs_decay=0.5, layers: LayerFn = None): + super().__init__() + assert stride in (2, 4) + layers = layers or LayerFn() + + if isinstance(out_chs, (list, tuple)): + num_rep = len(out_chs) + stem_chs = out_chs + else: + stem_chs = [round(out_chs * chs_decay ** i) for i in range(num_rep)][::-1] + + self.stride = stride + self.feature_info = [] # track intermediate features + prev_feat = '' + stem_strides = [2] + [1] * (num_rep - 1) + if stride == 4 and not pool: + # set last conv in stack to be strided if stride == 4 and no pooling layer + stem_strides[-1] = 2 + + num_act = num_rep if num_act is None else num_act + # if num_act < num_rep, first convs in stack won't have bn + act + stem_norm_acts = [False] * (num_rep - num_act) + [True] * num_act + prev_chs = in_chs + curr_stride = 1 + for i, (ch, s, na) in enumerate(zip(stem_chs, stem_strides, stem_norm_acts)): + layer_fn = layers.conv_norm_act if na else create_conv2d + conv_name = f'conv{i + 1}' + if i > 0 and s > 1: + self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat)) + self.add_module(conv_name, layer_fn(prev_chs, ch, kernel_size=kernel_size, stride=s)) + prev_chs = ch + curr_stride *= s + prev_feat = conv_name + + if pool and 'max' in pool.lower(): + self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat)) + self.add_module('pool', nn.MaxPool2d(3, 2, 1)) + curr_stride *= 2 + prev_feat = 'pool' + + self.feature_info.append(dict(num_chs=prev_chs, reduction=curr_stride, module=prev_feat)) + assert curr_stride == stride + + +def create_byob_stem(in_chs, out_chs, stem_type='', pool_type='', feat_prefix='stem', layers: LayerFn = None): + layers = layers or LayerFn() + assert stem_type in ('', 'quad', 'quad2', 'tiered', 'deep', 'rep', '7x7', '3x3') + if 'quad' in stem_type: + # based on NFNet stem, stack of 4 3x3 convs + num_act = 2 if 'quad2' in stem_type else None + stem = Stem(in_chs, out_chs, num_rep=4, num_act=num_act, pool=pool_type, layers=layers) + elif 'tiered' in stem_type: + # 3x3 stack of 3 convs as in my ResNet-T + stem = Stem(in_chs, (3 * out_chs // 8, out_chs // 2, out_chs), pool=pool_type, layers=layers) + elif 'deep' in stem_type: + # 3x3 stack of 3 convs as in ResNet-D + stem = Stem(in_chs, out_chs, num_rep=3, chs_decay=1.0, pool=pool_type, layers=layers) + elif 'rep' in stem_type: + stem = RepVggBlock(in_chs, out_chs, stride=2, layers=layers) + elif '7x7' in stem_type: + # 7x7 stem conv as in ResNet + if pool_type: + stem = Stem(in_chs, out_chs, 7, num_rep=1, pool=pool_type, layers=layers) + else: + stem = layers.conv_norm_act(in_chs, out_chs, 7, stride=2) + else: + # 3x3 stem conv as in RegNet is the default + if pool_type: + stem = Stem(in_chs, out_chs, 3, num_rep=1, pool=pool_type, layers=layers) + else: + stem = layers.conv_norm_act(in_chs, out_chs, 3, stride=2) + + if isinstance(stem, Stem): + feature_info = [dict(f, module='.'.join([feat_prefix, f['module']])) for f in stem.feature_info] + else: + feature_info = [dict(num_chs=out_chs, reduction=2, module=feat_prefix)] + return stem, feature_info + + +def reduce_feat_size(feat_size, stride=2): + return None if feat_size is None else tuple([s // stride for s in feat_size]) + + +def override_kwargs(block_kwargs, model_kwargs): + """ Override model level attn/self-attn/block kwargs w/ block level + + NOTE: kwargs are NOT merged across levels, block_kwargs will fully replace model_kwargs + for the block if set to anything that isn't None. + + i.e. an empty block_kwargs dict will remove kwargs set at model level for that block + """ + out_kwargs = block_kwargs if block_kwargs is not None else model_kwargs + return out_kwargs or {} # make sure None isn't returned + + +def update_block_kwargs(block_kwargs: Dict[str, Any], block_cfg: ByoBlockCfg, model_cfg: ByoModelCfg, ): + layer_fns = block_kwargs['layers'] + + # override attn layer / args with block local config + attn_set = block_cfg.attn_layer is not None + if attn_set or block_cfg.attn_kwargs is not None: + # override attn layer config + if attn_set and not block_cfg.attn_layer: + # empty string for attn_layer type will disable attn for this block + attn_layer = None + else: + attn_kwargs = override_kwargs(block_cfg.attn_kwargs, model_cfg.attn_kwargs) + attn_layer = block_cfg.attn_layer or model_cfg.attn_layer + attn_layer = partial(get_attn(attn_layer), **attn_kwargs) if attn_layer is not None else None + layer_fns = replace(layer_fns, attn=attn_layer) + + # override self-attn layer / args with block local cfg + self_attn_set = block_cfg.self_attn_layer is not None + if self_attn_set or block_cfg.self_attn_kwargs is not None: + # override attn layer config + if self_attn_set and not block_cfg.self_attn_layer: # attn_layer == '' + # empty string for self_attn_layer type will disable attn for this block + self_attn_layer = None + else: + self_attn_kwargs = override_kwargs(block_cfg.self_attn_kwargs, model_cfg.self_attn_kwargs) + self_attn_layer = block_cfg.self_attn_layer or model_cfg.self_attn_layer + self_attn_layer = partial(get_attn(self_attn_layer), **self_attn_kwargs) \ + if self_attn_layer is not None else None + layer_fns = replace(layer_fns, self_attn=self_attn_layer) + + block_kwargs['layers'] = layer_fns + + # add additional block_kwargs specified in block_cfg or model_cfg, precedence to block if set + block_kwargs.update(override_kwargs(block_cfg.block_kwargs, model_cfg.block_kwargs)) + + +def create_byob_stages( + cfg: ByoModelCfg, drop_path_rate: float, output_stride: int, stem_feat: Dict[str, Any], + feat_size: Optional[int] = None, + layers: Optional[LayerFn] = None, + block_kwargs_fn: Optional[Callable] = update_block_kwargs): + + layers = layers or LayerFn() + feature_info = [] + block_cfgs = [expand_blocks_cfg(s) for s in cfg.blocks] + depths = [sum([bc.d for bc in stage_bcs]) for stage_bcs in block_cfgs] + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] + dilation = 1 + net_stride = stem_feat['reduction'] + prev_chs = stem_feat['num_chs'] + prev_feat = stem_feat + stages = [] + for stage_idx, stage_block_cfgs in enumerate(block_cfgs): + stride = stage_block_cfgs[0].s + if stride != 1 and prev_feat: + feature_info.append(prev_feat) + if net_stride >= output_stride and stride > 1: + dilation *= stride + stride = 1 + net_stride *= stride + first_dilation = 1 if dilation in (1, 2) else 2 + + blocks = [] + for block_idx, block_cfg in enumerate(stage_block_cfgs): + out_chs = make_divisible(block_cfg.c * cfg.width_factor) + group_size = block_cfg.gs + if isinstance(group_size, Callable): + group_size = group_size(out_chs, block_idx) + block_kwargs = dict( # Blocks used in this model must accept these arguments + in_chs=prev_chs, + out_chs=out_chs, + stride=stride if block_idx == 0 else 1, + dilation=(first_dilation, dilation), + group_size=group_size, + bottle_ratio=block_cfg.br, + downsample=cfg.downsample, + drop_path_rate=dpr[stage_idx][block_idx], + layers=layers, + ) + if block_cfg.type in ('self_attn',): + # add feat_size arg for blocks that support/need it + block_kwargs['feat_size'] = feat_size + block_kwargs_fn(block_kwargs, block_cfg=block_cfg, model_cfg=cfg) + blocks += [create_block(block_cfg.type, **block_kwargs)] + first_dilation = dilation + prev_chs = out_chs + if stride > 1 and block_idx == 0: + feat_size = reduce_feat_size(feat_size, stride) + + stages += [nn.Sequential(*blocks)] + prev_feat = dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}') + + feature_info.append(prev_feat) + return nn.Sequential(*stages), feature_info + + +def get_layer_fns(cfg: ByoModelCfg): + act = get_act_layer(cfg.act_layer) + norm_act = get_norm_act_layer(norm_layer=cfg.norm_layer, act_layer=act) + conv_norm_act = partial(ConvNormAct, norm_layer=cfg.norm_layer, act_layer=act) + attn = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None + self_attn = partial(get_attn(cfg.self_attn_layer), **cfg.self_attn_kwargs) if cfg.self_attn_layer else None + layer_fn = LayerFn(conv_norm_act=conv_norm_act, norm_act=norm_act, act=act, attn=attn, self_attn=self_attn) + return layer_fn + + +class ByobNet(nn.Module): + """ 'Bring-your-own-blocks' Net + + A flexible network backbone that allows building model stem + blocks via + dataclass cfg definition w/ factory functions for module instantiation. + + Current assumption is that both stem and blocks are in conv-bn-act order (w/ block ending in act). + """ + def __init__( + self, cfg: ByoModelCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32, + zero_init_last=True, img_size=None, drop_rate=0., drop_path_rate=0.): + super().__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + self.grad_checkpointing = False + layers = get_layer_fns(cfg) + if cfg.fixed_input_size: + assert img_size is not None, 'img_size argument is required for fixed input size model' + feat_size = to_2tuple(img_size) if img_size is not None else None + + self.feature_info = [] + stem_chs = int(round((cfg.stem_chs or cfg.blocks[0].c) * cfg.width_factor)) + self.stem, stem_feat = create_byob_stem(in_chans, stem_chs, cfg.stem_type, cfg.stem_pool, layers=layers) + self.feature_info.extend(stem_feat[:-1]) + feat_size = reduce_feat_size(feat_size, stride=stem_feat[-1]['reduction']) + + self.stages, stage_feat = create_byob_stages( + cfg, drop_path_rate, output_stride, stem_feat[-1], layers=layers, feat_size=feat_size) + self.feature_info.extend(stage_feat[:-1]) + + prev_chs = stage_feat[-1]['num_chs'] + if cfg.num_features: + self.num_features = int(round(cfg.width_factor * cfg.num_features)) + self.final_conv = layers.conv_norm_act(prev_chs, self.num_features, 1) + else: + self.num_features = prev_chs + self.final_conv = nn.Identity() + self.feature_info += [ + dict(num_chs=self.num_features, reduction=stage_feat[-1]['reduction'], module='final_conv')] + + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + # init weights + named_apply(partial(_init_weights, zero_init_last=zero_init_last), self) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', + blocks=[ + (r'^stages\.(\d+)' if coarse else r'^stages\.(\d+)\.(\d+)', None), + (r'^final_conv', (99999,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.stages, x) + else: + x = self.stages(x) + x = self.final_conv(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _init_weights(module, name='', zero_init_last=False): + if isinstance(module, nn.Conv2d): + fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels + fan_out //= module.groups + module.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=0.01) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.BatchNorm2d): + nn.init.ones_(module.weight) + nn.init.zeros_(module.bias) + elif hasattr(module, 'init_weights'): + module.init_weights(zero_init_last=zero_init_last) + + +def _create_byobnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + ByobNet, variant, pretrained, + model_cfg=model_cfgs[variant], + feature_cfg=dict(flatten_sequential=True), + **kwargs) diff --git a/custom_timm/models/cait.py b/custom_timm/models/cait.py new file mode 100644 index 0000000000000000000000000000000000000000..3e8ec277e8fa8027b340872ccb7a6179479d4bee --- /dev/null +++ b/custom_timm/models/cait.py @@ -0,0 +1,421 @@ +""" Class-Attention in Image Transformers (CaiT) + +Paper: 'Going deeper with Image Transformers' - https://arxiv.org/abs/2103.17239 + +Original code and weights from https://github.com/facebookresearch/deit, copyright below + +Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman +""" +# Copyright (c) 2015-present, Facebook, Inc. +# All rights reserved. +from copy import deepcopy +from functools import partial + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_ +from .registry import register_model + + +__all__ = ['Cait', 'ClassAttn', 'LayerScaleBlockClassAttn', 'LayerScaleBlock', 'TalkingHeadAttn'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 384, 384), 'pool_size': None, + 'crop_pct': 1.0, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = dict( + cait_xxs24_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/XXS24_224.pth', + input_size=(3, 224, 224), + ), + cait_xxs24_384=_cfg( + url='https://dl.fbaipublicfiles.com/deit/XXS24_384.pth', + ), + cait_xxs36_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/XXS36_224.pth', + input_size=(3, 224, 224), + ), + cait_xxs36_384=_cfg( + url='https://dl.fbaipublicfiles.com/deit/XXS36_384.pth', + ), + cait_xs24_384=_cfg( + url='https://dl.fbaipublicfiles.com/deit/XS24_384.pth', + ), + cait_s24_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/S24_224.pth', + input_size=(3, 224, 224), + ), + cait_s24_384=_cfg( + url='https://dl.fbaipublicfiles.com/deit/S24_384.pth', + ), + cait_s36_384=_cfg( + url='https://dl.fbaipublicfiles.com/deit/S36_384.pth', + ), + cait_m36_384=_cfg( + url='https://dl.fbaipublicfiles.com/deit/M36_384.pth', + ), + cait_m48_448=_cfg( + url='https://dl.fbaipublicfiles.com/deit/M48_448.pth', + input_size=(3, 448, 448), + ), +) + + +class ClassAttn(nn.Module): + # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + # with slight modifications to do CA + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.q = nn.Linear(dim, dim, bias=qkv_bias) + self.k = nn.Linear(dim, dim, bias=qkv_bias) + self.v = nn.Linear(dim, dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + q = self.q(x[:, 0]).unsqueeze(1).reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + k = self.k(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + + q = q * self.scale + v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + + attn = (q @ k.transpose(-2, -1)) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x_cls = (attn @ v).transpose(1, 2).reshape(B, 1, C) + x_cls = self.proj(x_cls) + x_cls = self.proj_drop(x_cls) + + return x_cls + + +class LayerScaleBlockClassAttn(nn.Module): + # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + # with slight modifications to add CA and LayerScale + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, attn_block=ClassAttn, + mlp_block=Mlp, init_values=1e-4): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = attn_block( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = mlp_block(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + self.gamma_1 = nn.Parameter(init_values * torch.ones(dim)) + self.gamma_2 = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x, x_cls): + u = torch.cat((x_cls, x), dim=1) + x_cls = x_cls + self.drop_path(self.gamma_1 * self.attn(self.norm1(u))) + x_cls = x_cls + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x_cls))) + return x_cls + + +class TalkingHeadAttn(nn.Module): + # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + # with slight modifications to add Talking Heads Attention (https://arxiv.org/pdf/2003.02436v1.pdf) + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + + self.num_heads = num_heads + + head_dim = dim // num_heads + + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + + self.proj = nn.Linear(dim, dim) + + self.proj_l = nn.Linear(num_heads, num_heads) + self.proj_w = nn.Linear(num_heads, num_heads) + + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + + attn = (q @ k.transpose(-2, -1)) + + attn = self.proj_l(attn.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + + attn = attn.softmax(dim=-1) + + attn = self.proj_w(attn.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class LayerScaleBlock(nn.Module): + # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + # with slight modifications to add layerScale + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, attn_block=TalkingHeadAttn, + mlp_block=Mlp, init_values=1e-4): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = attn_block( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = mlp_block(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + self.gamma_1 = nn.Parameter(init_values * torch.ones(dim)) + self.gamma_2 = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x))) + x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x))) + return x + + +class Cait(nn.Module): + # taken from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + # with slight modifications to adapt to our cait models + def __init__( + self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token', + embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + block_layers=LayerScaleBlock, + block_layers_token=LayerScaleBlockClassAttn, + patch_layer=PatchEmbed, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + attn_block=TalkingHeadAttn, + mlp_block=Mlp, + init_values=1e-4, + attn_block_token_only=ClassAttn, + mlp_block_token_only=Mlp, + depth_token_only=2, + mlp_ratio_token_only=4.0 + ): + super().__init__() + assert global_pool in ('', 'token', 'avg') + + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = embed_dim + self.grad_checkpointing = False + + self.patch_embed = patch_layer( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [drop_path_rate for i in range(depth)] + self.blocks = nn.Sequential(*[ + block_layers( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + act_layer=act_layer, attn_block=attn_block, mlp_block=mlp_block, init_values=init_values) + for i in range(depth)]) + + self.blocks_token_only = nn.ModuleList([ + block_layers_token( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio_token_only, qkv_bias=qkv_bias, + drop=0.0, attn_drop=0.0, drop_path=0.0, norm_layer=norm_layer, + act_layer=act_layer, attn_block=attn_block_token_only, + mlp_block=mlp_block_token_only, init_values=init_values) + for i in range(depth_token_only)]) + + self.norm = norm_layer(embed_dim) + + self.feature_info = [dict(num_chs=embed_dim, reduction=0, module='head')] + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def group_matcher(self, coarse=False): + def _matcher(name): + if any([name.startswith(n) for n in ('cls_token', 'pos_embed', 'patch_embed')]): + return 0 + elif name.startswith('blocks.'): + return int(name.split('.')[1]) + 1 + elif name.startswith('blocks_token_only.'): + # overlap token only blocks with last blocks + to_offset = len(self.blocks) - len(self.blocks_token_only) + 1 + return int(name.split('.')[1]) + to_offset + elif name.startswith('norm.'): + return len(self.blocks) + else: + return float('inf') + return _matcher + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'token', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + x = self.pos_drop(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) + for i, blk in enumerate(self.blocks_token_only): + cls_tokens = blk(x, cls_tokens) + x = torch.cat((cls_tokens, x), dim=1) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def checkpoint_filter_fn(state_dict, model=None): + if 'model' in state_dict: + state_dict = state_dict['model'] + checkpoint_no_module = {} + for k, v in state_dict.items(): + checkpoint_no_module[k.replace('module.', '')] = v + return checkpoint_no_module + + +def _create_cait(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model = build_model_with_cfg( + Cait, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def cait_xxs24_224(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=192, depth=24, num_heads=4, init_values=1e-5, **kwargs) + model = _create_cait('cait_xxs24_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_xxs24_384(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=192, depth=24, num_heads=4, init_values=1e-5, **kwargs) + model = _create_cait('cait_xxs24_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_xxs36_224(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=192, depth=36, num_heads=4, init_values=1e-5, **kwargs) + model = _create_cait('cait_xxs36_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_xxs36_384(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=192, depth=36, num_heads=4, init_values=1e-5, **kwargs) + model = _create_cait('cait_xxs36_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_xs24_384(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=288, depth=24, num_heads=6, init_values=1e-5, **kwargs) + model = _create_cait('cait_xs24_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_s24_224(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=384, depth=24, num_heads=8, init_values=1e-5, **kwargs) + model = _create_cait('cait_s24_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_s24_384(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=384, depth=24, num_heads=8, init_values=1e-5, **kwargs) + model = _create_cait('cait_s24_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_s36_384(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=384, depth=36, num_heads=8, init_values=1e-6, **kwargs) + model = _create_cait('cait_s36_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_m36_384(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=768, depth=36, num_heads=16, init_values=1e-6, **kwargs) + model = _create_cait('cait_m36_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def cait_m48_448(pretrained=False, **kwargs): + model_args = dict(patch_size=16, embed_dim=768, depth=48, num_heads=16, init_values=1e-6, **kwargs) + model = _create_cait('cait_m48_448', pretrained=pretrained, **model_args) + return model diff --git a/custom_timm/models/coat.py b/custom_timm/models/coat.py new file mode 100644 index 0000000000000000000000000000000000000000..6af1bd8824141c9bfe4404970606d0d9def9ce6a --- /dev/null +++ b/custom_timm/models/coat.py @@ -0,0 +1,689 @@ +""" +CoaT architecture. + +Paper: Co-Scale Conv-Attentional Image Transformers - https://arxiv.org/abs/2104.06399 + +Official CoaT code at: https://github.com/mlpc-ucsd/CoaT + +Modified from custom_timm/models/vision_transformer.py +""" +from copy import deepcopy +from functools import partial +from typing import Tuple, List, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import PatchEmbed, Mlp, DropPath, to_2tuple, trunc_normal_ +from .registry import register_model +from .layers import _assert + + +__all__ = [ + "coat_tiny", + "coat_mini", + "coat_lite_tiny", + "coat_lite_mini", + "coat_lite_small" +] + + +def _cfg_coat(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed1.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'coat_tiny': _cfg_coat( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_tiny-473c2a20.pth' + ), + 'coat_mini': _cfg_coat( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_mini-2c6baf49.pth' + ), + 'coat_lite_tiny': _cfg_coat( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_tiny-461b07a7.pth' + ), + 'coat_lite_mini': _cfg_coat( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_mini-d7842000.pth' + ), + 'coat_lite_small': _cfg_coat( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-coat-weights/coat_lite_small-fea1d5a1.pth' + ), +} + + +class ConvRelPosEnc(nn.Module): + """ Convolutional relative position encoding. """ + def __init__(self, Ch, h, window): + """ + Initialization. + Ch: Channels per head. + h: Number of heads. + window: Window size(s) in convolutional relative positional encoding. It can have two forms: + 1. An integer of window size, which assigns all attention heads with the same window s + size in ConvRelPosEnc. + 2. A dict mapping window size to #attention head splits ( + e.g. {window size 1: #attention head split 1, window size 2: #attention head split 2}) + It will apply different window size to the attention head splits. + """ + super().__init__() + + if isinstance(window, int): + # Set the same window size for all attention heads. + window = {window: h} + self.window = window + elif isinstance(window, dict): + self.window = window + else: + raise ValueError() + + self.conv_list = nn.ModuleList() + self.head_splits = [] + for cur_window, cur_head_split in window.items(): + dilation = 1 + # Determine padding size. + # Ref: https://discuss.pytorch.org/t/how-to-keep-the-shape-of-input-and-output-same-when-dilation-conv/14338 + padding_size = (cur_window + (cur_window - 1) * (dilation - 1)) // 2 + cur_conv = nn.Conv2d(cur_head_split*Ch, cur_head_split*Ch, + kernel_size=(cur_window, cur_window), + padding=(padding_size, padding_size), + dilation=(dilation, dilation), + groups=cur_head_split*Ch, + ) + self.conv_list.append(cur_conv) + self.head_splits.append(cur_head_split) + self.channel_splits = [x*Ch for x in self.head_splits] + + def forward(self, q, v, size: Tuple[int, int]): + B, h, N, Ch = q.shape + H, W = size + _assert(N == 1 + H * W, '') + + # Convolutional relative position encoding. + q_img = q[:, :, 1:, :] # [B, h, H*W, Ch] + v_img = v[:, :, 1:, :] # [B, h, H*W, Ch] + + v_img = v_img.transpose(-1, -2).reshape(B, h * Ch, H, W) + v_img_list = torch.split(v_img, self.channel_splits, dim=1) # Split according to channels + conv_v_img_list = [] + for i, conv in enumerate(self.conv_list): + conv_v_img_list.append(conv(v_img_list[i])) + conv_v_img = torch.cat(conv_v_img_list, dim=1) + conv_v_img = conv_v_img.reshape(B, h, Ch, H * W).transpose(-1, -2) + + EV_hat = q_img * conv_v_img + EV_hat = F.pad(EV_hat, (0, 0, 1, 0, 0, 0)) # [B, h, N, Ch]. + return EV_hat + + +class FactorAttnConvRelPosEnc(nn.Module): + """ Factorized attention with convolutional relative position encoding class. """ + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., shared_crpe=None): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) # Note: attn_drop is actually not used. + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + # Shared convolutional relative position encoding. + self.crpe = shared_crpe + + def forward(self, x, size: Tuple[int, int]): + B, N, C = x.shape + + # Generate Q, K, V. + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] # [B, h, N, Ch] + + # Factorized attention. + k_softmax = k.softmax(dim=2) + factor_att = k_softmax.transpose(-1, -2) @ v + factor_att = q @ factor_att + + # Convolutional relative position encoding. + crpe = self.crpe(q, v, size=size) # [B, h, N, Ch] + + # Merge and reshape. + x = self.scale * factor_att + crpe + x = x.transpose(1, 2).reshape(B, N, C) # [B, h, N, Ch] -> [B, N, h, Ch] -> [B, N, C] + + # Output projection. + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class ConvPosEnc(nn.Module): + """ Convolutional Position Encoding. + Note: This module is similar to the conditional position encoding in CPVT. + """ + def __init__(self, dim, k=3): + super(ConvPosEnc, self).__init__() + self.proj = nn.Conv2d(dim, dim, k, 1, k//2, groups=dim) + + def forward(self, x, size: Tuple[int, int]): + B, N, C = x.shape + H, W = size + _assert(N == 1 + H * W, '') + + # Extract CLS token and image tokens. + cls_token, img_tokens = x[:, :1], x[:, 1:] # [B, 1, C], [B, H*W, C] + + # Depthwise convolution. + feat = img_tokens.transpose(1, 2).view(B, C, H, W) + x = self.proj(feat) + feat + x = x.flatten(2).transpose(1, 2) + + # Combine with CLS token. + x = torch.cat((cls_token, x), dim=1) + + return x + + +class SerialBlock(nn.Module): + """ Serial block class. + Note: In this implementation, each serial block only contains a conv-attention and a FFN (MLP) module. """ + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, shared_cpe=None, shared_crpe=None): + super().__init__() + + # Conv-Attention. + self.cpe = shared_cpe + + self.norm1 = norm_layer(dim) + self.factoratt_crpe = FactorAttnConvRelPosEnc( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, shared_crpe=shared_crpe) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + # MLP. + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x, size: Tuple[int, int]): + # Conv-Attention. + x = self.cpe(x, size) + cur = self.norm1(x) + cur = self.factoratt_crpe(cur, size) + x = x + self.drop_path(cur) + + # MLP. + cur = self.norm2(x) + cur = self.mlp(cur) + x = x + self.drop_path(cur) + + return x + + +class ParallelBlock(nn.Module): + """ Parallel block class. """ + def __init__(self, dims, num_heads, mlp_ratios=[], qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, shared_crpes=None): + super().__init__() + + # Conv-Attention. + self.norm12 = norm_layer(dims[1]) + self.norm13 = norm_layer(dims[2]) + self.norm14 = norm_layer(dims[3]) + self.factoratt_crpe2 = FactorAttnConvRelPosEnc( + dims[1], num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, + shared_crpe=shared_crpes[1] + ) + self.factoratt_crpe3 = FactorAttnConvRelPosEnc( + dims[2], num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, + shared_crpe=shared_crpes[2] + ) + self.factoratt_crpe4 = FactorAttnConvRelPosEnc( + dims[3], num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, + shared_crpe=shared_crpes[3] + ) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + # MLP. + self.norm22 = norm_layer(dims[1]) + self.norm23 = norm_layer(dims[2]) + self.norm24 = norm_layer(dims[3]) + # In parallel block, we assume dimensions are the same and share the linear transformation. + assert dims[1] == dims[2] == dims[3] + assert mlp_ratios[1] == mlp_ratios[2] == mlp_ratios[3] + mlp_hidden_dim = int(dims[1] * mlp_ratios[1]) + self.mlp2 = self.mlp3 = self.mlp4 = Mlp( + in_features=dims[1], hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def upsample(self, x, factor: float, size: Tuple[int, int]): + """ Feature map up-sampling. """ + return self.interpolate(x, scale_factor=factor, size=size) + + def downsample(self, x, factor: float, size: Tuple[int, int]): + """ Feature map down-sampling. """ + return self.interpolate(x, scale_factor=1.0/factor, size=size) + + def interpolate(self, x, scale_factor: float, size: Tuple[int, int]): + """ Feature map interpolation. """ + B, N, C = x.shape + H, W = size + _assert(N == 1 + H * W, '') + + cls_token = x[:, :1, :] + img_tokens = x[:, 1:, :] + + img_tokens = img_tokens.transpose(1, 2).reshape(B, C, H, W) + img_tokens = F.interpolate( + img_tokens, scale_factor=scale_factor, recompute_scale_factor=False, mode='bilinear', align_corners=False) + img_tokens = img_tokens.reshape(B, C, -1).transpose(1, 2) + + out = torch.cat((cls_token, img_tokens), dim=1) + + return out + + def forward(self, x1, x2, x3, x4, sizes: List[Tuple[int, int]]): + _, S2, S3, S4 = sizes + cur2 = self.norm12(x2) + cur3 = self.norm13(x3) + cur4 = self.norm14(x4) + cur2 = self.factoratt_crpe2(cur2, size=S2) + cur3 = self.factoratt_crpe3(cur3, size=S3) + cur4 = self.factoratt_crpe4(cur4, size=S4) + upsample3_2 = self.upsample(cur3, factor=2., size=S3) + upsample4_3 = self.upsample(cur4, factor=2., size=S4) + upsample4_2 = self.upsample(cur4, factor=4., size=S4) + downsample2_3 = self.downsample(cur2, factor=2., size=S2) + downsample3_4 = self.downsample(cur3, factor=2., size=S3) + downsample2_4 = self.downsample(cur2, factor=4., size=S2) + cur2 = cur2 + upsample3_2 + upsample4_2 + cur3 = cur3 + upsample4_3 + downsample2_3 + cur4 = cur4 + downsample3_4 + downsample2_4 + x2 = x2 + self.drop_path(cur2) + x3 = x3 + self.drop_path(cur3) + x4 = x4 + self.drop_path(cur4) + + # MLP. + cur2 = self.norm22(x2) + cur3 = self.norm23(x3) + cur4 = self.norm24(x4) + cur2 = self.mlp2(cur2) + cur3 = self.mlp3(cur3) + cur4 = self.mlp4(cur4) + x2 = x2 + self.drop_path(cur2) + x3 = x3 + self.drop_path(cur3) + x4 = x4 + self.drop_path(cur4) + + return x1, x2, x3, x4 + + +class CoaT(nn.Module): + """ CoaT class. """ + def __init__( + self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dims=(0, 0, 0, 0), + serial_depths=(0, 0, 0, 0), parallel_depth=0, num_heads=0, mlp_ratios=(0, 0, 0, 0), qkv_bias=True, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=partial(nn.LayerNorm, eps=1e-6), + return_interm_layers=False, out_features=None, crpe_window=None, global_pool='token'): + super().__init__() + assert global_pool in ('token', 'avg') + crpe_window = crpe_window or {3: 2, 5: 3, 7: 3} + self.return_interm_layers = return_interm_layers + self.out_features = out_features + self.embed_dims = embed_dims + self.num_features = embed_dims[-1] + self.num_classes = num_classes + self.global_pool = global_pool + + # Patch embeddings. + img_size = to_2tuple(img_size) + self.patch_embed1 = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, + embed_dim=embed_dims[0], norm_layer=nn.LayerNorm) + self.patch_embed2 = PatchEmbed( + img_size=[x // 4 for x in img_size], patch_size=2, in_chans=embed_dims[0], + embed_dim=embed_dims[1], norm_layer=nn.LayerNorm) + self.patch_embed3 = PatchEmbed( + img_size=[x // 8 for x in img_size], patch_size=2, in_chans=embed_dims[1], + embed_dim=embed_dims[2], norm_layer=nn.LayerNorm) + self.patch_embed4 = PatchEmbed( + img_size=[x // 16 for x in img_size], patch_size=2, in_chans=embed_dims[2], + embed_dim=embed_dims[3], norm_layer=nn.LayerNorm) + + # Class tokens. + self.cls_token1 = nn.Parameter(torch.zeros(1, 1, embed_dims[0])) + self.cls_token2 = nn.Parameter(torch.zeros(1, 1, embed_dims[1])) + self.cls_token3 = nn.Parameter(torch.zeros(1, 1, embed_dims[2])) + self.cls_token4 = nn.Parameter(torch.zeros(1, 1, embed_dims[3])) + + # Convolutional position encodings. + self.cpe1 = ConvPosEnc(dim=embed_dims[0], k=3) + self.cpe2 = ConvPosEnc(dim=embed_dims[1], k=3) + self.cpe3 = ConvPosEnc(dim=embed_dims[2], k=3) + self.cpe4 = ConvPosEnc(dim=embed_dims[3], k=3) + + # Convolutional relative position encodings. + self.crpe1 = ConvRelPosEnc(Ch=embed_dims[0] // num_heads, h=num_heads, window=crpe_window) + self.crpe2 = ConvRelPosEnc(Ch=embed_dims[1] // num_heads, h=num_heads, window=crpe_window) + self.crpe3 = ConvRelPosEnc(Ch=embed_dims[2] // num_heads, h=num_heads, window=crpe_window) + self.crpe4 = ConvRelPosEnc(Ch=embed_dims[3] // num_heads, h=num_heads, window=crpe_window) + + # Disable stochastic depth. + dpr = drop_path_rate + assert dpr == 0.0 + + # Serial blocks 1. + self.serial_blocks1 = nn.ModuleList([ + SerialBlock( + dim=embed_dims[0], num_heads=num_heads, mlp_ratio=mlp_ratios[0], qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, + shared_cpe=self.cpe1, shared_crpe=self.crpe1 + ) + for _ in range(serial_depths[0])] + ) + + # Serial blocks 2. + self.serial_blocks2 = nn.ModuleList([ + SerialBlock( + dim=embed_dims[1], num_heads=num_heads, mlp_ratio=mlp_ratios[1], qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, + shared_cpe=self.cpe2, shared_crpe=self.crpe2 + ) + for _ in range(serial_depths[1])] + ) + + # Serial blocks 3. + self.serial_blocks3 = nn.ModuleList([ + SerialBlock( + dim=embed_dims[2], num_heads=num_heads, mlp_ratio=mlp_ratios[2], qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, + shared_cpe=self.cpe3, shared_crpe=self.crpe3 + ) + for _ in range(serial_depths[2])] + ) + + # Serial blocks 4. + self.serial_blocks4 = nn.ModuleList([ + SerialBlock( + dim=embed_dims[3], num_heads=num_heads, mlp_ratio=mlp_ratios[3], qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, + shared_cpe=self.cpe4, shared_crpe=self.crpe4 + ) + for _ in range(serial_depths[3])] + ) + + # Parallel blocks. + self.parallel_depth = parallel_depth + if self.parallel_depth > 0: + self.parallel_blocks = nn.ModuleList([ + ParallelBlock( + dims=embed_dims, num_heads=num_heads, mlp_ratios=mlp_ratios, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr, norm_layer=norm_layer, + shared_crpes=(self.crpe1, self.crpe2, self.crpe3, self.crpe4) + ) + for _ in range(parallel_depth)] + ) + else: + self.parallel_blocks = None + + # Classification head(s). + if not self.return_interm_layers: + if self.parallel_blocks is not None: + self.norm2 = norm_layer(embed_dims[1]) + self.norm3 = norm_layer(embed_dims[2]) + else: + self.norm2 = self.norm3 = None + self.norm4 = norm_layer(embed_dims[3]) + + if self.parallel_depth > 0: + # CoaT series: Aggregate features of last three scales for classification. + assert embed_dims[1] == embed_dims[2] == embed_dims[3] + self.aggregate = torch.nn.Conv1d(in_channels=3, out_channels=1, kernel_size=1) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + else: + # CoaT-Lite series: Use feature of last scale for classification. + self.aggregate = None + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + # Initialize weights. + trunc_normal_(self.cls_token1, std=.02) + trunc_normal_(self.cls_token2, std=.02) + trunc_normal_(self.cls_token3, std=.02) + trunc_normal_(self.cls_token4, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'cls_token1', 'cls_token2', 'cls_token3', 'cls_token4'} + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem1=r'^cls_token1|patch_embed1|crpe1|cpe1', + serial_blocks1=r'^serial_blocks1\.(\d+)', + stem2=r'^cls_token2|patch_embed2|crpe2|cpe2', + serial_blocks2=r'^serial_blocks2\.(\d+)', + stem3=r'^cls_token3|patch_embed3|crpe3|cpe3', + serial_blocks3=r'^serial_blocks3\.(\d+)', + stem4=r'^cls_token4|patch_embed4|crpe4|cpe4', + serial_blocks4=r'^serial_blocks4\.(\d+)', + parallel_blocks=[ # FIXME (partially?) overlap parallel w/ serial blocks?? + (r'^parallel_blocks\.(\d+)', None), + (r'^norm|aggregate', (99999,)), + ] + ) + return matcher + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('token', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x0): + B = x0.shape[0] + + # Serial blocks 1. + x1 = self.patch_embed1(x0) + H1, W1 = self.patch_embed1.grid_size + x1 = insert_cls(x1, self.cls_token1) + for blk in self.serial_blocks1: + x1 = blk(x1, size=(H1, W1)) + x1_nocls = remove_cls(x1).reshape(B, H1, W1, -1).permute(0, 3, 1, 2).contiguous() + + # Serial blocks 2. + x2 = self.patch_embed2(x1_nocls) + H2, W2 = self.patch_embed2.grid_size + x2 = insert_cls(x2, self.cls_token2) + for blk in self.serial_blocks2: + x2 = blk(x2, size=(H2, W2)) + x2_nocls = remove_cls(x2).reshape(B, H2, W2, -1).permute(0, 3, 1, 2).contiguous() + + # Serial blocks 3. + x3 = self.patch_embed3(x2_nocls) + H3, W3 = self.patch_embed3.grid_size + x3 = insert_cls(x3, self.cls_token3) + for blk in self.serial_blocks3: + x3 = blk(x3, size=(H3, W3)) + x3_nocls = remove_cls(x3).reshape(B, H3, W3, -1).permute(0, 3, 1, 2).contiguous() + + # Serial blocks 4. + x4 = self.patch_embed4(x3_nocls) + H4, W4 = self.patch_embed4.grid_size + x4 = insert_cls(x4, self.cls_token4) + for blk in self.serial_blocks4: + x4 = blk(x4, size=(H4, W4)) + x4_nocls = remove_cls(x4).reshape(B, H4, W4, -1).permute(0, 3, 1, 2).contiguous() + + # Only serial blocks: Early return. + if self.parallel_blocks is None: + if not torch.jit.is_scripting() and self.return_interm_layers: + # Return intermediate features for down-stream tasks (e.g. Deformable DETR and Detectron2). + feat_out = {} + if 'x1_nocls' in self.out_features: + feat_out['x1_nocls'] = x1_nocls + if 'x2_nocls' in self.out_features: + feat_out['x2_nocls'] = x2_nocls + if 'x3_nocls' in self.out_features: + feat_out['x3_nocls'] = x3_nocls + if 'x4_nocls' in self.out_features: + feat_out['x4_nocls'] = x4_nocls + return feat_out + else: + # Return features for classification. + x4 = self.norm4(x4) + return x4 + + # Parallel blocks. + for blk in self.parallel_blocks: + x2, x3, x4 = self.cpe2(x2, (H2, W2)), self.cpe3(x3, (H3, W3)), self.cpe4(x4, (H4, W4)) + x1, x2, x3, x4 = blk(x1, x2, x3, x4, sizes=[(H1, W1), (H2, W2), (H3, W3), (H4, W4)]) + + if not torch.jit.is_scripting() and self.return_interm_layers: + # Return intermediate features for down-stream tasks (e.g. Deformable DETR and Detectron2). + feat_out = {} + if 'x1_nocls' in self.out_features: + x1_nocls = remove_cls(x1).reshape(B, H1, W1, -1).permute(0, 3, 1, 2).contiguous() + feat_out['x1_nocls'] = x1_nocls + if 'x2_nocls' in self.out_features: + x2_nocls = remove_cls(x2).reshape(B, H2, W2, -1).permute(0, 3, 1, 2).contiguous() + feat_out['x2_nocls'] = x2_nocls + if 'x3_nocls' in self.out_features: + x3_nocls = remove_cls(x3).reshape(B, H3, W3, -1).permute(0, 3, 1, 2).contiguous() + feat_out['x3_nocls'] = x3_nocls + if 'x4_nocls' in self.out_features: + x4_nocls = remove_cls(x4).reshape(B, H4, W4, -1).permute(0, 3, 1, 2).contiguous() + feat_out['x4_nocls'] = x4_nocls + return feat_out + else: + x2 = self.norm2(x2) + x3 = self.norm3(x3) + x4 = self.norm4(x4) + return [x2, x3, x4] + + def forward_head(self, x_feat: Union[torch.Tensor, List[torch.Tensor]], pre_logits: bool = False): + if isinstance(x_feat, list): + assert self.aggregate is not None + if self.global_pool == 'avg': + x = torch.cat([xl[:, 1:].mean(dim=1, keepdim=True) for xl in x_feat], dim=1) # [B, 3, C] + else: + x = torch.stack([xl[:, 0] for xl in x_feat], dim=1) # [B, 3, C] + x = self.aggregate(x).squeeze(dim=1) # Shape: [B, C] + else: + x = x_feat[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x_feat[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x) -> torch.Tensor: + if not torch.jit.is_scripting() and self.return_interm_layers: + # Return intermediate features (for down-stream tasks). + return self.forward_features(x) + else: + # Return features for classification. + x_feat = self.forward_features(x) + x = self.forward_head(x_feat) + return x + + +def insert_cls(x, cls_token): + """ Insert CLS token. """ + cls_tokens = cls_token.expand(x.shape[0], -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + return x + + +def remove_cls(x): + """ Remove CLS token. """ + return x[:, 1:, :] + + +def checkpoint_filter_fn(state_dict, model): + out_dict = {} + for k, v in state_dict.items(): + # original model had unused norm layers, removing them requires filtering pretrained checkpoints + if k.startswith('norm1') or \ + (model.norm2 is None and k.startswith('norm2')) or \ + (model.norm3 is None and k.startswith('norm3')): + continue + out_dict[k] = v + return out_dict + + +def _create_coat(variant, pretrained=False, default_cfg=None, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model = build_model_with_cfg( + CoaT, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def coat_tiny(pretrained=False, **kwargs): + model_cfg = dict( + patch_size=4, embed_dims=[152, 152, 152, 152], serial_depths=[2, 2, 2, 2], parallel_depth=6, + num_heads=8, mlp_ratios=[4, 4, 4, 4], **kwargs) + model = _create_coat('coat_tiny', pretrained=pretrained, **model_cfg) + return model + + +@register_model +def coat_mini(pretrained=False, **kwargs): + model_cfg = dict( + patch_size=4, embed_dims=[152, 216, 216, 216], serial_depths=[2, 2, 2, 2], parallel_depth=6, + num_heads=8, mlp_ratios=[4, 4, 4, 4], **kwargs) + model = _create_coat('coat_mini', pretrained=pretrained, **model_cfg) + return model + + +@register_model +def coat_lite_tiny(pretrained=False, **kwargs): + model_cfg = dict( + patch_size=4, embed_dims=[64, 128, 256, 320], serial_depths=[2, 2, 2, 2], parallel_depth=0, + num_heads=8, mlp_ratios=[8, 8, 4, 4], **kwargs) + model = _create_coat('coat_lite_tiny', pretrained=pretrained, **model_cfg) + return model + + +@register_model +def coat_lite_mini(pretrained=False, **kwargs): + model_cfg = dict( + patch_size=4, embed_dims=[64, 128, 320, 512], serial_depths=[2, 2, 2, 2], parallel_depth=0, + num_heads=8, mlp_ratios=[8, 8, 4, 4], **kwargs) + model = _create_coat('coat_lite_mini', pretrained=pretrained, **model_cfg) + return model + + +@register_model +def coat_lite_small(pretrained=False, **kwargs): + model_cfg = dict( + patch_size=4, embed_dims=[64, 128, 320, 512], serial_depths=[3, 4, 6, 3], parallel_depth=0, + num_heads=8, mlp_ratios=[8, 8, 4, 4], **kwargs) + model = _create_coat('coat_lite_small', pretrained=pretrained, **model_cfg) + return model \ No newline at end of file diff --git a/custom_timm/models/convit.py b/custom_timm/models/convit.py new file mode 100644 index 0000000000000000000000000000000000000000..b23e1c5504cfb12a47a651e45eb7ffd488e32acb --- /dev/null +++ b/custom_timm/models/convit.py @@ -0,0 +1,369 @@ +""" ConViT Model + +@article{d2021convit, + title={ConViT: Improving Vision Transformers with Soft Convolutional Inductive Biases}, + author={d'Ascoli, St{\'e}phane and Touvron, Hugo and Leavitt, Matthew and Morcos, Ari and Biroli, Giulio and Sagun, Levent}, + journal={arXiv preprint arXiv:2103.10697}, + year={2021} +} + +Paper link: https://arxiv.org/abs/2103.10697 +Original code: https://github.com/facebookresearch/convit, original copyright below + +Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman +""" +# Copyright (c) 2015-present, Facebook, Inc. +# All rights reserved. +# +# This source code is licensed under the CC-by-NC license found in the +# LICENSE file in the root directory of this source tree. +# +'''These modules are adapted from those of timm, see +https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +''' + +import torch +import torch.nn as nn +from functools import partial +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import DropPath, to_2tuple, trunc_normal_, PatchEmbed, Mlp +from .registry import register_model +from .vision_transformer_hybrid import HybridEmbed +from .fx_features import register_notrace_module + +import torch +import torch.nn as nn + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'fixed_input_size': True, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # ConViT + 'convit_tiny': _cfg( + url="https://dl.fbaipublicfiles.com/convit/convit_tiny.pth"), + 'convit_small': _cfg( + url="https://dl.fbaipublicfiles.com/convit/convit_small.pth"), + 'convit_base': _cfg( + url="https://dl.fbaipublicfiles.com/convit/convit_base.pth") +} + + +@register_notrace_module # reason: FX can't symbolically trace control flow in forward method +class GPSA(nn.Module): + def __init__( + self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., locality_strength=1.): + super().__init__() + self.num_heads = num_heads + self.dim = dim + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + self.locality_strength = locality_strength + + self.qk = nn.Linear(dim, dim * 2, bias=qkv_bias) + self.v = nn.Linear(dim, dim, bias=qkv_bias) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.pos_proj = nn.Linear(3, num_heads) + self.proj_drop = nn.Dropout(proj_drop) + self.gating_param = nn.Parameter(torch.ones(self.num_heads)) + self.rel_indices: torch.Tensor = torch.zeros(1, 1, 1, 3) # silly torchscript hack, won't work with None + + def forward(self, x): + B, N, C = x.shape + if self.rel_indices is None or self.rel_indices.shape[1] != N: + self.rel_indices = self.get_rel_indices(N) + attn = self.get_attention(x) + v = self.v(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def get_attention(self, x): + B, N, C = x.shape + qk = self.qk(x).reshape(B, N, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k = qk[0], qk[1] + pos_score = self.rel_indices.expand(B, -1, -1, -1) + pos_score = self.pos_proj(pos_score).permute(0, 3, 1, 2) + patch_score = (q @ k.transpose(-2, -1)) * self.scale + patch_score = patch_score.softmax(dim=-1) + pos_score = pos_score.softmax(dim=-1) + + gating = self.gating_param.view(1, -1, 1, 1) + attn = (1. - torch.sigmoid(gating)) * patch_score + torch.sigmoid(gating) * pos_score + attn /= attn.sum(dim=-1).unsqueeze(-1) + attn = self.attn_drop(attn) + return attn + + def get_attention_map(self, x, return_map=False): + attn_map = self.get_attention(x).mean(0) # average over batch + distances = self.rel_indices.squeeze()[:, :, -1] ** .5 + dist = torch.einsum('nm,hnm->h', (distances, attn_map)) / distances.size(0) + if return_map: + return dist, attn_map + else: + return dist + + def local_init(self): + self.v.weight.data.copy_(torch.eye(self.dim)) + locality_distance = 1 # max(1,1/locality_strength**.5) + + kernel_size = int(self.num_heads ** .5) + center = (kernel_size - 1) / 2 if kernel_size % 2 == 0 else kernel_size // 2 + for h1 in range(kernel_size): + for h2 in range(kernel_size): + position = h1 + kernel_size * h2 + self.pos_proj.weight.data[position, 2] = -1 + self.pos_proj.weight.data[position, 1] = 2 * (h1 - center) * locality_distance + self.pos_proj.weight.data[position, 0] = 2 * (h2 - center) * locality_distance + self.pos_proj.weight.data *= self.locality_strength + + def get_rel_indices(self, num_patches: int) -> torch.Tensor: + img_size = int(num_patches ** .5) + rel_indices = torch.zeros(1, num_patches, num_patches, 3) + ind = torch.arange(img_size).view(1, -1) - torch.arange(img_size).view(-1, 1) + indx = ind.repeat(img_size, img_size) + indy = ind.repeat_interleave(img_size, dim=0).repeat_interleave(img_size, dim=1) + indd = indx ** 2 + indy ** 2 + rel_indices[:, :, :, 2] = indd.unsqueeze(0) + rel_indices[:, :, :, 1] = indy.unsqueeze(0) + rel_indices[:, :, :, 0] = indx.unsqueeze(0) + device = self.qk.weight.device + return rel_indices.to(device) + + +class MHSA(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def get_attention_map(self, x, return_map=False): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv[0], qkv[1], qkv[2] + attn_map = (q @ k.transpose(-2, -1)) * self.scale + attn_map = attn_map.softmax(dim=-1).mean(0) + + img_size = int(N ** .5) + ind = torch.arange(img_size).view(1, -1) - torch.arange(img_size).view(-1, 1) + indx = ind.repeat(img_size, img_size) + indy = ind.repeat_interleave(img_size, dim=0).repeat_interleave(img_size, dim=1) + indd = indx ** 2 + indy ** 2 + distances = indd ** .5 + distances = distances.to(x.device) + + dist = torch.einsum('nm,hnm->h', (distances, attn_map)) / N + if return_map: + return dist, attn_map + else: + return dist + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, use_gpsa=True, **kwargs): + super().__init__() + self.norm1 = norm_layer(dim) + self.use_gpsa = use_gpsa + if self.use_gpsa: + self.attn = GPSA( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, **kwargs) + else: + self.attn = MHSA(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class ConViT(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + + def __init__( + self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token', + embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, drop_rate=0., attn_drop_rate=0., + drop_path_rate=0., hybrid_backbone=None, norm_layer=nn.LayerNorm, + local_up_to_layer=3, locality_strength=1., use_pos_embed=True): + super().__init__() + assert global_pool in ('', 'avg', 'token') + embed_dim *= num_heads + self.num_classes = num_classes + self.global_pool = global_pool + self.local_up_to_layer = local_up_to_layer + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.locality_strength = locality_strength + self.use_pos_embed = use_pos_embed + + if hybrid_backbone is not None: + self.patch_embed = HybridEmbed( + hybrid_backbone, img_size=img_size, in_chans=in_chans, embed_dim=embed_dim) + else: + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + num_patches = self.patch_embed.num_patches + self.num_patches = num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + + if self.use_pos_embed: + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.pos_embed, std=.02) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + use_gpsa=True, + locality_strength=locality_strength) + if i < local_up_to_layer else + Block( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + use_gpsa=False) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) + + # Classifier head + self.feature_info = [dict(num_chs=embed_dim, reduction=0, module='head')] + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + for n, m in self.named_modules(): + if hasattr(m, 'local_init'): + m.local_init() + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^cls_token|pos_embed|patch_embed', # stem and embed + blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'token', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + if self.use_pos_embed: + x = x + self.pos_embed + x = self.pos_drop(x) + cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) + for u, blk in enumerate(self.blocks): + if u == self.local_up_to_layer: + x = torch.cat((cls_tokens, x), dim=1) + x = blk(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_convit(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + return build_model_with_cfg(ConViT, variant, pretrained, **kwargs) + + +@register_model +def convit_tiny(pretrained=False, **kwargs): + model_args = dict( + local_up_to_layer=10, locality_strength=1.0, embed_dim=48, + num_heads=4, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model = _create_convit(variant='convit_tiny', pretrained=pretrained, **model_args) + return model + + +@register_model +def convit_small(pretrained=False, **kwargs): + model_args = dict( + local_up_to_layer=10, locality_strength=1.0, embed_dim=48, + num_heads=9, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model = _create_convit(variant='convit_small', pretrained=pretrained, **model_args) + return model + + +@register_model +def convit_base(pretrained=False, **kwargs): + model_args = dict( + local_up_to_layer=10, locality_strength=1.0, embed_dim=48, + num_heads=16, norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + model = _create_convit(variant='convit_base', pretrained=pretrained, **model_args) + return model diff --git a/custom_timm/models/convmixer.py b/custom_timm/models/convmixer.py new file mode 100644 index 0000000000000000000000000000000000000000..e2140241a2af7f6e7a7427d9fc926e9b71c233b0 --- /dev/null +++ b/custom_timm/models/convmixer.py @@ -0,0 +1,125 @@ +""" ConvMixer + +""" +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from custom_timm.models.registry import register_model +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import SelectAdaptivePool2d + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .96, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'classifier': 'head', + 'first_conv': 'stem.0', + **kwargs + } + + +default_cfgs = { + 'convmixer_1536_20': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1536_20_ks9_p7.pth.tar'), + 'convmixer_768_32': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_768_32_ks7_p7_relu.pth.tar'), + 'convmixer_1024_20_ks9_p14': _cfg(url='https://github.com/tmp-iclr/convmixer/releases/download/timm-v1.0/convmixer_1024_20_ks9_p14.pth.tar') +} + + +class Residual(nn.Module): + def __init__(self, fn): + super().__init__() + self.fn = fn + + def forward(self, x): + return self.fn(x) + x + + +class ConvMixer(nn.Module): + def __init__( + self, dim, depth, kernel_size=9, patch_size=7, in_chans=3, num_classes=1000, global_pool='avg', + act_layer=nn.GELU, **kwargs): + super().__init__() + self.num_classes = num_classes + self.num_features = dim + self.grad_checkpointing = False + + self.stem = nn.Sequential( + nn.Conv2d(in_chans, dim, kernel_size=patch_size, stride=patch_size), + act_layer(), + nn.BatchNorm2d(dim) + ) + self.blocks = nn.Sequential( + *[nn.Sequential( + Residual(nn.Sequential( + nn.Conv2d(dim, dim, kernel_size, groups=dim, padding="same"), + act_layer(), + nn.BatchNorm2d(dim) + )), + nn.Conv2d(dim, dim, kernel_size=1), + act_layer(), + nn.BatchNorm2d(dim) + ) for i in range(depth)] + ) + self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True) + self.head = nn.Linear(dim, num_classes) if num_classes > 0 else nn.Identity() + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict(stem=r'^stem', blocks=r'^blocks\.(\d+)') + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + self.pooling = SelectAdaptivePool2d(pool_type=global_pool, flatten=True) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.stem(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.pooling(x) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_convmixer(variant, pretrained=False, **kwargs): + return build_model_with_cfg(ConvMixer, variant, pretrained, **kwargs) + + +@register_model +def convmixer_1536_20(pretrained=False, **kwargs): + model_args = dict(dim=1536, depth=20, kernel_size=9, patch_size=7, **kwargs) + return _create_convmixer('convmixer_1536_20', pretrained, **model_args) + + +@register_model +def convmixer_768_32(pretrained=False, **kwargs): + model_args = dict(dim=768, depth=32, kernel_size=7, patch_size=7, act_layer=nn.ReLU, **kwargs) + return _create_convmixer('convmixer_768_32', pretrained, **model_args) + + +@register_model +def convmixer_1024_20_ks9_p14(pretrained=False, **kwargs): + model_args = dict(dim=1024, depth=20, kernel_size=9, patch_size=14, **kwargs) + return _create_convmixer('convmixer_1024_20_ks9_p14', pretrained, **model_args) \ No newline at end of file diff --git a/custom_timm/models/convnext.py b/custom_timm/models/convnext.py new file mode 100644 index 0000000000000000000000000000000000000000..f76d972236dbae1a8df24d70ee35f05f6207f815 --- /dev/null +++ b/custom_timm/models/convnext.py @@ -0,0 +1,673 @@ +""" ConvNeXt + +Paper: `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf + +Original code and weights from https://github.com/facebookresearch/ConvNeXt, original copyright below + +Model defs atto, femto, pico, nano and _ols / _hnf variants are timm specific. + +Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman +""" +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# This source code is licensed under the MIT license +from collections import OrderedDict +from functools import partial + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import named_apply, build_model_with_cfg, checkpoint_seq +from .layers import trunc_normal_, SelectAdaptivePool2d, DropPath, ConvMlp, Mlp, LayerNorm2d, LayerNorm, \ + create_conv2d, get_act_layer, make_divisible, to_ntuple +from .registry import register_model + + +__all__ = ['ConvNeXt'] # model_registry will add each entrypoint fn to this + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.0', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = dict( + # timm specific variants + convnext_atto=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_d2-01bb0f51.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + convnext_atto_ols=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_atto_ols_a2-78d1c8f3.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + convnext_femto=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_d1-d71d5b4c.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + convnext_femto_ols=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_femto_ols_d1-246bf2ed.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + convnext_pico=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_d1-10ad7f0d.pth', + test_input_size=(3, 288, 288), test_crop_pct=0.95), + convnext_pico_ols=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_pico_ols_d1-611f0ca7.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_nano=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_d1h-7eb4bdea.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_nano_ols=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_nano_ols_d1h-ae424a9a.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_tiny_hnf=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/convnext_tiny_hnf_a2h-ab7e9df2.pth', + crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + + convnext_tiny=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_small=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_small_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_base=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_base_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_large=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_large_1k_224_ema.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + + convnext_tiny_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_small_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_base_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_large_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_224.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + convnext_xlarge_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_224_ema.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + + convnext_tiny_384_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), + convnext_small_384_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), + convnext_base_384_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), + convnext_large_384_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_1k_384.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), + convnext_xlarge_384_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_1k_384_ema.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), + + convnext_tiny_in22k=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_tiny_22k_224.pth", num_classes=21841), + convnext_small_in22k=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_small_22k_224.pth", num_classes=21841), + convnext_base_in22k=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_base_22k_224.pth", num_classes=21841), + convnext_large_in22k=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_large_22k_224.pth", num_classes=21841), + convnext_xlarge_in22k=_cfg( + url="https://dl.fbaipublicfiles.com/convnext/convnext_xlarge_22k_224.pth", num_classes=21841), +) + + +class ConvNeXtBlock(nn.Module): + """ ConvNeXt Block + There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + + Unlike the official impl, this one allows choice of 1 or 2, 1x1 conv can be faster with appropriate + choice of LayerNorm impl, however as model size increases the tradeoffs appear to change and nn.Linear + is a better choice. This was observed with PyTorch 1.10 on 3090 GPU, it could change over time & w/ different HW. + + Args: + in_chs (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + ls_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__( + self, + in_chs, + out_chs=None, + kernel_size=7, + stride=1, + dilation=1, + mlp_ratio=4, + conv_mlp=False, + conv_bias=True, + ls_init_value=1e-6, + act_layer='gelu', + norm_layer=None, + drop_path=0., + ): + super().__init__() + out_chs = out_chs or in_chs + act_layer = get_act_layer(act_layer) + if not norm_layer: + norm_layer = LayerNorm2d if conv_mlp else LayerNorm + mlp_layer = ConvMlp if conv_mlp else Mlp + self.use_conv_mlp = conv_mlp + + self.conv_dw = create_conv2d( + in_chs, out_chs, kernel_size=kernel_size, stride=stride, dilation=dilation, depthwise=True, bias=conv_bias) + self.norm = norm_layer(out_chs) + self.mlp = mlp_layer(out_chs, int(mlp_ratio * out_chs), act_layer=act_layer) + self.gamma = nn.Parameter(ls_init_value * torch.ones(out_chs)) if ls_init_value > 0 else None + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + if self.use_conv_mlp: + x = self.norm(x) + x = self.mlp(x) + else: + x = x.permute(0, 2, 3, 1) + x = self.norm(x) + x = self.mlp(x) + x = x.permute(0, 3, 1, 2) + if self.gamma is not None: + x = x.mul(self.gamma.reshape(1, -1, 1, 1)) + + x = self.drop_path(x) + shortcut + return x + + +class ConvNeXtStage(nn.Module): + + def __init__( + self, + in_chs, + out_chs, + kernel_size=7, + stride=2, + depth=2, + dilation=(1, 1), + drop_path_rates=None, + ls_init_value=1.0, + conv_mlp=False, + conv_bias=True, + act_layer='gelu', + norm_layer=None, + norm_layer_cl=None + ): + super().__init__() + self.grad_checkpointing = False + + if in_chs != out_chs or stride > 1 or dilation[0] != dilation[1]: + ds_ks = 2 if stride > 1 or dilation[0] != dilation[1] else 1 + pad = 'same' if dilation[1] > 1 else 0 # same padding needed if dilation used + self.downsample = nn.Sequential( + norm_layer(in_chs), + create_conv2d( + in_chs, out_chs, kernel_size=ds_ks, stride=stride, + dilation=dilation[0], padding=pad, bias=conv_bias), + ) + in_chs = out_chs + else: + self.downsample = nn.Identity() + + drop_path_rates = drop_path_rates or [0.] * depth + stage_blocks = [] + for i in range(depth): + stage_blocks.append(ConvNeXtBlock( + in_chs=in_chs, + out_chs=out_chs, + kernel_size=kernel_size, + dilation=dilation[1], + drop_path=drop_path_rates[i], + ls_init_value=ls_init_value, + conv_mlp=conv_mlp, + conv_bias=conv_bias, + act_layer=act_layer, + norm_layer=norm_layer if conv_mlp else norm_layer_cl + )) + in_chs = out_chs + self.blocks = nn.Sequential(*stage_blocks) + + def forward(self, x): + x = self.downsample(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + return x + + +class ConvNeXt(nn.Module): + r""" ConvNeXt + A PyTorch impl of : `A ConvNet for the 2020s` - https://arxiv.org/pdf/2201.03545.pdf + + Args: + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + depths (tuple(int)): Number of blocks at each stage. Default: [3, 3, 9, 3] + dims (tuple(int)): Feature dimension at each stage. Default: [96, 192, 384, 768] + drop_rate (float): Head dropout rate + drop_path_rate (float): Stochastic depth rate. Default: 0. + ls_init_value (float): Init value for Layer Scale. Default: 1e-6. + head_init_scale (float): Init scaling value for classifier weights and biases. Default: 1. + """ + + def __init__( + self, + in_chans=3, + num_classes=1000, + global_pool='avg', + output_stride=32, + depths=(3, 3, 9, 3), + dims=(96, 192, 384, 768), + kernel_sizes=7, + ls_init_value=1e-6, + stem_type='patch', + patch_size=4, + head_init_scale=1., + head_norm_first=False, + conv_mlp=False, + conv_bias=True, + act_layer='gelu', + norm_layer=None, + drop_rate=0., + drop_path_rate=0., + ): + super().__init__() + assert output_stride in (8, 16, 32) + kernel_sizes = to_ntuple(4)(kernel_sizes) + if norm_layer is None: + norm_layer = LayerNorm2d + norm_layer_cl = norm_layer if conv_mlp else LayerNorm + else: + assert conv_mlp,\ + 'If a norm_layer is specified, conv MLP must be used so all norm expect rank-4, channels-first input' + norm_layer_cl = norm_layer + + self.num_classes = num_classes + self.drop_rate = drop_rate + self.feature_info = [] + + assert stem_type in ('patch', 'overlap', 'overlap_tiered') + if stem_type == 'patch': + # NOTE: this stem is a minimal form of ViT PatchEmbed, as used in SwinTransformer w/ patch_size = 4 + self.stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=patch_size, stride=patch_size, bias=conv_bias), + norm_layer(dims[0]) + ) + stem_stride = patch_size + else: + mid_chs = make_divisible(dims[0] // 2) if 'tiered' in stem_type else dims[0] + self.stem = nn.Sequential( + nn.Conv2d(in_chans, mid_chs, kernel_size=3, stride=2, padding=1, bias=conv_bias), + nn.Conv2d(mid_chs, dims[0], kernel_size=3, stride=2, padding=1, bias=conv_bias), + norm_layer(dims[0]), + ) + stem_stride = 4 + + self.stages = nn.Sequential() + dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] + stages = [] + prev_chs = dims[0] + curr_stride = stem_stride + dilation = 1 + # 4 feature resolution stages, each consisting of multiple residual blocks + for i in range(4): + stride = 2 if curr_stride == 2 or i > 0 else 1 + if curr_stride >= output_stride and stride > 1: + dilation *= stride + stride = 1 + curr_stride *= stride + first_dilation = 1 if dilation in (1, 2) else 2 + out_chs = dims[i] + stages.append(ConvNeXtStage( + prev_chs, + out_chs, + kernel_size=kernel_sizes[i], + stride=stride, + dilation=(first_dilation, dilation), + depth=depths[i], + drop_path_rates=dp_rates[i], + ls_init_value=ls_init_value, + conv_mlp=conv_mlp, + conv_bias=conv_bias, + act_layer=act_layer, + norm_layer=norm_layer, + norm_layer_cl=norm_layer_cl + )) + prev_chs = out_chs + # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2 + self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{i}')] + self.stages = nn.Sequential(*stages) + self.num_features = prev_chs + + # if head_norm_first == true, norm -> global pool -> fc ordering, like most other nets + # otherwise pool -> norm -> fc, the default ConvNeXt ordering (pretrained FB weights) + self.norm_pre = norm_layer(self.num_features) if head_norm_first else nn.Identity() + self.head = nn.Sequential(OrderedDict([ + ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)), + ('norm', nn.Identity() if head_norm_first else norm_layer(self.num_features)), + ('flatten', nn.Flatten(1) if global_pool else nn.Identity()), + ('drop', nn.Dropout(self.drop_rate)), + ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())])) + + named_apply(partial(_init_weights, head_init_scale=head_init_scale), self) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', + blocks=r'^stages\.(\d+)' if coarse else [ + (r'^stages\.(\d+)\.downsample', (0,)), # blocks + (r'^stages\.(\d+)\.blocks\.(\d+)', None), + (r'^norm_pre', (99999,)) + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes=0, global_pool=None): + if global_pool is not None: + self.head.global_pool = SelectAdaptivePool2d(pool_type=global_pool) + self.head.flatten = nn.Flatten(1) if global_pool else nn.Identity() + self.head.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.stem(x) + x = self.stages(x) + x = self.norm_pre(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + # NOTE nn.Sequential in head broken down since can't call head[:-1](x) in torchscript :( + x = self.head.global_pool(x) + x = self.head.norm(x) + x = self.head.flatten(x) + x = self.head.drop(x) + return x if pre_logits else self.head.fc(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _init_weights(module, name=None, head_init_scale=1.0): + if isinstance(module, nn.Conv2d): + trunc_normal_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=.02) + nn.init.zeros_(module.bias) + if name and 'head.' in name: + module.weight.data.mul_(head_init_scale) + module.bias.data.mul_(head_init_scale) + + +def checkpoint_filter_fn(state_dict, model): + """ Remap FB checkpoints -> timm """ + if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict: + return state_dict # non-FB checkpoint + if 'model' in state_dict: + state_dict = state_dict['model'] + out_dict = {} + import re + for k, v in state_dict.items(): + k = k.replace('downsample_layers.0.', 'stem.') + k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k) + k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k) + k = k.replace('dwconv', 'conv_dw') + k = k.replace('pwconv', 'mlp.fc') + k = k.replace('head.', 'head.fc.') + if k.startswith('norm.'): + k = k.replace('norm', 'head.norm') + if v.ndim == 2 and 'head' not in k: + model_shape = model.state_dict()[k].shape + v = v.reshape(model_shape) + out_dict[k] = v + return out_dict + + +def _create_convnext(variant, pretrained=False, **kwargs): + model = build_model_with_cfg( + ConvNeXt, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True), + **kwargs) + return model + + +@register_model +def convnext_atto(pretrained=False, **kwargs): + # timm femto variant (NOTE: still tweaking depths, will vary between 3-4M param, current is 3.7M + model_args = dict( + depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True, **kwargs) + model = _create_convnext('convnext_atto', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_atto_ols(pretrained=False, **kwargs): + # timm femto variant with overlapping 3x3 conv stem, wider than non-ols femto above, current param count 3.7M + model_args = dict( + depths=(2, 2, 6, 2), dims=(40, 80, 160, 320), conv_mlp=True, stem_type='overlap_tiered', **kwargs) + model = _create_convnext('convnext_atto_ols', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_femto(pretrained=False, **kwargs): + # timm femto variant + model_args = dict( + depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True, **kwargs) + model = _create_convnext('convnext_femto', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_femto_ols(pretrained=False, **kwargs): + # timm femto variant + model_args = dict( + depths=(2, 2, 6, 2), dims=(48, 96, 192, 384), conv_mlp=True, stem_type='overlap_tiered', **kwargs) + model = _create_convnext('convnext_femto_ols', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_pico(pretrained=False, **kwargs): + # timm pico variant + model_args = dict( + depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True, **kwargs) + model = _create_convnext('convnext_pico', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_pico_ols(pretrained=False, **kwargs): + # timm nano variant with overlapping 3x3 conv stem + model_args = dict( + depths=(2, 2, 6, 2), dims=(64, 128, 256, 512), conv_mlp=True, stem_type='overlap_tiered', **kwargs) + model = _create_convnext('convnext_pico_ols', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_nano(pretrained=False, **kwargs): + # timm nano variant with standard stem and head + model_args = dict( + depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True, **kwargs) + model = _create_convnext('convnext_nano', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_nano_ols(pretrained=False, **kwargs): + # experimental nano variant with overlapping conv stem + model_args = dict( + depths=(2, 2, 8, 2), dims=(80, 160, 320, 640), conv_mlp=True, stem_type='overlap', **kwargs) + model = _create_convnext('convnext_nano_ols', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_tiny_hnf(pretrained=False, **kwargs): + # experimental tiny variant with norm before pooling in head (head norm first) + model_args = dict( + depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), head_norm_first=True, conv_mlp=True, **kwargs) + model = _create_convnext('convnext_tiny_hnf', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_tiny(pretrained=False, **kwargs): + model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs) + model = _create_convnext('convnext_tiny', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_small(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) + model = _create_convnext('convnext_small', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_base(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) + model = _create_convnext('convnext_base', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_large(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) + model = _create_convnext('convnext_large', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_tiny_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs) + model = _create_convnext('convnext_tiny_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_small_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) + model = _create_convnext('convnext_small_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_base_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) + model = _create_convnext('convnext_base_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_large_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) + model = _create_convnext('convnext_large_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_xlarge_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) + model = _create_convnext('convnext_xlarge_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_tiny_384_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs) + model = _create_convnext('convnext_tiny_384_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_small_384_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) + model = _create_convnext('convnext_small_384_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_base_384_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) + model = _create_convnext('convnext_base_384_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_large_384_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) + model = _create_convnext('convnext_large_384_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_xlarge_384_in22ft1k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) + model = _create_convnext('convnext_xlarge_384_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_tiny_in22k(pretrained=False, **kwargs): + model_args = dict(depths=(3, 3, 9, 3), dims=(96, 192, 384, 768), **kwargs) + model = _create_convnext('convnext_tiny_in22k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_small_in22k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[96, 192, 384, 768], **kwargs) + model = _create_convnext('convnext_small_in22k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_base_in22k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[128, 256, 512, 1024], **kwargs) + model = _create_convnext('convnext_base_in22k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_large_in22k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[192, 384, 768, 1536], **kwargs) + model = _create_convnext('convnext_large_in22k', pretrained=pretrained, **model_args) + return model + + +@register_model +def convnext_xlarge_in22k(pretrained=False, **kwargs): + model_args = dict(depths=[3, 3, 27, 3], dims=[256, 512, 1024, 2048], **kwargs) + model = _create_convnext('convnext_xlarge_in22k', pretrained=pretrained, **model_args) + return model diff --git a/custom_timm/models/crossvit.py b/custom_timm/models/crossvit.py new file mode 100644 index 0000000000000000000000000000000000000000..bb996207da81e19b932c44d36af020267e227357 --- /dev/null +++ b/custom_timm/models/crossvit.py @@ -0,0 +1,539 @@ +""" CrossViT Model + +@inproceedings{ + chen2021crossvit, + title={{CrossViT: Cross-Attention Multi-Scale Vision Transformer for Image Classification}}, + author={Chun-Fu (Richard) Chen and Quanfu Fan and Rameswar Panda}, + booktitle={International Conference on Computer Vision (ICCV)}, + year={2021} +} + +Paper link: https://arxiv.org/abs/2103.14899 +Original code: https://github.com/IBM/CrossViT/blob/main/models/crossvit.py + +NOTE: model names have been renamed from originals to represent actual input res all *_224 -> *_240 and *_384 -> *_408 + +Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman +""" + +# Copyright IBM All Rights Reserved. +# SPDX-License-Identifier: Apache-2.0 + + +""" +Modifed from custom_timm. https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py + +""" +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.hub +from functools import partial +from typing import List + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_function +from .helpers import build_model_with_cfg +from .layers import DropPath, to_2tuple, trunc_normal_, _assert +from .registry import register_model +from .vision_transformer import Mlp, Block + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 240, 240), 'pool_size': None, 'crop_pct': 0.875, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, 'fixed_input_size': True, + 'first_conv': ('patch_embed.0.proj', 'patch_embed.1.proj'), + 'classifier': ('head.0', 'head.1'), + **kwargs + } + + +default_cfgs = { + 'crossvit_15_240': _cfg(url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_15_224.pth'), + 'crossvit_15_dagger_240': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_15_dagger_224.pth', + first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'), + ), + 'crossvit_15_dagger_408': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_15_dagger_384.pth', + input_size=(3, 408, 408), first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'), crop_pct=1.0, + ), + 'crossvit_18_240': _cfg(url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_18_224.pth'), + 'crossvit_18_dagger_240': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_18_dagger_224.pth', + first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'), + ), + 'crossvit_18_dagger_408': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_18_dagger_384.pth', + input_size=(3, 408, 408), first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'), crop_pct=1.0, + ), + 'crossvit_9_240': _cfg(url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_9_224.pth'), + 'crossvit_9_dagger_240': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_9_dagger_224.pth', + first_conv=('patch_embed.0.proj.0', 'patch_embed.1.proj.0'), + ), + 'crossvit_base_240': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_base_224.pth'), + 'crossvit_small_240': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_small_224.pth'), + 'crossvit_tiny_240': _cfg( + url='https://github.com/IBM/CrossViT/releases/download/weights-0.1/crossvit_tiny_224.pth'), +} + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, multi_conv=False): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0]) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + if multi_conv: + if patch_size[0] == 12: + self.proj = nn.Sequential( + nn.Conv2d(in_chans, embed_dim // 4, kernel_size=7, stride=4, padding=3), + nn.ReLU(inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim // 2, kernel_size=3, stride=3, padding=0), + nn.ReLU(inplace=True), + nn.Conv2d(embed_dim // 2, embed_dim, kernel_size=3, stride=1, padding=1), + ) + elif patch_size[0] == 16: + self.proj = nn.Sequential( + nn.Conv2d(in_chans, embed_dim // 4, kernel_size=7, stride=4, padding=3), + nn.ReLU(inplace=True), + nn.Conv2d(embed_dim // 4, embed_dim // 2, kernel_size=3, stride=2, padding=1), + nn.ReLU(inplace=True), + nn.Conv2d(embed_dim // 2, embed_dim, kernel_size=3, stride=2, padding=1), + ) + else: + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + B, C, H, W = x.shape + # FIXME look at relaxing size constraints + _assert(H == self.img_size[0], + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).") + _assert(W == self.img_size[1], + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).") + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +class CrossAttention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + # NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights + self.scale = qk_scale or head_dim ** -0.5 + + self.wq = nn.Linear(dim, dim, bias=qkv_bias) + self.wk = nn.Linear(dim, dim, bias=qkv_bias) + self.wv = nn.Linear(dim, dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + # B1C -> B1H(C/H) -> BH1(C/H) + q = self.wq(x[:, 0:1, ...]).reshape(B, 1, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + # BNC -> BNH(C/H) -> BHN(C/H) + k = self.wk(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + # BNC -> BNH(C/H) -> BHN(C/H) + v = self.wv(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + + attn = (q @ k.transpose(-2, -1)) * self.scale # BH1(C/H) @ BH(C/H)N -> BH1N + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, 1, C) # (BH1N @ BHN(C/H)) -> BH1(C/H) -> B1H(C/H) -> B1C + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class CrossAttentionBlock(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = CrossAttention( + dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + x = x[:, 0:1, ...] + self.drop_path(self.attn(self.norm1(x))) + return x + + +class MultiScaleBlock(nn.Module): + + def __init__(self, dim, patches, depth, num_heads, mlp_ratio, qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + + num_branches = len(dim) + self.num_branches = num_branches + # different branch could have different embedding size, the first one is the base + self.blocks = nn.ModuleList() + for d in range(num_branches): + tmp = [] + for i in range(depth[d]): + tmp.append(Block( + dim=dim[d], num_heads=num_heads[d], mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias, + drop=drop, attn_drop=attn_drop, drop_path=drop_path[i], norm_layer=norm_layer)) + if len(tmp) != 0: + self.blocks.append(nn.Sequential(*tmp)) + + if len(self.blocks) == 0: + self.blocks = None + + self.projs = nn.ModuleList() + for d in range(num_branches): + if dim[d] == dim[(d + 1) % num_branches] and False: + tmp = [nn.Identity()] + else: + tmp = [norm_layer(dim[d]), act_layer(), nn.Linear(dim[d], dim[(d + 1) % num_branches])] + self.projs.append(nn.Sequential(*tmp)) + + self.fusion = nn.ModuleList() + for d in range(num_branches): + d_ = (d + 1) % num_branches + nh = num_heads[d_] + if depth[-1] == 0: # backward capability: + self.fusion.append( + CrossAttentionBlock( + dim=dim[d_], num_heads=nh, mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias, + drop=drop, attn_drop=attn_drop, drop_path=drop_path[-1], norm_layer=norm_layer)) + else: + tmp = [] + for _ in range(depth[-1]): + tmp.append(CrossAttentionBlock( + dim=dim[d_], num_heads=nh, mlp_ratio=mlp_ratio[d], qkv_bias=qkv_bias, + drop=drop, attn_drop=attn_drop, drop_path=drop_path[-1], norm_layer=norm_layer)) + self.fusion.append(nn.Sequential(*tmp)) + + self.revert_projs = nn.ModuleList() + for d in range(num_branches): + if dim[(d + 1) % num_branches] == dim[d] and False: + tmp = [nn.Identity()] + else: + tmp = [norm_layer(dim[(d + 1) % num_branches]), act_layer(), + nn.Linear(dim[(d + 1) % num_branches], dim[d])] + self.revert_projs.append(nn.Sequential(*tmp)) + + def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: + + outs_b = [] + for i, block in enumerate(self.blocks): + outs_b.append(block(x[i])) + + # only take the cls token out + proj_cls_token = torch.jit.annotate(List[torch.Tensor], []) + for i, proj in enumerate(self.projs): + proj_cls_token.append(proj(outs_b[i][:, 0:1, ...])) + + # cross attention + outs = [] + for i, (fusion, revert_proj) in enumerate(zip(self.fusion, self.revert_projs)): + tmp = torch.cat((proj_cls_token[i], outs_b[(i + 1) % self.num_branches][:, 1:, ...]), dim=1) + tmp = fusion(tmp) + reverted_proj_cls_token = revert_proj(tmp[:, 0:1, ...]) + tmp = torch.cat((reverted_proj_cls_token, outs_b[i][:, 1:, ...]), dim=1) + outs.append(tmp) + return outs + + +def _compute_num_patches(img_size, patches): + return [i[0] // p * i[1] // p for i, p in zip(img_size, patches)] + + +@register_notrace_function +def scale_image(x, ss: Tuple[int, int], crop_scale: bool = False): # annotations for torchscript + """ + Pulled out of CrossViT.forward_features to bury conditional logic in a leaf node for FX tracing. + Args: + x (Tensor): input image + ss (tuple[int, int]): height and width to scale to + crop_scale (bool): whether to crop instead of interpolate to achieve the desired scale. Defaults to False + Returns: + Tensor: the "scaled" image batch tensor + """ + H, W = x.shape[-2:] + if H != ss[0] or W != ss[1]: + if crop_scale and ss[0] <= H and ss[1] <= W: + cu, cl = int(round((H - ss[0]) / 2.)), int(round((W - ss[1]) / 2.)) + x = x[:, :, cu:cu + ss[0], cl:cl + ss[1]] + else: + x = torch.nn.functional.interpolate(x, size=ss, mode='bicubic', align_corners=False) + return x + + +class CrossViT(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + """ + + def __init__( + self, img_size=224, img_scale=(1.0, 1.0), patch_size=(8, 16), in_chans=3, num_classes=1000, + embed_dim=(192, 384), depth=((1, 3, 1), (1, 3, 1), (1, 3, 1)), num_heads=(6, 12), mlp_ratio=(2., 2., 4.), + multi_conv=False, crop_scale=False, qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + norm_layer=partial(nn.LayerNorm, eps=1e-6), global_pool='token', + ): + super().__init__() + assert global_pool in ('token', 'avg') + + self.num_classes = num_classes + self.global_pool = global_pool + self.img_size = to_2tuple(img_size) + img_scale = to_2tuple(img_scale) + self.img_size_scaled = [tuple([int(sj * si) for sj in self.img_size]) for si in img_scale] + self.crop_scale = crop_scale # crop instead of interpolate for scale + num_patches = _compute_num_patches(self.img_size_scaled, patch_size) + self.num_branches = len(patch_size) + self.embed_dim = embed_dim + self.num_features = sum(embed_dim) + self.patch_embed = nn.ModuleList() + + # hard-coded for torch jit script + for i in range(self.num_branches): + setattr(self, f'pos_embed_{i}', nn.Parameter(torch.zeros(1, 1 + num_patches[i], embed_dim[i]))) + setattr(self, f'cls_token_{i}', nn.Parameter(torch.zeros(1, 1, embed_dim[i]))) + + for im_s, p, d in zip(self.img_size_scaled, patch_size, embed_dim): + self.patch_embed.append( + PatchEmbed(img_size=im_s, patch_size=p, in_chans=in_chans, embed_dim=d, multi_conv=multi_conv)) + + self.pos_drop = nn.Dropout(p=drop_rate) + + total_depth = sum([sum(x[-2:]) for x in depth]) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, total_depth)] # stochastic depth decay rule + dpr_ptr = 0 + self.blocks = nn.ModuleList() + for idx, block_cfg in enumerate(depth): + curr_depth = max(block_cfg[:-1]) + block_cfg[-1] + dpr_ = dpr[dpr_ptr:dpr_ptr + curr_depth] + blk = MultiScaleBlock( + embed_dim, num_patches, block_cfg, num_heads=num_heads, mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr_, norm_layer=norm_layer) + dpr_ptr += curr_depth + self.blocks.append(blk) + + self.norm = nn.ModuleList([norm_layer(embed_dim[i]) for i in range(self.num_branches)]) + self.head = nn.ModuleList([ + nn.Linear(embed_dim[i], num_classes) if num_classes > 0 else nn.Identity() + for i in range(self.num_branches)]) + + for i in range(self.num_branches): + trunc_normal_(getattr(self, f'pos_embed_{i}'), std=.02) + trunc_normal_(getattr(self, f'cls_token_{i}'), std=.02) + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + out = set() + for i in range(self.num_branches): + out.add(f'cls_token_{i}') + pe = getattr(self, f'pos_embed_{i}', None) + if pe is not None and pe.requires_grad: + out.add(f'pos_embed_{i}') + return out + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^cls_token|pos_embed|patch_embed', # stem and embed + blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('token', 'avg') + self.global_pool = global_pool + self.head = nn.ModuleList( + [nn.Linear(self.embed_dim[i], num_classes) if num_classes > 0 else nn.Identity() for i in + range(self.num_branches)]) + + def forward_features(self, x) -> List[torch.Tensor]: + B = x.shape[0] + xs = [] + for i, patch_embed in enumerate(self.patch_embed): + x_ = x + ss = self.img_size_scaled[i] + x_ = scale_image(x_, ss, self.crop_scale) + x_ = patch_embed(x_) + cls_tokens = self.cls_token_0 if i == 0 else self.cls_token_1 # hard-coded for torch jit script + cls_tokens = cls_tokens.expand(B, -1, -1) + x_ = torch.cat((cls_tokens, x_), dim=1) + pos_embed = self.pos_embed_0 if i == 0 else self.pos_embed_1 # hard-coded for torch jit script + x_ = x_ + pos_embed + x_ = self.pos_drop(x_) + xs.append(x_) + + for i, blk in enumerate(self.blocks): + xs = blk(xs) + + # NOTE: was before branch token section, move to here to assure all branch token are before layer norm + xs = [norm(xs[i]) for i, norm in enumerate(self.norm)] + return xs + + def forward_head(self, xs: List[torch.Tensor], pre_logits: bool = False) -> torch.Tensor: + xs = [x[:, 1:].mean(dim=1) for x in xs] if self.global_pool == 'avg' else [x[:, 0] for x in xs] + if pre_logits or isinstance(self.head[0], nn.Identity): + return torch.cat([x for x in xs], dim=1) + return torch.mean(torch.stack([head(xs[i]) for i, head in enumerate(self.head)], dim=0), dim=0) + + def forward(self, x): + xs = self.forward_features(x) + x = self.forward_head(xs) + return x + + +def _create_crossvit(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + def pretrained_filter_fn(state_dict): + new_state_dict = {} + for key in state_dict.keys(): + if 'pos_embed' in key or 'cls_token' in key: + new_key = key.replace(".", "_") + else: + new_key = key + new_state_dict[new_key] = state_dict[key] + return new_state_dict + + return build_model_with_cfg( + CrossViT, variant, pretrained, + pretrained_filter_fn=pretrained_filter_fn, + **kwargs) + + +@register_model +def crossvit_tiny_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[96, 192], depth=[[1, 4, 0], [1, 4, 0], [1, 4, 0]], + num_heads=[3, 3], mlp_ratio=[4, 4, 1], **kwargs) + model = _create_crossvit(variant='crossvit_tiny_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_small_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 4, 0], [1, 4, 0], [1, 4, 0]], + num_heads=[6, 6], mlp_ratio=[4, 4, 1], **kwargs) + model = _create_crossvit(variant='crossvit_small_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_base_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[384, 768], depth=[[1, 4, 0], [1, 4, 0], [1, 4, 0]], + num_heads=[12, 12], mlp_ratio=[4, 4, 1], **kwargs) + model = _create_crossvit(variant='crossvit_base_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_9_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[128, 256], depth=[[1, 3, 0], [1, 3, 0], [1, 3, 0]], + num_heads=[4, 4], mlp_ratio=[3, 3, 1], **kwargs) + model = _create_crossvit(variant='crossvit_9_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_15_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 5, 0], [1, 5, 0], [1, 5, 0]], + num_heads=[6, 6], mlp_ratio=[3, 3, 1], **kwargs) + model = _create_crossvit(variant='crossvit_15_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_18_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224 / 240), patch_size=[12, 16], embed_dim=[224, 448], depth=[[1, 6, 0], [1, 6, 0], [1, 6, 0]], + num_heads=[7, 7], mlp_ratio=[3, 3, 1], **kwargs) + model = _create_crossvit(variant='crossvit_18_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_9_dagger_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224 / 240), patch_size=[12, 16], embed_dim=[128, 256], depth=[[1, 3, 0], [1, 3, 0], [1, 3, 0]], + num_heads=[4, 4], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs) + model = _create_crossvit(variant='crossvit_9_dagger_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_15_dagger_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 5, 0], [1, 5, 0], [1, 5, 0]], + num_heads=[6, 6], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs) + model = _create_crossvit(variant='crossvit_15_dagger_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_15_dagger_408(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 384/408), patch_size=[12, 16], embed_dim=[192, 384], depth=[[1, 5, 0], [1, 5, 0], [1, 5, 0]], + num_heads=[6, 6], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs) + model = _create_crossvit(variant='crossvit_15_dagger_408', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_18_dagger_240(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 224/240), patch_size=[12, 16], embed_dim=[224, 448], depth=[[1, 6, 0], [1, 6, 0], [1, 6, 0]], + num_heads=[7, 7], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs) + model = _create_crossvit(variant='crossvit_18_dagger_240', pretrained=pretrained, **model_args) + return model + + +@register_model +def crossvit_18_dagger_408(pretrained=False, **kwargs): + model_args = dict( + img_scale=(1.0, 384/408), patch_size=[12, 16], embed_dim=[224, 448], depth=[[1, 6, 0], [1, 6, 0], [1, 6, 0]], + num_heads=[7, 7], mlp_ratio=[3, 3, 1], multi_conv=True, **kwargs) + model = _create_crossvit(variant='crossvit_18_dagger_408', pretrained=pretrained, **model_args) + return model diff --git a/custom_timm/models/cspnet.py b/custom_timm/models/cspnet.py new file mode 100644 index 0000000000000000000000000000000000000000..8e19ec29f7b14cdf58368a8cbea5cdccee43b07e --- /dev/null +++ b/custom_timm/models/cspnet.py @@ -0,0 +1,1083 @@ +"""PyTorch CspNet + +A PyTorch implementation of Cross Stage Partial Networks including: +* CSPResNet50 +* CSPResNeXt50 +* CSPDarkNet53 +* and DarkNet53 for good measure + +Based on paper `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929 + +Reference impl via darknet cfg files at https://github.com/WongKinYiu/CrossStagePartialNetworks + +Hacked together by / Copyright 2020 Ross Wightman +""" +import collections.abc +from dataclasses import dataclass, field, asdict +from functools import partial +from typing import Any, Callable, Dict, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, named_apply, MATCH_PREV_GROUP +from .layers import ClassifierHead, ConvNormAct, ConvNormActAa, DropPath, get_attn, create_act_layer, make_divisible +from .registry import register_model + + +__all__ = ['CspNet'] # model_registry will add each entrypoint fn to this + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8), + 'crop_pct': 0.887, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv1.conv', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = { + 'cspresnet50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnet50_ra-d3e8d487.pth'), + 'cspresnet50d': _cfg(url=''), + 'cspresnet50w': _cfg(url=''), + 'cspresnext50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspresnext50_ra_224-648b4713.pth', + ), + 'cspdarknet53': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/cspdarknet53_ra_256-d05c7c21.pth'), + + 'darknet17': _cfg(url=''), + 'darknet21': _cfg(url=''), + 'sedarknet21': _cfg(url=''), + 'darknet53': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/darknet53_256_c2ns-3aeff817.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'darknetaa53': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/darknetaa53_c2ns-5c28ec8a.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0), + + 'cs3darknet_s': _cfg( + url='', interpolation='bicubic'), + 'cs3darknet_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_m_c2ns-43f06604.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95, + ), + 'cs3darknet_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_l_c2ns-16220c5d.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'cs3darknet_x': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_x_c2ns-4e4490aa.pth', + interpolation='bicubic', crop_pct=0.95, test_input_size=(3, 288, 288), test_crop_pct=1.0), + + 'cs3darknet_focus_s': _cfg( + url='', interpolation='bicubic'), + 'cs3darknet_focus_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_focus_m_c2ns-e23bed41.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'cs3darknet_focus_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3darknet_focus_l_c2ns-65ef8888.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'cs3darknet_focus_x': _cfg( + url='', interpolation='bicubic'), + + 'cs3sedarknet_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3sedarknet_l_c2ns-e8d1dc13.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=0.95), + 'cs3sedarknet_x': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3sedarknet_x_c2ns-b4d0abc0.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=1.0), + + 'cs3sedarknet_xdw': _cfg( + url='', interpolation='bicubic'), + + 'cs3edgenet_x': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3edgenet_x_c2-2e1610a9.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), test_crop_pct=1.0), + 'cs3se_edgenet_x': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/cs3se_edgenet_x_c2ns-76f8e3ac.pth', + interpolation='bicubic', crop_pct=0.95, test_input_size=(3, 320, 320), test_crop_pct=1.0), +} + + +@dataclass +class CspStemCfg: + out_chs: Union[int, Tuple[int, ...]] = 32 + stride: Union[int, Tuple[int, ...]] = 2 + kernel_size: int = 3 + padding: Union[int, str] = '' + pool: Optional[str] = '' + + +def _pad_arg(x, n): + # pads an argument tuple to specified n by padding with last value + if not isinstance(x, (tuple, list)): + x = (x,) + curr_n = len(x) + pad_n = n - curr_n + if pad_n <= 0: + return x[:n] + return tuple(x + (x[-1],) * pad_n) + + +@dataclass +class CspStagesCfg: + depth: Tuple[int, ...] = (3, 3, 5, 2) # block depth (number of block repeats in stages) + out_chs: Tuple[int, ...] = (128, 256, 512, 1024) # number of output channels for blocks in stage + stride: Union[int, Tuple[int, ...]] = 2 # stride of stage + groups: Union[int, Tuple[int, ...]] = 1 # num kxk conv groups + block_ratio: Union[float, Tuple[float, ...]] = 1.0 + bottle_ratio: Union[float, Tuple[float, ...]] = 1. # bottleneck-ratio of blocks in stage + avg_down: Union[bool, Tuple[bool, ...]] = False + attn_layer: Optional[Union[str, Tuple[str, ...]]] = None + attn_kwargs: Optional[Union[Dict, Tuple[Dict]]] = None + stage_type: Union[str, Tuple[str]] = 'csp' # stage type ('csp', 'cs2', 'dark') + block_type: Union[str, Tuple[str]] = 'bottle' # blocks type for stages ('bottle', 'dark') + + # cross-stage only + expand_ratio: Union[float, Tuple[float, ...]] = 1.0 + cross_linear: Union[bool, Tuple[bool, ...]] = False + down_growth: Union[bool, Tuple[bool, ...]] = False + + def __post_init__(self): + n = len(self.depth) + assert len(self.out_chs) == n + self.stride = _pad_arg(self.stride, n) + self.groups = _pad_arg(self.groups, n) + self.block_ratio = _pad_arg(self.block_ratio, n) + self.bottle_ratio = _pad_arg(self.bottle_ratio, n) + self.avg_down = _pad_arg(self.avg_down, n) + self.attn_layer = _pad_arg(self.attn_layer, n) + self.attn_kwargs = _pad_arg(self.attn_kwargs, n) + self.stage_type = _pad_arg(self.stage_type, n) + self.block_type = _pad_arg(self.block_type, n) + + self.expand_ratio = _pad_arg(self.expand_ratio, n) + self.cross_linear = _pad_arg(self.cross_linear, n) + self.down_growth = _pad_arg(self.down_growth, n) + + +@dataclass +class CspModelCfg: + stem: CspStemCfg + stages: CspStagesCfg + zero_init_last: bool = True # zero init last weight (usually bn) in residual path + act_layer: str = 'leaky_relu' + norm_layer: str = 'batchnorm' + aa_layer: Optional[str] = None # FIXME support string factory for this + + +def _cs3_cfg( + width_multiplier=1.0, + depth_multiplier=1.0, + avg_down=False, + act_layer='silu', + focus=False, + attn_layer=None, + attn_kwargs=None, + bottle_ratio=1.0, + block_type='dark', +): + if focus: + stem_cfg = CspStemCfg( + out_chs=make_divisible(64 * width_multiplier), + kernel_size=6, stride=2, padding=2, pool='') + else: + stem_cfg = CspStemCfg( + out_chs=tuple([make_divisible(c * width_multiplier) for c in (32, 64)]), + kernel_size=3, stride=2, pool='') + return CspModelCfg( + stem=stem_cfg, + stages=CspStagesCfg( + out_chs=tuple([make_divisible(c * width_multiplier) for c in (128, 256, 512, 1024)]), + depth=tuple([int(d * depth_multiplier) for d in (3, 6, 9, 3)]), + stride=2, + bottle_ratio=bottle_ratio, + block_ratio=0.5, + avg_down=avg_down, + attn_layer=attn_layer, + attn_kwargs=attn_kwargs, + stage_type='cs3', + block_type=block_type, + ), + act_layer=act_layer, + ) + + +model_cfgs = dict( + cspresnet50=CspModelCfg( + stem=CspStemCfg(out_chs=64, kernel_size=7, stride=4, pool='max'), + stages=CspStagesCfg( + depth=(3, 3, 5, 2), + out_chs=(128, 256, 512, 1024), + stride=(1, 2), + expand_ratio=2., + bottle_ratio=0.5, + cross_linear=True, + ), + ), + cspresnet50d=CspModelCfg( + stem=CspStemCfg(out_chs=(32, 32, 64), kernel_size=3, stride=4, pool='max'), + stages=CspStagesCfg( + depth=(3, 3, 5, 2), + out_chs=(128, 256, 512, 1024), + stride=(1,) + (2,), + expand_ratio=2., + bottle_ratio=0.5, + block_ratio=1., + cross_linear=True, + ), + ), + cspresnet50w=CspModelCfg( + stem=CspStemCfg(out_chs=(32, 32, 64), kernel_size=3, stride=4, pool='max'), + stages=CspStagesCfg( + depth=(3, 3, 5, 2), + out_chs=(256, 512, 1024, 2048), + stride=(1,) + (2,), + expand_ratio=1., + bottle_ratio=0.25, + block_ratio=0.5, + cross_linear=True, + ), + ), + cspresnext50=CspModelCfg( + stem=CspStemCfg(out_chs=64, kernel_size=7, stride=4, pool='max'), + stages=CspStagesCfg( + depth=(3, 3, 5, 2), + out_chs=(256, 512, 1024, 2048), + stride=(1,) + (2,), + groups=32, + expand_ratio=1., + bottle_ratio=1., + block_ratio=0.5, + cross_linear=True, + ), + ), + cspdarknet53=CspModelCfg( + stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''), + stages=CspStagesCfg( + depth=(1, 2, 8, 8, 4), + out_chs=(64, 128, 256, 512, 1024), + stride=2, + expand_ratio=(2.,) + (1.,), + bottle_ratio=(0.5,) + (1.,), + block_ratio=(1.,) + (0.5,), + down_growth=True, + block_type='dark', + ), + ), + darknet17=CspModelCfg( + stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''), + stages=CspStagesCfg( + depth=(1,) * 5, + out_chs=(64, 128, 256, 512, 1024), + stride=(2,), + bottle_ratio=(0.5,), + block_ratio=(1.,), + stage_type='dark', + block_type='dark', + ), + ), + darknet21=CspModelCfg( + stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''), + stages=CspStagesCfg( + depth=(1, 1, 1, 2, 2), + out_chs=(64, 128, 256, 512, 1024), + stride=(2,), + bottle_ratio=(0.5,), + block_ratio=(1.,), + stage_type='dark', + block_type='dark', + + ), + ), + sedarknet21=CspModelCfg( + stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''), + stages=CspStagesCfg( + depth=(1, 1, 1, 2, 2), + out_chs=(64, 128, 256, 512, 1024), + stride=2, + bottle_ratio=0.5, + block_ratio=1., + attn_layer='se', + stage_type='dark', + block_type='dark', + + ), + ), + darknet53=CspModelCfg( + stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''), + stages=CspStagesCfg( + depth=(1, 2, 8, 8, 4), + out_chs=(64, 128, 256, 512, 1024), + stride=2, + bottle_ratio=0.5, + block_ratio=1., + stage_type='dark', + block_type='dark', + ), + ), + darknetaa53=CspModelCfg( + stem=CspStemCfg(out_chs=32, kernel_size=3, stride=1, pool=''), + stages=CspStagesCfg( + depth=(1, 2, 8, 8, 4), + out_chs=(64, 128, 256, 512, 1024), + stride=2, + bottle_ratio=0.5, + block_ratio=1., + avg_down=True, + stage_type='dark', + block_type='dark', + ), + ), + + cs3darknet_s=_cs3_cfg(width_multiplier=0.5, depth_multiplier=0.5), + cs3darknet_m=_cs3_cfg(width_multiplier=0.75, depth_multiplier=0.67), + cs3darknet_l=_cs3_cfg(), + cs3darknet_x=_cs3_cfg(width_multiplier=1.25, depth_multiplier=1.33), + + cs3darknet_focus_s=_cs3_cfg(width_multiplier=0.5, depth_multiplier=0.5, focus=True), + cs3darknet_focus_m=_cs3_cfg(width_multiplier=0.75, depth_multiplier=0.67, focus=True), + cs3darknet_focus_l=_cs3_cfg(focus=True), + cs3darknet_focus_x=_cs3_cfg(width_multiplier=1.25, depth_multiplier=1.33, focus=True), + + cs3sedarknet_l=_cs3_cfg(attn_layer='se', attn_kwargs=dict(rd_ratio=.25)), + cs3sedarknet_x=_cs3_cfg(attn_layer='se', width_multiplier=1.25, depth_multiplier=1.33), + + cs3sedarknet_xdw=CspModelCfg( + stem=CspStemCfg(out_chs=(32, 64), kernel_size=3, stride=2, pool=''), + stages=CspStagesCfg( + depth=(3, 6, 12, 4), + out_chs=(256, 512, 1024, 2048), + stride=2, + groups=(1, 1, 256, 512), + bottle_ratio=0.5, + block_ratio=0.5, + attn_layer='se', + ), + act_layer='silu', + ), + + cs3edgenet_x=_cs3_cfg(width_multiplier=1.25, depth_multiplier=1.33, bottle_ratio=1.5, block_type='edge'), + cs3se_edgenet_x=_cs3_cfg( + width_multiplier=1.25, depth_multiplier=1.33, bottle_ratio=1.5, block_type='edge', + attn_layer='se', attn_kwargs=dict(rd_ratio=.25)), +) + + +class BottleneckBlock(nn.Module): + """ ResNe(X)t Bottleneck Block + """ + + def __init__( + self, + in_chs, + out_chs, + dilation=1, + bottle_ratio=0.25, + groups=1, + act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, + attn_last=False, + attn_layer=None, + drop_block=None, + drop_path=0. + ): + super(BottleneckBlock, self).__init__() + mid_chs = int(round(out_chs * bottle_ratio)) + ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer) + attn_last = attn_layer is not None and attn_last + attn_first = attn_layer is not None and not attn_last + + self.conv1 = ConvNormAct(in_chs, mid_chs, kernel_size=1, **ckwargs) + self.conv2 = ConvNormAct( + mid_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups, + drop_layer=drop_block, **ckwargs) + self.attn2 = attn_layer(mid_chs, act_layer=act_layer) if attn_first else nn.Identity() + self.conv3 = ConvNormAct(mid_chs, out_chs, kernel_size=1, apply_act=False, **ckwargs) + self.attn3 = attn_layer(out_chs, act_layer=act_layer) if attn_last else nn.Identity() + self.drop_path = DropPath(drop_path) if drop_path else nn.Identity() + self.act3 = create_act_layer(act_layer) + + def zero_init_last(self): + nn.init.zeros_(self.conv3.bn.weight) + + def forward(self, x): + shortcut = x + x = self.conv1(x) + x = self.conv2(x) + x = self.attn2(x) + x = self.conv3(x) + x = self.attn3(x) + x = self.drop_path(x) + shortcut + # FIXME partial shortcut needed if first block handled as per original, not used for my current impl + #x[:, :shortcut.size(1)] += shortcut + x = self.act3(x) + return x + + +class DarkBlock(nn.Module): + """ DarkNet Block + """ + + def __init__( + self, + in_chs, + out_chs, + dilation=1, + bottle_ratio=0.5, + groups=1, + act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, + attn_layer=None, + drop_block=None, + drop_path=0. + ): + super(DarkBlock, self).__init__() + mid_chs = int(round(out_chs * bottle_ratio)) + ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer) + + self.conv1 = ConvNormAct(in_chs, mid_chs, kernel_size=1, **ckwargs) + self.attn = attn_layer(mid_chs, act_layer=act_layer) if attn_layer is not None else nn.Identity() + self.conv2 = ConvNormAct( + mid_chs, out_chs, kernel_size=3, dilation=dilation, groups=groups, + drop_layer=drop_block, **ckwargs) + self.drop_path = DropPath(drop_path) if drop_path else nn.Identity() + + def zero_init_last(self): + nn.init.zeros_(self.conv2.bn.weight) + + def forward(self, x): + shortcut = x + x = self.conv1(x) + x = self.attn(x) + x = self.conv2(x) + x = self.drop_path(x) + shortcut + return x + + +class EdgeBlock(nn.Module): + """ EdgeResidual / Fused-MBConv / MobileNetV1-like 3x3 + 1x1 block (w/ activated output) + """ + + def __init__( + self, + in_chs, + out_chs, + dilation=1, + bottle_ratio=0.5, + groups=1, + act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, + attn_layer=None, + drop_block=None, + drop_path=0. + ): + super(EdgeBlock, self).__init__() + mid_chs = int(round(out_chs * bottle_ratio)) + ckwargs = dict(act_layer=act_layer, norm_layer=norm_layer) + + self.conv1 = ConvNormAct( + in_chs, mid_chs, kernel_size=3, dilation=dilation, groups=groups, + drop_layer=drop_block, **ckwargs) + self.attn = attn_layer(mid_chs, act_layer=act_layer) if attn_layer is not None else nn.Identity() + self.conv2 = ConvNormAct(mid_chs, out_chs, kernel_size=1, **ckwargs) + self.drop_path = DropPath(drop_path) if drop_path else nn.Identity() + + def zero_init_last(self): + nn.init.zeros_(self.conv2.bn.weight) + + def forward(self, x): + shortcut = x + x = self.conv1(x) + x = self.attn(x) + x = self.conv2(x) + x = self.drop_path(x) + shortcut + return x + + +class CrossStage(nn.Module): + """Cross Stage.""" + def __init__( + self, + in_chs, + out_chs, + stride, + dilation, + depth, + block_ratio=1., + bottle_ratio=1., + expand_ratio=1., + groups=1, + first_dilation=None, + avg_down=False, + down_growth=False, + cross_linear=False, + block_dpr=None, + block_fn=BottleneckBlock, + **block_kwargs + ): + super(CrossStage, self).__init__() + first_dilation = first_dilation or dilation + down_chs = out_chs if down_growth else in_chs # grow downsample channels to output channels + self.expand_chs = exp_chs = int(round(out_chs * expand_ratio)) + block_out_chs = int(round(out_chs * block_ratio)) + conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer')) + aa_layer = block_kwargs.pop('aa_layer', None) + + if stride != 1 or first_dilation != dilation: + if avg_down: + self.conv_down = nn.Sequential( + nn.AvgPool2d(2) if stride == 2 else nn.Identity(), # FIXME dilation handling + ConvNormActAa(in_chs, out_chs, kernel_size=1, stride=1, groups=groups, **conv_kwargs) + ) + else: + self.conv_down = ConvNormActAa( + in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups, + aa_layer=aa_layer, **conv_kwargs) + prev_chs = down_chs + else: + self.conv_down = nn.Identity() + prev_chs = in_chs + + # FIXME this 1x1 expansion is pushed down into the cross and block paths in the darknet cfgs. Also, + # there is also special case for the first stage for some of the model that results in uneven split + # across the two paths. I did it this way for simplicity for now. + self.conv_exp = ConvNormAct(prev_chs, exp_chs, kernel_size=1, apply_act=not cross_linear, **conv_kwargs) + prev_chs = exp_chs // 2 # output of conv_exp is always split in two + + self.blocks = nn.Sequential() + for i in range(depth): + self.blocks.add_module(str(i), block_fn( + in_chs=prev_chs, + out_chs=block_out_chs, + dilation=dilation, + bottle_ratio=bottle_ratio, + groups=groups, + drop_path=block_dpr[i] if block_dpr is not None else 0., + **block_kwargs + )) + prev_chs = block_out_chs + + # transition convs + self.conv_transition_b = ConvNormAct(prev_chs, exp_chs // 2, kernel_size=1, **conv_kwargs) + self.conv_transition = ConvNormAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs) + + def forward(self, x): + x = self.conv_down(x) + x = self.conv_exp(x) + xs, xb = x.split(self.expand_chs // 2, dim=1) + xb = self.blocks(xb) + xb = self.conv_transition_b(xb).contiguous() + out = self.conv_transition(torch.cat([xs, xb], dim=1)) + return out + + +class CrossStage3(nn.Module): + """Cross Stage 3. + Similar to CrossStage, but with only one transition conv for the output. + """ + def __init__( + self, + in_chs, + out_chs, + stride, + dilation, + depth, + block_ratio=1., + bottle_ratio=1., + expand_ratio=1., + groups=1, + first_dilation=None, + avg_down=False, + down_growth=False, + cross_linear=False, + block_dpr=None, + block_fn=BottleneckBlock, + **block_kwargs + ): + super(CrossStage3, self).__init__() + first_dilation = first_dilation or dilation + down_chs = out_chs if down_growth else in_chs # grow downsample channels to output channels + self.expand_chs = exp_chs = int(round(out_chs * expand_ratio)) + block_out_chs = int(round(out_chs * block_ratio)) + conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer')) + aa_layer = block_kwargs.pop('aa_layer', None) + + if stride != 1 or first_dilation != dilation: + if avg_down: + self.conv_down = nn.Sequential( + nn.AvgPool2d(2) if stride == 2 else nn.Identity(), # FIXME dilation handling + ConvNormActAa(in_chs, out_chs, kernel_size=1, stride=1, groups=groups, **conv_kwargs) + ) + else: + self.conv_down = ConvNormActAa( + in_chs, down_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups, + aa_layer=aa_layer, **conv_kwargs) + prev_chs = down_chs + else: + self.conv_down = None + prev_chs = in_chs + + # expansion conv + self.conv_exp = ConvNormAct(prev_chs, exp_chs, kernel_size=1, apply_act=not cross_linear, **conv_kwargs) + prev_chs = exp_chs // 2 # expanded output is split in 2 for blocks and cross stage + + self.blocks = nn.Sequential() + for i in range(depth): + self.blocks.add_module(str(i), block_fn( + in_chs=prev_chs, + out_chs=block_out_chs, + dilation=dilation, + bottle_ratio=bottle_ratio, + groups=groups, + drop_path=block_dpr[i] if block_dpr is not None else 0., + **block_kwargs + )) + prev_chs = block_out_chs + + # transition convs + self.conv_transition = ConvNormAct(exp_chs, out_chs, kernel_size=1, **conv_kwargs) + + def forward(self, x): + x = self.conv_down(x) + x = self.conv_exp(x) + x1, x2 = x.split(self.expand_chs // 2, dim=1) + x1 = self.blocks(x1) + out = self.conv_transition(torch.cat([x1, x2], dim=1)) + return out + + +class DarkStage(nn.Module): + """DarkNet stage.""" + + def __init__( + self, + in_chs, + out_chs, + stride, + dilation, + depth, + block_ratio=1., + bottle_ratio=1., + groups=1, + first_dilation=None, + avg_down=False, + block_fn=BottleneckBlock, + block_dpr=None, + **block_kwargs + ): + super(DarkStage, self).__init__() + first_dilation = first_dilation or dilation + conv_kwargs = dict(act_layer=block_kwargs.get('act_layer'), norm_layer=block_kwargs.get('norm_layer')) + aa_layer = block_kwargs.pop('aa_layer', None) + + if avg_down: + self.conv_down = nn.Sequential( + nn.AvgPool2d(2) if stride == 2 else nn.Identity(), # FIXME dilation handling + ConvNormActAa(in_chs, out_chs, kernel_size=1, stride=1, groups=groups, **conv_kwargs) + ) + else: + self.conv_down = ConvNormActAa( + in_chs, out_chs, kernel_size=3, stride=stride, dilation=first_dilation, groups=groups, + aa_layer=aa_layer, **conv_kwargs) + + prev_chs = out_chs + block_out_chs = int(round(out_chs * block_ratio)) + self.blocks = nn.Sequential() + for i in range(depth): + self.blocks.add_module(str(i), block_fn( + in_chs=prev_chs, + out_chs=block_out_chs, + dilation=dilation, + bottle_ratio=bottle_ratio, + groups=groups, + drop_path=block_dpr[i] if block_dpr is not None else 0., + **block_kwargs + )) + prev_chs = block_out_chs + + def forward(self, x): + x = self.conv_down(x) + x = self.blocks(x) + return x + + +def create_csp_stem( + in_chans=3, + out_chs=32, + kernel_size=3, + stride=2, + pool='', + padding='', + act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, + aa_layer=None +): + stem = nn.Sequential() + feature_info = [] + if not isinstance(out_chs, (tuple, list)): + out_chs = [out_chs] + stem_depth = len(out_chs) + assert stem_depth + assert stride in (1, 2, 4) + prev_feat = None + prev_chs = in_chans + last_idx = stem_depth - 1 + stem_stride = 1 + for i, chs in enumerate(out_chs): + conv_name = f'conv{i + 1}' + conv_stride = 2 if (i == 0 and stride > 1) or (i == last_idx and stride > 2 and not pool) else 1 + if conv_stride > 1 and prev_feat is not None: + feature_info.append(prev_feat) + stem.add_module(conv_name, ConvNormAct( + prev_chs, chs, kernel_size, + stride=conv_stride, + padding=padding if i == 0 else '', + act_layer=act_layer, + norm_layer=norm_layer + )) + stem_stride *= conv_stride + prev_chs = chs + prev_feat = dict(num_chs=prev_chs, reduction=stem_stride, module='.'.join(['stem', conv_name])) + if pool: + assert stride > 2 + if prev_feat is not None: + feature_info.append(prev_feat) + if aa_layer is not None: + stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=1, padding=1)) + stem.add_module('aa', aa_layer(channels=prev_chs, stride=2)) + pool_name = 'aa' + else: + stem.add_module('pool', nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) + pool_name = 'pool' + stem_stride *= 2 + prev_feat = dict(num_chs=prev_chs, reduction=stem_stride, module='.'.join(['stem', pool_name])) + feature_info.append(prev_feat) + return stem, feature_info + + +def _get_stage_fn(stage_args): + stage_type = stage_args.pop('stage_type') + assert stage_type in ('dark', 'csp', 'cs3') + if stage_type == 'dark': + stage_args.pop('expand_ratio', None) + stage_args.pop('cross_linear', None) + stage_args.pop('down_growth', None) + stage_fn = DarkStage + elif stage_type == 'csp': + stage_fn = CrossStage + else: + stage_fn = CrossStage3 + return stage_fn, stage_args + + +def _get_block_fn(stage_args): + block_type = stage_args.pop('block_type') + assert block_type in ('dark', 'edge', 'bottle') + if block_type == 'dark': + return DarkBlock, stage_args + elif block_type == 'edge': + return EdgeBlock, stage_args + else: + return BottleneckBlock, stage_args + + +def _get_attn_fn(stage_args): + attn_layer = stage_args.pop('attn_layer') + attn_kwargs = stage_args.pop('attn_kwargs', None) or {} + if attn_layer is not None: + attn_layer = get_attn(attn_layer) + if attn_kwargs: + attn_layer = partial(attn_layer, **attn_kwargs) + return attn_layer, stage_args + + +def create_csp_stages( + cfg: CspModelCfg, + drop_path_rate: float, + output_stride: int, + stem_feat: Dict[str, Any] +): + cfg_dict = asdict(cfg.stages) + num_stages = len(cfg.stages.depth) + cfg_dict['block_dpr'] = [None] * num_stages if not drop_path_rate else \ + [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.stages.depth)).split(cfg.stages.depth)] + stage_args = [dict(zip(cfg_dict.keys(), values)) for values in zip(*cfg_dict.values())] + block_kwargs = dict( + act_layer=cfg.act_layer, + norm_layer=cfg.norm_layer, + ) + + dilation = 1 + net_stride = stem_feat['reduction'] + prev_chs = stem_feat['num_chs'] + prev_feat = stem_feat + feature_info = [] + stages = [] + for stage_idx, stage_args in enumerate(stage_args): + stage_fn, stage_args = _get_stage_fn(stage_args) + block_fn, stage_args = _get_block_fn(stage_args) + attn_fn, stage_args = _get_attn_fn(stage_args) + stride = stage_args.pop('stride') + if stride != 1 and prev_feat: + feature_info.append(prev_feat) + if net_stride >= output_stride and stride > 1: + dilation *= stride + stride = 1 + net_stride *= stride + first_dilation = 1 if dilation in (1, 2) else 2 + + stages += [stage_fn( + prev_chs, + **stage_args, + stride=stride, + first_dilation=first_dilation, + dilation=dilation, + block_fn=block_fn, + aa_layer=cfg.aa_layer, + attn_layer=attn_fn, # will be passed through stage as block_kwargs + **block_kwargs, + )] + prev_chs = stage_args['out_chs'] + prev_feat = dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}') + + feature_info.append(prev_feat) + return nn.Sequential(*stages), feature_info + + +class CspNet(nn.Module): + """Cross Stage Partial base model. + + Paper: `CSPNet: A New Backbone that can Enhance Learning Capability of CNN` - https://arxiv.org/abs/1911.11929 + Ref Impl: https://github.com/WongKinYiu/CrossStagePartialNetworks + + NOTE: There are differences in the way I handle the 1x1 'expansion' conv in this impl vs the + darknet impl. I did it this way for simplicity and less special cases. + """ + + def __init__( + self, + cfg: CspModelCfg, + in_chans=3, + num_classes=1000, + output_stride=32, + global_pool='avg', + drop_rate=0., + drop_path_rate=0., + zero_init_last=True + ): + super().__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + assert output_stride in (8, 16, 32) + layer_args = dict( + act_layer=cfg.act_layer, + norm_layer=cfg.norm_layer, + aa_layer=cfg.aa_layer + ) + self.feature_info = [] + + # Construct the stem + self.stem, stem_feat_info = create_csp_stem(in_chans, **asdict(cfg.stem), **layer_args) + self.feature_info.extend(stem_feat_info[:-1]) + + # Construct the stages + self.stages, stage_feat_info = create_csp_stages( + cfg, + drop_path_rate=drop_path_rate, + output_stride=output_stride, + stem_feat=stem_feat_info[-1], + ) + prev_chs = stage_feat_info[-1]['num_chs'] + self.feature_info.extend(stage_feat_info) + + # Construct the head + self.num_features = prev_chs + self.head = ClassifierHead( + in_chs=prev_chs, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate) + + named_apply(partial(_init_weights, zero_init_last=zero_init_last), self) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', + blocks=r'^stages\.(\d+)' if coarse else [ + (r'^stages\.(\d+)\.blocks\.(\d+)', None), + (r'^stages\.(\d+)\..*transition', MATCH_PREV_GROUP), # map to last block in stage + (r'^stages\.(\d+)', (0,)), + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + x = self.stages(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _init_weights(module, name, zero_init_last=False): + if isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu') + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=0.01) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif zero_init_last and hasattr(module, 'zero_init_last'): + module.zero_init_last() + + +def _create_cspnet(variant, pretrained=False, **kwargs): + if variant.startswith('darknet') or variant.startswith('cspdarknet'): + # NOTE: DarkNet is one of few models with stride==1 features w/ 6 out_indices [0..5] + default_out_indices = (0, 1, 2, 3, 4, 5) + else: + default_out_indices = (0, 1, 2, 3, 4) + out_indices = kwargs.pop('out_indices', default_out_indices) + return build_model_with_cfg( + CspNet, variant, pretrained, + model_cfg=model_cfgs[variant], + feature_cfg=dict(flatten_sequential=True, out_indices=out_indices), + **kwargs) + + +@register_model +def cspresnet50(pretrained=False, **kwargs): + return _create_cspnet('cspresnet50', pretrained=pretrained, **kwargs) + + +@register_model +def cspresnet50d(pretrained=False, **kwargs): + return _create_cspnet('cspresnet50d', pretrained=pretrained, **kwargs) + + +@register_model +def cspresnet50w(pretrained=False, **kwargs): + return _create_cspnet('cspresnet50w', pretrained=pretrained, **kwargs) + + +@register_model +def cspresnext50(pretrained=False, **kwargs): + return _create_cspnet('cspresnext50', pretrained=pretrained, **kwargs) + + +@register_model +def cspdarknet53(pretrained=False, **kwargs): + return _create_cspnet('cspdarknet53', pretrained=pretrained, **kwargs) + + +@register_model +def darknet17(pretrained=False, **kwargs): + return _create_cspnet('darknet17', pretrained=pretrained, **kwargs) + + +@register_model +def darknet21(pretrained=False, **kwargs): + return _create_cspnet('darknet21', pretrained=pretrained, **kwargs) + + +@register_model +def sedarknet21(pretrained=False, **kwargs): + return _create_cspnet('sedarknet21', pretrained=pretrained, **kwargs) + + +@register_model +def darknet53(pretrained=False, **kwargs): + return _create_cspnet('darknet53', pretrained=pretrained, **kwargs) + + +@register_model +def darknetaa53(pretrained=False, **kwargs): + return _create_cspnet('darknetaa53', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_s(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_s', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_m(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_m', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_l(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_l', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_x(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_x', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_focus_s(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_focus_s', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_focus_m(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_focus_m', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_focus_l(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_focus_l', pretrained=pretrained, **kwargs) + + +@register_model +def cs3darknet_focus_x(pretrained=False, **kwargs): + return _create_cspnet('cs3darknet_focus_x', pretrained=pretrained, **kwargs) + + +@register_model +def cs3sedarknet_l(pretrained=False, **kwargs): + return _create_cspnet('cs3sedarknet_l', pretrained=pretrained, **kwargs) + + +@register_model +def cs3sedarknet_x(pretrained=False, **kwargs): + return _create_cspnet('cs3sedarknet_x', pretrained=pretrained, **kwargs) + + +@register_model +def cs3sedarknet_xdw(pretrained=False, **kwargs): + return _create_cspnet('cs3sedarknet_xdw', pretrained=pretrained, **kwargs) + + +@register_model +def cs3edgenet_x(pretrained=False, **kwargs): + return _create_cspnet('cs3edgenet_x', pretrained=pretrained, **kwargs) + + +@register_model +def cs3se_edgenet_x(pretrained=False, **kwargs): + return _create_cspnet('cs3se_edgenet_x', pretrained=pretrained, **kwargs) \ No newline at end of file diff --git a/custom_timm/models/deit.py b/custom_timm/models/deit.py new file mode 100644 index 0000000000000000000000000000000000000000..19d9e14d1420b45383829cfe00c822216994b114 --- /dev/null +++ b/custom_timm/models/deit.py @@ -0,0 +1,449 @@ +""" DeiT - Data-efficient Image Transformers + +DeiT model defs and weights from https://github.com/facebookresearch/deit, original copyright below + +paper: `DeiT: Data-efficient Image Transformers` - https://arxiv.org/abs/2012.12877 + +paper: `DeiT III: Revenge of the ViT` - https://arxiv.org/abs/2204.07118 + +Modifications copyright 2021, Ross Wightman +""" +# Copyright (c) 2015-present, Facebook, Inc. +# All rights reserved. +from functools import partial + +import torch +from torch import nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from custom_timm.models.vision_transformer import VisionTransformer, trunc_normal_, checkpoint_filter_fn + +from .helpers import build_model_with_cfg, checkpoint_seq +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # deit models (FB weights) + 'deit_tiny_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth'), + 'deit_small_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_small_patch16_224-cd65a155.pth'), + 'deit_base_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_224-b5f2ef4d.pth'), + 'deit_base_patch16_384': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_patch16_384-8de9b5d1.pth', + input_size=(3, 384, 384), crop_pct=1.0), + + 'deit_tiny_distilled_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_tiny_distilled_patch16_224-b40b3cf7.pth', + classifier=('head', 'head_dist')), + 'deit_small_distilled_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_small_distilled_patch16_224-649709d9.pth', + classifier=('head', 'head_dist')), + 'deit_base_distilled_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_224-df68dfff.pth', + classifier=('head', 'head_dist')), + 'deit_base_distilled_patch16_384': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_base_distilled_patch16_384-d0272ac0.pth', + input_size=(3, 384, 384), crop_pct=1.0, + classifier=('head', 'head_dist')), + + 'deit3_small_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_small_224_1k.pth'), + 'deit3_small_patch16_384': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_small_384_1k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'deit3_medium_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_medium_224_1k.pth'), + 'deit3_base_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_base_224_1k.pth'), + 'deit3_base_patch16_384': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_base_384_1k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'deit3_large_patch16_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_large_224_1k.pth'), + 'deit3_large_patch16_384': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_large_384_1k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'deit3_huge_patch14_224': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_huge_224_1k.pth'), + + 'deit3_small_patch16_224_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_small_224_21k.pth', + crop_pct=1.0), + 'deit3_small_patch16_384_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_small_384_21k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'deit3_medium_patch16_224_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_medium_224_21k.pth', + crop_pct=1.0), + 'deit3_base_patch16_224_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_base_224_21k.pth', + crop_pct=1.0), + 'deit3_base_patch16_384_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_base_384_21k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'deit3_large_patch16_224_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_large_224_21k.pth', + crop_pct=1.0), + 'deit3_large_patch16_384_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_large_384_21k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'deit3_huge_patch14_224_in21ft1k': _cfg( + url='https://dl.fbaipublicfiles.com/deit/deit_3_huge_224_21k_v1.pth', + crop_pct=1.0), +} + + +class VisionTransformerDistilled(VisionTransformer): + """ Vision Transformer w/ Distillation Token and Head + + Distillation token & head support for `DeiT: Data-efficient Image Transformers` + - https://arxiv.org/abs/2012.12877 + """ + + def __init__(self, *args, **kwargs): + weight_init = kwargs.pop('weight_init', '') + super().__init__(*args, **kwargs, weight_init='skip') + assert self.global_pool in ('token',) + + self.num_prefix_tokens = 2 + self.dist_token = nn.Parameter(torch.zeros(1, 1, self.embed_dim)) + self.pos_embed = nn.Parameter( + torch.zeros(1, self.patch_embed.num_patches + self.num_prefix_tokens, self.embed_dim)) + self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if self.num_classes > 0 else nn.Identity() + self.distilled_training = False # must set this True to train w/ distillation token + + self.init_weights(weight_init) + + def init_weights(self, mode=''): + trunc_normal_(self.dist_token, std=.02) + super().init_weights(mode=mode) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^cls_token|pos_embed|patch_embed|dist_token', + blocks=[ + (r'^blocks\.(\d+)', None), + (r'^norm', (99999,))] # final norm w/ last block + ) + + @torch.jit.ignore + def get_classifier(self): + return self.head, self.head_dist + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity() + + @torch.jit.ignore + def set_distilled_training(self, enable=True): + self.distilled_training = enable + + def forward_features(self, x) -> torch.Tensor: + x = self.patch_embed(x) + x = torch.cat(( + self.cls_token.expand(x.shape[0], -1, -1), + self.dist_token.expand(x.shape[0], -1, -1), x), dim=1) + x = self.pos_drop(x + self.pos_embed) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False) -> torch.Tensor: + if pre_logits: + return (x[:, 0] + x[:, 1]) / 2 + x, x_dist = self.head(x[:, 0]), self.head_dist(x[:, 1]) + if self.distilled_training and self.training and not torch.jit.is_scripting(): + # only return separate classification predictions when training in distilled mode + return x, x_dist + else: + # during standard train / finetune, inference average the classifier predictions + return (x + x_dist) / 2 + + +def _create_deit(variant, pretrained=False, distilled=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + model_cls = VisionTransformerDistilled if distilled else VisionTransformer + model = build_model_with_cfg( + model_cls, variant, pretrained, + pretrained_filter_fn=partial(checkpoint_filter_fn, adapt_layer_scale=True), + **kwargs) + return model + + +@register_model +def deit_tiny_patch16_224(pretrained=False, **kwargs): + """ DeiT-tiny model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_deit('deit_tiny_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit_small_patch16_224(pretrained=False, **kwargs): + """ DeiT-small model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_deit('deit_small_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit_base_patch16_224(pretrained=False, **kwargs): + """ DeiT base model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_deit('deit_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit_base_patch16_384(pretrained=False, **kwargs): + """ DeiT base model @ 384x384 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_deit('deit_base_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit_tiny_distilled_patch16_224(pretrained=False, **kwargs): + """ DeiT-tiny distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_deit( + 'deit_tiny_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def deit_small_distilled_patch16_224(pretrained=False, **kwargs): + """ DeiT-small distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_deit( + 'deit_small_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def deit_base_distilled_patch16_224(pretrained=False, **kwargs): + """ DeiT-base distilled model @ 224x224 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_deit( + 'deit_base_distilled_patch16_224', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def deit_base_distilled_patch16_384(pretrained=False, **kwargs): + """ DeiT-base distilled model @ 384x384 from paper (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_deit( + 'deit_base_distilled_patch16_384', pretrained=pretrained, distilled=True, **model_kwargs) + return model + + +@register_model +def deit3_small_patch16_224(pretrained=False, **kwargs): + """ DeiT-3 small model @ 224x224 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_small_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_small_patch16_384(pretrained=False, **kwargs): + """ DeiT-3 small model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_small_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_medium_patch16_224(pretrained=False, **kwargs): + """ DeiT-3 medium model @ 224x224 (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=12, num_heads=8, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_medium_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_base_patch16_224(pretrained=False, **kwargs): + """ DeiT-3 base model @ 224x224 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_base_patch16_384(pretrained=False, **kwargs): + """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_base_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_large_patch16_224(pretrained=False, **kwargs): + """ DeiT-3 large model @ 224x224 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_large_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_large_patch16_384(pretrained=False, **kwargs): + """ DeiT-3 large model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_large_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_huge_patch14_224(pretrained=False, **kwargs): + """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_huge_patch14_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_small_patch16_224_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 small model @ 224x224 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_small_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_small_patch16_384_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 small model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=6, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_small_patch16_384_in21ft1k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_medium_patch16_224_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 medium model @ 224x224 (https://arxiv.org/abs/2012.12877). + ImageNet-1k weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=12, num_heads=8, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_medium_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_base_patch16_224_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 base model @ 224x224 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_base_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_base_patch16_384_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_base_patch16_384_in21ft1k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_large_patch16_224_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 large model @ 224x224 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_large_patch16_224_in21ft1k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_large_patch16_384_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 large model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=16, embed_dim=1024, depth=24, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_large_patch16_384_in21ft1k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def deit3_huge_patch14_224_in21ft1k(pretrained=False, **kwargs): + """ DeiT-3 base model @ 384x384 from paper (https://arxiv.org/abs/2204.07118). + ImageNet-21k pretrained weights from https://github.com/facebookresearch/deit. + """ + model_kwargs = dict( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, no_embed_class=True, init_values=1e-6, **kwargs) + model = _create_deit('deit3_huge_patch14_224_in21ft1k', pretrained=pretrained, **model_kwargs) + return model diff --git a/custom_timm/models/densenet.py b/custom_timm/models/densenet.py new file mode 100644 index 0000000000000000000000000000000000000000..357afe0a341389787067efd66207108d15400a84 --- /dev/null +++ b/custom_timm/models/densenet.py @@ -0,0 +1,400 @@ +"""Pytorch Densenet implementation w/ tweaks +This file is a copy of https://github.com/pytorch/vision 'densenet.py' (BSD-3-Clause) with +fixed kwargs passthrough and addition of dynamic global avg/max pool. +""" +import re +from collections import OrderedDict +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as cp +from torch.jit.annotations import List + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, MATCH_PREV_GROUP +from .layers import BatchNormAct2d, create_norm_act_layer, BlurPool2d, create_classifier +from .registry import register_model + +__all__ = ['DenseNet'] + + +def _cfg(url=''): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'features.conv0', 'classifier': 'classifier', + } + + +default_cfgs = { + 'densenet121': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenet121_ra-50efcf5c.pth'), + 'densenet121d': _cfg(url=''), + 'densenetblur121d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/densenetblur121d_ra-100dcfbc.pth'), + 'densenet169': _cfg(url='https://download.pytorch.org/models/densenet169-b2777c0a.pth'), + 'densenet201': _cfg(url='https://download.pytorch.org/models/densenet201-c1103571.pth'), + 'densenet161': _cfg(url='https://download.pytorch.org/models/densenet161-8d451a50.pth'), + 'densenet264': _cfg(url=''), + 'densenet264d_iabn': _cfg(url=''), + 'tv_densenet121': _cfg(url='https://download.pytorch.org/models/densenet121-a639ec97.pth'), +} + + +class DenseLayer(nn.Module): + def __init__( + self, num_input_features, growth_rate, bn_size, norm_layer=BatchNormAct2d, + drop_rate=0., memory_efficient=False): + super(DenseLayer, self).__init__() + self.add_module('norm1', norm_layer(num_input_features)), + self.add_module('conv1', nn.Conv2d( + num_input_features, bn_size * growth_rate, kernel_size=1, stride=1, bias=False)), + self.add_module('norm2', norm_layer(bn_size * growth_rate)), + self.add_module('conv2', nn.Conv2d( + bn_size * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)), + self.drop_rate = float(drop_rate) + self.memory_efficient = memory_efficient + + def bottleneck_fn(self, xs): + # type: (List[torch.Tensor]) -> torch.Tensor + concated_features = torch.cat(xs, 1) + bottleneck_output = self.conv1(self.norm1(concated_features)) # noqa: T484 + return bottleneck_output + + # todo: rewrite when torchscript supports any + def any_requires_grad(self, x): + # type: (List[torch.Tensor]) -> bool + for tensor in x: + if tensor.requires_grad: + return True + return False + + @torch.jit.unused # noqa: T484 + def call_checkpoint_bottleneck(self, x): + # type: (List[torch.Tensor]) -> torch.Tensor + def closure(*xs): + return self.bottleneck_fn(xs) + + return cp.checkpoint(closure, *x) + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (List[torch.Tensor]) -> (torch.Tensor) + pass + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (torch.Tensor) -> (torch.Tensor) + pass + + # torchscript does not yet support *args, so we overload method + # allowing it to take either a List[Tensor] or single Tensor + def forward(self, x): # noqa: F811 + if isinstance(x, torch.Tensor): + prev_features = [x] + else: + prev_features = x + + if self.memory_efficient and self.any_requires_grad(prev_features): + if torch.jit.is_scripting(): + raise Exception("Memory Efficient not supported in JIT") + bottleneck_output = self.call_checkpoint_bottleneck(prev_features) + else: + bottleneck_output = self.bottleneck_fn(prev_features) + + new_features = self.conv2(self.norm2(bottleneck_output)) + if self.drop_rate > 0: + new_features = F.dropout(new_features, p=self.drop_rate, training=self.training) + return new_features + + +class DenseBlock(nn.ModuleDict): + _version = 2 + + def __init__( + self, num_layers, num_input_features, bn_size, growth_rate, norm_layer=BatchNormAct2d, + drop_rate=0., memory_efficient=False): + super(DenseBlock, self).__init__() + for i in range(num_layers): + layer = DenseLayer( + num_input_features + i * growth_rate, + growth_rate=growth_rate, + bn_size=bn_size, + norm_layer=norm_layer, + drop_rate=drop_rate, + memory_efficient=memory_efficient, + ) + self.add_module('denselayer%d' % (i + 1), layer) + + def forward(self, init_features): + features = [init_features] + for name, layer in self.items(): + new_features = layer(features) + features.append(new_features) + return torch.cat(features, 1) + + +class DenseTransition(nn.Sequential): + def __init__(self, num_input_features, num_output_features, norm_layer=BatchNormAct2d, aa_layer=None): + super(DenseTransition, self).__init__() + self.add_module('norm', norm_layer(num_input_features)) + self.add_module('conv', nn.Conv2d( + num_input_features, num_output_features, kernel_size=1, stride=1, bias=False)) + if aa_layer is not None: + self.add_module('pool', aa_layer(num_output_features, stride=2)) + else: + self.add_module('pool', nn.AvgPool2d(kernel_size=2, stride=2)) + + +class DenseNet(nn.Module): + r"""Densenet-BC model class, based on + `"Densely Connected Convolutional Networks" `_ + + Args: + growth_rate (int) - how many filters to add each layer (`k` in paper) + block_config (list of 4 ints) - how many layers in each pooling block + bn_size (int) - multiplicative factor for number of bottle neck layers + (i.e. bn_size * k features in the bottleneck layer) + drop_rate (float) - dropout rate after each dense layer + num_classes (int) - number of classification classes + memory_efficient (bool) - If True, uses checkpointing. Much more memory efficient, + but slower. Default: *False*. See `"paper" `_ + """ + + def __init__( + self, growth_rate=32, block_config=(6, 12, 24, 16), num_classes=1000, in_chans=3, global_pool='avg', + bn_size=4, stem_type='', norm_layer=BatchNormAct2d, aa_layer=None, drop_rate=0, + memory_efficient=False, aa_stem_only=True): + self.num_classes = num_classes + self.drop_rate = drop_rate + super(DenseNet, self).__init__() + + # Stem + deep_stem = 'deep' in stem_type # 3x3 deep stem + num_init_features = growth_rate * 2 + if aa_layer is None: + stem_pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + else: + stem_pool = nn.Sequential(*[ + nn.MaxPool2d(kernel_size=3, stride=1, padding=1), + aa_layer(channels=num_init_features, stride=2)]) + if deep_stem: + stem_chs_1 = stem_chs_2 = growth_rate + if 'tiered' in stem_type: + stem_chs_1 = 3 * (growth_rate // 4) + stem_chs_2 = num_init_features if 'narrow' in stem_type else 6 * (growth_rate // 4) + self.features = nn.Sequential(OrderedDict([ + ('conv0', nn.Conv2d(in_chans, stem_chs_1, 3, stride=2, padding=1, bias=False)), + ('norm0', norm_layer(stem_chs_1)), + ('conv1', nn.Conv2d(stem_chs_1, stem_chs_2, 3, stride=1, padding=1, bias=False)), + ('norm1', norm_layer(stem_chs_2)), + ('conv2', nn.Conv2d(stem_chs_2, num_init_features, 3, stride=1, padding=1, bias=False)), + ('norm2', norm_layer(num_init_features)), + ('pool0', stem_pool), + ])) + else: + self.features = nn.Sequential(OrderedDict([ + ('conv0', nn.Conv2d(in_chans, num_init_features, kernel_size=7, stride=2, padding=3, bias=False)), + ('norm0', norm_layer(num_init_features)), + ('pool0', stem_pool), + ])) + self.feature_info = [ + dict(num_chs=num_init_features, reduction=2, module=f'features.norm{2 if deep_stem else 0}')] + current_stride = 4 + + # DenseBlocks + num_features = num_init_features + for i, num_layers in enumerate(block_config): + block = DenseBlock( + num_layers=num_layers, + num_input_features=num_features, + bn_size=bn_size, + growth_rate=growth_rate, + norm_layer=norm_layer, + drop_rate=drop_rate, + memory_efficient=memory_efficient + ) + module_name = f'denseblock{(i + 1)}' + self.features.add_module(module_name, block) + num_features = num_features + num_layers * growth_rate + transition_aa_layer = None if aa_stem_only else aa_layer + if i != len(block_config) - 1: + self.feature_info += [ + dict(num_chs=num_features, reduction=current_stride, module='features.' + module_name)] + current_stride *= 2 + trans = DenseTransition( + num_input_features=num_features, num_output_features=num_features // 2, + norm_layer=norm_layer, aa_layer=transition_aa_layer) + self.features.add_module(f'transition{i + 1}', trans) + num_features = num_features // 2 + + # Final batch norm + self.features.add_module('norm5', norm_layer(num_features)) + + self.feature_info += [dict(num_chs=num_features, reduction=current_stride, module='features.norm5')] + self.num_features = num_features + + # Linear layer + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + # Official init from torch repo. + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^features\.conv[012]|features\.norm[012]|features\.pool[012]', + blocks=r'^features\.(?:denseblock|transition)(\d+)' if coarse else [ + (r'^features\.denseblock(\d+)\.denselayer(\d+)', None), + (r'^features\.transition(\d+)', MATCH_PREV_GROUP) # FIXME combine with previous denselayer + ] + ) + return matcher + + @torch.jit.ignore + def get_classifier(self): + return self.classifier + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + return self.features(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.global_pool(x) + # both classifier and block drop? + # if self.drop_rate > 0.: + # x = F.dropout(x, p=self.drop_rate, training=self.training) + x = self.classifier(x) + return x + + +def _filter_torchvision_pretrained(state_dict): + pattern = re.compile( + r'^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$') + + for key in list(state_dict.keys()): + res = pattern.match(key) + if res: + new_key = res.group(1) + res.group(2) + state_dict[new_key] = state_dict[key] + del state_dict[key] + return state_dict + + +def _create_densenet(variant, growth_rate, block_config, pretrained, **kwargs): + kwargs['growth_rate'] = growth_rate + kwargs['block_config'] = block_config + return build_model_with_cfg( + DenseNet, variant, pretrained, + feature_cfg=dict(flatten_sequential=True), pretrained_filter_fn=_filter_torchvision_pretrained, + **kwargs) + + +@register_model +def densenet121(pretrained=False, **kwargs): + r"""Densenet-121 model from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs) + return model + + +@register_model +def densenetblur121d(pretrained=False, **kwargs): + r"""Densenet-121 model from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'densenetblur121d', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, stem_type='deep', + aa_layer=BlurPool2d, **kwargs) + return model + + +@register_model +def densenet121d(pretrained=False, **kwargs): + r"""Densenet-121 model from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'densenet121d', growth_rate=32, block_config=(6, 12, 24, 16), stem_type='deep', + pretrained=pretrained, **kwargs) + return model + + +@register_model +def densenet169(pretrained=False, **kwargs): + r"""Densenet-169 model from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'densenet169', growth_rate=32, block_config=(6, 12, 32, 32), pretrained=pretrained, **kwargs) + return model + + +@register_model +def densenet201(pretrained=False, **kwargs): + r"""Densenet-201 model from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'densenet201', growth_rate=32, block_config=(6, 12, 48, 32), pretrained=pretrained, **kwargs) + return model + + +@register_model +def densenet161(pretrained=False, **kwargs): + r"""Densenet-161 model from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'densenet161', growth_rate=48, block_config=(6, 12, 36, 24), pretrained=pretrained, **kwargs) + return model + + +@register_model +def densenet264(pretrained=False, **kwargs): + r"""Densenet-264 model from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'densenet264', growth_rate=48, block_config=(6, 12, 64, 48), pretrained=pretrained, **kwargs) + return model + + +@register_model +def densenet264d_iabn(pretrained=False, **kwargs): + r"""Densenet-264 model with deep stem and Inplace-ABN + """ + def norm_act_fn(num_features, **kwargs): + return create_norm_act_layer('iabn', num_features, act_layer='leaky_relu', **kwargs) + model = _create_densenet( + 'densenet264d_iabn', growth_rate=48, block_config=(6, 12, 64, 48), stem_type='deep', + norm_layer=norm_act_fn, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tv_densenet121(pretrained=False, **kwargs): + r"""Densenet-121 model with original Torchvision weights, from + `"Densely Connected Convolutional Networks" ` + """ + model = _create_densenet( + 'tv_densenet121', growth_rate=32, block_config=(6, 12, 24, 16), pretrained=pretrained, **kwargs) + return model diff --git a/custom_timm/models/dla.py b/custom_timm/models/dla.py new file mode 100644 index 0000000000000000000000000000000000000000..e61146e2449e6599f4e584578e0550493eb7111a --- /dev/null +++ b/custom_timm/models/dla.py @@ -0,0 +1,474 @@ +""" Deep Layer Aggregation and DLA w/ Res2Net +DLA original adapted from Official Pytorch impl at: +DLA Paper: `Deep Layer Aggregation` - https://arxiv.org/abs/1707.06484 + +Res2Net additions from: https://github.com/gasvn/Res2Net/ +Res2Net Paper: `Res2Net: A New Multi-scale Backbone Architecture` - https://arxiv.org/abs/1904.01169 +""" +import math +from typing import List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import create_classifier +from .registry import register_model + +__all__ = ['DLA'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'base_layer.0', 'classifier': 'fc', + **kwargs + } + + +default_cfgs = { + 'dla34': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla34-2b83ff04.pth'), + 'dla46_c': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla46_c-9b68d685.pth'), + 'dla46x_c': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla46x_c-6bc5b5c8.pth'), + 'dla60x_c': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla60x_c-a38e054a.pth'), + 'dla60': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla60-9e91bd4d.pth'), + 'dla60x': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla60x-6818f6bb.pth'), + 'dla102': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla102-21f57b54.pth'), + 'dla102x': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla102x-7ec0aa2a.pth'), + 'dla102x2': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla102x2-ac4239c4.pth'), + 'dla169': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dla169-7c767967.pth'), + 'dla60_res2net': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net_dla60_4s-d88db7f9.pth'), + 'dla60_res2next': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2next_dla60_4s-d327927b.pth'), +} + + +class DlaBasic(nn.Module): + """DLA Basic""" + + def __init__(self, inplanes, planes, stride=1, dilation=1, **_): + super(DlaBasic, self).__init__() + self.conv1 = nn.Conv2d( + inplanes, planes, kernel_size=3, stride=stride, padding=dilation, bias=False, dilation=dilation) + self.bn1 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d( + planes, planes, kernel_size=3, stride=1, padding=dilation, bias=False, dilation=dilation) + self.bn2 = nn.BatchNorm2d(planes) + self.stride = stride + + def forward(self, x, shortcut=None, children: Optional[List[torch.Tensor]] = None): + if shortcut is None: + shortcut = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + + out += shortcut + out = self.relu(out) + + return out + + +class DlaBottleneck(nn.Module): + """DLA/DLA-X Bottleneck""" + expansion = 2 + + def __init__(self, inplanes, outplanes, stride=1, dilation=1, cardinality=1, base_width=64): + super(DlaBottleneck, self).__init__() + self.stride = stride + mid_planes = int(math.floor(outplanes * (base_width / 64)) * cardinality) + mid_planes = mid_planes // self.expansion + + self.conv1 = nn.Conv2d(inplanes, mid_planes, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(mid_planes) + self.conv2 = nn.Conv2d( + mid_planes, mid_planes, kernel_size=3, stride=stride, padding=dilation, + bias=False, dilation=dilation, groups=cardinality) + self.bn2 = nn.BatchNorm2d(mid_planes) + self.conv3 = nn.Conv2d(mid_planes, outplanes, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(outplanes) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x, shortcut: Optional[torch.Tensor] = None, children: Optional[List[torch.Tensor]] = None): + if shortcut is None: + shortcut = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + out += shortcut + out = self.relu(out) + + return out + + +class DlaBottle2neck(nn.Module): + """ Res2Net/Res2NeXT DLA Bottleneck + Adapted from https://github.com/gasvn/Res2Net/blob/master/dla.py + """ + expansion = 2 + + def __init__(self, inplanes, outplanes, stride=1, dilation=1, scale=4, cardinality=8, base_width=4): + super(DlaBottle2neck, self).__init__() + self.is_first = stride > 1 + self.scale = scale + mid_planes = int(math.floor(outplanes * (base_width / 64)) * cardinality) + mid_planes = mid_planes // self.expansion + self.width = mid_planes + + self.conv1 = nn.Conv2d(inplanes, mid_planes * scale, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(mid_planes * scale) + + num_scale_convs = max(1, scale - 1) + convs = [] + bns = [] + for _ in range(num_scale_convs): + convs.append(nn.Conv2d( + mid_planes, mid_planes, kernel_size=3, stride=stride, + padding=dilation, dilation=dilation, groups=cardinality, bias=False)) + bns.append(nn.BatchNorm2d(mid_planes)) + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1) if self.is_first else None + + self.conv3 = nn.Conv2d(mid_planes * scale, outplanes, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(outplanes) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x, shortcut: Optional[torch.Tensor] = None, children: Optional[List[torch.Tensor]] = None): + if shortcut is None: + shortcut = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + spx = torch.split(out, self.width, 1) + spo = [] + sp = spx[0] # redundant, for torchscript + for i, (conv, bn) in enumerate(zip(self.convs, self.bns)): + if i == 0 or self.is_first: + sp = spx[i] + else: + sp = sp + spx[i] + sp = conv(sp) + sp = bn(sp) + sp = self.relu(sp) + spo.append(sp) + if self.scale > 1: + if self.pool is not None: # self.is_first == True, None check for torchscript + spo.append(self.pool(spx[-1])) + else: + spo.append(spx[-1]) + out = torch.cat(spo, 1) + + out = self.conv3(out) + out = self.bn3(out) + + out += shortcut + out = self.relu(out) + + return out + + +class DlaRoot(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size, shortcut): + super(DlaRoot, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, 1, stride=1, bias=False, padding=(kernel_size - 1) // 2) + self.bn = nn.BatchNorm2d(out_channels) + self.relu = nn.ReLU(inplace=True) + self.shortcut = shortcut + + def forward(self, x_children: List[torch.Tensor]): + x = self.conv(torch.cat(x_children, 1)) + x = self.bn(x) + if self.shortcut: + x += x_children[0] + x = self.relu(x) + + return x + + +class DlaTree(nn.Module): + def __init__( + self, levels, block, in_channels, out_channels, stride=1, dilation=1, cardinality=1, + base_width=64, level_root=False, root_dim=0, root_kernel_size=1, root_shortcut=False): + super(DlaTree, self).__init__() + if root_dim == 0: + root_dim = 2 * out_channels + if level_root: + root_dim += in_channels + self.downsample = nn.MaxPool2d(stride, stride=stride) if stride > 1 else nn.Identity() + self.project = nn.Identity() + cargs = dict(dilation=dilation, cardinality=cardinality, base_width=base_width) + if levels == 1: + self.tree1 = block(in_channels, out_channels, stride, **cargs) + self.tree2 = block(out_channels, out_channels, 1, **cargs) + if in_channels != out_channels: + # NOTE the official impl/weights have project layers in levels > 1 case that are never + # used, I've moved the project layer here to avoid wasted params but old checkpoints will + # need strict=False while loading. + self.project = nn.Sequential( + nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False), + nn.BatchNorm2d(out_channels)) + self.root = DlaRoot(root_dim, out_channels, root_kernel_size, root_shortcut) + else: + cargs.update(dict(root_kernel_size=root_kernel_size, root_shortcut=root_shortcut)) + self.tree1 = DlaTree( + levels - 1, block, in_channels, out_channels, stride, root_dim=0, **cargs) + self.tree2 = DlaTree( + levels - 1, block, out_channels, out_channels, root_dim=root_dim + out_channels, **cargs) + self.root = None + self.level_root = level_root + self.root_dim = root_dim + self.levels = levels + + def forward(self, x, shortcut: Optional[torch.Tensor] = None, children: Optional[List[torch.Tensor]] = None): + if children is None: + children = [] + bottom = self.downsample(x) + shortcut = self.project(bottom) + if self.level_root: + children.append(bottom) + x1 = self.tree1(x, shortcut) + if self.root is not None: # levels == 1 + x2 = self.tree2(x1) + x = self.root([x2, x1] + children) + else: + children.append(x1) + x = self.tree2(x1, None, children) + return x + + +class DLA(nn.Module): + def __init__( + self, levels, channels, output_stride=32, num_classes=1000, in_chans=3, global_pool='avg', + cardinality=1, base_width=64, block=DlaBottle2neck, shortcut_root=False, drop_rate=0.0): + super(DLA, self).__init__() + self.channels = channels + self.num_classes = num_classes + self.cardinality = cardinality + self.base_width = base_width + self.drop_rate = drop_rate + assert output_stride == 32 # FIXME support dilation + + self.base_layer = nn.Sequential( + nn.Conv2d(in_chans, channels[0], kernel_size=7, stride=1, padding=3, bias=False), + nn.BatchNorm2d(channels[0]), + nn.ReLU(inplace=True)) + self.level0 = self._make_conv_level(channels[0], channels[0], levels[0]) + self.level1 = self._make_conv_level(channels[0], channels[1], levels[1], stride=2) + cargs = dict(cardinality=cardinality, base_width=base_width, root_shortcut=shortcut_root) + self.level2 = DlaTree(levels[2], block, channels[1], channels[2], 2, level_root=False, **cargs) + self.level3 = DlaTree(levels[3], block, channels[2], channels[3], 2, level_root=True, **cargs) + self.level4 = DlaTree(levels[4], block, channels[3], channels[4], 2, level_root=True, **cargs) + self.level5 = DlaTree(levels[5], block, channels[4], channels[5], 2, level_root=True, **cargs) + self.feature_info = [ + dict(num_chs=channels[0], reduction=1, module='level0'), # rare to have a meaningful stride 1 level + dict(num_chs=channels[1], reduction=2, module='level1'), + dict(num_chs=channels[2], reduction=4, module='level2'), + dict(num_chs=channels[3], reduction=8, module='level3'), + dict(num_chs=channels[4], reduction=16, module='level4'), + dict(num_chs=channels[5], reduction=32, module='level5'), + ] + + self.num_features = channels[-1] + self.global_pool, self.fc = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool, use_conv=True) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() + + for m in self.modules(): + if isinstance(m, nn.Conv2d): + n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(0, math.sqrt(2. / n)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1): + modules = [] + for i in range(convs): + modules.extend([ + nn.Conv2d( + inplanes, planes, kernel_size=3, stride=stride if i == 0 else 1, + padding=dilation, bias=False, dilation=dilation), + nn.BatchNorm2d(planes), + nn.ReLU(inplace=True)]) + inplanes = planes + return nn.Sequential(*modules) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^base_layer', + blocks=r'^level(\d+)' if coarse else [ + # an unusual arch, this achieves somewhat more granularity without getting super messy + (r'^level(\d+)\.tree(\d+)', None), + (r'^level(\d+)\.root', (2,)), + (r'^level(\d+)', (1,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.fc = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool, use_conv=True) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() + + def forward_features(self, x): + x = self.base_layer(x) + x = self.level0(x) + x = self.level1(x) + x = self.level2(x) + x = self.level3(x) + x = self.level4(x) + x = self.level5(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + if pre_logits: + return x.flatten(1) + else: + x = self.fc(x) + return self.flatten(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_dla(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + DLA, variant, pretrained, + pretrained_strict=False, + feature_cfg=dict(out_indices=(1, 2, 3, 4, 5)), + **kwargs) + + +@register_model +def dla60_res2net(pretrained=False, **kwargs): + model_kwargs = dict( + levels=(1, 1, 1, 2, 3, 1), channels=(16, 32, 128, 256, 512, 1024), + block=DlaBottle2neck, cardinality=1, base_width=28, **kwargs) + return _create_dla('dla60_res2net', pretrained, **model_kwargs) + + +@register_model +def dla60_res2next(pretrained=False,**kwargs): + model_kwargs = dict( + levels=(1, 1, 1, 2, 3, 1), channels=(16, 32, 128, 256, 512, 1024), + block=DlaBottle2neck, cardinality=8, base_width=4, **kwargs) + return _create_dla('dla60_res2next', pretrained, **model_kwargs) + + +@register_model +def dla34(pretrained=False, **kwargs): # DLA-34 + model_kwargs = dict( + levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 128, 256, 512], + block=DlaBasic, **kwargs) + return _create_dla('dla34', pretrained, **model_kwargs) + + +@register_model +def dla46_c(pretrained=False, **kwargs): # DLA-46-C + model_kwargs = dict( + levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 64, 128, 256], + block=DlaBottleneck, **kwargs) + return _create_dla('dla46_c', pretrained, **model_kwargs) + + +@register_model +def dla46x_c(pretrained=False, **kwargs): # DLA-X-46-C + model_kwargs = dict( + levels=[1, 1, 1, 2, 2, 1], channels=[16, 32, 64, 64, 128, 256], + block=DlaBottleneck, cardinality=32, base_width=4, **kwargs) + return _create_dla('dla46x_c', pretrained, **model_kwargs) + + +@register_model +def dla60x_c(pretrained=False, **kwargs): # DLA-X-60-C + model_kwargs = dict( + levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 64, 64, 128, 256], + block=DlaBottleneck, cardinality=32, base_width=4, **kwargs) + return _create_dla('dla60x_c', pretrained, **model_kwargs) + + +@register_model +def dla60(pretrained=False, **kwargs): # DLA-60 + model_kwargs = dict( + levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 128, 256, 512, 1024], + block=DlaBottleneck, **kwargs) + return _create_dla('dla60', pretrained, **model_kwargs) + + +@register_model +def dla60x(pretrained=False, **kwargs): # DLA-X-60 + model_kwargs = dict( + levels=[1, 1, 1, 2, 3, 1], channels=[16, 32, 128, 256, 512, 1024], + block=DlaBottleneck, cardinality=32, base_width=4, **kwargs) + return _create_dla('dla60x', pretrained, **model_kwargs) + + +@register_model +def dla102(pretrained=False, **kwargs): # DLA-102 + model_kwargs = dict( + levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024], + block=DlaBottleneck, shortcut_root=True, **kwargs) + return _create_dla('dla102', pretrained, **model_kwargs) + + +@register_model +def dla102x(pretrained=False, **kwargs): # DLA-X-102 + model_kwargs = dict( + levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024], + block=DlaBottleneck, cardinality=32, base_width=4, shortcut_root=True, **kwargs) + return _create_dla('dla102x', pretrained, **model_kwargs) + + +@register_model +def dla102x2(pretrained=False, **kwargs): # DLA-X-102 64 + model_kwargs = dict( + levels=[1, 1, 1, 3, 4, 1], channels=[16, 32, 128, 256, 512, 1024], + block=DlaBottleneck, cardinality=64, base_width=4, shortcut_root=True, **kwargs) + return _create_dla('dla102x2', pretrained, **model_kwargs) + + +@register_model +def dla169(pretrained=False, **kwargs): # DLA-169 + model_kwargs = dict( + levels=[1, 1, 2, 3, 5, 1], channels=[16, 32, 128, 256, 512, 1024], + block=DlaBottleneck, shortcut_root=True, **kwargs) + return _create_dla('dla169', pretrained, **model_kwargs) diff --git a/custom_timm/models/dpn.py b/custom_timm/models/dpn.py new file mode 100644 index 0000000000000000000000000000000000000000..4231735672b682fffc0577fe16578950ff3b85bb --- /dev/null +++ b/custom_timm/models/dpn.py @@ -0,0 +1,339 @@ +""" PyTorch implementation of DualPathNetworks +Based on original MXNet implementation https://github.com/cypw/DPNs with +many ideas from another PyTorch implementation https://github.com/oyam/pytorch-DPNs. + +This implementation is compatible with the pretrained weights from cypw's MXNet implementation. + +Hacked together by / Copyright 2020 Ross Wightman +""" +from collections import OrderedDict +from functools import partial +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DPN_MEAN, IMAGENET_DPN_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import BatchNormAct2d, ConvNormAct, create_conv2d, create_classifier +from .registry import register_model + +__all__ = ['DPN'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DPN_MEAN, 'std': IMAGENET_DPN_STD, + 'first_conv': 'features.conv1_1.conv', 'classifier': 'classifier', + **kwargs + } + + +default_cfgs = { + 'dpn68': _cfg( + url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn68-66bebafa7.pth'), + 'dpn68b': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/dpn68b_ra-a31ca160.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + 'dpn92': _cfg( + url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn92_extra-b040e4a9b.pth'), + 'dpn98': _cfg( + url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn98-5b90dec4d.pth'), + 'dpn131': _cfg( + url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn131-71dfe43e0.pth'), + 'dpn107': _cfg( + url='https://github.com/rwightman/pytorch-dpn-pretrained/releases/download/v0.1/dpn107_extra-1ac7121e2.pth') +} + + +class CatBnAct(nn.Module): + def __init__(self, in_chs, norm_layer=BatchNormAct2d): + super(CatBnAct, self).__init__() + self.bn = norm_layer(in_chs, eps=0.001) + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (Tuple[torch.Tensor, torch.Tensor]) -> (torch.Tensor) + pass + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (torch.Tensor) -> (torch.Tensor) + pass + + def forward(self, x): + if isinstance(x, tuple): + x = torch.cat(x, dim=1) + return self.bn(x) + + +class BnActConv2d(nn.Module): + def __init__(self, in_chs, out_chs, kernel_size, stride, groups=1, norm_layer=BatchNormAct2d): + super(BnActConv2d, self).__init__() + self.bn = norm_layer(in_chs, eps=0.001) + self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, groups=groups) + + def forward(self, x): + return self.conv(self.bn(x)) + + +class DualPathBlock(nn.Module): + def __init__( + self, in_chs, num_1x1_a, num_3x3_b, num_1x1_c, inc, groups, block_type='normal', b=False): + super(DualPathBlock, self).__init__() + self.num_1x1_c = num_1x1_c + self.inc = inc + self.b = b + if block_type == 'proj': + self.key_stride = 1 + self.has_proj = True + elif block_type == 'down': + self.key_stride = 2 + self.has_proj = True + else: + assert block_type == 'normal' + self.key_stride = 1 + self.has_proj = False + + self.c1x1_w_s1 = None + self.c1x1_w_s2 = None + if self.has_proj: + # Using different member names here to allow easier parameter key matching for conversion + if self.key_stride == 2: + self.c1x1_w_s2 = BnActConv2d( + in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=2) + else: + self.c1x1_w_s1 = BnActConv2d( + in_chs=in_chs, out_chs=num_1x1_c + 2 * inc, kernel_size=1, stride=1) + + self.c1x1_a = BnActConv2d(in_chs=in_chs, out_chs=num_1x1_a, kernel_size=1, stride=1) + self.c3x3_b = BnActConv2d( + in_chs=num_1x1_a, out_chs=num_3x3_b, kernel_size=3, stride=self.key_stride, groups=groups) + if b: + self.c1x1_c = CatBnAct(in_chs=num_3x3_b) + self.c1x1_c1 = create_conv2d(num_3x3_b, num_1x1_c, kernel_size=1) + self.c1x1_c2 = create_conv2d(num_3x3_b, inc, kernel_size=1) + else: + self.c1x1_c = BnActConv2d(in_chs=num_3x3_b, out_chs=num_1x1_c + inc, kernel_size=1, stride=1) + self.c1x1_c1 = None + self.c1x1_c2 = None + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor] + pass + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor] + pass + + def forward(self, x) -> Tuple[torch.Tensor, torch.Tensor]: + if isinstance(x, tuple): + x_in = torch.cat(x, dim=1) + else: + x_in = x + if self.c1x1_w_s1 is None and self.c1x1_w_s2 is None: + # self.has_proj == False, torchscript requires condition on module == None + x_s1 = x[0] + x_s2 = x[1] + else: + # self.has_proj == True + if self.c1x1_w_s1 is not None: + # self.key_stride = 1 + x_s = self.c1x1_w_s1(x_in) + else: + # self.key_stride = 2 + x_s = self.c1x1_w_s2(x_in) + x_s1 = x_s[:, :self.num_1x1_c, :, :] + x_s2 = x_s[:, self.num_1x1_c:, :, :] + x_in = self.c1x1_a(x_in) + x_in = self.c3x3_b(x_in) + x_in = self.c1x1_c(x_in) + if self.c1x1_c1 is not None: + # self.b == True, using None check for torchscript compat + out1 = self.c1x1_c1(x_in) + out2 = self.c1x1_c2(x_in) + else: + out1 = x_in[:, :self.num_1x1_c, :, :] + out2 = x_in[:, self.num_1x1_c:, :, :] + resid = x_s1 + out1 + dense = torch.cat([x_s2, out2], dim=1) + return resid, dense + + +class DPN(nn.Module): + def __init__( + self, small=False, num_init_features=64, k_r=96, groups=32, global_pool='avg', + b=False, k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128), output_stride=32, + num_classes=1000, in_chans=3, drop_rate=0., fc_act_layer=nn.ELU): + super(DPN, self).__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + self.b = b + assert output_stride == 32 # FIXME look into dilation support + norm_layer = partial(BatchNormAct2d, eps=.001) + fc_norm_layer = partial(BatchNormAct2d, eps=.001, act_layer=fc_act_layer, inplace=False) + bw_factor = 1 if small else 4 + blocks = OrderedDict() + + # conv1 + blocks['conv1_1'] = ConvNormAct( + in_chans, num_init_features, kernel_size=3 if small else 7, stride=2, norm_layer=norm_layer) + blocks['conv1_pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.feature_info = [dict(num_chs=num_init_features, reduction=2, module='features.conv1_1')] + + # conv2 + bw = 64 * bw_factor + inc = inc_sec[0] + r = (k_r * bw) // (64 * bw_factor) + blocks['conv2_1'] = DualPathBlock(num_init_features, r, r, bw, inc, groups, 'proj', b) + in_chs = bw + 3 * inc + for i in range(2, k_sec[0] + 1): + blocks['conv2_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b) + in_chs += inc + self.feature_info += [dict(num_chs=in_chs, reduction=4, module=f'features.conv2_{k_sec[0]}')] + + # conv3 + bw = 128 * bw_factor + inc = inc_sec[1] + r = (k_r * bw) // (64 * bw_factor) + blocks['conv3_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b) + in_chs = bw + 3 * inc + for i in range(2, k_sec[1] + 1): + blocks['conv3_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b) + in_chs += inc + self.feature_info += [dict(num_chs=in_chs, reduction=8, module=f'features.conv3_{k_sec[1]}')] + + # conv4 + bw = 256 * bw_factor + inc = inc_sec[2] + r = (k_r * bw) // (64 * bw_factor) + blocks['conv4_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b) + in_chs = bw + 3 * inc + for i in range(2, k_sec[2] + 1): + blocks['conv4_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b) + in_chs += inc + self.feature_info += [dict(num_chs=in_chs, reduction=16, module=f'features.conv4_{k_sec[2]}')] + + # conv5 + bw = 512 * bw_factor + inc = inc_sec[3] + r = (k_r * bw) // (64 * bw_factor) + blocks['conv5_1'] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'down', b) + in_chs = bw + 3 * inc + for i in range(2, k_sec[3] + 1): + blocks['conv5_' + str(i)] = DualPathBlock(in_chs, r, r, bw, inc, groups, 'normal', b) + in_chs += inc + self.feature_info += [dict(num_chs=in_chs, reduction=32, module=f'features.conv5_{k_sec[3]}')] + + blocks['conv5_bn_ac'] = CatBnAct(in_chs, norm_layer=fc_norm_layer) + + self.num_features = in_chs + self.features = nn.Sequential(blocks) + + # Using 1x1 conv for the FC layer to allow the extra pooling scheme + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool, use_conv=True) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^features\.conv1', + blocks=[ + (r'^features\.conv(\d+)' if coarse else r'^features\.conv(\d+)_(\d+)', None), + (r'^features\.conv5_bn_ac', (99999,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.classifier + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool, use_conv=True) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() + + def forward_features(self, x): + return self.features(x) + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + if pre_logits: + return x.flatten(1) + else: + x = self.classifier(x) + return self.flatten(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_dpn(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + DPN, variant, pretrained, + feature_cfg=dict(feature_concat=True, flatten_sequential=True), + **kwargs) + + +@register_model +def dpn68(pretrained=False, **kwargs): + model_kwargs = dict( + small=True, num_init_features=10, k_r=128, groups=32, + k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64), **kwargs) + return _create_dpn('dpn68', pretrained=pretrained, **model_kwargs) + + +@register_model +def dpn68b(pretrained=False, **kwargs): + model_kwargs = dict( + small=True, num_init_features=10, k_r=128, groups=32, + b=True, k_sec=(3, 4, 12, 3), inc_sec=(16, 32, 32, 64), **kwargs) + return _create_dpn('dpn68b', pretrained=pretrained, **model_kwargs) + + +@register_model +def dpn92(pretrained=False, **kwargs): + model_kwargs = dict( + num_init_features=64, k_r=96, groups=32, + k_sec=(3, 4, 20, 3), inc_sec=(16, 32, 24, 128), **kwargs) + return _create_dpn('dpn92', pretrained=pretrained, **model_kwargs) + + +@register_model +def dpn98(pretrained=False, **kwargs): + model_kwargs = dict( + num_init_features=96, k_r=160, groups=40, + k_sec=(3, 6, 20, 3), inc_sec=(16, 32, 32, 128), **kwargs) + return _create_dpn('dpn98', pretrained=pretrained, **model_kwargs) + + +@register_model +def dpn131(pretrained=False, **kwargs): + model_kwargs = dict( + num_init_features=128, k_r=160, groups=40, + k_sec=(4, 8, 28, 3), inc_sec=(16, 32, 32, 128), **kwargs) + return _create_dpn('dpn131', pretrained=pretrained, **model_kwargs) + + +@register_model +def dpn107(pretrained=False, **kwargs): + model_kwargs = dict( + num_init_features=128, k_r=200, groups=50, + k_sec=(4, 8, 20, 3), inc_sec=(20, 64, 64, 128), **kwargs) + return _create_dpn('dpn107', pretrained=pretrained, **model_kwargs) diff --git a/custom_timm/models/edgenext.py b/custom_timm/models/edgenext.py new file mode 100644 index 0000000000000000000000000000000000000000..202c89ba8a9cf6c15087efd441a437e85d0ce515 --- /dev/null +++ b/custom_timm/models/edgenext.py @@ -0,0 +1,572 @@ +""" EdgeNeXt + +Paper: `EdgeNeXt: Efficiently Amalgamated CNN-Transformer Architecture for Mobile Vision Applications` + - https://arxiv.org/abs/2206.10589 + +Original code and weights from https://github.com/mmaaz60/EdgeNeXt + +Modifications and additions for timm by / Copyright 2022, Ross Wightman +""" +import math +import torch +from collections import OrderedDict +from functools import partial +from typing import Tuple + +from torch import nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_module +from .layers import trunc_normal_tf_, DropPath, LayerNorm2d, Mlp, SelectAdaptivePool2d, create_conv2d +from .helpers import named_apply, build_model_with_cfg, checkpoint_seq +from .registry import register_model + + +__all__ = ['EdgeNeXt'] # model_registry will add each entrypoint fn to this + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8), + 'crop_pct': 0.9, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.0', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = dict( + edgenext_xx_small=_cfg( + url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_xx_small.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + edgenext_x_small=_cfg( + url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_x_small.pth", + test_input_size=(3, 288, 288), test_crop_pct=1.0), + # edgenext_small=_cfg( + # url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.0/edgenext_small.pth"), + edgenext_small=_cfg( # USI weights + url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.1/edgenext_small_usi.pth", + crop_pct=0.95, test_input_size=(3, 320, 320), test_crop_pct=1.0, + ), + # edgenext_base=_cfg( + # url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.2/edgenext_base_usi.pth"), + edgenext_base=_cfg( # USI weights + url="https://github.com/mmaaz60/EdgeNeXt/releases/download/v1.2/edgenext_base_usi.pth", + crop_pct=0.95, test_input_size=(3, 320, 320), test_crop_pct=1.0, + ), + + edgenext_small_rw=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/edgenext_small_rw-sw-b00041bb.pth', + test_input_size=(3, 320, 320), test_crop_pct=1.0, + ), +) + + +@register_notrace_module # reason: FX can't symbolically trace torch.arange in forward method +class PositionalEncodingFourier(nn.Module): + def __init__(self, hidden_dim=32, dim=768, temperature=10000): + super().__init__() + self.token_projection = nn.Conv2d(hidden_dim * 2, dim, kernel_size=1) + self.scale = 2 * math.pi + self.temperature = temperature + self.hidden_dim = hidden_dim + self.dim = dim + + def forward(self, shape: Tuple[int, int, int]): + inv_mask = ~torch.zeros(shape).to(device=self.token_projection.weight.device, dtype=torch.bool) + y_embed = inv_mask.cumsum(1, dtype=torch.float32) + x_embed = inv_mask.cumsum(2, dtype=torch.float32) + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.hidden_dim, dtype=torch.float32, device=inv_mask.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / self.hidden_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), + pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), + pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + pos = self.token_projection(pos) + + return pos + + +class ConvBlock(nn.Module): + def __init__( + self, + dim, + dim_out=None, + kernel_size=7, + stride=1, + conv_bias=True, + expand_ratio=4, + ls_init_value=1e-6, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, drop_path=0., + ): + super().__init__() + dim_out = dim_out or dim + self.shortcut_after_dw = stride > 1 or dim != dim_out + + self.conv_dw = create_conv2d( + dim, dim_out, kernel_size=kernel_size, stride=stride, depthwise=True, bias=conv_bias) + self.norm = norm_layer(dim_out) + self.mlp = Mlp(dim_out, int(expand_ratio * dim_out), act_layer=act_layer) + self.gamma = nn.Parameter(ls_init_value * torch.ones(dim_out)) if ls_init_value > 0 else None + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + if self.shortcut_after_dw: + shortcut = x + + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.norm(x) + x = self.mlp(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = shortcut + self.drop_path(x) + return x + + +class CrossCovarianceAttn(nn.Module): + def __init__( + self, + dim, + num_heads=8, + qkv_bias=False, + attn_drop=0., + proj_drop=0. + ): + super().__init__() + self.num_heads = num_heads + self.temperature = nn.Parameter(torch.ones(num_heads, 1, 1)) + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 4, 1) + q, k, v = qkv.unbind(0) + + # NOTE, this is NOT spatial attn, q, k, v are B, num_heads, C, L --> C x C attn map + attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)) * self.temperature + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + @torch.jit.ignore + def no_weight_decay(self): + return {'temperature'} + + +class SplitTransposeBlock(nn.Module): + def __init__( + self, + dim, + num_scales=1, + num_heads=8, + expand_ratio=4, + use_pos_emb=True, + conv_bias=True, + qkv_bias=True, + ls_init_value=1e-6, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + drop_path=0., + attn_drop=0., + proj_drop=0. + ): + super().__init__() + width = max(int(math.ceil(dim / num_scales)), int(math.floor(dim // num_scales))) + self.width = width + self.num_scales = max(1, num_scales - 1) + + convs = [] + for i in range(self.num_scales): + convs.append(create_conv2d(width, width, kernel_size=3, depthwise=True, bias=conv_bias)) + self.convs = nn.ModuleList(convs) + + self.pos_embd = None + if use_pos_emb: + self.pos_embd = PositionalEncodingFourier(dim=dim) + self.norm_xca = norm_layer(dim) + self.gamma_xca = nn.Parameter(ls_init_value * torch.ones(dim)) if ls_init_value > 0 else None + self.xca = CrossCovarianceAttn( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=proj_drop) + + self.norm = norm_layer(dim, eps=1e-6) + self.mlp = Mlp(dim, int(expand_ratio * dim), act_layer=act_layer) + self.gamma = nn.Parameter(ls_init_value * torch.ones(dim)) if ls_init_value > 0 else None + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + shortcut = x + + # scales code re-written for torchscript as per my res2net fixes -rw + # NOTE torch.split(x, self.width, 1) causing issues with ONNX export + spx = x.chunk(len(self.convs) + 1, dim=1) + spo = [] + sp = spx[0] + for i, conv in enumerate(self.convs): + if i > 0: + sp = sp + spx[i] + sp = conv(sp) + spo.append(sp) + spo.append(spx[-1]) + x = torch.cat(spo, 1) + + # XCA + B, C, H, W = x.shape + x = x.reshape(B, C, H * W).permute(0, 2, 1) + if self.pos_embd is not None: + pos_encoding = self.pos_embd((B, H, W)).reshape(B, -1, x.shape[1]).permute(0, 2, 1) + x = x + pos_encoding + x = x + self.drop_path(self.gamma_xca * self.xca(self.norm_xca(x))) + x = x.reshape(B, H, W, C) + + # Inverted Bottleneck + x = self.norm(x) + x = self.mlp(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = shortcut + self.drop_path(x) + return x + + +class EdgeNeXtStage(nn.Module): + def __init__( + self, + in_chs, + out_chs, + stride=2, + depth=2, + num_global_blocks=1, + num_heads=4, + scales=2, + kernel_size=7, + expand_ratio=4, + use_pos_emb=False, + downsample_block=False, + conv_bias=True, + ls_init_value=1.0, + drop_path_rates=None, + norm_layer=LayerNorm2d, + norm_layer_cl=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU + ): + super().__init__() + self.grad_checkpointing = False + + if downsample_block or stride == 1: + self.downsample = nn.Identity() + else: + self.downsample = nn.Sequential( + norm_layer(in_chs), + nn.Conv2d(in_chs, out_chs, kernel_size=2, stride=2, bias=conv_bias) + ) + in_chs = out_chs + + stage_blocks = [] + for i in range(depth): + if i < depth - num_global_blocks: + stage_blocks.append( + ConvBlock( + dim=in_chs, + dim_out=out_chs, + stride=stride if downsample_block and i == 0 else 1, + conv_bias=conv_bias, + kernel_size=kernel_size, + expand_ratio=expand_ratio, + ls_init_value=ls_init_value, + drop_path=drop_path_rates[i], + norm_layer=norm_layer_cl, + act_layer=act_layer, + ) + ) + else: + stage_blocks.append( + SplitTransposeBlock( + dim=in_chs, + num_scales=scales, + num_heads=num_heads, + expand_ratio=expand_ratio, + use_pos_emb=use_pos_emb, + conv_bias=conv_bias, + ls_init_value=ls_init_value, + drop_path=drop_path_rates[i], + norm_layer=norm_layer_cl, + act_layer=act_layer, + ) + ) + in_chs = out_chs + self.blocks = nn.Sequential(*stage_blocks) + + def forward(self, x): + x = self.downsample(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + return x + + +class EdgeNeXt(nn.Module): + def __init__( + self, + in_chans=3, + num_classes=1000, + global_pool='avg', + dims=(24, 48, 88, 168), + depths=(3, 3, 9, 3), + global_block_counts=(0, 1, 1, 1), + kernel_sizes=(3, 5, 7, 9), + heads=(8, 8, 8, 8), + d2_scales=(2, 2, 3, 4), + use_pos_emb=(False, True, False, False), + ls_init_value=1e-6, + head_init_scale=1., + expand_ratio=4, + downsample_block=False, + conv_bias=True, + stem_type='patch', + head_norm_first=False, + act_layer=nn.GELU, + drop_path_rate=0., + drop_rate=0., + ): + super().__init__() + self.num_classes = num_classes + self.global_pool = global_pool + self.drop_rate = drop_rate + norm_layer = partial(LayerNorm2d, eps=1e-6) + norm_layer_cl = partial(nn.LayerNorm, eps=1e-6) + self.feature_info = [] + + assert stem_type in ('patch', 'overlap') + if stem_type == 'patch': + self.stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=4, stride=4, bias=conv_bias), + norm_layer(dims[0]), + ) + else: + self.stem = nn.Sequential( + nn.Conv2d(in_chans, dims[0], kernel_size=9, stride=4, padding=9 // 2, bias=conv_bias), + norm_layer(dims[0]), + ) + + curr_stride = 4 + stages = [] + dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] + in_chs = dims[0] + for i in range(4): + stride = 2 if curr_stride == 2 or i > 0 else 1 + # FIXME support dilation / output_stride + curr_stride *= stride + stages.append(EdgeNeXtStage( + in_chs=in_chs, + out_chs=dims[i], + stride=stride, + depth=depths[i], + num_global_blocks=global_block_counts[i], + num_heads=heads[i], + drop_path_rates=dp_rates[i], + scales=d2_scales[i], + expand_ratio=expand_ratio, + kernel_size=kernel_sizes[i], + use_pos_emb=use_pos_emb[i], + ls_init_value=ls_init_value, + downsample_block=downsample_block, + conv_bias=conv_bias, + norm_layer=norm_layer, + norm_layer_cl=norm_layer_cl, + act_layer=act_layer, + )) + # NOTE feature_info use currently assumes stage 0 == stride 1, rest are stride 2 + in_chs = dims[i] + self.feature_info += [dict(num_chs=in_chs, reduction=curr_stride, module=f'stages.{i}')] + + self.stages = nn.Sequential(*stages) + + self.num_features = dims[-1] + self.norm_pre = norm_layer(self.num_features) if head_norm_first else nn.Identity() + self.head = nn.Sequential(OrderedDict([ + ('global_pool', SelectAdaptivePool2d(pool_type=global_pool)), + ('norm', nn.Identity() if head_norm_first else norm_layer(self.num_features)), + ('flatten', nn.Flatten(1) if global_pool else nn.Identity()), + ('drop', nn.Dropout(self.drop_rate)), + ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity())])) + + named_apply(partial(_init_weights, head_init_scale=head_init_scale), self) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', + blocks=r'^stages\.(\d+)' if coarse else [ + (r'^stages\.(\d+)\.downsample', (0,)), # blocks + (r'^stages\.(\d+)\.blocks\.(\d+)', None), + (r'^norm_pre', (99999,)) + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes=0, global_pool=None): + if global_pool is not None: + self.head.global_pool = SelectAdaptivePool2d(pool_type=global_pool) + self.head.flatten = nn.Flatten(1) if global_pool else nn.Identity() + self.head.fc = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.stem(x) + x = self.stages(x) + x = self.norm_pre(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + # NOTE nn.Sequential in head broken down since can't call head[:-1](x) in torchscript :( + x = self.head.global_pool(x) + x = self.head.norm(x) + x = self.head.flatten(x) + x = self.head.drop(x) + return x if pre_logits else self.head.fc(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _init_weights(module, name=None, head_init_scale=1.0): + if isinstance(module, nn.Conv2d): + trunc_normal_tf_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Linear): + trunc_normal_tf_(module.weight, std=.02) + nn.init.zeros_(module.bias) + if name and 'head.' in name: + module.weight.data.mul_(head_init_scale) + module.bias.data.mul_(head_init_scale) + + +def checkpoint_filter_fn(state_dict, model): + """ Remap FB checkpoints -> timm """ + if 'head.norm.weight' in state_dict or 'norm_pre.weight' in state_dict: + return state_dict # non-FB checkpoint + + # models were released as train checkpoints... :/ + if 'model_ema' in state_dict: + state_dict = state_dict['model_ema'] + elif 'model' in state_dict: + state_dict = state_dict['model'] + elif 'state_dict' in state_dict: + state_dict = state_dict['state_dict'] + + out_dict = {} + import re + for k, v in state_dict.items(): + k = k.replace('downsample_layers.0.', 'stem.') + k = re.sub(r'stages.([0-9]+).([0-9]+)', r'stages.\1.blocks.\2', k) + k = re.sub(r'downsample_layers.([0-9]+).([0-9]+)', r'stages.\1.downsample.\2', k) + k = k.replace('dwconv', 'conv_dw') + k = k.replace('pwconv', 'mlp.fc') + k = k.replace('head.', 'head.fc.') + if k.startswith('norm.'): + k = k.replace('norm', 'head.norm') + if v.ndim == 2 and 'head' not in k: + model_shape = model.state_dict()[k].shape + v = v.reshape(model_shape) + out_dict[k] = v + return out_dict + + +def _create_edgenext(variant, pretrained=False, **kwargs): + model = build_model_with_cfg( + EdgeNeXt, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + feature_cfg=dict(out_indices=(0, 1, 2, 3), flatten_sequential=True), + **kwargs) + return model + + +@register_model +def edgenext_xx_small(pretrained=False, **kwargs): + # 1.33M & 260.58M @ 256 resolution + # 71.23% Top-1 accuracy + # No AA, Color Jitter=0.4, No Mixup & Cutmix, DropPath=0.0, BS=4096, lr=0.006, multi-scale-sampler + # Jetson FPS=51.66 versus 47.67 for MobileViT_XXS + # For A100: FPS @ BS=1: 212.13 & @ BS=256: 7042.06 versus FPS @ BS=1: 96.68 & @ BS=256: 4624.71 for MobileViT_XXS + model_kwargs = dict(depths=(2, 2, 6, 2), dims=(24, 48, 88, 168), heads=(4, 4, 4, 4), **kwargs) + return _create_edgenext('edgenext_xx_small', pretrained=pretrained, **model_kwargs) + + +@register_model +def edgenext_x_small(pretrained=False, **kwargs): + # 2.34M & 538.0M @ 256 resolution + # 75.00% Top-1 accuracy + # No AA, No Mixup & Cutmix, DropPath=0.0, BS=4096, lr=0.006, multi-scale-sampler + # Jetson FPS=31.61 versus 28.49 for MobileViT_XS + # For A100: FPS @ BS=1: 179.55 & @ BS=256: 4404.95 versus FPS @ BS=1: 94.55 & @ BS=256: 2361.53 for MobileViT_XS + model_kwargs = dict(depths=(3, 3, 9, 3), dims=(32, 64, 100, 192), heads=(4, 4, 4, 4), **kwargs) + return _create_edgenext('edgenext_x_small', pretrained=pretrained, **model_kwargs) + + +@register_model +def edgenext_small(pretrained=False, **kwargs): + # 5.59M & 1260.59M @ 256 resolution + # 79.43% Top-1 accuracy + # AA=True, No Mixup & Cutmix, DropPath=0.1, BS=4096, lr=0.006, multi-scale-sampler + # Jetson FPS=20.47 versus 18.86 for MobileViT_S + # For A100: FPS @ BS=1: 172.33 & @ BS=256: 3010.25 versus FPS @ BS=1: 93.84 & @ BS=256: 1785.92 for MobileViT_S + model_kwargs = dict(depths=(3, 3, 9, 3), dims=(48, 96, 160, 304), **kwargs) + return _create_edgenext('edgenext_small', pretrained=pretrained, **model_kwargs) + + +@register_model +def edgenext_base(pretrained=False, **kwargs): + # 18.51M & 3840.93M @ 256 resolution + # 82.5% (normal) 83.7% (USI) Top-1 accuracy + # AA=True, Mixup & Cutmix, DropPath=0.1, BS=4096, lr=0.006, multi-scale-sampler + # Jetson FPS=xx.xx versus xx.xx for MobileViT_S + # For A100: FPS @ BS=1: xxx.xx & @ BS=256: xxxx.xx + model_kwargs = dict(depths=[3, 3, 9, 3], dims=[80, 160, 288, 584], **kwargs) + return _create_edgenext('edgenext_base', pretrained=pretrained, **model_kwargs) + + +@register_model +def edgenext_small_rw(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 3, 9, 3), dims=(48, 96, 192, 384), + downsample_block=True, conv_bias=False, stem_type='overlap', **kwargs) + return _create_edgenext('edgenext_small_rw', pretrained=pretrained, **model_kwargs) + diff --git a/custom_timm/models/efficientformer.py b/custom_timm/models/efficientformer.py new file mode 100644 index 0000000000000000000000000000000000000000..0f5c71ab8766892c10d8063df055883484dc04c4 --- /dev/null +++ b/custom_timm/models/efficientformer.py @@ -0,0 +1,551 @@ +""" EfficientFormer + +@article{li2022efficientformer, + title={EfficientFormer: Vision Transformers at MobileNet Speed}, + author={Li, Yanyu and Yuan, Geng and Wen, Yang and Hu, Eric and Evangelidis, Georgios and Tulyakov, + Sergey and Wang, Yanzhi and Ren, Jian}, + journal={arXiv preprint arXiv:2206.01191}, + year={2022} +} + +Based on Apache 2.0 licensed code at https://github.com/snap-research/EfficientFormer, Copyright (c) 2022 Snap Inc. + +Modifications and timm support by / Copyright 2022, Ross Wightman +""" +from typing import Dict + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import DropPath, trunc_normal_, to_2tuple, Mlp +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, 'fixed_input_size': True, + 'crop_pct': .95, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv1', 'classifier': ('head', 'head_dist'), + **kwargs + } + + +default_cfgs = dict( + efficientformer_l1=_cfg( + url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/efficientformer_l1_1000d_224-5b08fab0.pth", + ), + efficientformer_l3=_cfg( + url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/efficientformer_l3_300d_224-6816624f.pth", + ), + efficientformer_l7=_cfg( + url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/efficientformer_l7_300d_224-e957ab75.pth", + ), +) + +EfficientFormer_width = { + 'l1': (48, 96, 224, 448), + 'l3': (64, 128, 320, 512), + 'l7': (96, 192, 384, 768), +} + +EfficientFormer_depth = { + 'l1': (3, 2, 6, 4), + 'l3': (4, 4, 12, 6), + 'l7': (6, 6, 18, 8), +} + + +class Attention(torch.nn.Module): + attention_bias_cache: Dict[str, torch.Tensor] + + def __init__( + self, + dim=384, + key_dim=32, + num_heads=8, + attn_ratio=4, + resolution=7 + ): + super().__init__() + self.num_heads = num_heads + self.scale = key_dim ** -0.5 + self.key_dim = key_dim + self.key_attn_dim = key_dim * num_heads + self.val_dim = int(attn_ratio * key_dim) + self.val_attn_dim = self.val_dim * num_heads + self.attn_ratio = attn_ratio + + self.qkv = nn.Linear(dim, self.key_attn_dim * 2 + self.val_attn_dim) + self.proj = nn.Linear(self.val_attn_dim, dim) + + resolution = to_2tuple(resolution) + pos = torch.stack(torch.meshgrid(torch.arange(resolution[0]), torch.arange(resolution[1]))).flatten(1) + rel_pos = (pos[..., :, None] - pos[..., None, :]).abs() + rel_pos = (rel_pos[0] * resolution[1]) + rel_pos[1] + self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, resolution[0] * resolution[1])) + self.register_buffer('attention_bias_idxs', torch.LongTensor(rel_pos)) + self.attention_bias_cache = {} # per-device attention_biases cache (data-parallel compat) + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and self.attention_bias_cache: + self.attention_bias_cache = {} # clear ab cache + + def get_attention_biases(self, device: torch.device) -> torch.Tensor: + if self.training: + return self.attention_biases[:, self.attention_bias_idxs] + else: + device_key = str(device) + if device_key not in self.attention_bias_cache: + self.attention_bias_cache[device_key] = self.attention_biases[:, self.attention_bias_idxs] + return self.attention_bias_cache[device_key] + + def forward(self, x): # x (B,N,C) + B, N, C = x.shape + qkv = self.qkv(x) + qkv = qkv.reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + q, k, v = qkv.split([self.key_dim, self.key_dim, self.val_dim], dim=3) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn + self.get_attention_biases(x.device) + + attn = attn.softmax(dim=-1) + x = (attn @ v).transpose(1, 2).reshape(B, N, self.val_attn_dim) + x = self.proj(x) + return x + + +class Stem4(nn.Sequential): + def __init__(self, in_chs, out_chs, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d): + super().__init__() + self.stride = 4 + + self.add_module('conv1', nn.Conv2d(in_chs, out_chs // 2, kernel_size=3, stride=2, padding=1)) + self.add_module('norm1', norm_layer(out_chs // 2)) + self.add_module('act1', act_layer()) + self.add_module('conv2', nn.Conv2d(out_chs // 2, out_chs, kernel_size=3, stride=2, padding=1)) + self.add_module('norm2', norm_layer(out_chs)) + self.add_module('act2', act_layer()) + + +class Downsample(nn.Module): + """ + Downsampling via strided conv w/ norm + Input: tensor in shape [B, C, H, W] + Output: tensor in shape [B, C, H/stride, W/stride] + """ + + def __init__(self, in_chs, out_chs, kernel_size=3, stride=2, padding=None, norm_layer=nn.BatchNorm2d): + super().__init__() + if padding is None: + padding = kernel_size // 2 + self.conv = nn.Conv2d(in_chs, out_chs, kernel_size=kernel_size, stride=stride, padding=padding) + self.norm = norm_layer(out_chs) + + def forward(self, x): + x = self.conv(x) + x = self.norm(x) + return x + + +class Flat(nn.Module): + + def __init__(self, ): + super().__init__() + + def forward(self, x): + x = x.flatten(2).transpose(1, 2) + return x + + +class Pooling(nn.Module): + """ + Implementation of pooling for PoolFormer + --pool_size: pooling size + """ + + def __init__(self, pool_size=3): + super().__init__() + self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False) + + def forward(self, x): + return self.pool(x) - x + + +class ConvMlpWithNorm(nn.Module): + """ + Implementation of MLP with 1*1 convolutions. + Input: tensor with shape [B, C, H, W] + """ + + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + norm_layer=nn.BatchNorm2d, + drop=0. + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Conv2d(in_features, hidden_features, 1) + self.norm1 = norm_layer(hidden_features) if norm_layer is not None else nn.Identity() + self.act = act_layer() + self.fc2 = nn.Conv2d(hidden_features, out_features, 1) + self.norm2 = norm_layer(out_features) if norm_layer is not None else nn.Identity() + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.norm1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.norm2(x) + x = self.drop(x) + return x + + +class LayerScale(nn.Module): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class MetaBlock1d(nn.Module): + + def __init__( + self, + dim, + mlp_ratio=4., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + drop=0., + drop_path=0., + layer_scale_init_value=1e-5 + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.token_mixer = Attention(dim) + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.ls1 = LayerScale(dim, layer_scale_init_value) + self.ls2 = LayerScale(dim, layer_scale_init_value) + + def forward(self, x): + x = x + self.drop_path(self.ls1(self.token_mixer(self.norm1(x)))) + x = x + self.drop_path(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class LayerScale2d(nn.Module): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + gamma = self.gamma.view(1, -1, 1, 1) + return x.mul_(gamma) if self.inplace else x * gamma + + +class MetaBlock2d(nn.Module): + + def __init__( + self, + dim, + pool_size=3, + mlp_ratio=4., + act_layer=nn.GELU, + norm_layer=nn.BatchNorm2d, + drop=0., + drop_path=0., + layer_scale_init_value=1e-5 + ): + super().__init__() + self.token_mixer = Pooling(pool_size=pool_size) + self.mlp = ConvMlpWithNorm( + dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, norm_layer=norm_layer, drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.ls1 = LayerScale2d(dim, layer_scale_init_value) + self.ls2 = LayerScale2d(dim, layer_scale_init_value) + + def forward(self, x): + x = x + self.drop_path(self.ls1(self.token_mixer(x))) + x = x + self.drop_path(self.ls2(self.mlp(x))) + return x + + +class EfficientFormerStage(nn.Module): + + def __init__( + self, + dim, + dim_out, + depth, + downsample=True, + num_vit=1, + pool_size=3, + mlp_ratio=4., + act_layer=nn.GELU, + norm_layer=nn.BatchNorm2d, + norm_layer_cl=nn.LayerNorm, + drop=.0, + drop_path=0., + layer_scale_init_value=1e-5, +): + super().__init__() + self.grad_checkpointing = False + + if downsample: + self.downsample = Downsample(in_chs=dim, out_chs=dim_out, norm_layer=norm_layer) + dim = dim_out + else: + assert dim == dim_out + self.downsample = nn.Identity() + + blocks = [] + if num_vit and num_vit >= depth: + blocks.append(Flat()) + + for block_idx in range(depth): + remain_idx = depth - block_idx - 1 + if num_vit and num_vit > remain_idx: + blocks.append( + MetaBlock1d( + dim, + mlp_ratio=mlp_ratio, + act_layer=act_layer, + norm_layer=norm_layer_cl, + drop=drop, + drop_path=drop_path[block_idx], + layer_scale_init_value=layer_scale_init_value, + )) + else: + blocks.append( + MetaBlock2d( + dim, + pool_size=pool_size, + mlp_ratio=mlp_ratio, + act_layer=act_layer, + norm_layer=norm_layer, + drop=drop, + drop_path=drop_path[block_idx], + layer_scale_init_value=layer_scale_init_value, + )) + if num_vit and num_vit == remain_idx: + blocks.append(Flat()) + + self.blocks = nn.Sequential(*blocks) + + def forward(self, x): + x = self.downsample(x) + x = self.blocks(x) + return x + + +class EfficientFormer(nn.Module): + + def __init__( + self, + depths, + embed_dims=None, + in_chans=3, + num_classes=1000, + global_pool='avg', + downsamples=None, + num_vit=0, + mlp_ratios=4, + pool_size=3, + layer_scale_init_value=1e-5, + act_layer=nn.GELU, + norm_layer=nn.BatchNorm2d, + norm_layer_cl=nn.LayerNorm, + drop_rate=0., + drop_path_rate=0., + **kwargs + ): + super().__init__() + self.num_classes = num_classes + self.global_pool = global_pool + + self.stem = Stem4(in_chans, embed_dims[0], norm_layer=norm_layer) + prev_dim = embed_dims[0] + + # stochastic depth decay rule + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] + downsamples = downsamples or (False,) + (True,) * (len(depths) - 1) + stages = [] + for i in range(len(depths)): + stage = EfficientFormerStage( + prev_dim, + embed_dims[i], + depths[i], + downsample=downsamples[i], + num_vit=num_vit if i == 3 else 0, + pool_size=pool_size, + mlp_ratio=mlp_ratios, + act_layer=act_layer, + norm_layer_cl=norm_layer_cl, + norm_layer=norm_layer, + drop=drop_rate, + drop_path=dpr[i], + layer_scale_init_value=layer_scale_init_value, + ) + prev_dim = embed_dims[i] + stages.append(stage) + + self.stages = nn.Sequential(*stages) + + # Classifier head + self.num_features = embed_dims[-1] + self.norm = norm_layer_cl(self.num_features) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + # assuming model is always distilled (valid for current checkpoints, will split def if that changes) + self.head_dist = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity() + self.distilled_training = False # must set this True to train w/ distillation token + + self.apply(self._init_weights) + + # init for classification + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def no_weight_decay(self): + return {k for k, _ in self.named_parameters() if 'attention_biases' in k} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', # stem and embed + blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head, self.head_dist + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + self.head_dist = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + @torch.jit.ignore + def set_distilled_training(self, enable=True): + self.distilled_training = enable + + def forward_features(self, x): + x = self.stem(x) + x = self.stages(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean(dim=1) + if pre_logits: + return x + x, x_dist = self.head(x), self.head_dist(x) + if self.distilled_training and self.training and not torch.jit.is_scripting(): + # only return separate classification predictions when training in distilled mode + return x, x_dist + else: + # during standard train/finetune, inference average the classifier predictions + return (x + x_dist) / 2 + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _checkpoint_filter_fn(state_dict, model): + """ Remap original checkpoints -> timm """ + if 'stem.0.weight' in state_dict: + return state_dict # non-original checkpoint, no remapping needed + + out_dict = {} + import re + stage_idx = 0 + for k, v in state_dict.items(): + if k.startswith('patch_embed'): + k = k.replace('patch_embed.0', 'stem.conv1') + k = k.replace('patch_embed.1', 'stem.norm1') + k = k.replace('patch_embed.3', 'stem.conv2') + k = k.replace('patch_embed.4', 'stem.norm2') + + if re.match(r'network\.(\d+)\.proj\.weight', k): + stage_idx += 1 + k = re.sub(r'network.(\d+).(\d+)', f'stages.{stage_idx}.blocks.\\2', k) + k = re.sub(r'network.(\d+).proj', f'stages.{stage_idx}.downsample.conv', k) + k = re.sub(r'network.(\d+).norm', f'stages.{stage_idx}.downsample.norm', k) + + k = re.sub(r'layer_scale_([0-9])', r'ls\1.gamma', k) + k = k.replace('dist_head', 'head_dist') + out_dict[k] = v + return out_dict + + +def _create_efficientformer(variant, pretrained=False, **kwargs): + model = build_model_with_cfg( + EfficientFormer, variant, pretrained, + pretrained_filter_fn=_checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def efficientformer_l1(pretrained=False, **kwargs): + model_kwargs = dict( + depths=EfficientFormer_depth['l1'], + embed_dims=EfficientFormer_width['l1'], + num_vit=1, + **kwargs) + return _create_efficientformer('efficientformer_l1', pretrained=pretrained, **model_kwargs) + + +@register_model +def efficientformer_l3(pretrained=False, **kwargs): + model_kwargs = dict( + depths=EfficientFormer_depth['l3'], + embed_dims=EfficientFormer_width['l3'], + num_vit=4, + **kwargs) + return _create_efficientformer('efficientformer_l3', pretrained=pretrained, **model_kwargs) + + +@register_model +def efficientformer_l7(pretrained=False, **kwargs): + model_kwargs = dict( + depths=EfficientFormer_depth['l7'], + embed_dims=EfficientFormer_width['l7'], + num_vit=8, + **kwargs) + return _create_efficientformer('efficientformer_l7', pretrained=pretrained, **model_kwargs) + diff --git a/custom_timm/models/efficientnet.py b/custom_timm/models/efficientnet.py new file mode 100644 index 0000000000000000000000000000000000000000..90dd9eb85dfc6ab473e48df9aacdccd73bdff22b --- /dev/null +++ b/custom_timm/models/efficientnet.py @@ -0,0 +1,2403 @@ +""" The EfficientNet Family in PyTorch + +An implementation of EfficienNet that covers variety of related models with efficient architectures: + +* EfficientNet-V2 + - `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298 + +* EfficientNet (B0-B8, L2 + Tensorflow pretrained AutoAug/RandAug/AdvProp/NoisyStudent weight ports) + - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946 + - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971 + - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665 + - Self-training with Noisy Student improves ImageNet classification - https://arxiv.org/abs/1911.04252 + +* MixNet (Small, Medium, and Large) + - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595 + +* MNasNet B1, A1 (SE), Small + - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626 + +* FBNet-C + - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443 + +* Single-Path NAS Pixel1 + - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877 + +* TinyNet + - Model Rubik's Cube: Twisting Resolution, Depth and Width for TinyNets - https://arxiv.org/abs/2010.14819 + - Definitions & weights borrowed from https://github.com/huawei-noah/CV-Backbones/tree/master/tinynet_pytorch + +* And likely more... + +The majority of the above models (EfficientNet*, MixNet, MnasNet) and original weights were made available +by Mingxing Tan, Quoc Le, and other members of their Google Brain team. Thanks for consistently releasing +the models and weights open source! + +Hacked together by / Copyright 2019, Ross Wightman +""" +from functools import partial +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .efficientnet_blocks import SqueezeExcite +from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights,\ + round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT +from .features import FeatureInfo, FeatureHooks +from .helpers import build_model_with_cfg, pretrained_cfg_for_features, checkpoint_seq +from .layers import create_conv2d, create_classifier, get_norm_act_layer, EvoNorm2dS0, GroupNormAct +from .registry import register_model + +__all__ = ['EfficientNet', 'EfficientNetFeatures'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv_stem', 'classifier': 'classifier', + **kwargs + } + + +default_cfgs = { + 'mnasnet_050': _cfg(url=''), + 'mnasnet_075': _cfg(url=''), + 'mnasnet_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_b1-74cb7081.pth'), + 'mnasnet_140': _cfg(url=''), + + 'semnasnet_050': _cfg(url=''), + 'semnasnet_075': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/semnasnet_075-18710866.pth'), + 'semnasnet_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth'), + 'semnasnet_140': _cfg(url=''), + 'mnasnet_small': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_small_lamb-aff75073.pth'), + + 'mobilenetv2_035': _cfg( + url=''), + 'mobilenetv2_050': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_050-3d30d450.pth', + interpolation='bicubic', + ), + 'mobilenetv2_075': _cfg( + url=''), + 'mobilenetv2_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth'), + 'mobilenetv2_110d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_110d_ra-77090ade.pth'), + 'mobilenetv2_120d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_120d_ra-5987e2ed.pth'), + 'mobilenetv2_140': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_140_ra-21a4e913.pth'), + + 'fbnetc_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth', + interpolation='bilinear'), + 'spnasnet_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth', + interpolation='bilinear'), + + # NOTE experimenting with alternate attention + 'efficientnet_b0': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth'), + 'efficientnet_b1': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth', + test_input_size=(3, 256, 256), crop_pct=1.0), + 'efficientnet_b2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth', + input_size=(3, 256, 256), pool_size=(8, 8), test_input_size=(3, 288, 288), crop_pct=1.0), + 'efficientnet_b3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth', + input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0), + 'efficientnet_b4': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b4_ra2_320-7eb33cd5.pth', + input_size=(3, 320, 320), pool_size=(10, 10), test_input_size=(3, 384, 384), crop_pct=1.0), + 'efficientnet_b5': _cfg( + url='', input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934), + 'efficientnet_b6': _cfg( + url='', input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942), + 'efficientnet_b7': _cfg( + url='', input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949), + 'efficientnet_b8': _cfg( + url='', input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954), + 'efficientnet_l2': _cfg( + url='', input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961), + + # FIXME experimental + 'efficientnet_b0_gn': _cfg( + url=''), + 'efficientnet_b0_g8_gn': _cfg( + url=''), + 'efficientnet_b0_g16_evos': _cfg( + url=''), + 'efficientnet_b3_gn': _cfg( + url='', + input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0), + 'efficientnet_b3_g8_gn': _cfg( + url='', + input_size=(3, 288, 288), pool_size=(9, 9), test_input_size=(3, 320, 320), crop_pct=1.0), + + 'efficientnet_es': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth'), + 'efficientnet_em': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_em_ra2-66250f76.pth', + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + 'efficientnet_el': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_el-3b455510.pth', + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904), + + 'efficientnet_es_pruned': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_pruned75-1b7248cf.pth'), + 'efficientnet_el_pruned': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_el_pruned70-ef2a2ccf.pth', + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904), + + 'efficientnet_cc_b0_4e': _cfg(url=''), + 'efficientnet_cc_b0_8e': _cfg(url=''), + 'efficientnet_cc_b1_8e': _cfg(url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + + 'efficientnet_lite0': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_lite0_ra-37913777.pth'), + 'efficientnet_lite1': _cfg( + url='', + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + 'efficientnet_lite2': _cfg( + url='', + input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890), + 'efficientnet_lite3': _cfg( + url='', + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904), + 'efficientnet_lite4': _cfg( + url='', input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922), + + 'efficientnet_b1_pruned': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/effnetb1_pruned-bea43a3a.pth', + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'efficientnet_b2_pruned': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/effnetb2_pruned-08c1b27c.pth', + input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'efficientnet_b3_pruned': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/effnetb3_pruned-59ecf72d.pth', + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + + 'efficientnetv2_rw_t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_t_agc-3620981a.pth', + input_size=(3, 224, 224), test_input_size=(3, 288, 288), pool_size=(7, 7), crop_pct=1.0), + 'gc_efficientnetv2_rw_t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gc_efficientnetv2_rw_t_agc-927a0bde.pth', + input_size=(3, 224, 224), test_input_size=(3, 288, 288), pool_size=(7, 7), crop_pct=1.0), + 'efficientnetv2_rw_s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_v2s_ra2_288-a6477665.pth', + input_size=(3, 288, 288), test_input_size=(3, 384, 384), pool_size=(9, 9), crop_pct=1.0), + 'efficientnetv2_rw_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnetv2_rw_m_agc-3d90cb1e.pth', + input_size=(3, 320, 320), test_input_size=(3, 416, 416), pool_size=(10, 10), crop_pct=1.0), + + 'efficientnetv2_s': _cfg( + url='', + input_size=(3, 288, 288), test_input_size=(3, 384, 384), pool_size=(9, 9), crop_pct=1.0), + 'efficientnetv2_m': _cfg( + url='', + input_size=(3, 320, 320), test_input_size=(3, 416, 416), pool_size=(10, 10), crop_pct=1.0), + 'efficientnetv2_l': _cfg( + url='', + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + 'efficientnetv2_xl': _cfg( + url='', + input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0), + + 'tf_efficientnet_b0': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth', + input_size=(3, 224, 224)), + 'tf_efficientnet_b1': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_aa-ea7a6ee0.pth', + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + 'tf_efficientnet_b2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_aa-60c94f97.pth', + input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890), + 'tf_efficientnet_b3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth', + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904), + 'tf_efficientnet_b4': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth', + input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922), + 'tf_efficientnet_b5': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth', + input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934), + 'tf_efficientnet_b6': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth', + input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942), + 'tf_efficientnet_b7': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth', + input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949), + 'tf_efficientnet_b8': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ra-572d5dd9.pth', + input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954), + + 'tf_efficientnet_b0_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, input_size=(3, 224, 224)), + 'tf_efficientnet_b1_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + 'tf_efficientnet_b2_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890), + 'tf_efficientnet_b3_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904), + 'tf_efficientnet_b4_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922), + 'tf_efficientnet_b5_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934), + 'tf_efficientnet_b6_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942), + 'tf_efficientnet_b7_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949), + 'tf_efficientnet_b8_ap': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954), + + 'tf_efficientnet_b0_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth', + input_size=(3, 224, 224)), + 'tf_efficientnet_b1_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth', + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + 'tf_efficientnet_b2_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ns-00306e48.pth', + input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890), + 'tf_efficientnet_b3_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth', + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904), + 'tf_efficientnet_b4_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth', + input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922), + 'tf_efficientnet_b5_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ns-6f26d0cf.pth', + input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934), + 'tf_efficientnet_b6_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ns-51548356.pth', + input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942), + 'tf_efficientnet_b7_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth', + input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949), + 'tf_efficientnet_l2_ns_475': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns_475-bebbd00a.pth', + input_size=(3, 475, 475), pool_size=(15, 15), crop_pct=0.936), + 'tf_efficientnet_l2_ns': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth', + input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.96), + + 'tf_efficientnet_es': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 224, 224), ), + 'tf_efficientnet_em': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_em-e78cfe58.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + 'tf_efficientnet_el': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904), + + 'tf_efficientnet_cc_b0_4e': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'tf_efficientnet_cc_b0_8e': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'tf_efficientnet_cc_b1_8e': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882), + + 'tf_efficientnet_lite0': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite0-0aa007d2.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + interpolation='bicubic', # should be bilinear but bicubic better match for TF bilinear at low res + ), + 'tf_efficientnet_lite1': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite1-bde8b488.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882, + interpolation='bicubic', # should be bilinear but bicubic better match for TF bilinear at low res + ), + 'tf_efficientnet_lite2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite2-dcccb7df.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890, + interpolation='bicubic', # should be bilinear but bicubic better match for TF bilinear at low res + ), + 'tf_efficientnet_lite3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, interpolation='bilinear'), + 'tf_efficientnet_lite4': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite4-741542c3.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.920, interpolation='bilinear'), + + 'tf_efficientnetv2_s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s-eb54923e.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 300, 300), test_input_size=(3, 384, 384), pool_size=(10, 10), crop_pct=1.0), + 'tf_efficientnetv2_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m-cc09e0cd.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + 'tf_efficientnetv2_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l-d664b728.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + + 'tf_efficientnetv2_s_in21ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21ft1k-d7dafa41.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 300, 300), test_input_size=(3, 384, 384), pool_size=(10, 10), crop_pct=1.0), + 'tf_efficientnetv2_m_in21ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21ft1k-bf41664a.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + 'tf_efficientnetv2_l_in21ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21ft1k-60127a9d.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + 'tf_efficientnetv2_xl_in21ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21ft1k-06c35c48.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0), + + 'tf_efficientnetv2_s_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_s_21k-6337ad01.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, + input_size=(3, 300, 300), test_input_size=(3, 384, 384), pool_size=(10, 10), crop_pct=1.0), + 'tf_efficientnetv2_m_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_m_21k-361418a2.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + 'tf_efficientnetv2_l_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_l_21k-91a19ec9.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, + input_size=(3, 384, 384), test_input_size=(3, 480, 480), pool_size=(12, 12), crop_pct=1.0), + 'tf_efficientnetv2_xl_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_xl_in21k-fd7e8abf.pth', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), num_classes=21843, + input_size=(3, 384, 384), test_input_size=(3, 512, 512), pool_size=(12, 12), crop_pct=1.0), + + 'tf_efficientnetv2_b0': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b0-c7cc451f.pth', + input_size=(3, 192, 192), test_input_size=(3, 224, 224), pool_size=(6, 6)), + 'tf_efficientnetv2_b1': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b1-be6e41b0.pth', + input_size=(3, 192, 192), test_input_size=(3, 240, 240), pool_size=(6, 6), crop_pct=0.882), + 'tf_efficientnetv2_b2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b2-847de54e.pth', + input_size=(3, 208, 208), test_input_size=(3, 260, 260), pool_size=(7, 7), crop_pct=0.890), + 'tf_efficientnetv2_b3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-effv2-weights/tf_efficientnetv2_b3-57773f13.pth', + input_size=(3, 240, 240), test_input_size=(3, 300, 300), pool_size=(8, 8), crop_pct=0.904), + + 'mixnet_s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth'), + 'mixnet_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_m-4647fc68.pth'), + 'mixnet_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth'), + 'mixnet_xl': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth'), + 'mixnet_xxl': _cfg(), + + 'tf_mixnet_s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth'), + 'tf_mixnet_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_m-0f4d8805.pth'), + 'tf_mixnet_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth'), + + "tinynet_a": _cfg( + input_size=(3, 192, 192), pool_size=(6, 6), # int(224 * 0.86) + url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_a.pth'), + "tinynet_b": _cfg( + input_size=(3, 188, 188), pool_size=(6, 6), # int(224 * 0.84) + url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_b.pth'), + "tinynet_c": _cfg( + input_size=(3, 184, 184), pool_size=(6, 6), # int(224 * 0.825) + url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_c.pth'), + "tinynet_d": _cfg( + input_size=(3, 152, 152), pool_size=(5, 5), # int(224 * 0.68) + url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_d.pth'), + "tinynet_e": _cfg( + input_size=(3, 106, 106), pool_size=(4, 4), # int(224 * 0.475) + url='https://github.com/huawei-noah/CV-Backbones/releases/download/v1.2.0/tinynet_e.pth'), +} + + +class EfficientNet(nn.Module): + """ EfficientNet + + A flexible and performant PyTorch implementation of efficient network architectures, including: + * EfficientNet-V2 Small, Medium, Large, XL & B0-B3 + * EfficientNet B0-B8, L2 + * EfficientNet-EdgeTPU + * EfficientNet-CondConv + * MixNet S, M, L, XL + * MnasNet A1, B1, and small + * MobileNet-V2 + * FBNet C + * Single-Path NAS Pixel1 + * TinyNet + """ + + def __init__( + self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32, fix_stem=False, + output_stride=32, pad_type='', round_chs_fn=round_channels, act_layer=None, norm_layer=None, + se_layer=None, drop_rate=0., drop_path_rate=0., global_pool='avg'): + super(EfficientNet, self).__init__() + act_layer = act_layer or nn.ReLU + norm_layer = norm_layer or nn.BatchNorm2d + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + se_layer = se_layer or SqueezeExcite + self.num_classes = num_classes + self.num_features = num_features + self.drop_rate = drop_rate + self.grad_checkpointing = False + + # Stem + if not fix_stem: + stem_size = round_chs_fn(stem_size) + self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type) + self.bn1 = norm_act_layer(stem_size, inplace=True) + + # Middle stages (IR/ER/DS Blocks) + builder = EfficientNetBuilder( + output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn, + act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate) + self.blocks = nn.Sequential(*builder(stem_size, block_args)) + self.feature_info = builder.features + head_chs = builder.in_chs + + # Head + Pooling + self.conv_head = create_conv2d(head_chs, self.num_features, 1, padding=pad_type) + self.bn2 = norm_act_layer(self.num_features, inplace=True) + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + efficientnet_init_weights(self) + + def as_sequential(self): + layers = [self.conv_stem, self.bn1] + layers.extend(self.blocks) + layers.extend([self.conv_head, self.bn2, self.global_pool]) + layers.extend([nn.Dropout(self.drop_rate), self.classifier]) + return nn.Sequential(*layers) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^conv_stem|bn1', + blocks=[ + (r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)', None), + (r'conv_head|bn2', (99999,)) + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.classifier + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x = self.conv_stem(x) + x = self.bn1(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x, flatten=True) + else: + x = self.blocks(x) + x = self.conv_head(x) + x = self.bn2(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return x if pre_logits else self.classifier(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +class EfficientNetFeatures(nn.Module): + """ EfficientNet Feature Extractor + + A work-in-progress feature extraction module for EfficientNet, to use as a backbone for segmentation + and object detection models. + """ + + def __init__( + self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck', in_chans=3, + stem_size=32, fix_stem=False, output_stride=32, pad_type='', round_chs_fn=round_channels, + act_layer=None, norm_layer=None, se_layer=None, drop_rate=0., drop_path_rate=0.): + super(EfficientNetFeatures, self).__init__() + act_layer = act_layer or nn.ReLU + norm_layer = norm_layer or nn.BatchNorm2d + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + se_layer = se_layer or SqueezeExcite + self.drop_rate = drop_rate + + # Stem + if not fix_stem: + stem_size = round_chs_fn(stem_size) + self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type) + self.bn1 = norm_act_layer(stem_size, inplace=True) + + # Middle stages (IR/ER/DS Blocks) + builder = EfficientNetBuilder( + output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn, + act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate, + feature_location=feature_location) + self.blocks = nn.Sequential(*builder(stem_size, block_args)) + self.feature_info = FeatureInfo(builder.features, out_indices) + self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices} + + efficientnet_init_weights(self) + + # Register feature extraction hooks with FeatureHooks helper + self.feature_hooks = None + if feature_location != 'bottleneck': + hooks = self.feature_info.get_dicts(keys=('module', 'hook_type')) + self.feature_hooks = FeatureHooks(hooks, self.named_modules()) + + def forward(self, x) -> List[torch.Tensor]: + x = self.conv_stem(x) + x = self.bn1(x) + if self.feature_hooks is None: + features = [] + if 0 in self._stage_out_idx: + features.append(x) # add stem out + for i, b in enumerate(self.blocks): + x = b(x) + if i + 1 in self._stage_out_idx: + features.append(x) + return features + else: + self.blocks(x) + out = self.feature_hooks.get_output(x.device) + return list(out.values()) + + +def _create_effnet(variant, pretrained=False, **kwargs): + features_only = False + model_cls = EfficientNet + kwargs_filter = None + if kwargs.pop('features_only', False): + features_only = True + kwargs_filter = ('num_classes', 'num_features', 'head_conv', 'global_pool') + model_cls = EfficientNetFeatures + model = build_model_with_cfg( + model_cls, variant, pretrained, + pretrained_strict=not features_only, + kwargs_filter=kwargs_filter, + **kwargs) + if features_only: + model.default_cfg = pretrained_cfg_for_features(model.default_cfg) + return model + + +def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """Creates a mnasnet-a1 model. + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet + Paper: https://arxiv.org/pdf/1807.11626.pdf. + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_e1_c16_noskip'], + # stage 1, 112x112 in + ['ir_r2_k3_s2_e6_c24'], + # stage 2, 56x56 in + ['ir_r3_k5_s2_e3_c40_se0.25'], + # stage 3, 28x28 in + ['ir_r4_k3_s2_e6_c80'], + # stage 4, 14x14in + ['ir_r2_k3_s1_e6_c112_se0.25'], + # stage 5, 14x14in + ['ir_r3_k5_s2_e6_c160_se0.25'], + # stage 6, 7x7 in + ['ir_r1_k3_s1_e6_c320'], + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + stem_size=32, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """Creates a mnasnet-b1 model. + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet + Paper: https://arxiv.org/pdf/1807.11626.pdf. + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_c16_noskip'], + # stage 1, 112x112 in + ['ir_r3_k3_s2_e3_c24'], + # stage 2, 56x56 in + ['ir_r3_k5_s2_e3_c40'], + # stage 3, 28x28 in + ['ir_r3_k5_s2_e6_c80'], + # stage 4, 14x14in + ['ir_r2_k3_s1_e6_c96'], + # stage 5, 14x14in + ['ir_r4_k5_s2_e6_c192'], + # stage 6, 7x7 in + ['ir_r1_k3_s1_e6_c320_noskip'] + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + stem_size=32, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """Creates a mnasnet-b1 model. + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet + Paper: https://arxiv.org/pdf/1807.11626.pdf. + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + arch_def = [ + ['ds_r1_k3_s1_c8'], + ['ir_r1_k3_s2_e3_c16'], + ['ir_r2_k3_s2_e6_c16'], + ['ir_r4_k5_s2_e6_c32_se0.25'], + ['ir_r3_k3_s1_e6_c32_se0.25'], + ['ir_r3_k5_s2_e6_c88_se0.25'], + ['ir_r1_k3_s1_e6_c144'] + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + stem_size=8, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_mobilenet_v2( + variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs): + """ Generate MobileNet-V2 network + Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py + Paper: https://arxiv.org/abs/1801.04381 + """ + arch_def = [ + ['ds_r1_k3_s1_c16'], + ['ir_r2_k3_s2_e6_c24'], + ['ir_r3_k3_s2_e6_c32'], + ['ir_r4_k3_s2_e6_c64'], + ['ir_r3_k3_s1_e6_c96'], + ['ir_r3_k3_s2_e6_c160'], + ['ir_r1_k3_s1_e6_c320'], + ] + round_chs_fn = partial(round_channels, multiplier=channel_multiplier) + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head), + num_features=1280 if fix_stem_head else max(1280, round_chs_fn(1280)), + stem_size=32, + fix_stem=fix_stem_head, + round_chs_fn=round_chs_fn, + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'relu6'), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """ FBNet-C + + Paper: https://arxiv.org/abs/1812.03443 + Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py + + NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper, + it was used to confirm some building block details + """ + arch_def = [ + ['ir_r1_k3_s1_e1_c16'], + ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'], + ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'], + ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'], + ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'], + ['ir_r4_k5_s2_e6_c184'], + ['ir_r1_k3_s1_e6_c352'], + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + stem_size=16, + num_features=1984, # paper suggests this, but is not 100% clear + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """Creates the Single-Path NAS model from search targeted for Pixel1 phone. + + Paper: https://arxiv.org/abs/1904.02877 + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_c16_noskip'], + # stage 1, 112x112 in + ['ir_r3_k3_s2_e3_c24'], + # stage 2, 56x56 in + ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'], + # stage 3, 28x28 in + ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'], + # stage 4, 14x14in + ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'], + # stage 5, 14x14in + ['ir_r4_k5_s2_e6_c192'], + # stage 6, 7x7 in + ['ir_r1_k3_s1_e6_c320_noskip'] + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + stem_size=32, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnet( + variant, channel_multiplier=1.0, depth_multiplier=1.0, channel_divisor=8, + group_size=None, pretrained=False, **kwargs): + """Creates an EfficientNet model. + + Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py + Paper: https://arxiv.org/abs/1905.11946 + + EfficientNet params + name: (channel_multiplier, depth_multiplier, resolution, dropout_rate) + 'efficientnet-b0': (1.0, 1.0, 224, 0.2), + 'efficientnet-b1': (1.0, 1.1, 240, 0.2), + 'efficientnet-b2': (1.1, 1.2, 260, 0.3), + 'efficientnet-b3': (1.2, 1.4, 300, 0.3), + 'efficientnet-b4': (1.4, 1.8, 380, 0.4), + 'efficientnet-b5': (1.6, 2.2, 456, 0.4), + 'efficientnet-b6': (1.8, 2.6, 528, 0.5), + 'efficientnet-b7': (2.0, 3.1, 600, 0.5), + 'efficientnet-b8': (2.2, 3.6, 672, 0.5), + 'efficientnet-l2': (4.3, 5.3, 800, 0.5), + + Args: + channel_multiplier: multiplier to number of channels per layer + depth_multiplier: multiplier to number of repeats per stage + + """ + arch_def = [ + ['ds_r1_k3_s1_e1_c16_se0.25'], + ['ir_r2_k3_s2_e6_c24_se0.25'], + ['ir_r2_k5_s2_e6_c40_se0.25'], + ['ir_r3_k3_s2_e6_c80_se0.25'], + ['ir_r3_k5_s1_e6_c112_se0.25'], + ['ir_r4_k5_s2_e6_c192_se0.25'], + ['ir_r1_k3_s1_e6_c320_se0.25'], + ] + round_chs_fn = partial(round_channels, multiplier=channel_multiplier, divisor=channel_divisor) + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size), + num_features=round_chs_fn(1280), + stem_size=32, + round_chs_fn=round_chs_fn, + act_layer=resolve_act_layer(kwargs, 'swish'), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnet_edge( + variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, pretrained=False, **kwargs): + """ Creates an EfficientNet-EdgeTPU model + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu + """ + + arch_def = [ + # NOTE `fc` is present to override a mismatch between stem channels and in chs not + # present in other models + ['er_r1_k3_s1_e4_c24_fc24_noskip'], + ['er_r2_k3_s2_e8_c32'], + ['er_r4_k3_s2_e8_c48'], + ['ir_r5_k5_s2_e8_c96'], + ['ir_r4_k5_s1_e8_c144'], + ['ir_r2_k5_s2_e8_c192'], + ] + round_chs_fn = partial(round_channels, multiplier=channel_multiplier) + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size), + num_features=round_chs_fn(1280), + stem_size=32, + round_chs_fn=round_chs_fn, + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'relu'), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnet_condconv( + variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs): + """Creates an EfficientNet-CondConv model. + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv + """ + arch_def = [ + ['ds_r1_k3_s1_e1_c16_se0.25'], + ['ir_r2_k3_s2_e6_c24_se0.25'], + ['ir_r2_k5_s2_e6_c40_se0.25'], + ['ir_r3_k3_s2_e6_c80_se0.25'], + ['ir_r3_k5_s1_e6_c112_se0.25_cc4'], + ['ir_r4_k5_s2_e6_c192_se0.25_cc4'], + ['ir_r1_k3_s1_e6_c320_se0.25_cc4'], + ] + # NOTE unlike official impl, this one uses `cc` option where x is the base number of experts for each stage and + # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers + round_chs_fn = partial(round_channels, multiplier=channel_multiplier) + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier), + num_features=round_chs_fn(1280), + stem_size=32, + round_chs_fn=round_chs_fn, + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'swish'), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): + """Creates an EfficientNet-Lite model. + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite + Paper: https://arxiv.org/abs/1905.11946 + + EfficientNet params + name: (channel_multiplier, depth_multiplier, resolution, dropout_rate) + 'efficientnet-lite0': (1.0, 1.0, 224, 0.2), + 'efficientnet-lite1': (1.0, 1.1, 240, 0.2), + 'efficientnet-lite2': (1.1, 1.2, 260, 0.3), + 'efficientnet-lite3': (1.2, 1.4, 280, 0.3), + 'efficientnet-lite4': (1.4, 1.8, 300, 0.3), + + Args: + channel_multiplier: multiplier to number of channels per layer + depth_multiplier: multiplier to number of repeats per stage + """ + arch_def = [ + ['ds_r1_k3_s1_e1_c16'], + ['ir_r2_k3_s2_e6_c24'], + ['ir_r2_k5_s2_e6_c40'], + ['ir_r3_k3_s2_e6_c80'], + ['ir_r3_k5_s1_e6_c112'], + ['ir_r4_k5_s2_e6_c192'], + ['ir_r1_k3_s1_e6_c320'], + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True), + num_features=1280, + stem_size=32, + fix_stem=True, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + act_layer=resolve_act_layer(kwargs, 'relu6'), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnetv2_base( + variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): + """ Creates an EfficientNet-V2 base model + + Ref impl: https://github.com/google/automl/tree/master/efficientnetv2 + Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298 + """ + arch_def = [ + ['cn_r1_k3_s1_e1_c16_skip'], + ['er_r2_k3_s2_e4_c32'], + ['er_r2_k3_s2_e4_c48'], + ['ir_r3_k3_s2_e4_c96_se0.25'], + ['ir_r5_k3_s1_e6_c112_se0.25'], + ['ir_r8_k3_s2_e6_c192_se0.25'], + ] + round_chs_fn = partial(round_channels, multiplier=channel_multiplier, round_limit=0.) + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier), + num_features=round_chs_fn(1280), + stem_size=32, + round_chs_fn=round_chs_fn, + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'silu'), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnetv2_s( + variant, channel_multiplier=1.0, depth_multiplier=1.0, group_size=None, rw=False, pretrained=False, **kwargs): + """ Creates an EfficientNet-V2 Small model + + Ref impl: https://github.com/google/automl/tree/master/efficientnetv2 + Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298 + + NOTE: `rw` flag sets up 'small' variant to behave like my initial v2 small model, + before ref the impl was released. + """ + arch_def = [ + ['cn_r2_k3_s1_e1_c24_skip'], + ['er_r4_k3_s2_e4_c48'], + ['er_r4_k3_s2_e4_c64'], + ['ir_r6_k3_s2_e4_c128_se0.25'], + ['ir_r9_k3_s1_e6_c160_se0.25'], + ['ir_r15_k3_s2_e6_c256_se0.25'], + ] + num_features = 1280 + if rw: + # my original variant, based on paper figure differs from the official release + arch_def[0] = ['er_r2_k3_s1_e1_c24'] + arch_def[-1] = ['ir_r15_k3_s2_e6_c272_se0.25'] + num_features = 1792 + + round_chs_fn = partial(round_channels, multiplier=channel_multiplier) + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier, group_size=group_size), + num_features=round_chs_fn(num_features), + stem_size=24, + round_chs_fn=round_chs_fn, + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'silu'), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnetv2_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): + """ Creates an EfficientNet-V2 Medium model + + Ref impl: https://github.com/google/automl/tree/master/efficientnetv2 + Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298 + """ + + arch_def = [ + ['cn_r3_k3_s1_e1_c24_skip'], + ['er_r5_k3_s2_e4_c48'], + ['er_r5_k3_s2_e4_c80'], + ['ir_r7_k3_s2_e4_c160_se0.25'], + ['ir_r14_k3_s1_e6_c176_se0.25'], + ['ir_r18_k3_s2_e6_c304_se0.25'], + ['ir_r5_k3_s1_e6_c512_se0.25'], + ] + + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier), + num_features=1280, + stem_size=24, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'silu'), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnetv2_l(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): + """ Creates an EfficientNet-V2 Large model + + Ref impl: https://github.com/google/automl/tree/master/efficientnetv2 + Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298 + """ + + arch_def = [ + ['cn_r4_k3_s1_e1_c32_skip'], + ['er_r7_k3_s2_e4_c64'], + ['er_r7_k3_s2_e4_c96'], + ['ir_r10_k3_s2_e4_c192_se0.25'], + ['ir_r19_k3_s1_e6_c224_se0.25'], + ['ir_r25_k3_s2_e6_c384_se0.25'], + ['ir_r7_k3_s1_e6_c640_se0.25'], + ] + + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier), + num_features=1280, + stem_size=32, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'silu'), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_efficientnetv2_xl(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): + """ Creates an EfficientNet-V2 Xtra-Large model + + Ref impl: https://github.com/google/automl/tree/master/efficientnetv2 + Paper: `EfficientNetV2: Smaller Models and Faster Training` - https://arxiv.org/abs/2104.00298 + """ + + arch_def = [ + ['cn_r4_k3_s1_e1_c32_skip'], + ['er_r8_k3_s2_e4_c64'], + ['er_r8_k3_s2_e4_c96'], + ['ir_r16_k3_s2_e4_c192_se0.25'], + ['ir_r24_k3_s1_e6_c256_se0.25'], + ['ir_r32_k3_s2_e6_c512_se0.25'], + ['ir_r8_k3_s1_e6_c640_se0.25'], + ] + + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier), + num_features=1280, + stem_size=32, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'silu'), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """Creates a MixNet Small model. + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet + Paper: https://arxiv.org/abs/1907.09595 + """ + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_e1_c16'], # relu + # stage 1, 112x112 in + ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'], # relu + # stage 2, 56x56 in + ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish + # stage 3, 28x28 in + ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'], # swish + # stage 4, 14x14in + ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish + # stage 5, 14x14in + ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish + # 7x7 + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + num_features=1536, + stem_size=16, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs): + """Creates a MixNet Medium-Large model. + + Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet + Paper: https://arxiv.org/abs/1907.09595 + """ + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_e1_c24'], # relu + # stage 1, 112x112 in + ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'], # relu + # stage 2, 56x56 in + ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'], # swish + # stage 3, 28x28 in + ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'], # swish + # stage 4, 14x14in + ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'], # swish + # stage 5, 14x14in + ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'], # swish + # 7x7 + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'), + num_features=1536, + stem_size=24, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +def _gen_tinynet( + variant, model_width=1.0, depth_multiplier=1.0, pretrained=False, **kwargs +): + """Creates a TinyNet model. + """ + arch_def = [ + ['ds_r1_k3_s1_e1_c16_se0.25'], ['ir_r2_k3_s2_e6_c24_se0.25'], + ['ir_r2_k5_s2_e6_c40_se0.25'], ['ir_r3_k3_s2_e6_c80_se0.25'], + ['ir_r3_k5_s1_e6_c112_se0.25'], ['ir_r4_k5_s2_e6_c192_se0.25'], + ['ir_r1_k3_s1_e6_c320_se0.25'], + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'), + num_features=max(1280, round_channels(1280, model_width, 8, None)), + stem_size=32, + fix_stem=True, + round_chs_fn=partial(round_channels, multiplier=model_width), + act_layer=resolve_act_layer(kwargs, 'swish'), + norm_layer=kwargs.pop('norm_layer', None) or partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + **kwargs, + ) + model = _create_effnet(variant, pretrained, **model_kwargs) + return model + + +@register_model +def mnasnet_050(pretrained=False, **kwargs): + """ MNASNet B1, depth multiplier of 0.5. """ + model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mnasnet_075(pretrained=False, **kwargs): + """ MNASNet B1, depth multiplier of 0.75. """ + model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mnasnet_100(pretrained=False, **kwargs): + """ MNASNet B1, depth multiplier of 1.0. """ + model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mnasnet_b1(pretrained=False, **kwargs): + """ MNASNet B1, depth multiplier of 1.0. """ + return mnasnet_100(pretrained, **kwargs) + + +@register_model +def mnasnet_140(pretrained=False, **kwargs): + """ MNASNet B1, depth multiplier of 1.4 """ + model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def semnasnet_050(pretrained=False, **kwargs): + """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """ + model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs) + return model + + +@register_model +def semnasnet_075(pretrained=False, **kwargs): + """ MNASNet A1 (w/ SE), depth multiplier of 0.75. """ + model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def semnasnet_100(pretrained=False, **kwargs): + """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """ + model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mnasnet_a1(pretrained=False, **kwargs): + """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """ + return semnasnet_100(pretrained, **kwargs) + + +@register_model +def semnasnet_140(pretrained=False, **kwargs): + """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """ + model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mnasnet_small(pretrained=False, **kwargs): + """ MNASNet Small, depth multiplier of 1.0. """ + model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv2_035(pretrained=False, **kwargs): + """ MobileNet V2 w/ 0.35 channel multiplier """ + model = _gen_mobilenet_v2('mobilenetv2_035', 0.35, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv2_050(pretrained=False, **kwargs): + """ MobileNet V2 w/ 0.5 channel multiplier """ + model = _gen_mobilenet_v2('mobilenetv2_050', 0.5, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv2_075(pretrained=False, **kwargs): + """ MobileNet V2 w/ 0.75 channel multiplier """ + model = _gen_mobilenet_v2('mobilenetv2_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv2_100(pretrained=False, **kwargs): + """ MobileNet V2 w/ 1.0 channel multiplier """ + model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv2_140(pretrained=False, **kwargs): + """ MobileNet V2 w/ 1.4 channel multiplier """ + model = _gen_mobilenet_v2('mobilenetv2_140', 1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv2_110d(pretrained=False, **kwargs): + """ MobileNet V2 w/ 1.1 channel, 1.2 depth multipliers""" + model = _gen_mobilenet_v2( + 'mobilenetv2_110d', 1.1, depth_multiplier=1.2, fix_stem_head=True, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv2_120d(pretrained=False, **kwargs): + """ MobileNet V2 w/ 1.2 channel, 1.4 depth multipliers """ + model = _gen_mobilenet_v2( + 'mobilenetv2_120d', 1.2, depth_multiplier=1.4, fix_stem_head=True, pretrained=pretrained, **kwargs) + return model + + +@register_model +def fbnetc_100(pretrained=False, **kwargs): + """ FBNet-C """ + if pretrained: + # pretrained model trained with non-default BN epsilon + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def spnasnet_100(pretrained=False, **kwargs): + """ Single-Path NAS Pixel1""" + model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b0(pretrained=False, **kwargs): + """ EfficientNet-B0 """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b1(pretrained=False, **kwargs): + """ EfficientNet-B1 """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b2(pretrained=False, **kwargs): + """ EfficientNet-B2 """ + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b2a(pretrained=False, **kwargs): + """ EfficientNet-B2 @ 288x288 w/ 1.0 test crop""" + # WARN this model def is deprecated, different train/test res + test crop handled by default_cfg now + return efficientnet_b2(pretrained=pretrained, **kwargs) + + +@register_model +def efficientnet_b3(pretrained=False, **kwargs): + """ EfficientNet-B3 """ + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b3a(pretrained=False, **kwargs): + """ EfficientNet-B3 @ 320x320 w/ 1.0 test crop-pct """ + # WARN this model def is deprecated, different train/test res + test crop handled by default_cfg now + return efficientnet_b3(pretrained=pretrained, **kwargs) + + +@register_model +def efficientnet_b4(pretrained=False, **kwargs): + """ EfficientNet-B4 """ + # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b5(pretrained=False, **kwargs): + """ EfficientNet-B5 """ + # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b6(pretrained=False, **kwargs): + """ EfficientNet-B6 """ + # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b7(pretrained=False, **kwargs): + """ EfficientNet-B7 """ + # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b8(pretrained=False, **kwargs): + """ EfficientNet-B8 """ + # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_l2(pretrained=False, **kwargs): + """ EfficientNet-L2.""" + # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs) + return model + + +# FIXME experimental group cong / GroupNorm / EvoNorm experiments +@register_model +def efficientnet_b0_gn(pretrained=False, **kwargs): + """ EfficientNet-B0 + GroupNorm""" + model = _gen_efficientnet( + 'efficientnet_b0_gn', norm_layer=partial(GroupNormAct, group_size=8), pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b0_g8_gn(pretrained=False, **kwargs): + """ EfficientNet-B0 w/ group conv + GroupNorm""" + model = _gen_efficientnet( + 'efficientnet_b0_g8_gn', group_size=8, norm_layer=partial(GroupNormAct, group_size=8), + pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b0_g16_evos(pretrained=False, **kwargs): + """ EfficientNet-B0 w/ group 16 conv + EvoNorm""" + model = _gen_efficientnet( + 'efficientnet_b0_g16_evos', group_size=16, channel_divisor=16, + pretrained=pretrained, **kwargs) #norm_layer=partial(EvoNorm2dS0, group_size=16), + return model + + +@register_model +def efficientnet_b3_gn(pretrained=False, **kwargs): + """ EfficientNet-B3 w/ GroupNorm """ + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b3_gn', channel_multiplier=1.2, depth_multiplier=1.4, channel_divisor=16, + norm_layer=partial(GroupNormAct, group_size=16), pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b3_g8_gn(pretrained=False, **kwargs): + """ EfficientNet-B3 w/ grouped conv + BN""" + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + model = _gen_efficientnet( + 'efficientnet_b3_g8_gn', channel_multiplier=1.2, depth_multiplier=1.4, group_size=8, channel_divisor=16, + norm_layer=partial(GroupNormAct, group_size=16), pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_es(pretrained=False, **kwargs): + """ EfficientNet-Edge Small. """ + model = _gen_efficientnet_edge( + 'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_es_pruned(pretrained=False, **kwargs): + """ EfficientNet-Edge Small Pruned. For more info: https://github.com/DeGirum/pruned-models/releases/tag/efficientnet_v1.0""" + model = _gen_efficientnet_edge( + 'efficientnet_es_pruned', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + +@register_model +def efficientnet_em(pretrained=False, **kwargs): + """ EfficientNet-Edge-Medium. """ + model = _gen_efficientnet_edge( + 'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_el(pretrained=False, **kwargs): + """ EfficientNet-Edge-Large. """ + model = _gen_efficientnet_edge( + 'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + +@register_model +def efficientnet_el_pruned(pretrained=False, **kwargs): + """ EfficientNet-Edge-Large pruned. For more info: https://github.com/DeGirum/pruned-models/releases/tag/efficientnet_v1.0""" + model = _gen_efficientnet_edge( + 'efficientnet_el_pruned', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + +@register_model +def efficientnet_cc_b0_4e(pretrained=False, **kwargs): + """ EfficientNet-CondConv-B0 w/ 8 Experts """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + model = _gen_efficientnet_condconv( + 'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_cc_b0_8e(pretrained=False, **kwargs): + """ EfficientNet-CondConv-B0 w/ 8 Experts """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + model = _gen_efficientnet_condconv( + 'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2, + pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_cc_b1_8e(pretrained=False, **kwargs): + """ EfficientNet-CondConv-B1 w/ 8 Experts """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + model = _gen_efficientnet_condconv( + 'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2, + pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_lite0(pretrained=False, **kwargs): + """ EfficientNet-Lite0 """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + model = _gen_efficientnet_lite( + 'efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_lite1(pretrained=False, **kwargs): + """ EfficientNet-Lite1 """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + model = _gen_efficientnet_lite( + 'efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_lite2(pretrained=False, **kwargs): + """ EfficientNet-Lite2 """ + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + model = _gen_efficientnet_lite( + 'efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_lite3(pretrained=False, **kwargs): + """ EfficientNet-Lite3 """ + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + model = _gen_efficientnet_lite( + 'efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_lite4(pretrained=False, **kwargs): + """ EfficientNet-Lite4 """ + # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2 + model = _gen_efficientnet_lite( + 'efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b1_pruned(pretrained=False, **kwargs): + """ EfficientNet-B1 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + variant = 'efficientnet_b1_pruned' + model = _gen_efficientnet( + variant, channel_multiplier=1.0, depth_multiplier=1.1, pruned=True, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b2_pruned(pretrained=False, **kwargs): + """ EfficientNet-B2 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'efficientnet_b2_pruned', channel_multiplier=1.1, depth_multiplier=1.2, pruned=True, + pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnet_b3_pruned(pretrained=False, **kwargs): + """ EfficientNet-B3 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'efficientnet_b3_pruned', channel_multiplier=1.2, depth_multiplier=1.4, pruned=True, + pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnetv2_rw_t(pretrained=False, **kwargs): + """ EfficientNet-V2 Tiny (Custom variant, tiny not in paper). """ + model = _gen_efficientnetv2_s( + 'efficientnetv2_rw_t', channel_multiplier=0.8, depth_multiplier=0.9, rw=False, pretrained=pretrained, **kwargs) + return model + + +@register_model +def gc_efficientnetv2_rw_t(pretrained=False, **kwargs): + """ EfficientNet-V2 Tiny w/ Global Context Attn (Custom variant, tiny not in paper). """ + model = _gen_efficientnetv2_s( + 'gc_efficientnetv2_rw_t', channel_multiplier=0.8, depth_multiplier=0.9, + rw=False, se_layer='gc', pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnetv2_rw_s(pretrained=False, **kwargs): + """ EfficientNet-V2 Small (RW variant). + NOTE: This is my initial (pre official code release) w/ some differences. + See efficientnetv2_s and tf_efficientnetv2_s for versions that match the official w/ PyTorch vs TF padding + """ + model = _gen_efficientnetv2_s('efficientnetv2_rw_s', rw=True, pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnetv2_rw_m(pretrained=False, **kwargs): + """ EfficientNet-V2 Medium (RW variant). + """ + model = _gen_efficientnetv2_s( + 'efficientnetv2_rw_m', channel_multiplier=1.2, depth_multiplier=(1.2,) * 4 + (1.6,) * 2, rw=True, + pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnetv2_s(pretrained=False, **kwargs): + """ EfficientNet-V2 Small. """ + model = _gen_efficientnetv2_s('efficientnetv2_s', pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnetv2_m(pretrained=False, **kwargs): + """ EfficientNet-V2 Medium. """ + model = _gen_efficientnetv2_m('efficientnetv2_m', pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnetv2_l(pretrained=False, **kwargs): + """ EfficientNet-V2 Large. """ + model = _gen_efficientnetv2_l('efficientnetv2_l', pretrained=pretrained, **kwargs) + return model + + +@register_model +def efficientnetv2_xl(pretrained=False, **kwargs): + """ EfficientNet-V2 Xtra-Large. """ + model = _gen_efficientnetv2_xl('efficientnetv2_xl', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b0(pretrained=False, **kwargs): + """ EfficientNet-B0. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b1(pretrained=False, **kwargs): + """ EfficientNet-B1. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b2(pretrained=False, **kwargs): + """ EfficientNet-B2. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b3(pretrained=False, **kwargs): + """ EfficientNet-B3. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b4(pretrained=False, **kwargs): + """ EfficientNet-B4. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b5(pretrained=False, **kwargs): + """ EfficientNet-B5. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b6(pretrained=False, **kwargs): + """ EfficientNet-B6. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b7(pretrained=False, **kwargs): + """ EfficientNet-B7. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b8(pretrained=False, **kwargs): + """ EfficientNet-B8. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b0_ap(pretrained=False, **kwargs): + """ EfficientNet-B0 AdvProp. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b1_ap(pretrained=False, **kwargs): + """ EfficientNet-B1 AdvProp. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b2_ap(pretrained=False, **kwargs): + """ EfficientNet-B2 AdvProp. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b3_ap(pretrained=False, **kwargs): + """ EfficientNet-B3 AdvProp. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b4_ap(pretrained=False, **kwargs): + """ EfficientNet-B4 AdvProp. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b5_ap(pretrained=False, **kwargs): + """ EfficientNet-B5 AdvProp. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b6_ap(pretrained=False, **kwargs): + """ EfficientNet-B6 AdvProp. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b7_ap(pretrained=False, **kwargs): + """ EfficientNet-B7 AdvProp. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b8_ap(pretrained=False, **kwargs): + """ EfficientNet-B8 AdvProp. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b0_ns(pretrained=False, **kwargs): + """ EfficientNet-B0 NoisyStudent. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b0_ns', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b1_ns(pretrained=False, **kwargs): + """ EfficientNet-B1 NoisyStudent. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b1_ns', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b2_ns(pretrained=False, **kwargs): + """ EfficientNet-B2 NoisyStudent. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b2_ns', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b3_ns(pretrained=False, **kwargs): + """ EfficientNet-B3 NoisyStudent. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b3_ns', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b4_ns(pretrained=False, **kwargs): + """ EfficientNet-B4 NoisyStudent. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b4_ns', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b5_ns(pretrained=False, **kwargs): + """ EfficientNet-B5 NoisyStudent. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b5_ns', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b6_ns(pretrained=False, **kwargs): + """ EfficientNet-B6 NoisyStudent. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b6_ns', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_b7_ns(pretrained=False, **kwargs): + """ EfficientNet-B7 NoisyStudent. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_b7_ns', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_l2_ns_475(pretrained=False, **kwargs): + """ EfficientNet-L2 NoisyStudent @ 475x475. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_l2_ns_475', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_l2_ns(pretrained=False, **kwargs): + """ EfficientNet-L2 NoisyStudent. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.5 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet( + 'tf_efficientnet_l2_ns', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_es(pretrained=False, **kwargs): + """ EfficientNet-Edge Small. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_edge( + 'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_em(pretrained=False, **kwargs): + """ EfficientNet-Edge-Medium. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_edge( + 'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_el(pretrained=False, **kwargs): + """ EfficientNet-Edge-Large. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_edge( + 'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs): + """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_condconv( + 'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs): + """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_condconv( + 'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2, + pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs): + """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_condconv( + 'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2, + pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_lite0(pretrained=False, **kwargs): + """ EfficientNet-Lite0 """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_lite( + 'tf_efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_lite1(pretrained=False, **kwargs): + """ EfficientNet-Lite1 """ + # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_lite( + 'tf_efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_lite2(pretrained=False, **kwargs): + """ EfficientNet-Lite2 """ + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_lite( + 'tf_efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_lite3(pretrained=False, **kwargs): + """ EfficientNet-Lite3 """ + # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_lite( + 'tf_efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnet_lite4(pretrained=False, **kwargs): + """ EfficientNet-Lite4 """ + # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2 + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnet_lite( + 'tf_efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs) + return model + + + +@register_model +def tf_efficientnetv2_s(pretrained=False, **kwargs): + """ EfficientNet-V2 Small. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_s('tf_efficientnetv2_s', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_m(pretrained=False, **kwargs): + """ EfficientNet-V2 Medium. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_m('tf_efficientnetv2_m', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_l(pretrained=False, **kwargs): + """ EfficientNet-V2 Large. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_l('tf_efficientnetv2_l', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_s_in21ft1k(pretrained=False, **kwargs): + """ EfficientNet-V2 Small. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_s('tf_efficientnetv2_s_in21ft1k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_m_in21ft1k(pretrained=False, **kwargs): + """ EfficientNet-V2 Medium. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_m('tf_efficientnetv2_m_in21ft1k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_l_in21ft1k(pretrained=False, **kwargs): + """ EfficientNet-V2 Large. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_l('tf_efficientnetv2_l_in21ft1k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_xl_in21ft1k(pretrained=False, **kwargs): + """ EfficientNet-V2 Xtra-Large. Pretrained on ImageNet-21k, fine-tuned on 1k. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_xl('tf_efficientnetv2_xl_in21ft1k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_s_in21k(pretrained=False, **kwargs): + """ EfficientNet-V2 Small w/ ImageNet-21k pretrained weights. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_s('tf_efficientnetv2_s_in21k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_m_in21k(pretrained=False, **kwargs): + """ EfficientNet-V2 Medium w/ ImageNet-21k pretrained weights. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_m('tf_efficientnetv2_m_in21k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_l_in21k(pretrained=False, **kwargs): + """ EfficientNet-V2 Large w/ ImageNet-21k pretrained weights. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_l('tf_efficientnetv2_l_in21k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_xl_in21k(pretrained=False, **kwargs): + """ EfficientNet-V2 Xtra-Large w/ ImageNet-21k pretrained weights. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_xl('tf_efficientnetv2_xl_in21k', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_b0(pretrained=False, **kwargs): + """ EfficientNet-V2-B0. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_base('tf_efficientnetv2_b0', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_b1(pretrained=False, **kwargs): + """ EfficientNet-V2-B1. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_base( + 'tf_efficientnetv2_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_b2(pretrained=False, **kwargs): + """ EfficientNet-V2-B2. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_base( + 'tf_efficientnetv2_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_efficientnetv2_b3(pretrained=False, **kwargs): + """ EfficientNet-V2-B3. Tensorflow compatible variant """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_efficientnetv2_base( + 'tf_efficientnetv2_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mixnet_s(pretrained=False, **kwargs): + """Creates a MixNet Small model. + """ + model = _gen_mixnet_s( + 'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mixnet_m(pretrained=False, **kwargs): + """Creates a MixNet Medium model. + """ + model = _gen_mixnet_m( + 'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mixnet_l(pretrained=False, **kwargs): + """Creates a MixNet Large model. + """ + model = _gen_mixnet_m( + 'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mixnet_xl(pretrained=False, **kwargs): + """Creates a MixNet Extra-Large model. + Not a paper spec, experimental def by RW w/ depth scaling. + """ + model = _gen_mixnet_m( + 'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mixnet_xxl(pretrained=False, **kwargs): + """Creates a MixNet Double Extra Large model. + Not a paper spec, experimental def by RW w/ depth scaling. + """ + model = _gen_mixnet_m( + 'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mixnet_s(pretrained=False, **kwargs): + """Creates a MixNet Small model. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mixnet_s( + 'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mixnet_m(pretrained=False, **kwargs): + """Creates a MixNet Medium model. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mixnet_m( + 'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mixnet_l(pretrained=False, **kwargs): + """Creates a MixNet Large model. Tensorflow compatible variant + """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mixnet_m( + 'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tinynet_a(pretrained=False, **kwargs): + model = _gen_tinynet('tinynet_a', 1.0, 1.2, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tinynet_b(pretrained=False, **kwargs): + model = _gen_tinynet('tinynet_b', 0.75, 1.1, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tinynet_c(pretrained=False, **kwargs): + model = _gen_tinynet('tinynet_c', 0.54, 0.85, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tinynet_d(pretrained=False, **kwargs): + model = _gen_tinynet('tinynet_d', 0.54, 0.695, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tinynet_e(pretrained=False, **kwargs): + model = _gen_tinynet('tinynet_e', 0.51, 0.6, pretrained=pretrained, **kwargs) + return model diff --git a/custom_timm/models/efficientnet_blocks.py b/custom_timm/models/efficientnet_blocks.py new file mode 100644 index 0000000000000000000000000000000000000000..34a317571c99132cbd6c00561f1eaf9699eabaff --- /dev/null +++ b/custom_timm/models/efficientnet_blocks.py @@ -0,0 +1,281 @@ +""" EfficientNet, MobileNetV3, etc Blocks + +Hacked together by / Copyright 2019, Ross Wightman +""" +import math + +import torch +import torch.nn as nn +from torch.nn import functional as F + +from .layers import create_conv2d, DropPath, make_divisible, create_act_layer, get_norm_act_layer + +__all__ = [ + 'SqueezeExcite', 'ConvBnAct', 'DepthwiseSeparableConv', 'InvertedResidual', 'CondConvResidual', 'EdgeResidual'] + + +def num_groups(group_size, channels): + if not group_size: # 0 or None + return 1 # normal conv with 1 group + else: + # NOTE group_size == 1 -> depthwise conv + assert channels % group_size == 0 + return channels // group_size + + +class SqueezeExcite(nn.Module): + """ Squeeze-and-Excitation w/ specific features for EfficientNet/MobileNet family + + Args: + in_chs (int): input channels to layer + rd_ratio (float): ratio of squeeze reduction + act_layer (nn.Module): activation layer of containing block + gate_layer (Callable): attention gate function + force_act_layer (nn.Module): override block's activation fn if this is set/bound + rd_round_fn (Callable): specify a fn to calculate rounding of reduced chs + """ + + def __init__( + self, in_chs, rd_ratio=0.25, rd_channels=None, act_layer=nn.ReLU, + gate_layer=nn.Sigmoid, force_act_layer=None, rd_round_fn=None): + super(SqueezeExcite, self).__init__() + if rd_channels is None: + rd_round_fn = rd_round_fn or round + rd_channels = rd_round_fn(in_chs * rd_ratio) + act_layer = force_act_layer or act_layer + self.conv_reduce = nn.Conv2d(in_chs, rd_channels, 1, bias=True) + self.act1 = create_act_layer(act_layer, inplace=True) + self.conv_expand = nn.Conv2d(rd_channels, in_chs, 1, bias=True) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + x_se = self.conv_reduce(x_se) + x_se = self.act1(x_se) + x_se = self.conv_expand(x_se) + return x * self.gate(x_se) + + +class ConvBnAct(nn.Module): + """ Conv + Norm Layer + Activation w/ optional skip connection + """ + def __init__( + self, in_chs, out_chs, kernel_size, stride=1, dilation=1, group_size=0, pad_type='', + skip=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_path_rate=0.): + super(ConvBnAct, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + groups = num_groups(group_size, in_chs) + self.has_skip = skip and stride == 1 and in_chs == out_chs + + self.conv = create_conv2d( + in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, groups=groups, padding=pad_type) + self.bn1 = norm_act_layer(out_chs, inplace=True) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': # output of conv after act, same as block coutput + return dict(module='bn1', hook_type='forward', num_chs=self.conv.out_channels) + else: # location == 'bottleneck', block output + return dict(module='', hook_type='', num_chs=self.conv.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv(x) + x = self.bn1(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + +class DepthwiseSeparableConv(nn.Module): + """ DepthwiseSeparable block + Used for DS convs in MobileNet-V1 and in the place of IR blocks that have no expansion + (factor of 1.0). This is an alternative to having a IR with an optional first pw conv. + """ + def __init__( + self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='', + noskip=False, pw_kernel_size=1, pw_act=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + se_layer=None, drop_path_rate=0.): + super(DepthwiseSeparableConv, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + groups = num_groups(group_size, in_chs) + self.has_skip = (stride == 1 and in_chs == out_chs) and not noskip + self.has_pw_act = pw_act # activation after point-wise conv + + self.conv_dw = create_conv2d( + in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, groups=groups) + self.bn1 = norm_act_layer(in_chs, inplace=True) + + # Squeeze-and-excitation + self.se = se_layer(in_chs, act_layer=act_layer) if se_layer else nn.Identity() + + self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type) + self.bn2 = norm_act_layer(out_chs, inplace=True, apply_act=self.has_pw_act) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': # after SE, input to PW + return dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels) + else: # location == 'bottleneck', block output + return dict(module='', hook_type='', num_chs=self.conv_pw.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + x = self.bn1(x) + x = self.se(x) + x = self.conv_pw(x) + x = self.bn2(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + +class InvertedResidual(nn.Module): + """ Inverted residual block w/ optional SE + + Originally used in MobileNet-V2 - https://arxiv.org/abs/1801.04381v4, this layer is often + referred to as 'MBConv' for (Mobile inverted bottleneck conv) and is also used in + * MNasNet - https://arxiv.org/abs/1807.11626 + * EfficientNet - https://arxiv.org/abs/1905.11946 + * MobileNet-V3 - https://arxiv.org/abs/1905.02244 + """ + + def __init__( + self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='', + noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, se_layer=None, conv_kwargs=None, drop_path_rate=0.): + super(InvertedResidual, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + conv_kwargs = conv_kwargs or {} + mid_chs = make_divisible(in_chs * exp_ratio) + groups = num_groups(group_size, mid_chs) + self.has_skip = (in_chs == out_chs and stride == 1) and not noskip + + # Point-wise expansion + self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs) + self.bn1 = norm_act_layer(mid_chs, inplace=True) + + # Depth-wise convolution + self.conv_dw = create_conv2d( + mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation, + groups=groups, padding=pad_type, **conv_kwargs) + self.bn2 = norm_act_layer(mid_chs, inplace=True) + + # Squeeze-and-excitation + self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity() + + # Point-wise linear projection + self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs) + self.bn3 = norm_act_layer(out_chs, apply_act=False) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': # after SE, input to PWL + return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels) + else: # location == 'bottleneck', block output + return dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv_pw(x) + x = self.bn1(x) + x = self.conv_dw(x) + x = self.bn2(x) + x = self.se(x) + x = self.conv_pwl(x) + x = self.bn3(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + +class CondConvResidual(InvertedResidual): + """ Inverted residual block w/ CondConv routing""" + + def __init__( + self, in_chs, out_chs, dw_kernel_size=3, stride=1, dilation=1, group_size=1, pad_type='', + noskip=False, exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1, act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, se_layer=None, num_experts=0, drop_path_rate=0.): + + self.num_experts = num_experts + conv_kwargs = dict(num_experts=self.num_experts) + + super(CondConvResidual, self).__init__( + in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, group_size=group_size, + pad_type=pad_type, act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size, + pw_kernel_size=pw_kernel_size, se_layer=se_layer, norm_layer=norm_layer, conv_kwargs=conv_kwargs, + drop_path_rate=drop_path_rate) + + self.routing_fn = nn.Linear(in_chs, self.num_experts) + + def forward(self, x): + shortcut = x + pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1) # CondConv routing + routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs)) + x = self.conv_pw(x, routing_weights) + x = self.bn1(x) + x = self.conv_dw(x, routing_weights) + x = self.bn2(x) + x = self.se(x) + x = self.conv_pwl(x, routing_weights) + x = self.bn3(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x + + +class EdgeResidual(nn.Module): + """ Residual block with expansion convolution followed by pointwise-linear w/ stride + + Originally introduced in `EfficientNet-EdgeTPU: Creating Accelerator-Optimized Neural Networks with AutoML` + - https://ai.googleblog.com/2019/08/efficientnet-edgetpu-creating.html + + This layer is also called FusedMBConv in the MobileDet, EfficientNet-X, and EfficientNet-V2 papers + * MobileDet - https://arxiv.org/abs/2004.14525 + * EfficientNet-X - https://arxiv.org/abs/2102.05610 + * EfficientNet-V2 - https://arxiv.org/abs/2104.00298 + """ + + def __init__( + self, in_chs, out_chs, exp_kernel_size=3, stride=1, dilation=1, group_size=0, pad_type='', + force_in_chs=0, noskip=False, exp_ratio=1.0, pw_kernel_size=1, act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, se_layer=None, drop_path_rate=0.): + super(EdgeResidual, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + if force_in_chs > 0: + mid_chs = make_divisible(force_in_chs * exp_ratio) + else: + mid_chs = make_divisible(in_chs * exp_ratio) + groups = num_groups(group_size, in_chs) + self.has_skip = (in_chs == out_chs and stride == 1) and not noskip + + # Expansion convolution + self.conv_exp = create_conv2d( + in_chs, mid_chs, exp_kernel_size, stride=stride, dilation=dilation, groups=groups, padding=pad_type) + self.bn1 = norm_act_layer(mid_chs, inplace=True) + + # Squeeze-and-excitation + self.se = se_layer(mid_chs, act_layer=act_layer) if se_layer else nn.Identity() + + # Point-wise linear projection + self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type) + self.bn2 = norm_act_layer(out_chs, apply_act=False) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate else nn.Identity() + + def feature_info(self, location): + if location == 'expansion': # after SE, before PWL + return dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels) + else: # location == 'bottleneck', block output + return dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels) + + def forward(self, x): + shortcut = x + x = self.conv_exp(x) + x = self.bn1(x) + x = self.se(x) + x = self.conv_pwl(x) + x = self.bn2(x) + if self.has_skip: + x = self.drop_path(x) + shortcut + return x diff --git a/custom_timm/models/efficientnet_builder.py b/custom_timm/models/efficientnet_builder.py new file mode 100644 index 0000000000000000000000000000000000000000..67d15a8692dc99d735c94b37505f3c01b2c29fea --- /dev/null +++ b/custom_timm/models/efficientnet_builder.py @@ -0,0 +1,477 @@ +""" EfficientNet, MobileNetV3, etc Builder + +Assembles EfficieNet and related network feature blocks from string definitions. +Handles stride, dilation calculations, and selects feature extraction points. + +Hacked together by / Copyright 2019, Ross Wightman +""" + +import logging +import math +import re +from copy import deepcopy +from functools import partial + +import torch.nn as nn + +from .efficientnet_blocks import * +from .layers import CondConv2d, get_condconv_initializer, get_act_layer, get_attn, make_divisible + +__all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights", + 'resolve_bn_args', 'resolve_act_layer', 'round_channels', 'BN_MOMENTUM_TF_DEFAULT', 'BN_EPS_TF_DEFAULT'] + +_logger = logging.getLogger(__name__) + + +_DEBUG_BUILDER = False + +# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per +# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay) +# NOTE: momentum varies btw .99 and .9997 depending on source +# .99 in official TF TPU impl +# .9997 (/w .999 in search space) for paper +BN_MOMENTUM_TF_DEFAULT = 1 - 0.99 +BN_EPS_TF_DEFAULT = 1e-3 +_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT) + + +def get_bn_args_tf(): + return _BN_ARGS_TF.copy() + + +def resolve_bn_args(kwargs): + bn_args = {} + bn_momentum = kwargs.pop('bn_momentum', None) + if bn_momentum is not None: + bn_args['momentum'] = bn_momentum + bn_eps = kwargs.pop('bn_eps', None) + if bn_eps is not None: + bn_args['eps'] = bn_eps + return bn_args + + +def resolve_act_layer(kwargs, default='relu'): + return get_act_layer(kwargs.pop('act_layer', default)) + + +def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None, round_limit=0.9): + """Round number of filters based on depth multiplier.""" + if not multiplier: + return channels + return make_divisible(channels * multiplier, divisor, channel_min, round_limit=round_limit) + + +def _log_info_if(msg, condition): + if condition: + _logger.info(msg) + + +def _parse_ksize(ss): + if ss.isdigit(): + return int(ss) + else: + return [int(k) for k in ss.split('.')] + + +def _decode_block_str(block_str): + """ Decode block definition string + + Gets a list of block arg (dicts) through a string notation of arguments. + E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip + + All args can exist in any order with the exception of the leading string which + is assumed to indicate the block type. + + leading string - block type ( + ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct) + r - number of repeat blocks, + k - kernel size, + s - strides (1-9), + e - expansion ratio, + c - output channels, + se - squeeze/excitation ratio + n - activation fn ('re', 'r6', 'hs', or 'sw') + Args: + block_str: a string representation of block arguments. + Returns: + A list of block args (dicts) + Raises: + ValueError: if the string def not properly specified (TODO) + """ + assert isinstance(block_str, str) + ops = block_str.split('_') + block_type = ops[0] # take the block type off the front + ops = ops[1:] + options = {} + skip = None + for op in ops: + # string options being checked on individual basis, combine if they grow + if op == 'noskip': + skip = False # force no skip connection + elif op == 'skip': + skip = True # force a skip connection + elif op.startswith('n'): + # activation fn + key = op[0] + v = op[1:] + if v == 're': + value = get_act_layer('relu') + elif v == 'r6': + value = get_act_layer('relu6') + elif v == 'hs': + value = get_act_layer('hard_swish') + elif v == 'sw': + value = get_act_layer('swish') # aka SiLU + elif v == 'mi': + value = get_act_layer('mish') + else: + continue + options[key] = value + else: + # all numeric options + splits = re.split(r'(\d.*)', op) + if len(splits) >= 2: + key, value = splits[:2] + options[key] = value + + # if act_layer is None, the model default (passed to model init) will be used + act_layer = options['n'] if 'n' in options else None + exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1 + pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1 + force_in_chs = int(options['fc']) if 'fc' in options else 0 # FIXME hack to deal with in_chs issue in TPU def + num_repeat = int(options['r']) + + # each type of block has different valid arguments, fill accordingly + block_args = dict( + block_type=block_type, + out_chs=int(options['c']), + stride=int(options['s']), + act_layer=act_layer, + ) + if block_type == 'ir': + block_args.update(dict( + dw_kernel_size=_parse_ksize(options['k']), + exp_kernel_size=exp_kernel_size, + pw_kernel_size=pw_kernel_size, + exp_ratio=float(options['e']), + se_ratio=float(options['se']) if 'se' in options else 0., + noskip=skip is False, + )) + if 'cc' in options: + block_args['num_experts'] = int(options['cc']) + elif block_type == 'ds' or block_type == 'dsa': + block_args.update(dict( + dw_kernel_size=_parse_ksize(options['k']), + pw_kernel_size=pw_kernel_size, + se_ratio=float(options['se']) if 'se' in options else 0., + pw_act=block_type == 'dsa', + noskip=block_type == 'dsa' or skip is False, + )) + elif block_type == 'er': + block_args.update(dict( + exp_kernel_size=_parse_ksize(options['k']), + pw_kernel_size=pw_kernel_size, + exp_ratio=float(options['e']), + force_in_chs=force_in_chs, + se_ratio=float(options['se']) if 'se' in options else 0., + noskip=skip is False, + )) + elif block_type == 'cn': + block_args.update(dict( + kernel_size=int(options['k']), + skip=skip is True, + )) + else: + assert False, 'Unknown block type (%s)' % block_type + if 'gs' in options: + block_args['group_size'] = options['gs'] + + return block_args, num_repeat + + +def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'): + """ Per-stage depth scaling + Scales the block repeats in each stage. This depth scaling impl maintains + compatibility with the EfficientNet scaling method, while allowing sensible + scaling for other models that may have multiple block arg definitions in each stage. + """ + + # We scale the total repeat count for each stage, there may be multiple + # block arg defs per stage so we need to sum. + num_repeat = sum(repeats) + if depth_trunc == 'round': + # Truncating to int by rounding allows stages with few repeats to remain + # proportionally smaller for longer. This is a good choice when stage definitions + # include single repeat stages that we'd prefer to keep that way as long as possible + num_repeat_scaled = max(1, round(num_repeat * depth_multiplier)) + else: + # The default for EfficientNet truncates repeats to int via 'ceil'. + # Any multiplier > 1.0 will result in an increased depth for every stage. + num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier)) + + # Proportionally distribute repeat count scaling to each block definition in the stage. + # Allocation is done in reverse as it results in the first block being less likely to be scaled. + # The first block makes less sense to repeat in most of the arch definitions. + repeats_scaled = [] + for r in repeats[::-1]: + rs = max(1, round((r / num_repeat * num_repeat_scaled))) + repeats_scaled.append(rs) + num_repeat -= r + num_repeat_scaled -= rs + repeats_scaled = repeats_scaled[::-1] + + # Apply the calculated scaling to each block arg in the stage + sa_scaled = [] + for ba, rep in zip(stack_args, repeats_scaled): + sa_scaled.extend([deepcopy(ba) for _ in range(rep)]) + return sa_scaled + + +def decode_arch_def( + arch_def, + depth_multiplier=1.0, + depth_trunc='ceil', + experts_multiplier=1, + fix_first_last=False, + group_size=None, +): + """ Decode block architecture definition strings -> block kwargs + + Args: + arch_def: architecture definition strings, list of list of strings + depth_multiplier: network depth multiplier + depth_trunc: networ depth truncation mode when applying multiplier + experts_multiplier: CondConv experts multiplier + fix_first_last: fix first and last block depths when multiplier is applied + group_size: group size override for all blocks that weren't explicitly set in arch string + + Returns: + list of list of block kwargs + """ + arch_args = [] + if isinstance(depth_multiplier, tuple): + assert len(depth_multiplier) == len(arch_def) + else: + depth_multiplier = (depth_multiplier,) * len(arch_def) + for stack_idx, (block_strings, multiplier) in enumerate(zip(arch_def, depth_multiplier)): + assert isinstance(block_strings, list) + stack_args = [] + repeats = [] + for block_str in block_strings: + assert isinstance(block_str, str) + ba, rep = _decode_block_str(block_str) + if ba.get('num_experts', 0) > 0 and experts_multiplier > 1: + ba['num_experts'] *= experts_multiplier + if group_size is not None: + ba.setdefault('group_size', group_size) + stack_args.append(ba) + repeats.append(rep) + if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1): + arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc)) + else: + arch_args.append(_scale_stage_depth(stack_args, repeats, multiplier, depth_trunc)) + return arch_args + + +class EfficientNetBuilder: + """ Build Trunk Blocks + + This ended up being somewhat of a cross between + https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py + and + https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py + + """ + def __init__(self, output_stride=32, pad_type='', round_chs_fn=round_channels, se_from_exp=False, + act_layer=None, norm_layer=None, se_layer=None, drop_path_rate=0., feature_location=''): + self.output_stride = output_stride + self.pad_type = pad_type + self.round_chs_fn = round_chs_fn + self.se_from_exp = se_from_exp # calculate se channel reduction from expanded (mid) chs + self.act_layer = act_layer + self.norm_layer = norm_layer + self.se_layer = get_attn(se_layer) + try: + self.se_layer(8, rd_ratio=1.0) # test if attn layer accepts rd_ratio arg + self.se_has_ratio = True + except TypeError: + self.se_has_ratio = False + self.drop_path_rate = drop_path_rate + if feature_location == 'depthwise': + # old 'depthwise' mode renamed 'expansion' to match TF impl, old expansion mode didn't make sense + _logger.warning("feature_location=='depthwise' is deprecated, using 'expansion'") + feature_location = 'expansion' + self.feature_location = feature_location + assert feature_location in ('bottleneck', 'expansion', '') + self.verbose = _DEBUG_BUILDER + + # state updated during build, consumed by model + self.in_chs = None + self.features = [] + + def _make_block(self, ba, block_idx, block_count): + drop_path_rate = self.drop_path_rate * block_idx / block_count + bt = ba.pop('block_type') + ba['in_chs'] = self.in_chs + ba['out_chs'] = self.round_chs_fn(ba['out_chs']) + if 'force_in_chs' in ba and ba['force_in_chs']: + # NOTE this is a hack to work around mismatch in TF EdgeEffNet impl + ba['force_in_chs'] = self.round_chs_fn(ba['force_in_chs']) + ba['pad_type'] = self.pad_type + # block act fn overrides the model default + ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer + assert ba['act_layer'] is not None + ba['norm_layer'] = self.norm_layer + ba['drop_path_rate'] = drop_path_rate + if bt != 'cn': + se_ratio = ba.pop('se_ratio') + if se_ratio and self.se_layer is not None: + if not self.se_from_exp: + # adjust se_ratio by expansion ratio if calculating se channels from block input + se_ratio /= ba.get('exp_ratio', 1.0) + if self.se_has_ratio: + ba['se_layer'] = partial(self.se_layer, rd_ratio=se_ratio) + else: + ba['se_layer'] = self.se_layer + + if bt == 'ir': + _log_info_if(' InvertedResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose) + block = CondConvResidual(**ba) if ba.get('num_experts', 0) else InvertedResidual(**ba) + elif bt == 'ds' or bt == 'dsa': + _log_info_if(' DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)), self.verbose) + block = DepthwiseSeparableConv(**ba) + elif bt == 'er': + _log_info_if(' EdgeResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose) + block = EdgeResidual(**ba) + elif bt == 'cn': + _log_info_if(' ConvBnAct {}, Args: {}'.format(block_idx, str(ba)), self.verbose) + block = ConvBnAct(**ba) + else: + assert False, 'Uknkown block type (%s) while building model.' % bt + + self.in_chs = ba['out_chs'] # update in_chs for arg of next block + return block + + def __call__(self, in_chs, model_block_args): + """ Build the blocks + Args: + in_chs: Number of input-channels passed to first block + model_block_args: A list of lists, outer list defines stages, inner + list contains strings defining block configuration(s) + Return: + List of block stacks (each stack wrapped in nn.Sequential) + """ + _log_info_if('Building model trunk with %d stages...' % len(model_block_args), self.verbose) + self.in_chs = in_chs + total_block_count = sum([len(x) for x in model_block_args]) + total_block_idx = 0 + current_stride = 2 + current_dilation = 1 + stages = [] + if model_block_args[0][0]['stride'] > 1: + # if the first block starts with a stride, we need to extract first level feat from stem + feature_info = dict( + module='act1', num_chs=in_chs, stage=0, reduction=current_stride, + hook_type='forward' if self.feature_location != 'bottleneck' else '') + self.features.append(feature_info) + + # outer list of block_args defines the stacks + for stack_idx, stack_args in enumerate(model_block_args): + last_stack = stack_idx + 1 == len(model_block_args) + _log_info_if('Stack: {}'.format(stack_idx), self.verbose) + assert isinstance(stack_args, list) + + blocks = [] + # each stack (stage of blocks) contains a list of block arguments + for block_idx, block_args in enumerate(stack_args): + last_block = block_idx + 1 == len(stack_args) + _log_info_if(' Block: {}'.format(block_idx), self.verbose) + + assert block_args['stride'] in (1, 2) + if block_idx >= 1: # only the first block in any stack can have a stride > 1 + block_args['stride'] = 1 + + extract_features = False + if last_block: + next_stack_idx = stack_idx + 1 + extract_features = next_stack_idx >= len(model_block_args) or \ + model_block_args[next_stack_idx][0]['stride'] > 1 + + next_dilation = current_dilation + if block_args['stride'] > 1: + next_output_stride = current_stride * block_args['stride'] + if next_output_stride > self.output_stride: + next_dilation = current_dilation * block_args['stride'] + block_args['stride'] = 1 + _log_info_if(' Converting stride to dilation to maintain output_stride=={}'.format( + self.output_stride), self.verbose) + else: + current_stride = next_output_stride + block_args['dilation'] = current_dilation + if next_dilation != current_dilation: + current_dilation = next_dilation + + # create the block + block = self._make_block(block_args, total_block_idx, total_block_count) + blocks.append(block) + + # stash feature module name and channel info for model feature extraction + if extract_features: + feature_info = dict( + stage=stack_idx + 1, reduction=current_stride, **block.feature_info(self.feature_location)) + module_name = f'blocks.{stack_idx}.{block_idx}' + leaf_name = feature_info.get('module', '') + feature_info['module'] = '.'.join([module_name, leaf_name]) if leaf_name else module_name + self.features.append(feature_info) + + total_block_idx += 1 # incr global block idx (across all stacks) + stages.append(nn.Sequential(*blocks)) + return stages + + +def _init_weight_goog(m, n='', fix_group_fanout=True): + """ Weight initialization as per Tensorflow official implementations. + + Args: + m (nn.Module): module to init + n (str): module name + fix_group_fanout (bool): enable correct (matching Tensorflow TPU impl) fanout calculation w/ group convs + + Handles layers in EfficientNet, EfficientNet-CondConv, MixNet, MnasNet, MobileNetV3, etc: + * https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py + * https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py + """ + if isinstance(m, CondConv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + init_weight_fn = get_condconv_initializer( + lambda w: nn.init.normal_(w, 0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape) + init_weight_fn(m.weight) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + if fix_group_fanout: + fan_out //= m.groups + nn.init.normal_(m.weight, 0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + fan_out = m.weight.size(0) # fan-out + fan_in = 0 + if 'routing_fn' in n: + fan_in = m.weight.size(1) + init_range = 1.0 / math.sqrt(fan_in + fan_out) + nn.init.uniform_(m.weight, -init_range, init_range) + nn.init.zeros_(m.bias) + + +def efficientnet_init_weights(model: nn.Module, init_fn=None): + init_fn = init_fn or _init_weight_goog + for n, m in model.named_modules(): + init_fn(m, n) + diff --git a/custom_timm/models/factory.py b/custom_timm/models/factory.py new file mode 100644 index 0000000000000000000000000000000000000000..f7a8fd9cddf04633d6f5160dd1e2e96bab4737ad --- /dev/null +++ b/custom_timm/models/factory.py @@ -0,0 +1,76 @@ +from urllib.parse import urlsplit, urlunsplit +import os + +from .registry import is_model, is_model_in_modules, model_entrypoint +from .helpers import load_checkpoint +from .layers import set_layer_config +from .hub import load_model_config_from_hf + + +def parse_model_name(model_name): + model_name = model_name.replace('hf_hub', 'hf-hub') # NOTE for backwards compat, to deprecate hf_hub use + parsed = urlsplit(model_name) + assert parsed.scheme in ('', 'timm', 'hf-hub') + if parsed.scheme == 'hf-hub': + # FIXME may use fragment as revision, currently `@` in URI path + return parsed.scheme, parsed.path + else: + model_name = os.path.split(parsed.path)[-1] + return 'timm', model_name + + +def safe_model_name(model_name, remove_source=True): + def make_safe(name): + return ''.join(c if c.isalnum() else '_' for c in name).rstrip('_') + if remove_source: + model_name = parse_model_name(model_name)[-1] + return make_safe(model_name) + + +def create_model( + model_name, + pretrained=False, + pretrained_cfg=None, + checkpoint_path='', + scriptable=None, + exportable=None, + no_jit=None, + **kwargs): + """Create a model + + Args: + model_name (str): name of model to instantiate + pretrained (bool): load pretrained ImageNet-1k weights if true + checkpoint_path (str): path of checkpoint to load after model is initialized + scriptable (bool): set layer config so that model is jit scriptable (not working for all models yet) + exportable (bool): set layer config so that model is traceable / ONNX exportable (not fully impl/obeyed yet) + no_jit (bool): set layer config so that model doesn't utilize jit scripted layers (so far activations only) + + Keyword Args: + drop_rate (float): dropout rate for training (default: 0.0) + global_pool (str): global pool type (default: 'avg') + **: other kwargs are model specific + """ + # Parameters that aren't supported by all models or are intended to only override model defaults if set + # should default to None in command line args/cfg. Remove them if they are present and not set so that + # non-supporting models don't break and default args remain in effect. + kwargs = {k: v for k, v in kwargs.items() if v is not None} + + model_source, model_name = parse_model_name(model_name) + if model_source == 'hf-hub': + # FIXME hf-hub source overrides any passed in pretrained_cfg, warn? + # For model names specified in the form `hf-hub:path/architecture_name@revision`, + # load model weights + pretrained_cfg from Hugging Face hub. + pretrained_cfg, model_name = load_model_config_from_hf(model_name) + + if not is_model(model_name): + raise RuntimeError('Unknown model (%s)' % model_name) + + create_fn = model_entrypoint(model_name) + with set_layer_config(scriptable=scriptable, exportable=exportable, no_jit=no_jit): + model = create_fn(pretrained=pretrained, pretrained_cfg=pretrained_cfg, **kwargs) + + if checkpoint_path: + load_checkpoint(model, checkpoint_path) + + return model diff --git a/custom_timm/models/features.py b/custom_timm/models/features.py new file mode 100644 index 0000000000000000000000000000000000000000..0bc46419d16f9759221a39061f4eb34e76aa6efd --- /dev/null +++ b/custom_timm/models/features.py @@ -0,0 +1,284 @@ +""" PyTorch Feature Extraction Helpers + +A collection of classes, functions, modules to help extract features from models +and provide a common interface for describing them. + +The return_layers, module re-writing idea inspired by torchvision IntermediateLayerGetter +https://github.com/pytorch/vision/blob/d88d8961ae51507d0cb680329d985b1488b1b76b/torchvision/models/_utils.py + +Hacked together by / Copyright 2020 Ross Wightman +""" +from collections import OrderedDict, defaultdict +from copy import deepcopy +from functools import partial +from typing import Dict, List, Tuple + +import torch +import torch.nn as nn + + +class FeatureInfo: + + def __init__(self, feature_info: List[Dict], out_indices: Tuple[int]): + prev_reduction = 1 + for fi in feature_info: + # sanity check the mandatory fields, there may be additional fields depending on the model + assert 'num_chs' in fi and fi['num_chs'] > 0 + assert 'reduction' in fi and fi['reduction'] >= prev_reduction + prev_reduction = fi['reduction'] + assert 'module' in fi + self.out_indices = out_indices + self.info = feature_info + + def from_other(self, out_indices: Tuple[int]): + return FeatureInfo(deepcopy(self.info), out_indices) + + def get(self, key, idx=None): + """ Get value by key at specified index (indices) + if idx == None, returns value for key at each output index + if idx is an integer, return value for that feature module index (ignoring output indices) + if idx is a list/tupple, return value for each module index (ignoring output indices) + """ + if idx is None: + return [self.info[i][key] for i in self.out_indices] + if isinstance(idx, (tuple, list)): + return [self.info[i][key] for i in idx] + else: + return self.info[idx][key] + + def get_dicts(self, keys=None, idx=None): + """ return info dicts for specified keys (or all if None) at specified indices (or out_indices if None) + """ + if idx is None: + if keys is None: + return [self.info[i] for i in self.out_indices] + else: + return [{k: self.info[i][k] for k in keys} for i in self.out_indices] + if isinstance(idx, (tuple, list)): + return [self.info[i] if keys is None else {k: self.info[i][k] for k in keys} for i in idx] + else: + return self.info[idx] if keys is None else {k: self.info[idx][k] for k in keys} + + def channels(self, idx=None): + """ feature channels accessor + """ + return self.get('num_chs', idx) + + def reduction(self, idx=None): + """ feature reduction (output stride) accessor + """ + return self.get('reduction', idx) + + def module_name(self, idx=None): + """ feature module name accessor + """ + return self.get('module', idx) + + def __getitem__(self, item): + return self.info[item] + + def __len__(self): + return len(self.info) + + +class FeatureHooks: + """ Feature Hook Helper + + This module helps with the setup and extraction of hooks for extracting features from + internal nodes in a model by node name. This works quite well in eager Python but needs + redesign for torchscript. + """ + + def __init__(self, hooks, named_modules, out_map=None, default_hook_type='forward'): + # setup feature hooks + modules = {k: v for k, v in named_modules} + for i, h in enumerate(hooks): + hook_name = h['module'] + m = modules[hook_name] + hook_id = out_map[i] if out_map else hook_name + hook_fn = partial(self._collect_output_hook, hook_id) + hook_type = h.get('hook_type', default_hook_type) + if hook_type == 'forward_pre': + m.register_forward_pre_hook(hook_fn) + elif hook_type == 'forward': + m.register_forward_hook(hook_fn) + else: + assert False, "Unsupported hook type" + self._feature_outputs = defaultdict(OrderedDict) + + def _collect_output_hook(self, hook_id, *args): + x = args[-1] # tensor we want is last argument, output for fwd, input for fwd_pre + if isinstance(x, tuple): + x = x[0] # unwrap input tuple + self._feature_outputs[x.device][hook_id] = x + + def get_output(self, device) -> Dict[str, torch.tensor]: + output = self._feature_outputs[device] + self._feature_outputs[device] = OrderedDict() # clear after reading + return output + + +def _module_list(module, flatten_sequential=False): + # a yield/iter would be better for this but wouldn't be compatible with torchscript + ml = [] + for name, module in module.named_children(): + if flatten_sequential and isinstance(module, nn.Sequential): + # first level of Sequential containers is flattened into containing model + for child_name, child_module in module.named_children(): + combined = [name, child_name] + ml.append(('_'.join(combined), '.'.join(combined), child_module)) + else: + ml.append((name, name, module)) + return ml + + +def _get_feature_info(net, out_indices): + feature_info = getattr(net, 'feature_info') + if isinstance(feature_info, FeatureInfo): + return feature_info.from_other(out_indices) + elif isinstance(feature_info, (list, tuple)): + return FeatureInfo(net.feature_info, out_indices) + else: + assert False, "Provided feature_info is not valid" + + +def _get_return_layers(feature_info, out_map): + module_names = feature_info.module_name() + return_layers = {} + for i, name in enumerate(module_names): + return_layers[name] = out_map[i] if out_map is not None else feature_info.out_indices[i] + return return_layers + + +class FeatureDictNet(nn.ModuleDict): + """ Feature extractor with OrderedDict return + + Wrap a model and extract features as specified by the out indices, the network is + partially re-built from contained modules. + + There is a strong assumption that the modules have been registered into the model in the same + order as they are used. There should be no reuse of the same nn.Module more than once, including + trivial modules like `self.relu = nn.ReLU`. + + Only submodules that are directly assigned to the model class (`model.feature1`) or at most + one Sequential container deep (`model.features.1`, with flatten_sequent=True) can be captured. + All Sequential containers that are directly assigned to the original model will have their + modules assigned to this module with the name `model.features.1` being changed to `model.features_1` + + Arguments: + model (nn.Module): model from which we will extract the features + out_indices (tuple[int]): model output indices to extract features for + out_map (sequence): list or tuple specifying desired return id for each out index, + otherwise str(index) is used + feature_concat (bool): whether to concatenate intermediate features that are lists or tuples + vs select element [0] + flatten_sequential (bool): whether to flatten sequential modules assigned to model + """ + def __init__( + self, model, + out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False): + super(FeatureDictNet, self).__init__() + self.feature_info = _get_feature_info(model, out_indices) + self.concat = feature_concat + self.return_layers = {} + return_layers = _get_return_layers(self.feature_info, out_map) + modules = _module_list(model, flatten_sequential=flatten_sequential) + remaining = set(return_layers.keys()) + layers = OrderedDict() + for new_name, old_name, module in modules: + layers[new_name] = module + if old_name in remaining: + # return id has to be consistently str type for torchscript + self.return_layers[new_name] = str(return_layers[old_name]) + remaining.remove(old_name) + if not remaining: + break + assert not remaining and len(self.return_layers) == len(return_layers), \ + f'Return layers ({remaining}) are not present in model' + self.update(layers) + + def _collect(self, x) -> (Dict[str, torch.Tensor]): + out = OrderedDict() + for name, module in self.items(): + x = module(x) + if name in self.return_layers: + out_id = self.return_layers[name] + if isinstance(x, (tuple, list)): + # If model tap is a tuple or list, concat or select first element + # FIXME this may need to be more generic / flexible for some nets + out[out_id] = torch.cat(x, 1) if self.concat else x[0] + else: + out[out_id] = x + return out + + def forward(self, x) -> Dict[str, torch.Tensor]: + return self._collect(x) + + +class FeatureListNet(FeatureDictNet): + """ Feature extractor with list return + + See docstring for FeatureDictNet above, this class exists only to appease Torchscript typing constraints. + In eager Python we could have returned List[Tensor] vs Dict[id, Tensor] based on a member bool. + """ + def __init__( + self, model, + out_indices=(0, 1, 2, 3, 4), out_map=None, feature_concat=False, flatten_sequential=False): + super(FeatureListNet, self).__init__( + model, out_indices=out_indices, out_map=out_map, feature_concat=feature_concat, + flatten_sequential=flatten_sequential) + + def forward(self, x) -> (List[torch.Tensor]): + return list(self._collect(x).values()) + + +class FeatureHookNet(nn.ModuleDict): + """ FeatureHookNet + + Wrap a model and extract features specified by the out indices using forward/forward-pre hooks. + + If `no_rewrite` is True, features are extracted via hooks without modifying the underlying + network in any way. + + If `no_rewrite` is False, the model will be re-written as in the + FeatureList/FeatureDict case by folding first to second (Sequential only) level modules into this one. + + FIXME this does not currently work with Torchscript, see FeatureHooks class + """ + def __init__( + self, model, + out_indices=(0, 1, 2, 3, 4), out_map=None, out_as_dict=False, no_rewrite=False, + feature_concat=False, flatten_sequential=False, default_hook_type='forward'): + super(FeatureHookNet, self).__init__() + assert not torch.jit.is_scripting() + self.feature_info = _get_feature_info(model, out_indices) + self.out_as_dict = out_as_dict + layers = OrderedDict() + hooks = [] + if no_rewrite: + assert not flatten_sequential + if hasattr(model, 'reset_classifier'): # make sure classifier is removed? + model.reset_classifier(0) + layers['body'] = model + hooks.extend(self.feature_info.get_dicts()) + else: + modules = _module_list(model, flatten_sequential=flatten_sequential) + remaining = {f['module']: f['hook_type'] if 'hook_type' in f else default_hook_type + for f in self.feature_info.get_dicts()} + for new_name, old_name, module in modules: + layers[new_name] = module + for fn, fm in module.named_modules(prefix=old_name): + if fn in remaining: + hooks.append(dict(module=fn, hook_type=remaining[fn])) + del remaining[fn] + if not remaining: + break + assert not remaining, f'Return layers ({remaining}) are not present in model' + self.update(layers) + self.hooks = FeatureHooks(hooks, model.named_modules(), out_map=out_map) + + def forward(self, x): + for name, module in self.items(): + x = module(x) + out = self.hooks.get_output(x.device) + return out if self.out_as_dict else list(out.values()) diff --git a/custom_timm/models/fx_features.py b/custom_timm/models/fx_features.py new file mode 100644 index 0000000000000000000000000000000000000000..4fadcbf2ed9447496c744db95af84e697e527a4b --- /dev/null +++ b/custom_timm/models/fx_features.py @@ -0,0 +1,106 @@ +""" PyTorch FX Based Feature Extraction Helpers +Using https://pytorch.org/vision/stable/feature_extraction.html +""" +from typing import Callable, List, Dict, Union, Type + +import torch +from torch import nn + +from .features import _get_feature_info + +try: + from torchvision.models.feature_extraction import create_feature_extractor as _create_feature_extractor + has_fx_feature_extraction = True +except ImportError: + has_fx_feature_extraction = False + +# Layers we went to treat as leaf modules +from .layers import Conv2dSame, ScaledStdConv2dSame, CondConv2d, StdConv2dSame +from .layers.non_local_attn import BilinearAttnTransform +from .layers.pool2d_same import MaxPool2dSame, AvgPool2dSame + +# NOTE: By default, any modules from custom_timm.models.layers that we want to treat as leaf modules go here +# BUT modules from custom_timm.models should use the registration mechanism below +_leaf_modules = { + BilinearAttnTransform, # reason: flow control t <= 1 + # Reason: get_same_padding has a max which raises a control flow error + Conv2dSame, MaxPool2dSame, ScaledStdConv2dSame, StdConv2dSame, AvgPool2dSame, + CondConv2d, # reason: TypeError: F.conv2d received Proxy in groups=self.groups * B (because B = x.shape[0]) +} + +try: + from .layers import InplaceAbn + _leaf_modules.add(InplaceAbn) +except ImportError: + pass + + +def register_notrace_module(module: Type[nn.Module]): + """ + Any module not under timm.models.layers should get this decorator if we don't want to trace through it. + """ + _leaf_modules.add(module) + return module + + +# Functions we want to autowrap (treat them as leaves) +_autowrap_functions = set() + + +def register_notrace_function(func: Callable): + """ + Decorator for functions which ought not to be traced through + """ + _autowrap_functions.add(func) + return func + + +def create_feature_extractor(model: nn.Module, return_nodes: Union[Dict[str, str], List[str]]): + assert has_fx_feature_extraction, 'Please update to PyTorch 1.10+, torchvision 0.11+ for FX feature extraction' + return _create_feature_extractor( + model, return_nodes, + tracer_kwargs={'leaf_modules': list(_leaf_modules), 'autowrap_functions': list(_autowrap_functions)} + ) + + +class FeatureGraphNet(nn.Module): + """ A FX Graph based feature extractor that works with the model feature_info metadata + """ + def __init__(self, model, out_indices, out_map=None): + super().__init__() + assert has_fx_feature_extraction, 'Please update to PyTorch 1.10+, torchvision 0.11+ for FX feature extraction' + self.feature_info = _get_feature_info(model, out_indices) + if out_map is not None: + assert len(out_map) == len(out_indices) + return_nodes = { + info['module']: out_map[i] if out_map is not None else info['module'] + for i, info in enumerate(self.feature_info) if i in out_indices} + self.graph_module = create_feature_extractor(model, return_nodes) + + def forward(self, x): + return list(self.graph_module(x).values()) + + +class GraphExtractNet(nn.Module): + """ A standalone feature extraction wrapper that maps dict -> list or single tensor + NOTE: + * one can use feature_extractor directly if dictionary output is desired + * unlike FeatureGraphNet, this is intended to be used standalone and not with model feature_info + metadata for builtin feature extraction mode + * create_feature_extractor can be used directly if dictionary output is acceptable + + Args: + model: model to extract features from + return_nodes: node names to return features from (dict or list) + squeeze_out: if only one output, and output in list format, flatten to single tensor + """ + def __init__(self, model, return_nodes: Union[Dict[str, str], List[str]], squeeze_out: bool = True): + super().__init__() + self.squeeze_out = squeeze_out + self.graph_module = create_feature_extractor(model, return_nodes) + + def forward(self, x) -> Union[List[torch.Tensor], torch.Tensor]: + out = list(self.graph_module(x).values()) + if self.squeeze_out and len(out) == 1: + return out[0] + return out diff --git a/custom_timm/models/gcvit.py b/custom_timm/models/gcvit.py new file mode 100644 index 0000000000000000000000000000000000000000..e8984dfe2b60b1e574ed42458bce292ce8bf1fe2 --- /dev/null +++ b/custom_timm/models/gcvit.py @@ -0,0 +1,592 @@ +""" Global Context ViT + +From scratch implementation of GCViT in the style of timm swin_transformer_v2_cr.py + +Global Context Vision Transformers -https://arxiv.org/abs/2206.09959 + +@article{hatamizadeh2022global, + title={Global Context Vision Transformers}, + author={Hatamizadeh, Ali and Yin, Hongxu and Kautz, Jan and Molchanov, Pavlo}, + journal={arXiv preprint arXiv:2206.09959}, + year={2022} +} + +Free of any code related to NVIDIA GCVit impl at https://github.com/NVlabs/GCVit. +The license for this code release is Apache 2.0 with no commercial restrictions. + +However, weight files adapted from NVIDIA GCVit impl ARE under a non-commercial share-alike license +(https://creativecommons.org/licenses/by-nc-sa/4.0/) until I have a chance to train new ones... + +Hacked together by / Copyright 2022, Ross Wightman +""" +import math +from functools import partial +from typing import Callable, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_function +from .helpers import build_model_with_cfg, named_apply +from .layers import DropPath, to_2tuple, to_ntuple, Mlp, ClassifierHead, LayerNorm2d,\ + get_attn, get_act_layer, get_norm_layer, _assert +from .registry import register_model +from .vision_transformer_relpos import RelPosMlp, RelPosBias # FIXME move to common location + +__all__ = ['GlobalContextVit'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv1', 'classifier': 'head.fc', + 'fixed_input_size': True, + **kwargs + } + + +default_cfgs = { + 'gcvit_xxtiny': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_xxtiny_224_nvidia-d1d86009.pth'), + 'gcvit_xtiny': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_xtiny_224_nvidia-274b92b7.pth'), + 'gcvit_tiny': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_tiny_224_nvidia-ac783954.pth'), + 'gcvit_small': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_small_224_nvidia-4e98afa2.pth'), + 'gcvit_base': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-morevit/gcvit_base_224_nvidia-f009139b.pth'), +} + + +class MbConvBlock(nn.Module): + """ A depthwise separable / fused mbconv style residual block with SE, `no norm. + """ + def __init__( + self, + in_chs, + out_chs=None, + expand_ratio=1.0, + attn_layer='se', + bias=False, + act_layer=nn.GELU, + ): + super().__init__() + attn_kwargs = dict(act_layer=act_layer) + if isinstance(attn_layer, str) and attn_layer == 'se' or attn_layer == 'eca': + attn_kwargs['rd_ratio'] = 0.25 + attn_kwargs['bias'] = False + attn_layer = get_attn(attn_layer) + out_chs = out_chs or in_chs + mid_chs = int(expand_ratio * in_chs) + + self.conv_dw = nn.Conv2d(in_chs, mid_chs, 3, 1, 1, groups=in_chs, bias=bias) + self.act = act_layer() + self.se = attn_layer(mid_chs, **attn_kwargs) + self.conv_pw = nn.Conv2d(mid_chs, out_chs, 1, 1, 0, bias=bias) + + def forward(self, x): + shortcut = x + x = self.conv_dw(x) + x = self.act(x) + x = self.se(x) + x = self.conv_pw(x) + x = x + shortcut + return x + + +class Downsample2d(nn.Module): + def __init__( + self, + dim, + dim_out=None, + reduction='conv', + act_layer=nn.GELU, + norm_layer=LayerNorm2d, # NOTE in NCHW + ): + super().__init__() + dim_out = dim_out or dim + + self.norm1 = norm_layer(dim) if norm_layer is not None else nn.Identity() + self.conv_block = MbConvBlock(dim, act_layer=act_layer) + assert reduction in ('conv', 'max', 'avg') + if reduction == 'conv': + self.reduction = nn.Conv2d(dim, dim_out, 3, 2, 1, bias=False) + elif reduction == 'max': + assert dim == dim_out + self.reduction = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + else: + assert dim == dim_out + self.reduction = nn.AvgPool2d(kernel_size=2) + self.norm2 = norm_layer(dim_out) if norm_layer is not None else nn.Identity() + + def forward(self, x): + x = self.norm1(x) + x = self.conv_block(x) + x = self.reduction(x) + x = self.norm2(x) + return x + + +class FeatureBlock(nn.Module): + def __init__( + self, + dim, + levels=0, + reduction='max', + act_layer=nn.GELU, + ): + super().__init__() + reductions = levels + levels = max(1, levels) + if reduction == 'avg': + pool_fn = partial(nn.AvgPool2d, kernel_size=2) + else: + pool_fn = partial(nn.MaxPool2d, kernel_size=3, stride=2, padding=1) + self.blocks = nn.Sequential() + for i in range(levels): + self.blocks.add_module(f'conv{i+1}', MbConvBlock(dim, act_layer=act_layer)) + if reductions: + self.blocks.add_module(f'pool{i+1}', pool_fn()) + reductions -= 1 + + def forward(self, x): + return self.blocks(x) + + +class Stem(nn.Module): + def __init__( + self, + in_chs: int = 3, + out_chs: int = 96, + act_layer: Callable = nn.GELU, + norm_layer: Callable = LayerNorm2d, # NOTE stem in NCHW + ): + super().__init__() + self.conv1 = nn.Conv2d(in_chs, out_chs, kernel_size=3, stride=2, padding=1) + self.down = Downsample2d(out_chs, act_layer=act_layer, norm_layer=norm_layer) + + def forward(self, x): + x = self.conv1(x) + x = self.down(x) + return x + + +class WindowAttentionGlobal(nn.Module): + + def __init__( + self, + dim: int, + num_heads: int, + window_size: Tuple[int, int], + use_global: bool = True, + qkv_bias: bool = True, + attn_drop: float = 0., + proj_drop: float = 0., + ): + super().__init__() + window_size = to_2tuple(window_size) + self.window_size = window_size + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim ** -0.5 + self.use_global = use_global + + self.rel_pos = RelPosBias(window_size=window_size, num_heads=num_heads) + if self.use_global: + self.qkv = nn.Linear(dim, dim * 2, bias=qkv_bias) + else: + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, q_global: Optional[torch.Tensor] = None): + B, N, C = x.shape + if self.use_global and q_global is not None: + _assert(x.shape[-1] == q_global.shape[-1], 'x and q_global seq lengths should be equal') + + kv = self.qkv(x) + kv = kv.reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + k, v = kv.unbind(0) + + q = q_global.repeat(B // q_global.shape[0], 1, 1, 1) + q = q.reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3) + else: + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + q = q * self.scale + + attn = (q @ k.transpose(-2, -1)) + attn = self.rel_pos(attn) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +def window_partition(x, window_size: Tuple[int, int]): + B, H, W, C = x.shape + x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def window_reverse(windows, window_size: Tuple[int, int], img_size: Tuple[int, int]): + H, W = img_size + B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1])) + x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class LayerScale(nn.Module): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class GlobalContextVitBlock(nn.Module): + def __init__( + self, + dim: int, + feat_size: Tuple[int, int], + num_heads: int, + window_size: int = 7, + mlp_ratio: float = 4., + use_global: bool = True, + qkv_bias: bool = True, + layer_scale: Optional[float] = None, + proj_drop: float = 0., + attn_drop: float = 0., + drop_path: float = 0., + attn_layer: Callable = WindowAttentionGlobal, + act_layer: Callable = nn.GELU, + norm_layer: Callable = nn.LayerNorm, + ): + super().__init__() + feat_size = to_2tuple(feat_size) + window_size = to_2tuple(window_size) + self.window_size = window_size + self.num_windows = int((feat_size[0] // window_size[0]) * (feat_size[1] // window_size[1])) + + self.norm1 = norm_layer(dim) + self.attn = attn_layer( + dim, + num_heads=num_heads, + window_size=window_size, + use_global=use_global, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=proj_drop, + ) + self.ls1 = LayerScale(dim, layer_scale) if layer_scale is not None else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=proj_drop) + self.ls2 = LayerScale(dim, layer_scale) if layer_scale is not None else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def _window_attn(self, x, q_global: Optional[torch.Tensor] = None): + B, H, W, C = x.shape + x_win = window_partition(x, self.window_size) + x_win = x_win.view(-1, self.window_size[0] * self.window_size[1], C) + attn_win = self.attn(x_win, q_global) + x = window_reverse(attn_win, self.window_size, (H, W)) + return x + + def forward(self, x, q_global: Optional[torch.Tensor] = None): + x = x + self.drop_path1(self.ls1(self._window_attn(self.norm1(x), q_global))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class GlobalContextVitStage(nn.Module): + def __init__( + self, + dim, + depth: int, + num_heads: int, + feat_size: Tuple[int, int], + window_size: Tuple[int, int], + downsample: bool = True, + global_norm: bool = False, + stage_norm: bool = False, + mlp_ratio: float = 4., + qkv_bias: bool = True, + layer_scale: Optional[float] = None, + proj_drop: float = 0., + attn_drop: float = 0., + drop_path: Union[List[float], float] = 0.0, + act_layer: Callable = nn.GELU, + norm_layer: Callable = nn.LayerNorm, + norm_layer_cl: Callable = LayerNorm2d, + ): + super().__init__() + if downsample: + self.downsample = Downsample2d( + dim=dim, + dim_out=dim * 2, + norm_layer=norm_layer, + ) + dim = dim * 2 + feat_size = (feat_size[0] // 2, feat_size[1] // 2) + else: + self.downsample = nn.Identity() + self.feat_size = feat_size + window_size = to_2tuple(window_size) + + feat_levels = int(math.log2(min(feat_size) / min(window_size))) + self.global_block = FeatureBlock(dim, feat_levels) + self.global_norm = norm_layer_cl(dim) if global_norm else nn.Identity() + + self.blocks = nn.ModuleList([ + GlobalContextVitBlock( + dim=dim, + num_heads=num_heads, + feat_size=feat_size, + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + use_global=(i % 2 != 0), + layer_scale=layer_scale, + proj_drop=proj_drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + act_layer=act_layer, + norm_layer=norm_layer_cl, + ) + for i in range(depth) + ]) + self.norm = norm_layer_cl(dim) if stage_norm else nn.Identity() + self.dim = dim + self.feat_size = feat_size + self.grad_checkpointing = False + + def forward(self, x): + # input NCHW, downsample & global block are 2d conv + pooling + x = self.downsample(x) + global_query = self.global_block(x) + + # reshape NCHW --> NHWC for transformer blocks + x = x.permute(0, 2, 3, 1) + global_query = self.global_norm(global_query.permute(0, 2, 3, 1)) + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x, global_query) + x = self.norm(x) + x = x.permute(0, 3, 1, 2).contiguous() # back to NCHW + return x + + +class GlobalContextVit(nn.Module): + def __init__( + self, + in_chans: int = 3, + num_classes: int = 1000, + global_pool: str = 'avg', + img_size: Tuple[int, int] = 224, + window_ratio: Tuple[int, ...] = (32, 32, 16, 32), + window_size: Tuple[int, ...] = None, + embed_dim: int = 64, + depths: Tuple[int, ...] = (3, 4, 19, 5), + num_heads: Tuple[int, ...] = (2, 4, 8, 16), + mlp_ratio: float = 3.0, + qkv_bias: bool = True, + layer_scale: Optional[float] = None, + drop_rate: float = 0., + proj_drop_rate: float = 0., + attn_drop_rate: float = 0., + drop_path_rate: float = 0., + weight_init='', + act_layer: str = 'gelu', + norm_layer: str = 'layernorm2d', + norm_layer_cl: str = 'layernorm', + norm_eps: float = 1e-5, + ): + super().__init__() + act_layer = get_act_layer(act_layer) + norm_layer = partial(get_norm_layer(norm_layer), eps=norm_eps) + norm_layer_cl = partial(get_norm_layer(norm_layer_cl), eps=norm_eps) + + img_size = to_2tuple(img_size) + feat_size = tuple(d // 4 for d in img_size) # stem reduction by 4 + self.global_pool = global_pool + self.num_classes = num_classes + self.drop_rate = drop_rate + num_stages = len(depths) + self.num_features = int(embed_dim * 2 ** (num_stages - 1)) + if window_size is not None: + window_size = to_ntuple(num_stages)(window_size) + else: + assert window_ratio is not None + window_size = tuple([(img_size[0] // r, img_size[1] // r) for r in to_ntuple(num_stages)(window_ratio)]) + + self.stem = Stem( + in_chs=in_chans, + out_chs=embed_dim, + act_layer=act_layer, + norm_layer=norm_layer + ) + + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] + stages = [] + for i in range(num_stages): + last_stage = i == num_stages - 1 + stage_scale = 2 ** max(i - 1, 0) + stages.append(GlobalContextVitStage( + dim=embed_dim * stage_scale, + depth=depths[i], + num_heads=num_heads[i], + feat_size=(feat_size[0] // stage_scale, feat_size[1] // stage_scale), + window_size=window_size[i], + downsample=i != 0, + stage_norm=last_stage, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + layer_scale=layer_scale, + proj_drop=proj_drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + act_layer=act_layer, + norm_layer=norm_layer, + norm_layer_cl=norm_layer_cl, + )) + self.stages = nn.Sequential(*stages) + + # Classifier head + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate) + + if weight_init: + named_apply(partial(self._init_weights, scheme=weight_init), self) + + def _init_weights(self, module, name, scheme='vit'): + # note Conv2d left as default init + if scheme == 'vit': + if isinstance(module, nn.Linear): + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + if 'mlp' in name: + nn.init.normal_(module.bias, std=1e-6) + else: + nn.init.zeros_(module.bias) + else: + if isinstance(module, nn.Linear): + nn.init.normal_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + + @torch.jit.ignore + def no_weight_decay(self): + return { + k for k, _ in self.named_parameters() + if any(n in k for n in ["relative_position_bias_table", "rel_pos.mlp"])} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', # stem and embed + blocks=r'^stages\.(\d+)' + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is None: + global_pool = self.head.global_pool.pool_type + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x: torch.Tensor) -> torch.Tensor: + x = self.stem(x) + x = self.stages(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_gcvit(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + model = build_model_with_cfg(GlobalContextVit, variant, pretrained, **kwargs) + return model + + +@register_model +def gcvit_xxtiny(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(2, 2, 6, 2), + num_heads=(2, 4, 8, 16), + **kwargs) + return _create_gcvit('gcvit_xxtiny', pretrained=pretrained, **model_kwargs) + + +@register_model +def gcvit_xtiny(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 4, 6, 5), + num_heads=(2, 4, 8, 16), + **kwargs) + return _create_gcvit('gcvit_xtiny', pretrained=pretrained, **model_kwargs) + + +@register_model +def gcvit_tiny(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 4, 19, 5), + num_heads=(2, 4, 8, 16), + **kwargs) + return _create_gcvit('gcvit_tiny', pretrained=pretrained, **model_kwargs) + + +@register_model +def gcvit_small(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 4, 19, 5), + num_heads=(3, 6, 12, 24), + embed_dim=96, + mlp_ratio=2, + layer_scale=1e-5, + **kwargs) + return _create_gcvit('gcvit_small', pretrained=pretrained, **model_kwargs) + + +@register_model +def gcvit_base(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 4, 19, 5), + num_heads=(4, 8, 16, 32), + embed_dim=128, + mlp_ratio=2, + layer_scale=1e-5, + **kwargs) + return _create_gcvit('gcvit_base', pretrained=pretrained, **model_kwargs) diff --git a/custom_timm/models/ghostnet.py b/custom_timm/models/ghostnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f31127dd86409b5fe2e9b54036e72a0a938da09f --- /dev/null +++ b/custom_timm/models/ghostnet.py @@ -0,0 +1,302 @@ +""" +An implementation of GhostNet Model as defined in: +GhostNet: More Features from Cheap Operations. https://arxiv.org/abs/1911.11907 +The train script of the model is similar to that of MobileNetV3 +Original model: https://github.com/huawei-noah/CV-backbones/tree/master/ghostnet_pytorch +""" +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .layers import SelectAdaptivePool2d, Linear, make_divisible +from .efficientnet_blocks import SqueezeExcite, ConvBnAct +from .helpers import build_model_with_cfg, checkpoint_seq +from .registry import register_model + + +__all__ = ['GhostNet'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv_stem', 'classifier': 'classifier', + **kwargs + } + + +default_cfgs = { + 'ghostnet_050': _cfg(url=''), + 'ghostnet_100': _cfg( + url='https://github.com/huawei-noah/CV-backbones/releases/download/ghostnet_pth/ghostnet_1x.pth'), + 'ghostnet_130': _cfg(url=''), +} + + +_SE_LAYER = partial(SqueezeExcite, gate_layer='hard_sigmoid', rd_round_fn=partial(make_divisible, divisor=4)) + + +class GhostModule(nn.Module): + def __init__(self, inp, oup, kernel_size=1, ratio=2, dw_size=3, stride=1, relu=True): + super(GhostModule, self).__init__() + self.oup = oup + init_channels = math.ceil(oup / ratio) + new_channels = init_channels * (ratio - 1) + + self.primary_conv = nn.Sequential( + nn.Conv2d(inp, init_channels, kernel_size, stride, kernel_size//2, bias=False), + nn.BatchNorm2d(init_channels), + nn.ReLU(inplace=True) if relu else nn.Sequential(), + ) + + self.cheap_operation = nn.Sequential( + nn.Conv2d(init_channels, new_channels, dw_size, 1, dw_size//2, groups=init_channels, bias=False), + nn.BatchNorm2d(new_channels), + nn.ReLU(inplace=True) if relu else nn.Sequential(), + ) + + def forward(self, x): + x1 = self.primary_conv(x) + x2 = self.cheap_operation(x1) + out = torch.cat([x1, x2], dim=1) + return out[:, :self.oup, :, :] + + +class GhostBottleneck(nn.Module): + """ Ghost bottleneck w/ optional SE""" + + def __init__(self, in_chs, mid_chs, out_chs, dw_kernel_size=3, + stride=1, act_layer=nn.ReLU, se_ratio=0.): + super(GhostBottleneck, self).__init__() + has_se = se_ratio is not None and se_ratio > 0. + self.stride = stride + + # Point-wise expansion + self.ghost1 = GhostModule(in_chs, mid_chs, relu=True) + + # Depth-wise convolution + if self.stride > 1: + self.conv_dw = nn.Conv2d( + mid_chs, mid_chs, dw_kernel_size, stride=stride, + padding=(dw_kernel_size-1)//2, groups=mid_chs, bias=False) + self.bn_dw = nn.BatchNorm2d(mid_chs) + else: + self.conv_dw = None + self.bn_dw = None + + # Squeeze-and-excitation + self.se = _SE_LAYER(mid_chs, rd_ratio=se_ratio) if has_se else None + + # Point-wise linear projection + self.ghost2 = GhostModule(mid_chs, out_chs, relu=False) + + # shortcut + if in_chs == out_chs and self.stride == 1: + self.shortcut = nn.Sequential() + else: + self.shortcut = nn.Sequential( + nn.Conv2d( + in_chs, in_chs, dw_kernel_size, stride=stride, + padding=(dw_kernel_size-1)//2, groups=in_chs, bias=False), + nn.BatchNorm2d(in_chs), + nn.Conv2d(in_chs, out_chs, 1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(out_chs), + ) + + def forward(self, x): + shortcut = x + + # 1st ghost bottleneck + x = self.ghost1(x) + + # Depth-wise convolution + if self.conv_dw is not None: + x = self.conv_dw(x) + x = self.bn_dw(x) + + # Squeeze-and-excitation + if self.se is not None: + x = self.se(x) + + # 2nd ghost bottleneck + x = self.ghost2(x) + + x += self.shortcut(shortcut) + return x + + +class GhostNet(nn.Module): + def __init__( + self, cfgs, num_classes=1000, width=1.0, in_chans=3, output_stride=32, global_pool='avg', drop_rate=0.2): + super(GhostNet, self).__init__() + # setting of inverted residual blocks + assert output_stride == 32, 'only output_stride==32 is valid, dilation not supported' + self.cfgs = cfgs + self.num_classes = num_classes + self.drop_rate = drop_rate + self.grad_checkpointing = False + self.feature_info = [] + + # building first layer + stem_chs = make_divisible(16 * width, 4) + self.conv_stem = nn.Conv2d(in_chans, stem_chs, 3, 2, 1, bias=False) + self.feature_info.append(dict(num_chs=stem_chs, reduction=2, module=f'conv_stem')) + self.bn1 = nn.BatchNorm2d(stem_chs) + self.act1 = nn.ReLU(inplace=True) + prev_chs = stem_chs + + # building inverted residual blocks + stages = nn.ModuleList([]) + block = GhostBottleneck + stage_idx = 0 + net_stride = 2 + for cfg in self.cfgs: + layers = [] + s = 1 + for k, exp_size, c, se_ratio, s in cfg: + out_chs = make_divisible(c * width, 4) + mid_chs = make_divisible(exp_size * width, 4) + layers.append(block(prev_chs, mid_chs, out_chs, k, s, se_ratio=se_ratio)) + prev_chs = out_chs + if s > 1: + net_stride *= 2 + self.feature_info.append(dict( + num_chs=prev_chs, reduction=net_stride, module=f'blocks.{stage_idx}')) + stages.append(nn.Sequential(*layers)) + stage_idx += 1 + + out_chs = make_divisible(exp_size * width, 4) + stages.append(nn.Sequential(ConvBnAct(prev_chs, out_chs, 1))) + self.pool_dim = prev_chs = out_chs + + self.blocks = nn.Sequential(*stages) + + # building last several layers + self.num_features = out_chs = 1280 + self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) + self.conv_head = nn.Conv2d(prev_chs, out_chs, 1, 1, 0, bias=True) + self.act2 = nn.ReLU(inplace=True) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() # don't flatten if pooling disabled + self.classifier = Linear(out_chs, num_classes) if num_classes > 0 else nn.Identity() + + # FIXME init + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^conv_stem|bn1', + blocks=[ + (r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)', None), + (r'conv_head', (99999,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.classifier + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + # cannot meaningfully change pooling of efficient head after creation + self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() # don't flatten if pooling disabled + self.classifier = Linear(self.pool_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.conv_stem(x) + x = self.bn1(x) + x = self.act1(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x, flatten=True) + else: + x = self.blocks(x) + return x + + def forward_head(self, x): + x = self.global_pool(x) + x = self.conv_head(x) + x = self.act2(x) + x = self.flatten(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + x = self.classifier(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_ghostnet(variant, width=1.0, pretrained=False, **kwargs): + """ + Constructs a GhostNet model + """ + cfgs = [ + # k, t, c, SE, s + # stage1 + [[3, 16, 16, 0, 1]], + # stage2 + [[3, 48, 24, 0, 2]], + [[3, 72, 24, 0, 1]], + # stage3 + [[5, 72, 40, 0.25, 2]], + [[5, 120, 40, 0.25, 1]], + # stage4 + [[3, 240, 80, 0, 2]], + [[3, 200, 80, 0, 1], + [3, 184, 80, 0, 1], + [3, 184, 80, 0, 1], + [3, 480, 112, 0.25, 1], + [3, 672, 112, 0.25, 1] + ], + # stage5 + [[5, 672, 160, 0.25, 2]], + [[5, 960, 160, 0, 1], + [5, 960, 160, 0.25, 1], + [5, 960, 160, 0, 1], + [5, 960, 160, 0.25, 1] + ] + ] + model_kwargs = dict( + cfgs=cfgs, + width=width, + **kwargs, + ) + return build_model_with_cfg( + GhostNet, variant, pretrained, + feature_cfg=dict(flatten_sequential=True), + **model_kwargs) + + +@register_model +def ghostnet_050(pretrained=False, **kwargs): + """ GhostNet-0.5x """ + model = _create_ghostnet('ghostnet_050', width=0.5, pretrained=pretrained, **kwargs) + return model + + +@register_model +def ghostnet_100(pretrained=False, **kwargs): + """ GhostNet-1.0x """ + model = _create_ghostnet('ghostnet_100', width=1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def ghostnet_130(pretrained=False, **kwargs): + """ GhostNet-1.3x """ + model = _create_ghostnet('ghostnet_130', width=1.3, pretrained=pretrained, **kwargs) + return model diff --git a/custom_timm/models/gluon_resnet.py b/custom_timm/models/gluon_resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..f24eb3e682bc09df9434ba3bdf0248f303095f6f --- /dev/null +++ b/custom_timm/models/gluon_resnet.py @@ -0,0 +1,245 @@ +"""Pytorch impl of MxNet Gluon ResNet/(SE)ResNeXt variants +This file evolved from https://github.com/pytorch/vision 'resnet.py' with (SE)-ResNeXt additions +and ports of Gluon variations (https://github.com/dmlc/gluon-cv/blob/master/gluoncv/model_zoo/resnet.py) +by Ross Wightman +""" + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import SEModule +from .registry import register_model +from .resnet import ResNet, Bottleneck, BasicBlock + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv1', 'classifier': 'fc', + **kwargs + } + + +default_cfgs = { + 'gluon_resnet18_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet18_v1b-0757602b.pth'), + 'gluon_resnet34_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet34_v1b-c6d82d59.pth'), + 'gluon_resnet50_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1b-0ebe02e2.pth'), + 'gluon_resnet101_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1b-3b017079.pth'), + 'gluon_resnet152_v1b': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1b-c1edb0dd.pth'), + 'gluon_resnet50_v1c': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1c-48092f55.pth', + first_conv='conv1.0'), + 'gluon_resnet101_v1c': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1c-1f26822a.pth', + first_conv='conv1.0'), + 'gluon_resnet152_v1c': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1c-a3bb0b98.pth', + first_conv='conv1.0'), + 'gluon_resnet50_v1d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1d-818a1b1b.pth', + first_conv='conv1.0'), + 'gluon_resnet101_v1d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1d-0f9c8644.pth', + first_conv='conv1.0'), + 'gluon_resnet152_v1d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1d-bd354e12.pth', + first_conv='conv1.0'), + 'gluon_resnet50_v1s': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet50_v1s-1762acc0.pth', + first_conv='conv1.0'), + 'gluon_resnet101_v1s': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet101_v1s-60fe0cc1.pth', + first_conv='conv1.0'), + 'gluon_resnet152_v1s': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnet152_v1s-dcc41b81.pth', + first_conv='conv1.0'), + 'gluon_resnext50_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnext50_32x4d-e6a097c1.pth'), + 'gluon_resnext101_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnext101_32x4d-b253c8c4.pth'), + 'gluon_resnext101_64x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_resnext101_64x4d-f9a8e184.pth'), + 'gluon_seresnext50_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_seresnext50_32x4d-90cf2d6e.pth'), + 'gluon_seresnext101_32x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_seresnext101_32x4d-cf52900d.pth'), + 'gluon_seresnext101_64x4d': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_seresnext101_64x4d-f9926f93.pth'), + 'gluon_senet154': _cfg(url='https://github.com/rwightman/pytorch-pretrained-gluonresnet/releases/download/v0.1/gluon_senet154-70a1a3c0.pth', + first_conv='conv1.0'), +} + + +def _create_resnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg(ResNet, variant, pretrained, **kwargs) + + +@register_model +def gluon_resnet18_v1b(pretrained=False, **kwargs): + """Constructs a ResNet-18 model. + """ + model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs) + return _create_resnet('gluon_resnet18_v1b', pretrained, **model_args) + + +@register_model +def gluon_resnet34_v1b(pretrained=False, **kwargs): + """Constructs a ResNet-34 model. + """ + model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('gluon_resnet34_v1b', pretrained, **model_args) + + +@register_model +def gluon_resnet50_v1b(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('gluon_resnet50_v1b', pretrained, **model_args) + + +@register_model +def gluon_resnet101_v1b(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs) + return _create_resnet('gluon_resnet101_v1b', pretrained, **model_args) + + +@register_model +def gluon_resnet152_v1b(pretrained=False, **kwargs): + """Constructs a ResNet-152 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs) + return _create_resnet('gluon_resnet152_v1b', pretrained, **model_args) + + +@register_model +def gluon_resnet50_v1c(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', **kwargs) + return _create_resnet('gluon_resnet50_v1c', pretrained, **model_args) + + +@register_model +def gluon_resnet101_v1c(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', **kwargs) + return _create_resnet('gluon_resnet101_v1c', pretrained, **model_args) + + +@register_model +def gluon_resnet152_v1c(pretrained=False, **kwargs): + """Constructs a ResNet-152 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', **kwargs) + return _create_resnet('gluon_resnet152_v1c', pretrained, **model_args) + + +@register_model +def gluon_resnet50_v1d(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('gluon_resnet50_v1d', pretrained, **model_args) + + +@register_model +def gluon_resnet101_v1d(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('gluon_resnet101_v1d', pretrained, **model_args) + + +@register_model +def gluon_resnet152_v1d(pretrained=False, **kwargs): + """Constructs a ResNet-152 model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('gluon_resnet152_v1d', pretrained, **model_args) + + +@register_model +def gluon_resnet50_v1s(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=64, stem_type='deep', **kwargs) + return _create_resnet('gluon_resnet50_v1s', pretrained, **model_args) + + + +@register_model +def gluon_resnet101_v1s(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], stem_width=64, stem_type='deep', **kwargs) + return _create_resnet('gluon_resnet101_v1s', pretrained, **model_args) + + +@register_model +def gluon_resnet152_v1s(pretrained=False, **kwargs): + """Constructs a ResNet-152 model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 8, 36, 3], stem_width=64, stem_type='deep', **kwargs) + return _create_resnet('gluon_resnet152_v1s', pretrained, **model_args) + + + +@register_model +def gluon_resnext50_32x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt50-32x4d model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('gluon_resnext50_32x4d', pretrained, **model_args) + + +@register_model +def gluon_resnext101_32x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('gluon_resnext101_32x4d', pretrained, **model_args) + + +@register_model +def gluon_resnext101_64x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=64, base_width=4, **kwargs) + return _create_resnet('gluon_resnext101_64x4d', pretrained, **model_args) + + +@register_model +def gluon_seresnext50_32x4d(pretrained=False, **kwargs): + """Constructs a SEResNeXt50-32x4d model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, + block_args=dict(attn_layer=SEModule), **kwargs) + return _create_resnet('gluon_seresnext50_32x4d', pretrained, **model_args) + + +@register_model +def gluon_seresnext101_32x4d(pretrained=False, **kwargs): + """Constructs a SEResNeXt-101-32x4d model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, + block_args=dict(attn_layer=SEModule), **kwargs) + return _create_resnet('gluon_seresnext101_32x4d', pretrained, **model_args) + + +@register_model +def gluon_seresnext101_64x4d(pretrained=False, **kwargs): + """Constructs a SEResNeXt-101-64x4d model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], cardinality=64, base_width=4, + block_args=dict(attn_layer=SEModule), **kwargs) + return _create_resnet('gluon_seresnext101_64x4d', pretrained, **model_args) + + +@register_model +def gluon_senet154(pretrained=False, **kwargs): + """Constructs an SENet-154 model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep', + down_kernel_size=3, block_reduce_first=2, block_args=dict(attn_layer=SEModule), **kwargs) + return _create_resnet('gluon_senet154', pretrained, **model_args) diff --git a/custom_timm/models/gluon_xception.py b/custom_timm/models/gluon_xception.py new file mode 100644 index 0000000000000000000000000000000000000000..809251b28dbecf867169010ac962a5fb5ca09e8d --- /dev/null +++ b/custom_timm/models/gluon_xception.py @@ -0,0 +1,267 @@ +"""Pytorch impl of Gluon Xception +This is a port of the Gluon Xception code and weights, itself ported from a PyTorch DeepLab impl. + +Gluon model: (https://gluon-cv.mxnet.io/_modules/gluoncv/model_zoo/xception.html) +Original PyTorch DeepLab impl: https://github.com/jfzhang95/pytorch-deeplab-xception + +Hacked together by / Copyright 2020 Ross Wightman +""" +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import create_classifier, get_padding +from .registry import register_model + +__all__ = ['Xception65'] + +default_cfgs = { + 'gluon_xception65': { + 'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_xception-7015a15c.pth', + 'input_size': (3, 299, 299), + 'crop_pct': 0.903, + 'pool_size': (10, 10), + 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, + 'std': IMAGENET_DEFAULT_STD, + 'num_classes': 1000, + 'first_conv': 'conv1', + 'classifier': 'fc' + # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299 + }, +} + +""" PADDING NOTES +The original PyTorch and Gluon impl of these models dutifully reproduced the +aligned padding added to Tensorflow models for Deeplab. This padding was compensating +for Tensorflow 'SAME' padding. PyTorch symmetric padding behaves the way we'd want it to. +""" + + +class SeparableConv2d(nn.Module): + def __init__(self, inplanes, planes, kernel_size=3, stride=1, dilation=1, bias=False, norm_layer=None): + super(SeparableConv2d, self).__init__() + self.kernel_size = kernel_size + self.dilation = dilation + + # depthwise convolution + padding = get_padding(kernel_size, stride, dilation) + self.conv_dw = nn.Conv2d( + inplanes, inplanes, kernel_size, stride=stride, + padding=padding, dilation=dilation, groups=inplanes, bias=bias) + self.bn = norm_layer(num_features=inplanes) + # pointwise convolution + self.conv_pw = nn.Conv2d(inplanes, planes, kernel_size=1, bias=bias) + + def forward(self, x): + x = self.conv_dw(x) + x = self.bn(x) + x = self.conv_pw(x) + return x + + +class Block(nn.Module): + def __init__(self, inplanes, planes, stride=1, dilation=1, start_with_relu=True, norm_layer=None): + super(Block, self).__init__() + if isinstance(planes, (list, tuple)): + assert len(planes) == 3 + else: + planes = (planes,) * 3 + outplanes = planes[-1] + + if outplanes != inplanes or stride != 1: + self.skip = nn.Sequential() + self.skip.add_module('conv1', nn.Conv2d( + inplanes, outplanes, 1, stride=stride, bias=False)), + self.skip.add_module('bn1', norm_layer(num_features=outplanes)) + else: + self.skip = None + + rep = OrderedDict() + for i in range(3): + rep['act%d' % (i + 1)] = nn.ReLU(inplace=True) + rep['conv%d' % (i + 1)] = SeparableConv2d( + inplanes, planes[i], 3, stride=stride if i == 2 else 1, dilation=dilation, norm_layer=norm_layer) + rep['bn%d' % (i + 1)] = norm_layer(planes[i]) + inplanes = planes[i] + + if not start_with_relu: + del rep['act1'] + else: + rep['act1'] = nn.ReLU(inplace=False) + self.rep = nn.Sequential(rep) + + def forward(self, x): + skip = x + if self.skip is not None: + skip = self.skip(skip) + x = self.rep(x) + skip + return x + + +class Xception65(nn.Module): + """Modified Aligned Xception. + + NOTE: only the 65 layer version is included here, the 71 layer variant + was not correct and had no pretrained weights + """ + + def __init__(self, num_classes=1000, in_chans=3, output_stride=32, norm_layer=nn.BatchNorm2d, + drop_rate=0., global_pool='avg'): + super(Xception65, self).__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + if output_stride == 32: + entry_block3_stride = 2 + exit_block20_stride = 2 + middle_dilation = 1 + exit_dilation = (1, 1) + elif output_stride == 16: + entry_block3_stride = 2 + exit_block20_stride = 1 + middle_dilation = 1 + exit_dilation = (1, 2) + elif output_stride == 8: + entry_block3_stride = 1 + exit_block20_stride = 1 + middle_dilation = 2 + exit_dilation = (2, 4) + else: + raise NotImplementedError + + # Entry flow + self.conv1 = nn.Conv2d(in_chans, 32, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = norm_layer(num_features=32) + self.act1 = nn.ReLU(inplace=True) + + self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1, bias=False) + self.bn2 = norm_layer(num_features=64) + self.act2 = nn.ReLU(inplace=True) + + self.block1 = Block(64, 128, stride=2, start_with_relu=False, norm_layer=norm_layer) + self.block1_act = nn.ReLU(inplace=True) + self.block2 = Block(128, 256, stride=2, start_with_relu=False, norm_layer=norm_layer) + self.block3 = Block(256, 728, stride=entry_block3_stride, norm_layer=norm_layer) + + # Middle flow + self.mid = nn.Sequential(OrderedDict([('block%d' % i, Block( + 728, 728, stride=1, dilation=middle_dilation, norm_layer=norm_layer)) for i in range(4, 20)])) + + # Exit flow + self.block20 = Block( + 728, (728, 1024, 1024), stride=exit_block20_stride, dilation=exit_dilation[0], norm_layer=norm_layer) + self.block20_act = nn.ReLU(inplace=True) + + self.conv3 = SeparableConv2d(1024, 1536, 3, stride=1, dilation=exit_dilation[1], norm_layer=norm_layer) + self.bn3 = norm_layer(num_features=1536) + self.act3 = nn.ReLU(inplace=True) + + self.conv4 = SeparableConv2d(1536, 1536, 3, stride=1, dilation=exit_dilation[1], norm_layer=norm_layer) + self.bn4 = norm_layer(num_features=1536) + self.act4 = nn.ReLU(inplace=True) + + self.num_features = 2048 + self.conv5 = SeparableConv2d( + 1536, self.num_features, 3, stride=1, dilation=exit_dilation[1], norm_layer=norm_layer) + self.bn5 = norm_layer(num_features=self.num_features) + self.act5 = nn.ReLU(inplace=True) + self.feature_info = [ + dict(num_chs=64, reduction=2, module='act2'), + dict(num_chs=128, reduction=4, module='block1_act'), + dict(num_chs=256, reduction=8, module='block3.rep.act1'), + dict(num_chs=728, reduction=16, module='block20.rep.act1'), + dict(num_chs=2048, reduction=32, module='act5'), + ] + + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^conv[12]|bn[12]', + blocks=[ + (r'^mid\.block(\d+)', None), + (r'^block(\d+)', None), + (r'^conv[345]|bn[345]', (99,)), + ], + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, "gradient checkpointing not supported" + + @torch.jit.ignore + def get_classifier(self): + return self.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + # Entry flow + x = self.conv1(x) + x = self.bn1(x) + x = self.act1(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.act2(x) + + x = self.block1(x) + x = self.block1_act(x) + # c1 = x + x = self.block2(x) + # c2 = x + x = self.block3(x) + + # Middle flow + x = self.mid(x) + # c3 = x + + # Exit flow + x = self.block20(x) + x = self.block20_act(x) + x = self.conv3(x) + x = self.bn3(x) + x = self.act3(x) + + x = self.conv4(x) + x = self.bn4(x) + x = self.act4(x) + + x = self.conv5(x) + x = self.bn5(x) + x = self.act5(x) + return x + + def forward_head(self, x): + x = self.global_pool(x) + if self.drop_rate: + F.dropout(x, self.drop_rate, training=self.training) + x = self.fc(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_gluon_xception(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + Xception65, variant, pretrained, + feature_cfg=dict(feature_cls='hook'), + **kwargs) + + +@register_model +def gluon_xception65(pretrained=False, **kwargs): + """ Modified Aligned Xception-65 + """ + return _create_gluon_xception('gluon_xception65', pretrained, **kwargs) diff --git a/custom_timm/models/hardcorenas.py b/custom_timm/models/hardcorenas.py new file mode 100644 index 0000000000000000000000000000000000000000..e53134b3235feffe24fedbe451e1680cbcfed27e --- /dev/null +++ b/custom_timm/models/hardcorenas.py @@ -0,0 +1,151 @@ +from functools import partial + +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .efficientnet_blocks import SqueezeExcite +from .efficientnet_builder import decode_arch_def, resolve_act_layer, resolve_bn_args, round_channels +from .helpers import build_model_with_cfg, pretrained_cfg_for_features +from .layers import get_act_fn +from .mobilenetv3 import MobileNetV3, MobileNetV3Features +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv_stem', 'classifier': 'classifier', + **kwargs + } + + +default_cfgs = { + 'hardcorenas_a': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/hardcorenas_a_green_38ms_75_9-31dc7186.pth'), + 'hardcorenas_b': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/hardcorenas_b_green_40ms_76_5-32d91ff2.pth'), + 'hardcorenas_c': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/hardcorenas_c_green_44ms_77_1-631a0983.pth'), + 'hardcorenas_d': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/hardcorenas_d_green_50ms_77_4-998d9d7a.pth'), + 'hardcorenas_e': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/hardcorenas_e_green_55ms_77_9-482886a3.pth'), + 'hardcorenas_f': _cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/hardcorenas_f_green_60ms_78_1-14b9e780.pth'), +} + + +def _gen_hardcorenas(pretrained, variant, arch_def, **kwargs): + """Creates a hardcorenas model + + Ref impl: https://github.com/Alibaba-MIIL/HardCoReNAS + Paper: https://arxiv.org/abs/2102.11646 + + """ + num_features = 1280 + se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU, rd_round_fn=round_channels) + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + num_features=num_features, + stem_size=32, + norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'hard_swish'), + se_layer=se_layer, + **kwargs, + ) + + features_only = False + model_cls = MobileNetV3 + kwargs_filter = None + if model_kwargs.pop('features_only', False): + features_only = True + kwargs_filter = ('num_classes', 'num_features', 'global_pool', 'head_conv', 'head_bias', 'global_pool') + model_cls = MobileNetV3Features + model = build_model_with_cfg( + model_cls, variant, pretrained, + pretrained_strict=not features_only, + kwargs_filter=kwargs_filter, + **model_kwargs) + if features_only: + model.default_cfg = pretrained_cfg_for_features(model.default_cfg) + return model + + +@register_model +def hardcorenas_a(pretrained=False, **kwargs): + """ hardcorenas_A """ + arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre', 'ir_r1_k5_s1_e3_c24_nre_se0.25'], + ['ir_r1_k5_s2_e3_c40_nre', 'ir_r1_k5_s1_e6_c40_nre_se0.25'], + ['ir_r1_k5_s2_e6_c80_se0.25', 'ir_r1_k5_s1_e6_c80_se0.25'], + ['ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25'], + ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']] + model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_a', arch_def=arch_def, **kwargs) + return model + + +@register_model +def hardcorenas_b(pretrained=False, **kwargs): + """ hardcorenas_B """ + arch_def = [['ds_r1_k3_s1_e1_c16_nre'], + ['ir_r1_k5_s2_e3_c24_nre', 'ir_r1_k5_s1_e3_c24_nre_se0.25', 'ir_r1_k3_s1_e3_c24_nre'], + ['ir_r1_k5_s2_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre'], + ['ir_r1_k5_s2_e3_c80', 'ir_r1_k5_s1_e3_c80', 'ir_r1_k3_s1_e3_c80', 'ir_r1_k3_s1_e3_c80'], + ['ir_r1_k5_s1_e3_c112', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112'], + ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k3_s1_e3_c192_se0.25'], + ['cn_r1_k1_s1_c960']] + model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_b', arch_def=arch_def, **kwargs) + return model + + +@register_model +def hardcorenas_c(pretrained=False, **kwargs): + """ hardcorenas_C """ + arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre', 'ir_r1_k5_s1_e3_c24_nre_se0.25'], + ['ir_r1_k5_s2_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre', 'ir_r1_k5_s1_e3_c40_nre', + 'ir_r1_k5_s1_e3_c40_nre'], + ['ir_r1_k5_s2_e4_c80', 'ir_r1_k5_s1_e6_c80_se0.25', 'ir_r1_k3_s1_e3_c80', 'ir_r1_k3_s1_e3_c80'], + ['ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112', 'ir_r1_k3_s1_e3_c112'], + ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k3_s1_e3_c192_se0.25'], + ['cn_r1_k1_s1_c960']] + model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_c', arch_def=arch_def, **kwargs) + return model + + +@register_model +def hardcorenas_d(pretrained=False, **kwargs): + """ hardcorenas_D """ + arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre_se0.25', 'ir_r1_k5_s1_e3_c24_nre_se0.25'], + ['ir_r1_k5_s2_e3_c40_nre_se0.25', 'ir_r1_k5_s1_e4_c40_nre_se0.25', 'ir_r1_k3_s1_e3_c40_nre_se0.25'], + ['ir_r1_k5_s2_e4_c80_se0.25', 'ir_r1_k3_s1_e3_c80_se0.25', 'ir_r1_k3_s1_e3_c80_se0.25', + 'ir_r1_k3_s1_e3_c80_se0.25'], + ['ir_r1_k3_s1_e4_c112_se0.25', 'ir_r1_k5_s1_e4_c112_se0.25', 'ir_r1_k3_s1_e3_c112_se0.25', + 'ir_r1_k5_s1_e3_c112_se0.25'], + ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', + 'ir_r1_k3_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']] + model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_d', arch_def=arch_def, **kwargs) + return model + + +@register_model +def hardcorenas_e(pretrained=False, **kwargs): + """ hardcorenas_E """ + arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre_se0.25', 'ir_r1_k5_s1_e3_c24_nre_se0.25'], + ['ir_r1_k5_s2_e6_c40_nre_se0.25', 'ir_r1_k5_s1_e4_c40_nre_se0.25', 'ir_r1_k5_s1_e4_c40_nre_se0.25', + 'ir_r1_k3_s1_e3_c40_nre_se0.25'], ['ir_r1_k5_s2_e4_c80_se0.25', 'ir_r1_k3_s1_e6_c80_se0.25'], + ['ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25', + 'ir_r1_k5_s1_e3_c112_se0.25'], + ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', + 'ir_r1_k3_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']] + model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_e', arch_def=arch_def, **kwargs) + return model + + +@register_model +def hardcorenas_f(pretrained=False, **kwargs): + """ hardcorenas_F """ + arch_def = [['ds_r1_k3_s1_e1_c16_nre'], ['ir_r1_k5_s2_e3_c24_nre_se0.25', 'ir_r1_k5_s1_e3_c24_nre_se0.25'], + ['ir_r1_k5_s2_e6_c40_nre_se0.25', 'ir_r1_k5_s1_e6_c40_nre_se0.25'], + ['ir_r1_k5_s2_e6_c80_se0.25', 'ir_r1_k5_s1_e6_c80_se0.25', 'ir_r1_k3_s1_e3_c80_se0.25', + 'ir_r1_k3_s1_e3_c80_se0.25'], + ['ir_r1_k3_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25', 'ir_r1_k5_s1_e6_c112_se0.25', + 'ir_r1_k3_s1_e3_c112_se0.25'], + ['ir_r1_k5_s2_e6_c192_se0.25', 'ir_r1_k5_s1_e6_c192_se0.25', 'ir_r1_k3_s1_e6_c192_se0.25', + 'ir_r1_k3_s1_e6_c192_se0.25'], ['cn_r1_k1_s1_c960']] + model = _gen_hardcorenas(pretrained=pretrained, variant='hardcorenas_f', arch_def=arch_def, **kwargs) + return model diff --git a/custom_timm/models/helpers.py b/custom_timm/models/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..d68c7e6541ae5f39af0d962ff3b453e4b0c266c4 --- /dev/null +++ b/custom_timm/models/helpers.py @@ -0,0 +1,796 @@ +""" Model creation / weight loading / state_dict helpers + +Hacked together by / Copyright 2020 Ross Wightman +""" +import collections.abc +import logging +import math +import os +import re +from collections import OrderedDict, defaultdict +from copy import deepcopy +from itertools import chain +from typing import Any, Callable, Optional, Tuple, Dict, Union + +import torch +import torch.nn as nn +from torch.hub import load_state_dict_from_url +from torch.utils.checkpoint import checkpoint + +from .features import FeatureListNet, FeatureDictNet, FeatureHookNet +from .fx_features import FeatureGraphNet +from .hub import has_hf_hub, download_cached_file, load_state_dict_from_hf +from .layers import Conv2dSame, Linear, BatchNormAct2d +from .registry import get_pretrained_cfg + + +_logger = logging.getLogger(__name__) + + +# Global variables for rarely used pretrained checkpoint download progress and hash check. +# Use set_pretrained_download_progress / set_pretrained_check_hash functions to toggle. +_DOWNLOAD_PROGRESS = False +_CHECK_HASH = False + + +def clean_state_dict(state_dict): + # 'clean' checkpoint by removing .module prefix from state dict if it exists from parallel training + cleaned_state_dict = OrderedDict() + for k, v in state_dict.items(): + name = k[7:] if k.startswith('module.') else k + cleaned_state_dict[name] = v + return cleaned_state_dict + + +def load_state_dict(checkpoint_path, use_ema=True): + if checkpoint_path and os.path.isfile(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location='cpu') + state_dict_key = '' + if isinstance(checkpoint, dict): + if use_ema and checkpoint.get('state_dict_ema', None) is not None: + state_dict_key = 'state_dict_ema' + elif use_ema and checkpoint.get('model_ema', None) is not None: + state_dict_key = 'model_ema' + elif 'state_dict' in checkpoint: + state_dict_key = 'state_dict' + elif 'model' in checkpoint: + state_dict_key = 'model' + state_dict = clean_state_dict(checkpoint[state_dict_key] if state_dict_key else checkpoint) + _logger.info("Loaded {} from checkpoint '{}'".format(state_dict_key, checkpoint_path)) + return state_dict + else: + _logger.error("No checkpoint found at '{}'".format(checkpoint_path)) + raise FileNotFoundError() + + +def load_checkpoint(model, checkpoint_path, use_ema=True, strict=True): + if os.path.splitext(checkpoint_path)[-1].lower() in ('.npz', '.npy'): + # numpy checkpoint, try to load via model specific load_pretrained fn + if hasattr(model, 'load_pretrained'): + model.load_pretrained(checkpoint_path) + else: + raise NotImplementedError('Model cannot load numpy checkpoint') + return + state_dict = load_state_dict(checkpoint_path, use_ema) + incompatible_keys = model.load_state_dict(state_dict, strict=strict) + return incompatible_keys + + +def resume_checkpoint(model, checkpoint_path, optimizer=None, loss_scaler=None, log_info=True): + resume_epoch = None + if os.path.isfile(checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location='cpu') + if isinstance(checkpoint, dict) and 'state_dict' in checkpoint: + if log_info: + _logger.info('Restoring model state from checkpoint...') + state_dict = clean_state_dict(checkpoint['state_dict']) + model.load_state_dict(state_dict) + + if optimizer is not None and 'optimizer' in checkpoint: + if log_info: + _logger.info('Restoring optimizer state from checkpoint...') + optimizer.load_state_dict(checkpoint['optimizer']) + + if loss_scaler is not None and loss_scaler.state_dict_key in checkpoint: + if log_info: + _logger.info('Restoring AMP loss scaler state from checkpoint...') + loss_scaler.load_state_dict(checkpoint[loss_scaler.state_dict_key]) + + if 'epoch' in checkpoint: + resume_epoch = checkpoint['epoch'] + if 'version' in checkpoint and checkpoint['version'] > 1: + resume_epoch += 1 # start at the next epoch, old checkpoints incremented before save + + if log_info: + _logger.info("Loaded checkpoint '{}' (epoch {})".format(checkpoint_path, checkpoint['epoch'])) + else: + model.load_state_dict(checkpoint) + if log_info: + _logger.info("Loaded checkpoint '{}'".format(checkpoint_path)) + return resume_epoch + else: + _logger.error("No checkpoint found at '{}'".format(checkpoint_path)) + raise FileNotFoundError() + + +def _resolve_pretrained_source(pretrained_cfg): + cfg_source = pretrained_cfg.get('source', '') + pretrained_url = pretrained_cfg.get('url', None) + pretrained_file = pretrained_cfg.get('file', None) + hf_hub_id = pretrained_cfg.get('hf_hub_id', None) + # resolve where to load pretrained weights from + load_from = '' + pretrained_loc = '' + if cfg_source == 'hf-hub' and has_hf_hub(necessary=True): + # hf-hub specified as source via model identifier + load_from = 'hf-hub' + assert hf_hub_id + pretrained_loc = hf_hub_id + else: + # default source == timm or unspecified + if pretrained_file: + load_from = 'file' + pretrained_loc = pretrained_file + elif pretrained_url: + load_from = 'url' + pretrained_loc = pretrained_url + elif hf_hub_id and has_hf_hub(necessary=True): + # hf-hub available as alternate weight source in default_cfg + load_from = 'hf-hub' + pretrained_loc = hf_hub_id + if load_from == 'hf-hub' and 'hf_hub_filename' in pretrained_cfg: + # if a filename override is set, return tuple for location w/ (hub_id, filename) + pretrained_loc = pretrained_loc, pretrained_cfg['hf_hub_filename'] + return load_from, pretrained_loc + + +def set_pretrained_download_progress(enable=True): + """ Set download progress for pretrained weights on/off (globally). """ + global _DOWNLOAD_PROGRESS + _DOWNLOAD_PROGRESS = enable + + +def set_pretrained_check_hash(enable=True): + """ Set hash checking for pretrained weights on/off (globally). """ + global _CHECK_HASH + _CHECK_HASH = enable + + +def load_custom_pretrained( + model: nn.Module, + pretrained_cfg: Optional[Dict] = None, + load_fn: Optional[Callable] = None, +): + r"""Loads a custom (read non .pth) weight file + + Downloads checkpoint file into cache-dir like torch.hub based loaders, but calls + a passed in custom load fun, or the `load_pretrained` model member fn. + + If the object is already present in `model_dir`, it's deserialized and returned. + The default value of `model_dir` is ``/checkpoints`` where + `hub_dir` is the directory returned by :func:`~torch.hub.get_dir`. + + Args: + model: The instantiated model to load weights into + pretrained_cfg (dict): Default pretrained model cfg + load_fn: An external stand alone fn that loads weights into provided model, otherwise a fn named + 'laod_pretrained' on the model will be called if it exists + """ + pretrained_cfg = pretrained_cfg or getattr(model, 'pretrained_cfg', None) or {} + load_from, pretrained_loc = _resolve_pretrained_source(pretrained_cfg) + if not load_from: + _logger.warning("No pretrained weights exist for this model. Using random initialization.") + return + if load_from == 'hf-hub': # FIXME + _logger.warning("Hugging Face hub not currently supported for custom load pretrained models.") + elif load_from == 'url': + pretrained_loc = download_cached_file(pretrained_loc, check_hash=_CHECK_HASH, progress=_DOWNLOAD_PROGRESS) + + if load_fn is not None: + load_fn(model, pretrained_loc) + elif hasattr(model, 'load_pretrained'): + model.load_pretrained(pretrained_loc) + else: + _logger.warning("Valid function to load pretrained weights is not available, using random initialization.") + + +def adapt_input_conv(in_chans, conv_weight): + conv_type = conv_weight.dtype + conv_weight = conv_weight.float() # Some weights are in torch.half, ensure it's float for sum on CPU + O, I, J, K = conv_weight.shape + if in_chans == 1: + if I > 3: + assert conv_weight.shape[1] % 3 == 0 + # For models with space2depth stems + conv_weight = conv_weight.reshape(O, I // 3, 3, J, K) + conv_weight = conv_weight.sum(dim=2, keepdim=False) + else: + conv_weight = conv_weight.sum(dim=1, keepdim=True) + elif in_chans != 3: + if I != 3: + raise NotImplementedError('Weight format not supported by conversion.') + else: + # NOTE this strategy should be better than random init, but there could be other combinations of + # the original RGB input layer weights that'd work better for specific cases. + repeat = int(math.ceil(in_chans / 3)) + conv_weight = conv_weight.repeat(1, repeat, 1, 1)[:, :in_chans, :, :] + conv_weight *= (3 / float(in_chans)) + conv_weight = conv_weight.to(conv_type) + return conv_weight + + +def load_pretrained( + model: nn.Module, + pretrained_cfg: Optional[Dict] = None, + num_classes: int = 1000, + in_chans: int = 3, + filter_fn: Optional[Callable] = None, + strict: bool = True, +): + """ Load pretrained checkpoint + + Args: + model (nn.Module) : PyTorch model module + pretrained_cfg (Optional[Dict]): configuration for pretrained weights / target dataset + num_classes (int): num_classes for model + in_chans (int): in_chans for model + filter_fn (Optional[Callable]): state_dict filter fn for load (takes state_dict, model as args) + strict (bool): strict load of checkpoint + + """ + pretrained_cfg = pretrained_cfg or getattr(model, 'pretrained_cfg', None) or {} + load_from, pretrained_loc = _resolve_pretrained_source(pretrained_cfg) + if load_from == 'file': + _logger.info(f'Loading pretrained weights from file ({pretrained_loc})') + state_dict = load_state_dict(pretrained_loc) + elif load_from == 'url': + _logger.info(f'Loading pretrained weights from url ({pretrained_loc})') + state_dict = load_state_dict_from_url( + pretrained_loc, map_location='cpu', progress=_DOWNLOAD_PROGRESS, check_hash=_CHECK_HASH) + elif load_from == 'hf-hub': + _logger.info(f'Loading pretrained weights from Hugging Face hub ({pretrained_loc})') + if isinstance(pretrained_loc, (list, tuple)): + state_dict = load_state_dict_from_hf(*pretrained_loc) + else: + state_dict = load_state_dict_from_hf(pretrained_loc) + else: + _logger.warning("No pretrained weights exist or were found for this model. Using random initialization.") + return + + if filter_fn is not None: + # for backwards compat with filter fn that take one arg, try one first, the two + try: + state_dict = filter_fn(state_dict) + except TypeError: + state_dict = filter_fn(state_dict, model) + + input_convs = pretrained_cfg.get('first_conv', None) + if input_convs is not None and in_chans != 3: + if isinstance(input_convs, str): + input_convs = (input_convs,) + for input_conv_name in input_convs: + weight_name = input_conv_name + '.weight' + try: + state_dict[weight_name] = adapt_input_conv(in_chans, state_dict[weight_name]) + _logger.info( + f'Converted input conv {input_conv_name} pretrained weights from 3 to {in_chans} channel(s)') + except NotImplementedError as e: + del state_dict[weight_name] + strict = False + _logger.warning( + f'Unable to convert pretrained {input_conv_name} weights, using random init for this layer.') + + classifiers = pretrained_cfg.get('classifier', None) + label_offset = pretrained_cfg.get('label_offset', 0) + if classifiers is not None: + if isinstance(classifiers, str): + classifiers = (classifiers,) + if num_classes != pretrained_cfg['num_classes']: + for classifier_name in classifiers: + # completely discard fully connected if model num_classes doesn't match pretrained weights + state_dict.pop(classifier_name + '.weight', None) + state_dict.pop(classifier_name + '.bias', None) + strict = False + elif label_offset > 0: + for classifier_name in classifiers: + # special case for pretrained weights with an extra background class in pretrained weights + classifier_weight = state_dict[classifier_name + '.weight'] + state_dict[classifier_name + '.weight'] = classifier_weight[label_offset:] + classifier_bias = state_dict[classifier_name + '.bias'] + state_dict[classifier_name + '.bias'] = classifier_bias[label_offset:] + + model.load_state_dict(state_dict, strict=strict) + + +def extract_layer(model, layer): + layer = layer.split('.') + module = model + if hasattr(model, 'module') and layer[0] != 'module': + module = model.module + if not hasattr(model, 'module') and layer[0] == 'module': + layer = layer[1:] + for l in layer: + if hasattr(module, l): + if not l.isdigit(): + module = getattr(module, l) + else: + module = module[int(l)] + else: + return module + return module + + +def set_layer(model, layer, val): + layer = layer.split('.') + module = model + if hasattr(model, 'module') and layer[0] != 'module': + module = model.module + lst_index = 0 + module2 = module + for l in layer: + if hasattr(module2, l): + if not l.isdigit(): + module2 = getattr(module2, l) + else: + module2 = module2[int(l)] + lst_index += 1 + lst_index -= 1 + for l in layer[:lst_index]: + if not l.isdigit(): + module = getattr(module, l) + else: + module = module[int(l)] + l = layer[lst_index] + setattr(module, l, val) + + +def adapt_model_from_string(parent_module, model_string): + separator = '***' + state_dict = {} + lst_shape = model_string.split(separator) + for k in lst_shape: + k = k.split(':') + key = k[0] + shape = k[1][1:-1].split(',') + if shape[0] != '': + state_dict[key] = [int(i) for i in shape] + + new_module = deepcopy(parent_module) + for n, m in parent_module.named_modules(): + old_module = extract_layer(parent_module, n) + if isinstance(old_module, nn.Conv2d) or isinstance(old_module, Conv2dSame): + if isinstance(old_module, Conv2dSame): + conv = Conv2dSame + else: + conv = nn.Conv2d + s = state_dict[n + '.weight'] + in_channels = s[1] + out_channels = s[0] + g = 1 + if old_module.groups > 1: + in_channels = out_channels + g = in_channels + new_conv = conv( + in_channels=in_channels, out_channels=out_channels, kernel_size=old_module.kernel_size, + bias=old_module.bias is not None, padding=old_module.padding, dilation=old_module.dilation, + groups=g, stride=old_module.stride) + set_layer(new_module, n, new_conv) + elif isinstance(old_module, BatchNormAct2d): + new_bn = BatchNormAct2d( + state_dict[n + '.weight'][0], eps=old_module.eps, momentum=old_module.momentum, + affine=old_module.affine, track_running_stats=True) + new_bn.drop = old_module.drop + new_bn.act = old_module.act + set_layer(new_module, n, new_bn) + elif isinstance(old_module, nn.BatchNorm2d): + new_bn = nn.BatchNorm2d( + num_features=state_dict[n + '.weight'][0], eps=old_module.eps, momentum=old_module.momentum, + affine=old_module.affine, track_running_stats=True) + set_layer(new_module, n, new_bn) + elif isinstance(old_module, nn.Linear): + # FIXME extra checks to ensure this is actually the FC classifier layer and not a diff Linear layer? + num_features = state_dict[n + '.weight'][1] + new_fc = Linear( + in_features=num_features, out_features=old_module.out_features, bias=old_module.bias is not None) + set_layer(new_module, n, new_fc) + if hasattr(new_module, 'num_features'): + new_module.num_features = num_features + new_module.eval() + parent_module.eval() + + return new_module + + +def adapt_model_from_file(parent_module, model_variant): + adapt_file = os.path.join(os.path.dirname(__file__), 'pruned', model_variant + '.txt') + with open(adapt_file, 'r') as f: + return adapt_model_from_string(parent_module, f.read().strip()) + + +def pretrained_cfg_for_features(pretrained_cfg): + pretrained_cfg = deepcopy(pretrained_cfg) + # remove default pretrained cfg fields that don't have much relevance for feature backbone + to_remove = ('num_classes', 'crop_pct', 'classifier', 'global_pool') # add default final pool size? + for tr in to_remove: + pretrained_cfg.pop(tr, None) + return pretrained_cfg + + +def set_default_kwargs(kwargs, names, pretrained_cfg): + for n in names: + # for legacy reasons, model __init__args uses img_size + in_chans as separate args while + # pretrained_cfg has one input_size=(C, H ,W) entry + if n == 'img_size': + input_size = pretrained_cfg.get('input_size', None) + if input_size is not None: + assert len(input_size) == 3 + kwargs.setdefault(n, input_size[-2:]) + elif n == 'in_chans': + input_size = pretrained_cfg.get('input_size', None) + if input_size is not None: + assert len(input_size) == 3 + kwargs.setdefault(n, input_size[0]) + else: + default_val = pretrained_cfg.get(n, None) + if default_val is not None: + kwargs.setdefault(n, pretrained_cfg[n]) + + +def filter_kwargs(kwargs, names): + if not kwargs or not names: + return + for n in names: + kwargs.pop(n, None) + + +def update_pretrained_cfg_and_kwargs(pretrained_cfg, kwargs, kwargs_filter): + """ Update the default_cfg and kwargs before passing to model + + Args: + pretrained_cfg: input pretrained cfg (updated in-place) + kwargs: keyword args passed to model build fn (updated in-place) + kwargs_filter: keyword arg keys that must be removed before model __init__ + """ + # Set model __init__ args that can be determined by default_cfg (if not already passed as kwargs) + default_kwarg_names = ('num_classes', 'global_pool', 'in_chans') + if pretrained_cfg.get('fixed_input_size', False): + # if fixed_input_size exists and is True, model takes an img_size arg that fixes its input size + default_kwarg_names += ('img_size',) + set_default_kwargs(kwargs, names=default_kwarg_names, pretrained_cfg=pretrained_cfg) + # Filter keyword args for task specific model variants (some 'features only' models, etc.) + filter_kwargs(kwargs, names=kwargs_filter) + + +def resolve_pretrained_cfg(variant: str, pretrained_cfg=None): + if pretrained_cfg and isinstance(pretrained_cfg, dict): + # highest priority, pretrained_cfg available and passed as arg + return deepcopy(pretrained_cfg) + # fallback to looking up pretrained cfg in model registry by variant identifier + pretrained_cfg = get_pretrained_cfg(variant) + if not pretrained_cfg: + _logger.warning( + f"No pretrained configuration specified for {variant} model. Using a default." + f" Please add a config to the model pretrained_cfg registry or pass explicitly.") + pretrained_cfg = dict( + url='', + num_classes=1000, + input_size=(3, 224, 224), + pool_size=None, + crop_pct=.9, + interpolation='bicubic', + first_conv='', + classifier='', + ) + return pretrained_cfg + + +def build_model_with_cfg( + model_cls: Callable, + variant: str, + pretrained: bool, + pretrained_cfg: Optional[Dict] = None, + model_cfg: Optional[Any] = None, + feature_cfg: Optional[Dict] = None, + pretrained_strict: bool = True, + pretrained_filter_fn: Optional[Callable] = None, + pretrained_custom_load: bool = False, + kwargs_filter: Optional[Tuple[str]] = None, + **kwargs): + """ Build model with specified default_cfg and optional model_cfg + + This helper fn aids in the construction of a model including: + * handling default_cfg and associated pretrained weight loading + * passing through optional model_cfg for models with config based arch spec + * features_only model adaptation + * pruning config / model adaptation + + Args: + model_cls (nn.Module): model class + variant (str): model variant name + pretrained (bool): load pretrained weights + pretrained_cfg (dict): model's pretrained weight/task config + model_cfg (Optional[Dict]): model's architecture config + feature_cfg (Optional[Dict]: feature extraction adapter config + pretrained_strict (bool): load pretrained weights strictly + pretrained_filter_fn (Optional[Callable]): filter callable for pretrained weights + pretrained_custom_load (bool): use custom load fn, to load numpy or other non PyTorch weights + kwargs_filter (Optional[Tuple]): kwargs to filter before passing to model + **kwargs: model args passed through to model __init__ + """ + pruned = kwargs.pop('pruned', False) + features = False + feature_cfg = feature_cfg or {} + + # resolve and update model pretrained config and model kwargs + pretrained_cfg = resolve_pretrained_cfg(variant, pretrained_cfg=pretrained_cfg) + update_pretrained_cfg_and_kwargs(pretrained_cfg, kwargs, kwargs_filter) + pretrained_cfg.setdefault('architecture', variant) + + # Setup for feature extraction wrapper done at end of this fn + if kwargs.pop('features_only', False): + features = True + feature_cfg.setdefault('out_indices', (0, 1, 2, 3, 4)) + if 'out_indices' in kwargs: + feature_cfg['out_indices'] = kwargs.pop('out_indices') + + # Build the model + model = model_cls(**kwargs) if model_cfg is None else model_cls(cfg=model_cfg, **kwargs) + model.pretrained_cfg = pretrained_cfg + model.default_cfg = model.pretrained_cfg # alias for backwards compat + + if pruned: + model = adapt_model_from_file(model, variant) + + # For classification models, check class attr, then kwargs, then default to 1k, otherwise 0 for feats + num_classes_pretrained = 0 if features else getattr(model, 'num_classes', kwargs.get('num_classes', 1000)) + if pretrained: + if pretrained_custom_load: + # FIXME improve custom load trigger + load_custom_pretrained(model, pretrained_cfg=pretrained_cfg) + else: + load_pretrained( + model, + pretrained_cfg=pretrained_cfg, + num_classes=num_classes_pretrained, + in_chans=kwargs.get('in_chans', 3), + filter_fn=pretrained_filter_fn, + strict=pretrained_strict) + + # Wrap the model in a feature extraction module if enabled + if features: + feature_cls = FeatureListNet + if 'feature_cls' in feature_cfg: + feature_cls = feature_cfg.pop('feature_cls') + if isinstance(feature_cls, str): + feature_cls = feature_cls.lower() + if 'hook' in feature_cls: + feature_cls = FeatureHookNet + elif feature_cls == 'fx': + feature_cls = FeatureGraphNet + else: + assert False, f'Unknown feature class {feature_cls}' + model = feature_cls(model, **feature_cfg) + model.pretrained_cfg = pretrained_cfg_for_features(pretrained_cfg) # add back default_cfg + model.default_cfg = model.pretrained_cfg # alias for backwards compat + + return model + + +def model_parameters(model, exclude_head=False): + if exclude_head: + # FIXME this a bit of a quick and dirty hack to skip classifier head params based on ordering + return [p for p in model.parameters()][:-2] + else: + return model.parameters() + + +def named_apply(fn: Callable, module: nn.Module, name='', depth_first=True, include_root=False) -> nn.Module: + if not depth_first and include_root: + fn(module=module, name=name) + for child_name, child_module in module.named_children(): + child_name = '.'.join((name, child_name)) if name else child_name + named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if depth_first and include_root: + fn(module=module, name=name) + return module + + +def named_modules(module: nn.Module, name='', depth_first=True, include_root=False): + if not depth_first and include_root: + yield name, module + for child_name, child_module in module.named_children(): + child_name = '.'.join((name, child_name)) if name else child_name + yield from named_modules( + module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if depth_first and include_root: + yield name, module + + +def named_modules_with_params(module: nn.Module, name='', depth_first=True, include_root=False): + if module._parameters and not depth_first and include_root: + yield name, module + for child_name, child_module in module.named_children(): + child_name = '.'.join((name, child_name)) if name else child_name + yield from named_modules_with_params( + module=child_module, name=child_name, depth_first=depth_first, include_root=True) + if module._parameters and depth_first and include_root: + yield name, module + + +MATCH_PREV_GROUP = (99999,) + + +def group_with_matcher( + named_objects, + group_matcher: Union[Dict, Callable], + output_values: bool = False, + reverse: bool = False +): + if isinstance(group_matcher, dict): + # dictionary matcher contains a dict of raw-string regex expr that must be compiled + compiled = [] + for group_ordinal, (group_name, mspec) in enumerate(group_matcher.items()): + if mspec is None: + continue + # map all matching specifications into 3-tuple (compiled re, prefix, suffix) + if isinstance(mspec, (tuple, list)): + # multi-entry match specifications require each sub-spec to be a 2-tuple (re, suffix) + for sspec in mspec: + compiled += [(re.compile(sspec[0]), (group_ordinal,), sspec[1])] + else: + compiled += [(re.compile(mspec), (group_ordinal,), None)] + group_matcher = compiled + + def _get_grouping(name): + if isinstance(group_matcher, (list, tuple)): + for match_fn, prefix, suffix in group_matcher: + r = match_fn.match(name) + if r: + parts = (prefix, r.groups(), suffix) + # map all tuple elem to int for numeric sort, filter out None entries + return tuple(map(float, chain.from_iterable(filter(None, parts)))) + return float('inf'), # un-matched layers (neck, head) mapped to largest ordinal + else: + ord = group_matcher(name) + if not isinstance(ord, collections.abc.Iterable): + return ord, + return tuple(ord) + + # map layers into groups via ordinals (ints or tuples of ints) from matcher + grouping = defaultdict(list) + for k, v in named_objects: + grouping[_get_grouping(k)].append(v if output_values else k) + + # remap to integers + layer_id_to_param = defaultdict(list) + lid = -1 + for k in sorted(filter(lambda x: x is not None, grouping.keys())): + if lid < 0 or k[-1] != MATCH_PREV_GROUP[0]: + lid += 1 + layer_id_to_param[lid].extend(grouping[k]) + + if reverse: + assert not output_values, "reverse mapping only sensible for name output" + # output reverse mapping + param_to_layer_id = {} + for lid, lm in layer_id_to_param.items(): + for n in lm: + param_to_layer_id[n] = lid + return param_to_layer_id + + return layer_id_to_param + + +def group_parameters( + module: nn.Module, + group_matcher, + output_values=False, + reverse=False, +): + return group_with_matcher( + module.named_parameters(), group_matcher, output_values=output_values, reverse=reverse) + + +def group_modules( + module: nn.Module, + group_matcher, + output_values=False, + reverse=False, +): + return group_with_matcher( + named_modules_with_params(module), group_matcher, output_values=output_values, reverse=reverse) + + +def checkpoint_seq( + functions, + x, + every=1, + flatten=False, + skip_last=False, + preserve_rng_state=True +): + r"""A helper function for checkpointing sequential models. + + Sequential models execute a list of modules/functions in order + (sequentially). Therefore, we can divide such a sequence into segments + and checkpoint each segment. All segments except run in :func:`torch.no_grad` + manner, i.e., not storing the intermediate activations. The inputs of each + checkpointed segment will be saved for re-running the segment in the backward pass. + + See :func:`~torch.utils.checkpoint.checkpoint` on how checkpointing works. + + .. warning:: + Checkpointing currently only supports :func:`torch.autograd.backward` + and only if its `inputs` argument is not passed. :func:`torch.autograd.grad` + is not supported. + + .. warning: + At least one of the inputs needs to have :code:`requires_grad=True` if + grads are needed for model inputs, otherwise the checkpointed part of the + model won't have gradients. + + Args: + functions: A :class:`torch.nn.Sequential` or the list of modules or functions to run sequentially. + x: A Tensor that is input to :attr:`functions` + every: checkpoint every-n functions (default: 1) + flatten (bool): flatten nn.Sequential of nn.Sequentials + skip_last (bool): skip checkpointing the last function in the sequence if True + preserve_rng_state (bool, optional, default=True): Omit stashing and restoring + the RNG state during each checkpoint. + + Returns: + Output of running :attr:`functions` sequentially on :attr:`*inputs` + + Example: + >>> model = nn.Sequential(...) + >>> input_var = checkpoint_seq(model, input_var, every=2) + """ + def run_function(start, end, functions): + def forward(_x): + for j in range(start, end + 1): + _x = functions[j](_x) + return _x + return forward + + if isinstance(functions, torch.nn.Sequential): + functions = functions.children() + if flatten: + functions = chain.from_iterable(functions) + if not isinstance(functions, (tuple, list)): + functions = tuple(functions) + + num_checkpointed = len(functions) + if skip_last: + num_checkpointed -= 1 + end = -1 + for start in range(0, num_checkpointed, every): + end = min(start + every - 1, num_checkpointed - 1) + x = checkpoint(run_function(start, end, functions), x, preserve_rng_state=preserve_rng_state) + if skip_last: + return run_function(end + 1, len(functions) - 1, functions)(x) + return x + + +def flatten_modules(named_modules, depth=1, prefix='', module_types='sequential'): + prefix_is_tuple = isinstance(prefix, tuple) + if isinstance(module_types, str): + if module_types == 'container': + module_types = (nn.Sequential, nn.ModuleList, nn.ModuleDict) + else: + module_types = (nn.Sequential,) + for name, module in named_modules: + if depth and isinstance(module, module_types): + yield from flatten_modules( + module.named_children(), + depth - 1, + prefix=(name,) if prefix_is_tuple else name, + module_types=module_types, + ) + else: + if prefix_is_tuple: + name = prefix + (name,) + yield name, module + else: + if prefix: + name = '.'.join([prefix, name]) + yield name, module diff --git a/custom_timm/models/hrnet.py b/custom_timm/models/hrnet.py new file mode 100644 index 0000000000000000000000000000000000000000..08405e8793f4600a40bcea0cb6d5855e1d2f34b0 --- /dev/null +++ b/custom_timm/models/hrnet.py @@ -0,0 +1,858 @@ +""" HRNet + +Copied from https://github.com/HRNet/HRNet-Image-Classification + +Original header: + Copyright (c) Microsoft + Licensed under the MIT License. + Written by Bin Xiao (Bin.Xiao@microsoft.com) + Modified by Ke Sun (sunk@mail.ustc.edu.cn) +""" +import logging +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .features import FeatureInfo +from .helpers import build_model_with_cfg, pretrained_cfg_for_features +from .layers import create_classifier +from .registry import register_model +from .resnet import BasicBlock, Bottleneck # leveraging ResNet blocks w/ additional features like SE + +_BN_MOMENTUM = 0.1 +_logger = logging.getLogger(__name__) + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv1', 'classifier': 'classifier', + **kwargs + } + + +default_cfgs = { + 'hrnet_w18_small': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnet_w18_small_v1-f460c6bc.pth'), + 'hrnet_w18_small_v2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnet_w18_small_v2-4c50a8cb.pth'), + 'hrnet_w18': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w18-8cb57bb9.pth'), + 'hrnet_w30': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w30-8d7f8dab.pth'), + 'hrnet_w32': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w32-90d8c5fb.pth'), + 'hrnet_w40': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w40-7cd397a4.pth'), + 'hrnet_w44': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w44-c9ac8c18.pth'), + 'hrnet_w48': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w48-abd2e6ab.pth'), + 'hrnet_w64': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-hrnet/hrnetv2_w64-b47cc881.pth'), +} + +cfg_cls = dict( + hrnet_w18_small=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(1,), + NUM_CHANNELS=(32,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(2, 2), + NUM_CHANNELS=(16, 32), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=1, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(2, 2, 2), + NUM_CHANNELS=(16, 32, 64), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=1, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(2, 2, 2, 2), + NUM_CHANNELS=(16, 32, 64, 128), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w18_small_v2=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(2,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(2, 2), + NUM_CHANNELS=(18, 36), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=3, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(2, 2, 2), + NUM_CHANNELS=(18, 36, 72), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=2, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(2, 2, 2, 2), + NUM_CHANNELS=(18, 36, 72, 144), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w18=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(4,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4), + NUM_CHANNELS=(18, 36), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=4, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4), + NUM_CHANNELS=(18, 36, 72), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=3, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4, 4), + NUM_CHANNELS=(18, 36, 72, 144), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w30=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(4,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4), + NUM_CHANNELS=(30, 60), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=4, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4), + NUM_CHANNELS=(30, 60, 120), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=3, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4, 4), + NUM_CHANNELS=(30, 60, 120, 240), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w32=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(4,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4), + NUM_CHANNELS=(32, 64), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=4, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4), + NUM_CHANNELS=(32, 64, 128), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=3, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4, 4), + NUM_CHANNELS=(32, 64, 128, 256), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w40=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(4,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4), + NUM_CHANNELS=(40, 80), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=4, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4), + NUM_CHANNELS=(40, 80, 160), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=3, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4, 4), + NUM_CHANNELS=(40, 80, 160, 320), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w44=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(4,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4), + NUM_CHANNELS=(44, 88), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=4, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4), + NUM_CHANNELS=(44, 88, 176), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=3, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4, 4), + NUM_CHANNELS=(44, 88, 176, 352), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w48=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(4,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4), + NUM_CHANNELS=(48, 96), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=4, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4), + NUM_CHANNELS=(48, 96, 192), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=3, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4, 4), + NUM_CHANNELS=(48, 96, 192, 384), + FUSE_METHOD='SUM', + ), + ), + + hrnet_w64=dict( + STEM_WIDTH=64, + STAGE1=dict( + NUM_MODULES=1, + NUM_BRANCHES=1, + BLOCK='BOTTLENECK', + NUM_BLOCKS=(4,), + NUM_CHANNELS=(64,), + FUSE_METHOD='SUM', + ), + STAGE2=dict( + NUM_MODULES=1, + NUM_BRANCHES=2, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4), + NUM_CHANNELS=(64, 128), + FUSE_METHOD='SUM' + ), + STAGE3=dict( + NUM_MODULES=4, + NUM_BRANCHES=3, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4), + NUM_CHANNELS=(64, 128, 256), + FUSE_METHOD='SUM' + ), + STAGE4=dict( + NUM_MODULES=3, + NUM_BRANCHES=4, + BLOCK='BASIC', + NUM_BLOCKS=(4, 4, 4, 4), + NUM_CHANNELS=(64, 128, 256, 512), + FUSE_METHOD='SUM', + ), + ) +) + + +class HighResolutionModule(nn.Module): + def __init__(self, num_branches, blocks, num_blocks, num_in_chs, + num_channels, fuse_method, multi_scale_output=True): + super(HighResolutionModule, self).__init__() + self._check_branches( + num_branches, blocks, num_blocks, num_in_chs, num_channels) + + self.num_in_chs = num_in_chs + self.fuse_method = fuse_method + self.num_branches = num_branches + + self.multi_scale_output = multi_scale_output + + self.branches = self._make_branches( + num_branches, blocks, num_blocks, num_channels) + self.fuse_layers = self._make_fuse_layers() + self.fuse_act = nn.ReLU(False) + + def _check_branches(self, num_branches, blocks, num_blocks, num_in_chs, num_channels): + error_msg = '' + if num_branches != len(num_blocks): + error_msg = 'NUM_BRANCHES({}) <> NUM_BLOCKS({})'.format(num_branches, len(num_blocks)) + elif num_branches != len(num_channels): + error_msg = 'NUM_BRANCHES({}) <> NUM_CHANNELS({})'.format(num_branches, len(num_channels)) + elif num_branches != len(num_in_chs): + error_msg = 'NUM_BRANCHES({}) <> num_in_chs({})'.format(num_branches, len(num_in_chs)) + if error_msg: + _logger.error(error_msg) + raise ValueError(error_msg) + + def _make_one_branch(self, branch_index, block, num_blocks, num_channels, stride=1): + downsample = None + if stride != 1 or self.num_in_chs[branch_index] != num_channels[branch_index] * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.num_in_chs[branch_index], num_channels[branch_index] * block.expansion, + kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(num_channels[branch_index] * block.expansion, momentum=_BN_MOMENTUM), + ) + + layers = [block(self.num_in_chs[branch_index], num_channels[branch_index], stride, downsample)] + self.num_in_chs[branch_index] = num_channels[branch_index] * block.expansion + for i in range(1, num_blocks[branch_index]): + layers.append(block(self.num_in_chs[branch_index], num_channels[branch_index])) + + return nn.Sequential(*layers) + + def _make_branches(self, num_branches, block, num_blocks, num_channels): + branches = [] + for i in range(num_branches): + branches.append(self._make_one_branch(i, block, num_blocks, num_channels)) + + return nn.ModuleList(branches) + + def _make_fuse_layers(self): + if self.num_branches == 1: + return nn.Identity() + + num_branches = self.num_branches + num_in_chs = self.num_in_chs + fuse_layers = [] + for i in range(num_branches if self.multi_scale_output else 1): + fuse_layer = [] + for j in range(num_branches): + if j > i: + fuse_layer.append(nn.Sequential( + nn.Conv2d(num_in_chs[j], num_in_chs[i], 1, 1, 0, bias=False), + nn.BatchNorm2d(num_in_chs[i], momentum=_BN_MOMENTUM), + nn.Upsample(scale_factor=2 ** (j - i), mode='nearest'))) + elif j == i: + fuse_layer.append(nn.Identity()) + else: + conv3x3s = [] + for k in range(i - j): + if k == i - j - 1: + num_outchannels_conv3x3 = num_in_chs[i] + conv3x3s.append(nn.Sequential( + nn.Conv2d(num_in_chs[j], num_outchannels_conv3x3, 3, 2, 1, bias=False), + nn.BatchNorm2d(num_outchannels_conv3x3, momentum=_BN_MOMENTUM))) + else: + num_outchannels_conv3x3 = num_in_chs[j] + conv3x3s.append(nn.Sequential( + nn.Conv2d(num_in_chs[j], num_outchannels_conv3x3, 3, 2, 1, bias=False), + nn.BatchNorm2d(num_outchannels_conv3x3, momentum=_BN_MOMENTUM), + nn.ReLU(False))) + fuse_layer.append(nn.Sequential(*conv3x3s)) + fuse_layers.append(nn.ModuleList(fuse_layer)) + + return nn.ModuleList(fuse_layers) + + def get_num_in_chs(self): + return self.num_in_chs + + def forward(self, x: List[torch.Tensor]): + if self.num_branches == 1: + return [self.branches[0](x[0])] + + for i, branch in enumerate(self.branches): + x[i] = branch(x[i]) + + x_fuse = [] + for i, fuse_outer in enumerate(self.fuse_layers): + y = x[0] if i == 0 else fuse_outer[0](x[0]) + for j in range(1, self.num_branches): + if i == j: + y = y + x[j] + else: + y = y + fuse_outer[j](x[j]) + x_fuse.append(self.fuse_act(y)) + + return x_fuse + + +blocks_dict = { + 'BASIC': BasicBlock, + 'BOTTLENECK': Bottleneck +} + + +class HighResolutionNet(nn.Module): + + def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0.0, head='classification'): + super(HighResolutionNet, self).__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + + stem_width = cfg['STEM_WIDTH'] + self.conv1 = nn.Conv2d(in_chans, stem_width, kernel_size=3, stride=2, padding=1, bias=False) + self.bn1 = nn.BatchNorm2d(stem_width, momentum=_BN_MOMENTUM) + self.act1 = nn.ReLU(inplace=True) + self.conv2 = nn.Conv2d(stem_width, 64, kernel_size=3, stride=2, padding=1, bias=False) + self.bn2 = nn.BatchNorm2d(64, momentum=_BN_MOMENTUM) + self.act2 = nn.ReLU(inplace=True) + + self.stage1_cfg = cfg['STAGE1'] + num_channels = self.stage1_cfg['NUM_CHANNELS'][0] + block = blocks_dict[self.stage1_cfg['BLOCK']] + num_blocks = self.stage1_cfg['NUM_BLOCKS'][0] + self.layer1 = self._make_layer(block, 64, num_channels, num_blocks) + stage1_out_channel = block.expansion * num_channels + + self.stage2_cfg = cfg['STAGE2'] + num_channels = self.stage2_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage2_cfg['BLOCK']] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition1 = self._make_transition_layer([stage1_out_channel], num_channels) + self.stage2, pre_stage_channels = self._make_stage(self.stage2_cfg, num_channels) + + self.stage3_cfg = cfg['STAGE3'] + num_channels = self.stage3_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage3_cfg['BLOCK']] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition2 = self._make_transition_layer(pre_stage_channels, num_channels) + self.stage3, pre_stage_channels = self._make_stage(self.stage3_cfg, num_channels) + + self.stage4_cfg = cfg['STAGE4'] + num_channels = self.stage4_cfg['NUM_CHANNELS'] + block = blocks_dict[self.stage4_cfg['BLOCK']] + num_channels = [num_channels[i] * block.expansion for i in range(len(num_channels))] + self.transition3 = self._make_transition_layer(pre_stage_channels, num_channels) + self.stage4, pre_stage_channels = self._make_stage(self.stage4_cfg, num_channels, multi_scale_output=True) + + self.head = head + self.head_channels = None # set if _make_head called + if head == 'classification': + # Classification Head + self.num_features = 2048 + self.incre_modules, self.downsamp_modules, self.final_layer = self._make_head(pre_stage_channels) + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + elif head == 'incre': + self.num_features = 2048 + self.incre_modules, _, _ = self._make_head(pre_stage_channels, True) + else: + self.incre_modules = None + self.num_features = 256 + + curr_stride = 2 + # module names aren't actually valid here, hook or FeatureNet based extraction would not work + self.feature_info = [dict(num_chs=64, reduction=curr_stride, module='stem')] + for i, c in enumerate(self.head_channels if self.head_channels else num_channels): + curr_stride *= 2 + c = c * 4 if self.head_channels else c # head block expansion factor of 4 + self.feature_info += [dict(num_chs=c, reduction=curr_stride, module=f'stage{i + 1}')] + + self.init_weights() + + def _make_head(self, pre_stage_channels, incre_only=False): + head_block = Bottleneck + self.head_channels = [32, 64, 128, 256] + + # Increasing the #channels on each resolution + # from C, 2C, 4C, 8C to 128, 256, 512, 1024 + incre_modules = [] + for i, channels in enumerate(pre_stage_channels): + incre_modules.append(self._make_layer(head_block, channels, self.head_channels[i], 1, stride=1)) + incre_modules = nn.ModuleList(incre_modules) + if incre_only: + return incre_modules, None, None + + # downsampling modules + downsamp_modules = [] + for i in range(len(pre_stage_channels) - 1): + in_channels = self.head_channels[i] * head_block.expansion + out_channels = self.head_channels[i + 1] * head_block.expansion + downsamp_module = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, out_channels=out_channels, kernel_size=3, stride=2, padding=1), + nn.BatchNorm2d(out_channels, momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True) + ) + downsamp_modules.append(downsamp_module) + downsamp_modules = nn.ModuleList(downsamp_modules) + + final_layer = nn.Sequential( + nn.Conv2d( + in_channels=self.head_channels[3] * head_block.expansion, + out_channels=self.num_features, kernel_size=1, stride=1, padding=0 + ), + nn.BatchNorm2d(self.num_features, momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True) + ) + + return incre_modules, downsamp_modules, final_layer + + def _make_transition_layer(self, num_channels_pre_layer, num_channels_cur_layer): + num_branches_cur = len(num_channels_cur_layer) + num_branches_pre = len(num_channels_pre_layer) + + transition_layers = [] + for i in range(num_branches_cur): + if i < num_branches_pre: + if num_channels_cur_layer[i] != num_channels_pre_layer[i]: + transition_layers.append(nn.Sequential( + nn.Conv2d(num_channels_pre_layer[i], num_channels_cur_layer[i], 3, 1, 1, bias=False), + nn.BatchNorm2d(num_channels_cur_layer[i], momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True))) + else: + transition_layers.append(nn.Identity()) + else: + conv3x3s = [] + for j in range(i + 1 - num_branches_pre): + inchannels = num_channels_pre_layer[-1] + outchannels = num_channels_cur_layer[i] if j == i - num_branches_pre else inchannels + conv3x3s.append(nn.Sequential( + nn.Conv2d(inchannels, outchannels, 3, 2, 1, bias=False), + nn.BatchNorm2d(outchannels, momentum=_BN_MOMENTUM), + nn.ReLU(inplace=True))) + transition_layers.append(nn.Sequential(*conv3x3s)) + + return nn.ModuleList(transition_layers) + + def _make_layer(self, block, inplanes, planes, blocks, stride=1): + downsample = None + if stride != 1 or inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d(inplanes, planes * block.expansion, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(planes * block.expansion, momentum=_BN_MOMENTUM), + ) + + layers = [block(inplanes, planes, stride, downsample)] + inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(inplanes, planes)) + + return nn.Sequential(*layers) + + def _make_stage(self, layer_config, num_in_chs, multi_scale_output=True): + num_modules = layer_config['NUM_MODULES'] + num_branches = layer_config['NUM_BRANCHES'] + num_blocks = layer_config['NUM_BLOCKS'] + num_channels = layer_config['NUM_CHANNELS'] + block = blocks_dict[layer_config['BLOCK']] + fuse_method = layer_config['FUSE_METHOD'] + + modules = [] + for i in range(num_modules): + # multi_scale_output is only used last module + reset_multi_scale_output = multi_scale_output or i < num_modules - 1 + modules.append(HighResolutionModule( + num_branches, block, num_blocks, num_in_chs, num_channels, fuse_method, reset_multi_scale_output) + ) + num_in_chs = modules[-1].get_num_in_chs() + + return nn.Sequential(*modules), num_in_chs + + @torch.jit.ignore + def init_weights(self): + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^conv[12]|bn[12]', + blocks=r'^(?:layer|stage|transition)(\d+)' if coarse else [ + (r'^layer(\d+)\.(\d+)', None), + (r'^stage(\d+)\.(\d+)', None), + (r'^transition(\d+)', (99999,)), + ], + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, "gradient checkpointing not supported" + + @torch.jit.ignore + def get_classifier(self): + return self.classifier + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.classifier = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def stages(self, x) -> List[torch.Tensor]: + x = self.layer1(x) + + xl = [t(x) for i, t in enumerate(self.transition1)] + yl = self.stage2(xl) + + xl = [t(yl[-1]) if not isinstance(t, nn.Identity) else yl[i] for i, t in enumerate(self.transition2)] + yl = self.stage3(xl) + + xl = [t(yl[-1]) if not isinstance(t, nn.Identity) else yl[i] for i, t in enumerate(self.transition3)] + yl = self.stage4(xl) + return yl + + def forward_features(self, x): + # Stem + x = self.conv1(x) + x = self.bn1(x) + x = self.act1(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.act2(x) + + # Stages + yl = self.stages(x) + if self.incre_modules is None or self.downsamp_modules is None: + return yl + y = self.incre_modules[0](yl[0]) + for i, down in enumerate(self.downsamp_modules): + y = self.incre_modules[i + 1](yl[i + 1]) + down(y) + y = self.final_layer(y) + return y + + def forward_head(self, x, pre_logits: bool = False): + # Classification Head + x = self.global_pool(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return x if pre_logits else self.classifier(x) + + def forward(self, x): + y = self.forward_features(x) + x = self.forward_head(y) + return x + + +class HighResolutionNetFeatures(HighResolutionNet): + """HighResolutionNet feature extraction + + The design of HRNet makes it easy to grab feature maps, this class provides a simple wrapper to do so. + It would be more complicated to use the FeatureNet helpers. + + The `feature_location=incre` allows grabbing increased channel count features using part of the + classification head. If `feature_location=''` the default HRNet features are returned. First stem + conv is used for stride 2 features. + """ + + def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0.0, + feature_location='incre', out_indices=(0, 1, 2, 3, 4)): + assert feature_location in ('incre', '') + super(HighResolutionNetFeatures, self).__init__( + cfg, in_chans=in_chans, num_classes=num_classes, global_pool=global_pool, + drop_rate=drop_rate, head=feature_location) + self.feature_info = FeatureInfo(self.feature_info, out_indices) + self._out_idx = {i for i in out_indices} + + def forward_features(self, x): + assert False, 'Not supported' + + def forward(self, x) -> List[torch.tensor]: + out = [] + x = self.conv1(x) + x = self.bn1(x) + x = self.act1(x) + if 0 in self._out_idx: + out.append(x) + x = self.conv2(x) + x = self.bn2(x) + x = self.act2(x) + x = self.stages(x) + if self.incre_modules is not None: + x = [incre(f) for f, incre in zip(x, self.incre_modules)] + for i, f in enumerate(x): + if i + 1 in self._out_idx: + out.append(f) + return out + + +def _create_hrnet(variant, pretrained, **model_kwargs): + model_cls = HighResolutionNet + features_only = False + kwargs_filter = None + if model_kwargs.pop('features_only', False): + model_cls = HighResolutionNetFeatures + kwargs_filter = ('num_classes', 'global_pool') + features_only = True + model = build_model_with_cfg( + model_cls, variant, pretrained, + model_cfg=cfg_cls[variant], + pretrained_strict=not features_only, + kwargs_filter=kwargs_filter, + **model_kwargs) + if features_only: + model.pretrained_cfg = pretrained_cfg_for_features(model.default_cfg) + model.default_cfg = model.pretrained_cfg # backwards compat + return model + + +@register_model +def hrnet_w18_small(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w18_small', pretrained, **kwargs) + + +@register_model +def hrnet_w18_small_v2(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w18_small_v2', pretrained, **kwargs) + + +@register_model +def hrnet_w18(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w18', pretrained, **kwargs) + + +@register_model +def hrnet_w30(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w30', pretrained, **kwargs) + + +@register_model +def hrnet_w32(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w32', pretrained, **kwargs) + + +@register_model +def hrnet_w40(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w40', pretrained, **kwargs) + + +@register_model +def hrnet_w44(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w44', pretrained, **kwargs) + + +@register_model +def hrnet_w48(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w48', pretrained, **kwargs) + + +@register_model +def hrnet_w64(pretrained=False, **kwargs): + return _create_hrnet('hrnet_w64', pretrained, **kwargs) diff --git a/custom_timm/models/hub.py b/custom_timm/models/hub.py new file mode 100644 index 0000000000000000000000000000000000000000..2c1a6e5df0279d99b2a57f0762f5214de13dad94 --- /dev/null +++ b/custom_timm/models/hub.py @@ -0,0 +1,170 @@ +import json +import logging +import os +from functools import partial +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Optional, Union + +import torch +from torch.hub import HASH_REGEX, download_url_to_file, urlparse + +try: + from torch.hub import get_dir +except ImportError: + from torch.hub import _get_torch_home as get_dir + +from custom_timm import __version__ + +try: + from huggingface_hub import (create_repo, get_hf_file_metadata, + hf_hub_download, hf_hub_url, + repo_type_and_id_from_hf_id, upload_folder) + from huggingface_hub.utils import EntryNotFoundError + hf_hub_download = partial(hf_hub_download, library_name="timm", library_version=__version__) + _has_hf_hub = True +except ImportError: + hf_hub_download = None + _has_hf_hub = False + +_logger = logging.getLogger(__name__) + + +def get_cache_dir(child_dir=''): + """ + Returns the location of the directory where models are cached (and creates it if necessary). + """ + # Issue warning to move data if old env is set + if os.getenv('TORCH_MODEL_ZOO'): + _logger.warning('TORCH_MODEL_ZOO is deprecated, please use env TORCH_HOME instead') + + hub_dir = get_dir() + child_dir = () if not child_dir else (child_dir,) + model_dir = os.path.join(hub_dir, 'checkpoints', *child_dir) + os.makedirs(model_dir, exist_ok=True) + return model_dir + + +def download_cached_file(url, check_hash=True, progress=False): + parts = urlparse(url) + filename = os.path.basename(parts.path) + cached_file = os.path.join(get_cache_dir(), filename) + if not os.path.exists(cached_file): + _logger.info('Downloading: "{}" to {}\n'.format(url, cached_file)) + hash_prefix = None + if check_hash: + r = HASH_REGEX.search(filename) # r is Optional[Match[str]] + hash_prefix = r.group(1) if r else None + download_url_to_file(url, cached_file, hash_prefix, progress=progress) + return cached_file + + +def has_hf_hub(necessary=False): + if not _has_hf_hub and necessary: + # if no HF Hub module installed, and it is necessary to continue, raise error + raise RuntimeError( + 'Hugging Face hub model specified but package not installed. Run `pip install huggingface_hub`.') + return _has_hf_hub + + +def hf_split(hf_id): + # FIXME I may change @ -> # and be parsed as fragment in a URI model name scheme + rev_split = hf_id.split('@') + assert 0 < len(rev_split) <= 2, 'hf_hub id should only contain one @ character to identify revision.' + hf_model_id = rev_split[0] + hf_revision = rev_split[-1] if len(rev_split) > 1 else None + return hf_model_id, hf_revision + + +def load_cfg_from_json(json_file: Union[str, os.PathLike]): + with open(json_file, "r", encoding="utf-8") as reader: + text = reader.read() + return json.loads(text) + + +def _download_from_hf(model_id: str, filename: str): + hf_model_id, hf_revision = hf_split(model_id) + return hf_hub_download(hf_model_id, filename, revision=hf_revision) + + +def load_model_config_from_hf(model_id: str): + assert has_hf_hub(True) + cached_file = _download_from_hf(model_id, 'config.json') + pretrained_cfg = load_cfg_from_json(cached_file) + pretrained_cfg['hf_hub_id'] = model_id # insert hf_hub id for pretrained weight load during model creation + pretrained_cfg['source'] = 'hf-hub' + model_name = pretrained_cfg.get('architecture') + return pretrained_cfg, model_name + + +def load_state_dict_from_hf(model_id: str, filename: str = 'pytorch_model.bin'): + assert has_hf_hub(True) + cached_file = _download_from_hf(model_id, filename) + state_dict = torch.load(cached_file, map_location='cpu') + return state_dict + + +def save_for_hf(model, save_directory, model_config=None): + assert has_hf_hub(True) + model_config = model_config or {} + save_directory = Path(save_directory) + save_directory.mkdir(exist_ok=True, parents=True) + + weights_path = save_directory / 'pytorch_model.bin' + torch.save(model.state_dict(), weights_path) + + config_path = save_directory / 'config.json' + hf_config = model.pretrained_cfg + hf_config['num_classes'] = model_config.pop('num_classes', model.num_classes) + hf_config['num_features'] = model_config.pop('num_features', model.num_features) + hf_config['labels'] = model_config.pop('labels', [f"LABEL_{i}" for i in range(hf_config['num_classes'])]) + hf_config.update(model_config) + + with config_path.open('w') as f: + json.dump(hf_config, f, indent=2) + + +def push_to_hf_hub( + model, + repo_id: str, + commit_message: str ='Add model', + token: Optional[str] = None, + revision: Optional[str] = None, + private: bool = False, + create_pr: bool = False, + model_config: Optional[dict] = None, +): + # Create repo if doesn't exist yet + repo_url = create_repo(repo_id, token=token, private=private, exist_ok=True) + + # Infer complete repo_id from repo_url + # Can be different from the input `repo_id` if repo_owner was implicit + _, repo_owner, repo_name = repo_type_and_id_from_hf_id(repo_url) + repo_id = f"{repo_owner}/{repo_name}" + + # Check if README file already exist in repo + try: + get_hf_file_metadata(hf_hub_url(repo_id=repo_id, filename="README.md", revision=revision)) + has_readme = True + except EntryNotFoundError: + has_readme = False + + # Dump model and push to Hub + with TemporaryDirectory() as tmpdir: + # Save model weights and config. + save_for_hf(model, tmpdir, model_config=model_config) + + # Add readme if does not exist + if not has_readme: + readme_path = Path(tmpdir) / "README.md" + readme_text = f'---\ntags:\n- image-classification\n- timm\nlibrary_tag: timm\n---\n# Model card for {repo_id}' + readme_path.write_text(readme_text) + + # Upload model and return + return upload_folder( + repo_id=repo_id, + folder_path=tmpdir, + revision=revision, + create_pr=create_pr, + commit_message=commit_message, + ) diff --git a/custom_timm/models/inception_resnet_v2.py b/custom_timm/models/inception_resnet_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ae932786961457dd149817dd58e7d50ba2345b6c --- /dev/null +++ b/custom_timm/models/inception_resnet_v2.py @@ -0,0 +1,382 @@ +""" Pytorch Inception-Resnet-V2 implementation +Sourced from https://github.com/Cadene/tensorflow-model-zoo.torch (MIT License) which is +based upon Google's Tensorflow implementation and pretrained weights (Apache 2.0 License) +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .helpers import build_model_with_cfg, flatten_modules +from .layers import create_classifier +from .registry import register_model + +__all__ = ['InceptionResnetV2'] + +default_cfgs = { + # ported from http://download.tensorflow.org/models/inception_resnet_v2_2016_08_30.tar.gz + 'inception_resnet_v2': { + 'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/inception_resnet_v2-940b1cd6.pth', + 'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8), + 'crop_pct': 0.8975, 'interpolation': 'bicubic', + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'conv2d_1a.conv', 'classifier': 'classif', + 'label_offset': 1, # 1001 classes in pretrained weights + }, + # ported from http://download.tensorflow.org/models/ens_adv_inception_resnet_v2_2017_08_18.tar.gz + 'ens_adv_inception_resnet_v2': { + 'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ens_adv_inception_resnet_v2-2592a550.pth', + 'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8), + 'crop_pct': 0.8975, 'interpolation': 'bicubic', + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'conv2d_1a.conv', 'classifier': 'classif', + 'label_offset': 1, # 1001 classes in pretrained weights + } +} + + +class BasicConv2d(nn.Module): + def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0): + super(BasicConv2d, self).__init__() + self.conv = nn.Conv2d( + in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, bias=False) + self.bn = nn.BatchNorm2d(out_planes, eps=.001) + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class Mixed_5b(nn.Module): + def __init__(self): + super(Mixed_5b, self).__init__() + + self.branch0 = BasicConv2d(192, 96, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(192, 48, kernel_size=1, stride=1), + BasicConv2d(48, 64, kernel_size=5, stride=1, padding=2) + ) + + self.branch2 = nn.Sequential( + BasicConv2d(192, 64, kernel_size=1, stride=1), + BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1), + BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1) + ) + + self.branch3 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False), + BasicConv2d(192, 64, kernel_size=1, stride=1) + ) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + x3 = self.branch3(x) + out = torch.cat((x0, x1, x2, x3), 1) + return out + + +class Block35(nn.Module): + def __init__(self, scale=1.0): + super(Block35, self).__init__() + + self.scale = scale + + self.branch0 = BasicConv2d(320, 32, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(320, 32, kernel_size=1, stride=1), + BasicConv2d(32, 32, kernel_size=3, stride=1, padding=1) + ) + + self.branch2 = nn.Sequential( + BasicConv2d(320, 32, kernel_size=1, stride=1), + BasicConv2d(32, 48, kernel_size=3, stride=1, padding=1), + BasicConv2d(48, 64, kernel_size=3, stride=1, padding=1) + ) + + self.conv2d = nn.Conv2d(128, 320, kernel_size=1, stride=1) + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + out = torch.cat((x0, x1, x2), 1) + out = self.conv2d(out) + out = out * self.scale + x + out = self.relu(out) + return out + + +class Mixed_6a(nn.Module): + def __init__(self): + super(Mixed_6a, self).__init__() + + self.branch0 = BasicConv2d(320, 384, kernel_size=3, stride=2) + + self.branch1 = nn.Sequential( + BasicConv2d(320, 256, kernel_size=1, stride=1), + BasicConv2d(256, 256, kernel_size=3, stride=1, padding=1), + BasicConv2d(256, 384, kernel_size=3, stride=2) + ) + + self.branch2 = nn.MaxPool2d(3, stride=2) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + out = torch.cat((x0, x1, x2), 1) + return out + + +class Block17(nn.Module): + def __init__(self, scale=1.0): + super(Block17, self).__init__() + + self.scale = scale + + self.branch0 = BasicConv2d(1088, 192, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(1088, 128, kernel_size=1, stride=1), + BasicConv2d(128, 160, kernel_size=(1, 7), stride=1, padding=(0, 3)), + BasicConv2d(160, 192, kernel_size=(7, 1), stride=1, padding=(3, 0)) + ) + + self.conv2d = nn.Conv2d(384, 1088, kernel_size=1, stride=1) + self.relu = nn.ReLU(inplace=False) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + out = torch.cat((x0, x1), 1) + out = self.conv2d(out) + out = out * self.scale + x + out = self.relu(out) + return out + + +class Mixed_7a(nn.Module): + def __init__(self): + super(Mixed_7a, self).__init__() + + self.branch0 = nn.Sequential( + BasicConv2d(1088, 256, kernel_size=1, stride=1), + BasicConv2d(256, 384, kernel_size=3, stride=2) + ) + + self.branch1 = nn.Sequential( + BasicConv2d(1088, 256, kernel_size=1, stride=1), + BasicConv2d(256, 288, kernel_size=3, stride=2) + ) + + self.branch2 = nn.Sequential( + BasicConv2d(1088, 256, kernel_size=1, stride=1), + BasicConv2d(256, 288, kernel_size=3, stride=1, padding=1), + BasicConv2d(288, 320, kernel_size=3, stride=2) + ) + + self.branch3 = nn.MaxPool2d(3, stride=2) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + x3 = self.branch3(x) + out = torch.cat((x0, x1, x2, x3), 1) + return out + + +class Block8(nn.Module): + + def __init__(self, scale=1.0, no_relu=False): + super(Block8, self).__init__() + + self.scale = scale + + self.branch0 = BasicConv2d(2080, 192, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(2080, 192, kernel_size=1, stride=1), + BasicConv2d(192, 224, kernel_size=(1, 3), stride=1, padding=(0, 1)), + BasicConv2d(224, 256, kernel_size=(3, 1), stride=1, padding=(1, 0)) + ) + + self.conv2d = nn.Conv2d(448, 2080, kernel_size=1, stride=1) + self.relu = None if no_relu else nn.ReLU(inplace=False) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + out = torch.cat((x0, x1), 1) + out = self.conv2d(out) + out = out * self.scale + x + if self.relu is not None: + out = self.relu(out) + return out + + +class InceptionResnetV2(nn.Module): + def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., output_stride=32, global_pool='avg'): + super(InceptionResnetV2, self).__init__() + self.drop_rate = drop_rate + self.num_classes = num_classes + self.num_features = 1536 + assert output_stride == 32 + + self.conv2d_1a = BasicConv2d(in_chans, 32, kernel_size=3, stride=2) + self.conv2d_2a = BasicConv2d(32, 32, kernel_size=3, stride=1) + self.conv2d_2b = BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1) + self.feature_info = [dict(num_chs=64, reduction=2, module='conv2d_2b')] + + self.maxpool_3a = nn.MaxPool2d(3, stride=2) + self.conv2d_3b = BasicConv2d(64, 80, kernel_size=1, stride=1) + self.conv2d_4a = BasicConv2d(80, 192, kernel_size=3, stride=1) + self.feature_info += [dict(num_chs=192, reduction=4, module='conv2d_4a')] + + self.maxpool_5a = nn.MaxPool2d(3, stride=2) + self.mixed_5b = Mixed_5b() + self.repeat = nn.Sequential( + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17), + Block35(scale=0.17) + ) + self.feature_info += [dict(num_chs=320, reduction=8, module='repeat')] + + self.mixed_6a = Mixed_6a() + self.repeat_1 = nn.Sequential( + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10), + Block17(scale=0.10) + ) + self.feature_info += [dict(num_chs=1088, reduction=16, module='repeat_1')] + + self.mixed_7a = Mixed_7a() + self.repeat_2 = nn.Sequential( + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20), + Block8(scale=0.20) + ) + self.block8 = Block8(no_relu=True) + self.conv2d_7b = BasicConv2d(2080, self.num_features, kernel_size=1, stride=1) + self.feature_info += [dict(num_chs=self.num_features, reduction=32, module='conv2d_7b')] + + self.global_pool, self.classif = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + module_map = {k: i for i, (k, _) in enumerate(flatten_modules(self.named_children(), prefix=()))} + module_map.pop(('classif',)) + + def _matcher(name): + if any([name.startswith(n) for n in ('conv2d_1', 'conv2d_2')]): + return 0 + elif any([name.startswith(n) for n in ('conv2d_3', 'conv2d_4')]): + return 1 + elif any([name.startswith(n) for n in ('block8', 'conv2d_7')]): + return len(module_map) + 1 + else: + for k in module_map.keys(): + if k == tuple(name.split('.')[:len(k)]): + return module_map[k] + return float('inf') + return _matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, "checkpointing not supported" + + @torch.jit.ignore + def get_classifier(self): + return self.classif + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.classif = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x = self.conv2d_1a(x) + x = self.conv2d_2a(x) + x = self.conv2d_2b(x) + x = self.maxpool_3a(x) + x = self.conv2d_3b(x) + x = self.conv2d_4a(x) + x = self.maxpool_5a(x) + x = self.mixed_5b(x) + x = self.repeat(x) + x = self.mixed_6a(x) + x = self.repeat_1(x) + x = self.mixed_7a(x) + x = self.repeat_2(x) + x = self.block8(x) + x = self.conv2d_7b(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return x if pre_logits else self.classif(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_inception_resnet_v2(variant, pretrained=False, **kwargs): + return build_model_with_cfg(InceptionResnetV2, variant, pretrained, **kwargs) + + +@register_model +def inception_resnet_v2(pretrained=False, **kwargs): + r"""InceptionResnetV2 model architecture from the + `"InceptionV4, Inception-ResNet..." ` paper. + """ + return _create_inception_resnet_v2('inception_resnet_v2', pretrained=pretrained, **kwargs) + + +@register_model +def ens_adv_inception_resnet_v2(pretrained=False, **kwargs): + r""" Ensemble Adversarially trained InceptionResnetV2 model architecture + As per https://arxiv.org/abs/1705.07204 and + https://github.com/tensorflow/models/tree/master/research/adv_imagenet_models. + """ + return _create_inception_resnet_v2('ens_adv_inception_resnet_v2', pretrained=pretrained, **kwargs) diff --git a/custom_timm/models/inception_v3.py b/custom_timm/models/inception_v3.py new file mode 100644 index 0000000000000000000000000000000000000000..1e03afd9af9fbd463c17a9f0c961f73026c779e2 --- /dev/null +++ b/custom_timm/models/inception_v3.py @@ -0,0 +1,475 @@ +""" Inception-V3 + +Originally from torchvision Inception3 model +Licensed BSD-Clause 3 https://github.com/pytorch/vision/blob/master/LICENSE +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .helpers import build_model_with_cfg, resolve_pretrained_cfg, flatten_modules +from .registry import register_model +from .layers import trunc_normal_, create_classifier, Linear + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'Conv2d_1a_3x3.conv', 'classifier': 'fc', + **kwargs + } + + +default_cfgs = { + # original PyTorch weights, ported from Tensorflow but modified + 'inception_v3': _cfg( + url='https://download.pytorch.org/models/inception_v3_google-1a9a5a14.pth', + has_aux=True), # checkpoint has aux logit layer weights + # my port of Tensorflow SLIM weights (http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz) + 'tf_inception_v3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_inception_v3-e0069de4.pth', + num_classes=1000, has_aux=False, label_offset=1), + # my port of Tensorflow adversarially trained Inception V3 from + # http://download.tensorflow.org/models/adv_inception_v3_2017_08_18.tar.gz + 'adv_inception_v3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/adv_inception_v3-9e27bd63.pth', + num_classes=1000, has_aux=False, label_offset=1), + # from gluon pretrained models, best performing in terms of accuracy/loss metrics + # https://gluon-cv.mxnet.io/model_zoo/classification.html + 'gluon_inception_v3': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_inception_v3-9f746940.pth', + mean=IMAGENET_DEFAULT_MEAN, # also works well with inception defaults + std=IMAGENET_DEFAULT_STD, # also works well with inception defaults + has_aux=False, + ) +} + + +class InceptionA(nn.Module): + + def __init__(self, in_channels, pool_features, conv_block=None): + super(InceptionA, self).__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch1x1 = conv_block(in_channels, 64, kernel_size=1) + + self.branch5x5_1 = conv_block(in_channels, 48, kernel_size=1) + self.branch5x5_2 = conv_block(48, 64, kernel_size=5, padding=2) + + self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1) + self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1) + self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, padding=1) + + self.branch_pool = conv_block(in_channels, pool_features, kernel_size=1) + + def _forward(self, x): + branch1x1 = self.branch1x1(x) + + branch5x5 = self.branch5x5_1(x) + branch5x5 = self.branch5x5_2(branch5x5) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) + + branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool] + return outputs + + def forward(self, x): + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionB(nn.Module): + + def __init__(self, in_channels, conv_block=None): + super(InceptionB, self).__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch3x3 = conv_block(in_channels, 384, kernel_size=3, stride=2) + + self.branch3x3dbl_1 = conv_block(in_channels, 64, kernel_size=1) + self.branch3x3dbl_2 = conv_block(64, 96, kernel_size=3, padding=1) + self.branch3x3dbl_3 = conv_block(96, 96, kernel_size=3, stride=2) + + def _forward(self, x): + branch3x3 = self.branch3x3(x) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl) + + branch_pool = F.max_pool2d(x, kernel_size=3, stride=2) + + outputs = [branch3x3, branch3x3dbl, branch_pool] + return outputs + + def forward(self, x): + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionC(nn.Module): + + def __init__(self, in_channels, channels_7x7, conv_block=None): + super(InceptionC, self).__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch1x1 = conv_block(in_channels, 192, kernel_size=1) + + c7 = channels_7x7 + self.branch7x7_1 = conv_block(in_channels, c7, kernel_size=1) + self.branch7x7_2 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3)) + self.branch7x7_3 = conv_block(c7, 192, kernel_size=(7, 1), padding=(3, 0)) + + self.branch7x7dbl_1 = conv_block(in_channels, c7, kernel_size=1) + self.branch7x7dbl_2 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0)) + self.branch7x7dbl_3 = conv_block(c7, c7, kernel_size=(1, 7), padding=(0, 3)) + self.branch7x7dbl_4 = conv_block(c7, c7, kernel_size=(7, 1), padding=(3, 0)) + self.branch7x7dbl_5 = conv_block(c7, 192, kernel_size=(1, 7), padding=(0, 3)) + + self.branch_pool = conv_block(in_channels, 192, kernel_size=1) + + def _forward(self, x): + branch1x1 = self.branch1x1(x) + + branch7x7 = self.branch7x7_1(x) + branch7x7 = self.branch7x7_2(branch7x7) + branch7x7 = self.branch7x7_3(branch7x7) + + branch7x7dbl = self.branch7x7dbl_1(x) + branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl) + branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl) + + branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool] + return outputs + + def forward(self, x): + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionD(nn.Module): + + def __init__(self, in_channels, conv_block=None): + super(InceptionD, self).__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch3x3_1 = conv_block(in_channels, 192, kernel_size=1) + self.branch3x3_2 = conv_block(192, 320, kernel_size=3, stride=2) + + self.branch7x7x3_1 = conv_block(in_channels, 192, kernel_size=1) + self.branch7x7x3_2 = conv_block(192, 192, kernel_size=(1, 7), padding=(0, 3)) + self.branch7x7x3_3 = conv_block(192, 192, kernel_size=(7, 1), padding=(3, 0)) + self.branch7x7x3_4 = conv_block(192, 192, kernel_size=3, stride=2) + + def _forward(self, x): + branch3x3 = self.branch3x3_1(x) + branch3x3 = self.branch3x3_2(branch3x3) + + branch7x7x3 = self.branch7x7x3_1(x) + branch7x7x3 = self.branch7x7x3_2(branch7x7x3) + branch7x7x3 = self.branch7x7x3_3(branch7x7x3) + branch7x7x3 = self.branch7x7x3_4(branch7x7x3) + + branch_pool = F.max_pool2d(x, kernel_size=3, stride=2) + outputs = [branch3x3, branch7x7x3, branch_pool] + return outputs + + def forward(self, x): + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionE(nn.Module): + + def __init__(self, in_channels, conv_block=None): + super(InceptionE, self).__init__() + if conv_block is None: + conv_block = BasicConv2d + self.branch1x1 = conv_block(in_channels, 320, kernel_size=1) + + self.branch3x3_1 = conv_block(in_channels, 384, kernel_size=1) + self.branch3x3_2a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1)) + self.branch3x3_2b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0)) + + self.branch3x3dbl_1 = conv_block(in_channels, 448, kernel_size=1) + self.branch3x3dbl_2 = conv_block(448, 384, kernel_size=3, padding=1) + self.branch3x3dbl_3a = conv_block(384, 384, kernel_size=(1, 3), padding=(0, 1)) + self.branch3x3dbl_3b = conv_block(384, 384, kernel_size=(3, 1), padding=(1, 0)) + + self.branch_pool = conv_block(in_channels, 192, kernel_size=1) + + def _forward(self, x): + branch1x1 = self.branch1x1(x) + + branch3x3 = self.branch3x3_1(x) + branch3x3 = [ + self.branch3x3_2a(branch3x3), + self.branch3x3_2b(branch3x3), + ] + branch3x3 = torch.cat(branch3x3, 1) + + branch3x3dbl = self.branch3x3dbl_1(x) + branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl) + branch3x3dbl = [ + self.branch3x3dbl_3a(branch3x3dbl), + self.branch3x3dbl_3b(branch3x3dbl), + ] + branch3x3dbl = torch.cat(branch3x3dbl, 1) + + branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1) + branch_pool = self.branch_pool(branch_pool) + + outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool] + return outputs + + def forward(self, x): + outputs = self._forward(x) + return torch.cat(outputs, 1) + + +class InceptionAux(nn.Module): + + def __init__(self, in_channels, num_classes, conv_block=None): + super(InceptionAux, self).__init__() + if conv_block is None: + conv_block = BasicConv2d + self.conv0 = conv_block(in_channels, 128, kernel_size=1) + self.conv1 = conv_block(128, 768, kernel_size=5) + self.conv1.stddev = 0.01 + self.fc = Linear(768, num_classes) + self.fc.stddev = 0.001 + + def forward(self, x): + # N x 768 x 17 x 17 + x = F.avg_pool2d(x, kernel_size=5, stride=3) + # N x 768 x 5 x 5 + x = self.conv0(x) + # N x 128 x 5 x 5 + x = self.conv1(x) + # N x 768 x 1 x 1 + # Adaptive average pooling + x = F.adaptive_avg_pool2d(x, (1, 1)) + # N x 768 x 1 x 1 + x = torch.flatten(x, 1) + # N x 768 + x = self.fc(x) + # N x 1000 + return x + + +class BasicConv2d(nn.Module): + + def __init__(self, in_channels, out_channels, **kwargs): + super(BasicConv2d, self).__init__() + self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs) + self.bn = nn.BatchNorm2d(out_channels, eps=0.001) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return F.relu(x, inplace=True) + + +class InceptionV3(nn.Module): + """Inception-V3 with no AuxLogits + FIXME two class defs are redundant, but less screwing around with torchsript fussyness and inconsistent returns + """ + + def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg', aux_logits=False): + super(InceptionV3, self).__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + self.aux_logits = aux_logits + + self.Conv2d_1a_3x3 = BasicConv2d(in_chans, 32, kernel_size=3, stride=2) + self.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3) + self.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1) + self.Pool1 = nn.MaxPool2d(kernel_size=3, stride=2) + self.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1) + self.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3) + self.Pool2 = nn.MaxPool2d(kernel_size=3, stride=2) + self.Mixed_5b = InceptionA(192, pool_features=32) + self.Mixed_5c = InceptionA(256, pool_features=64) + self.Mixed_5d = InceptionA(288, pool_features=64) + self.Mixed_6a = InceptionB(288) + self.Mixed_6b = InceptionC(768, channels_7x7=128) + self.Mixed_6c = InceptionC(768, channels_7x7=160) + self.Mixed_6d = InceptionC(768, channels_7x7=160) + self.Mixed_6e = InceptionC(768, channels_7x7=192) + if aux_logits: + self.AuxLogits = InceptionAux(768, num_classes) + else: + self.AuxLogits = None + self.Mixed_7a = InceptionD(768) + self.Mixed_7b = InceptionE(1280) + self.Mixed_7c = InceptionE(2048) + self.feature_info = [ + dict(num_chs=64, reduction=2, module='Conv2d_2b_3x3'), + dict(num_chs=192, reduction=4, module='Conv2d_4a_3x3'), + dict(num_chs=288, reduction=8, module='Mixed_5d'), + dict(num_chs=768, reduction=16, module='Mixed_6e'), + dict(num_chs=2048, reduction=32, module='Mixed_7c'), + ] + + self.num_features = 2048 + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + for m in self.modules(): + if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear): + stddev = m.stddev if hasattr(m, 'stddev') else 0.1 + trunc_normal_(m.weight, std=stddev) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + module_map = {k: i for i, (k, _) in enumerate(flatten_modules(self.named_children(), prefix=()))} + module_map.pop(('fc',)) + + def _matcher(name): + if any([name.startswith(n) for n in ('Conv2d_1', 'Conv2d_2')]): + return 0 + elif any([name.startswith(n) for n in ('Conv2d_3', 'Conv2d_4')]): + return 1 + else: + for k in module_map.keys(): + if k == tuple(name.split('.')[:len(k)]): + return module_map[k] + return float('inf') + return _matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + def forward_preaux(self, x): + x = self.Conv2d_1a_3x3(x) # N x 32 x 149 x 149 + x = self.Conv2d_2a_3x3(x) # N x 32 x 147 x 147 + x = self.Conv2d_2b_3x3(x) # N x 64 x 147 x 147 + x = self.Pool1(x) # N x 64 x 73 x 73 + x = self.Conv2d_3b_1x1(x) # N x 80 x 73 x 73 + x = self.Conv2d_4a_3x3(x) # N x 192 x 71 x 71 + x = self.Pool2(x) # N x 192 x 35 x 35 + x = self.Mixed_5b(x) # N x 256 x 35 x 35 + x = self.Mixed_5c(x) # N x 288 x 35 x 35 + x = self.Mixed_5d(x) # N x 288 x 35 x 35 + x = self.Mixed_6a(x) # N x 768 x 17 x 17 + x = self.Mixed_6b(x) # N x 768 x 17 x 17 + x = self.Mixed_6c(x) # N x 768 x 17 x 17 + x = self.Mixed_6d(x) # N x 768 x 17 x 17 + x = self.Mixed_6e(x) # N x 768 x 17 x 17 + return x + + def forward_postaux(self, x): + x = self.Mixed_7a(x) # N x 1280 x 8 x 8 + x = self.Mixed_7b(x) # N x 2048 x 8 x 8 + x = self.Mixed_7c(x) # N x 2048 x 8 x 8 + return x + + def forward_features(self, x): + x = self.forward_preaux(x) + x = self.forward_postaux(x) + return x + + def forward_head(self, x): + x = self.global_pool(x) + if self.drop_rate > 0: + x = F.dropout(x, p=self.drop_rate, training=self.training) + x = self.fc(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +class InceptionV3Aux(InceptionV3): + """InceptionV3 with AuxLogits + """ + + def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg', aux_logits=True): + super(InceptionV3Aux, self).__init__( + num_classes, in_chans, drop_rate, global_pool, aux_logits) + + def forward_features(self, x): + x = self.forward_preaux(x) + aux = self.AuxLogits(x) if self.training else None + x = self.forward_postaux(x) + return x, aux + + def forward(self, x): + x, aux = self.forward_features(x) + x = self.forward_head(x) + return x, aux + + +def _create_inception_v3(variant, pretrained=False, **kwargs): + pretrained_cfg = resolve_pretrained_cfg(variant, pretrained_cfg=kwargs.pop('pretrained_cfg', None)) + aux_logits = kwargs.pop('aux_logits', False) + if aux_logits: + assert not kwargs.pop('features_only', False) + model_cls = InceptionV3Aux + load_strict = pretrained_cfg['has_aux'] + else: + model_cls = InceptionV3 + load_strict = not pretrained_cfg['has_aux'] + + return build_model_with_cfg( + model_cls, variant, pretrained, + pretrained_cfg=pretrained_cfg, + pretrained_strict=load_strict, + **kwargs) + + +@register_model +def inception_v3(pretrained=False, **kwargs): + # original PyTorch weights, ported from Tensorflow but modified + model = _create_inception_v3('inception_v3', pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_inception_v3(pretrained=False, **kwargs): + # my port of Tensorflow SLIM weights (http://download.tensorflow.org/models/inception_v3_2016_08_28.tar.gz) + model = _create_inception_v3('tf_inception_v3', pretrained=pretrained, **kwargs) + return model + + +@register_model +def adv_inception_v3(pretrained=False, **kwargs): + # my port of Tensorflow adversarially trained Inception V3 from + # http://download.tensorflow.org/models/adv_inception_v3_2017_08_18.tar.gz + model = _create_inception_v3('adv_inception_v3', pretrained=pretrained, **kwargs) + return model + + +@register_model +def gluon_inception_v3(pretrained=False, **kwargs): + # from gluon pretrained models, best performing in terms of accuracy/loss metrics + # https://gluon-cv.mxnet.io/model_zoo/classification.html + model = _create_inception_v3('gluon_inception_v3', pretrained=pretrained, **kwargs) + return model diff --git a/custom_timm/models/inception_v4.py b/custom_timm/models/inception_v4.py new file mode 100644 index 0000000000000000000000000000000000000000..02d7128221c521c245d3c8832923392c43255180 --- /dev/null +++ b/custom_timm/models/inception_v4.py @@ -0,0 +1,330 @@ +""" Pytorch Inception-V4 implementation +Sourced from https://github.com/Cadene/tensorflow-model-zoo.torch (MIT License) which is +based upon Google's Tensorflow implementation and pretrained weights (Apache 2.0 License) +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .helpers import build_model_with_cfg +from .layers import create_classifier +from .registry import register_model + +__all__ = ['InceptionV4'] + +default_cfgs = { + 'inception_v4': { + 'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/inceptionv4-8e4777a0.pth', + 'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (8, 8), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'features.0.conv', 'classifier': 'last_linear', + 'label_offset': 1, # 1001 classes in pretrained weights + } +} + + +class BasicConv2d(nn.Module): + def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0): + super(BasicConv2d, self).__init__() + self.conv = nn.Conv2d( + in_planes, out_planes, kernel_size=kernel_size, stride=stride, padding=padding, bias=False) + self.bn = nn.BatchNorm2d(out_planes, eps=0.001) + self.relu = nn.ReLU(inplace=True) + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.relu(x) + return x + + +class Mixed3a(nn.Module): + def __init__(self): + super(Mixed3a, self).__init__() + self.maxpool = nn.MaxPool2d(3, stride=2) + self.conv = BasicConv2d(64, 96, kernel_size=3, stride=2) + + def forward(self, x): + x0 = self.maxpool(x) + x1 = self.conv(x) + out = torch.cat((x0, x1), 1) + return out + + +class Mixed4a(nn.Module): + def __init__(self): + super(Mixed4a, self).__init__() + + self.branch0 = nn.Sequential( + BasicConv2d(160, 64, kernel_size=1, stride=1), + BasicConv2d(64, 96, kernel_size=3, stride=1) + ) + + self.branch1 = nn.Sequential( + BasicConv2d(160, 64, kernel_size=1, stride=1), + BasicConv2d(64, 64, kernel_size=(1, 7), stride=1, padding=(0, 3)), + BasicConv2d(64, 64, kernel_size=(7, 1), stride=1, padding=(3, 0)), + BasicConv2d(64, 96, kernel_size=(3, 3), stride=1) + ) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + out = torch.cat((x0, x1), 1) + return out + + +class Mixed5a(nn.Module): + def __init__(self): + super(Mixed5a, self).__init__() + self.conv = BasicConv2d(192, 192, kernel_size=3, stride=2) + self.maxpool = nn.MaxPool2d(3, stride=2) + + def forward(self, x): + x0 = self.conv(x) + x1 = self.maxpool(x) + out = torch.cat((x0, x1), 1) + return out + + +class InceptionA(nn.Module): + def __init__(self): + super(InceptionA, self).__init__() + self.branch0 = BasicConv2d(384, 96, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(384, 64, kernel_size=1, stride=1), + BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1) + ) + + self.branch2 = nn.Sequential( + BasicConv2d(384, 64, kernel_size=1, stride=1), + BasicConv2d(64, 96, kernel_size=3, stride=1, padding=1), + BasicConv2d(96, 96, kernel_size=3, stride=1, padding=1) + ) + + self.branch3 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False), + BasicConv2d(384, 96, kernel_size=1, stride=1) + ) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + x3 = self.branch3(x) + out = torch.cat((x0, x1, x2, x3), 1) + return out + + +class ReductionA(nn.Module): + def __init__(self): + super(ReductionA, self).__init__() + self.branch0 = BasicConv2d(384, 384, kernel_size=3, stride=2) + + self.branch1 = nn.Sequential( + BasicConv2d(384, 192, kernel_size=1, stride=1), + BasicConv2d(192, 224, kernel_size=3, stride=1, padding=1), + BasicConv2d(224, 256, kernel_size=3, stride=2) + ) + + self.branch2 = nn.MaxPool2d(3, stride=2) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + out = torch.cat((x0, x1, x2), 1) + return out + + +class InceptionB(nn.Module): + def __init__(self): + super(InceptionB, self).__init__() + self.branch0 = BasicConv2d(1024, 384, kernel_size=1, stride=1) + + self.branch1 = nn.Sequential( + BasicConv2d(1024, 192, kernel_size=1, stride=1), + BasicConv2d(192, 224, kernel_size=(1, 7), stride=1, padding=(0, 3)), + BasicConv2d(224, 256, kernel_size=(7, 1), stride=1, padding=(3, 0)) + ) + + self.branch2 = nn.Sequential( + BasicConv2d(1024, 192, kernel_size=1, stride=1), + BasicConv2d(192, 192, kernel_size=(7, 1), stride=1, padding=(3, 0)), + BasicConv2d(192, 224, kernel_size=(1, 7), stride=1, padding=(0, 3)), + BasicConv2d(224, 224, kernel_size=(7, 1), stride=1, padding=(3, 0)), + BasicConv2d(224, 256, kernel_size=(1, 7), stride=1, padding=(0, 3)) + ) + + self.branch3 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False), + BasicConv2d(1024, 128, kernel_size=1, stride=1) + ) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + x3 = self.branch3(x) + out = torch.cat((x0, x1, x2, x3), 1) + return out + + +class ReductionB(nn.Module): + def __init__(self): + super(ReductionB, self).__init__() + + self.branch0 = nn.Sequential( + BasicConv2d(1024, 192, kernel_size=1, stride=1), + BasicConv2d(192, 192, kernel_size=3, stride=2) + ) + + self.branch1 = nn.Sequential( + BasicConv2d(1024, 256, kernel_size=1, stride=1), + BasicConv2d(256, 256, kernel_size=(1, 7), stride=1, padding=(0, 3)), + BasicConv2d(256, 320, kernel_size=(7, 1), stride=1, padding=(3, 0)), + BasicConv2d(320, 320, kernel_size=3, stride=2) + ) + + self.branch2 = nn.MaxPool2d(3, stride=2) + + def forward(self, x): + x0 = self.branch0(x) + x1 = self.branch1(x) + x2 = self.branch2(x) + out = torch.cat((x0, x1, x2), 1) + return out + + +class InceptionC(nn.Module): + def __init__(self): + super(InceptionC, self).__init__() + + self.branch0 = BasicConv2d(1536, 256, kernel_size=1, stride=1) + + self.branch1_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1) + self.branch1_1a = BasicConv2d(384, 256, kernel_size=(1, 3), stride=1, padding=(0, 1)) + self.branch1_1b = BasicConv2d(384, 256, kernel_size=(3, 1), stride=1, padding=(1, 0)) + + self.branch2_0 = BasicConv2d(1536, 384, kernel_size=1, stride=1) + self.branch2_1 = BasicConv2d(384, 448, kernel_size=(3, 1), stride=1, padding=(1, 0)) + self.branch2_2 = BasicConv2d(448, 512, kernel_size=(1, 3), stride=1, padding=(0, 1)) + self.branch2_3a = BasicConv2d(512, 256, kernel_size=(1, 3), stride=1, padding=(0, 1)) + self.branch2_3b = BasicConv2d(512, 256, kernel_size=(3, 1), stride=1, padding=(1, 0)) + + self.branch3 = nn.Sequential( + nn.AvgPool2d(3, stride=1, padding=1, count_include_pad=False), + BasicConv2d(1536, 256, kernel_size=1, stride=1) + ) + + def forward(self, x): + x0 = self.branch0(x) + + x1_0 = self.branch1_0(x) + x1_1a = self.branch1_1a(x1_0) + x1_1b = self.branch1_1b(x1_0) + x1 = torch.cat((x1_1a, x1_1b), 1) + + x2_0 = self.branch2_0(x) + x2_1 = self.branch2_1(x2_0) + x2_2 = self.branch2_2(x2_1) + x2_3a = self.branch2_3a(x2_2) + x2_3b = self.branch2_3b(x2_2) + x2 = torch.cat((x2_3a, x2_3b), 1) + + x3 = self.branch3(x) + + out = torch.cat((x0, x1, x2, x3), 1) + return out + + +class InceptionV4(nn.Module): + def __init__(self, num_classes=1000, in_chans=3, output_stride=32, drop_rate=0., global_pool='avg'): + super(InceptionV4, self).__init__() + assert output_stride == 32 + self.drop_rate = drop_rate + self.num_classes = num_classes + self.num_features = 1536 + + self.features = nn.Sequential( + BasicConv2d(in_chans, 32, kernel_size=3, stride=2), + BasicConv2d(32, 32, kernel_size=3, stride=1), + BasicConv2d(32, 64, kernel_size=3, stride=1, padding=1), + Mixed3a(), + Mixed4a(), + Mixed5a(), + InceptionA(), + InceptionA(), + InceptionA(), + InceptionA(), + ReductionA(), # Mixed6a + InceptionB(), + InceptionB(), + InceptionB(), + InceptionB(), + InceptionB(), + InceptionB(), + InceptionB(), + ReductionB(), # Mixed7a + InceptionC(), + InceptionC(), + InceptionC(), + ) + self.feature_info = [ + dict(num_chs=64, reduction=2, module='features.2'), + dict(num_chs=160, reduction=4, module='features.3'), + dict(num_chs=384, reduction=8, module='features.9'), + dict(num_chs=1024, reduction=16, module='features.17'), + dict(num_chs=1536, reduction=32, module='features.21'), + ] + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^features\.[012]\.', + blocks=r'^features\.(\d+)' + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.last_linear + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + return self.features(x) + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return x if pre_logits else self.last_linear(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_inception_v4(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + InceptionV4, variant, pretrained, + feature_cfg=dict(flatten_sequential=True), + **kwargs) + + +@register_model +def inception_v4(pretrained=False, **kwargs): + return _create_inception_v4('inception_v4', pretrained, **kwargs) diff --git a/custom_timm/models/layers/__init__.py b/custom_timm/models/layers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..21c641b6c3a1b5c02e0f5213fe8de3437eb1eb96 --- /dev/null +++ b/custom_timm/models/layers/__init__.py @@ -0,0 +1,44 @@ +from .activations import * +from .adaptive_avgmax_pool import \ + adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d +from .blur_pool import BlurPool2d +from .classifier import ClassifierHead, create_classifier +from .cond_conv2d import CondConv2d, get_condconv_initializer +from .config import is_exportable, is_scriptable, is_no_jit, set_exportable, set_scriptable, set_no_jit,\ + set_layer_config +from .conv2d_same import Conv2dSame, conv2d_same +from .conv_bn_act import ConvNormAct, ConvNormActAa, ConvBnAct +from .create_act import create_act_layer, get_act_layer, get_act_fn +from .create_attn import get_attn, create_attn +from .create_conv2d import create_conv2d +from .create_norm import get_norm_layer, create_norm_layer +from .create_norm_act import get_norm_act_layer, create_norm_act_layer, get_norm_act_layer +from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path +from .eca import EcaModule, CecaModule, EfficientChannelAttn, CircularEfficientChannelAttn +from .evo_norm import EvoNorm2dB0, EvoNorm2dB1, EvoNorm2dB2,\ + EvoNorm2dS0, EvoNorm2dS0a, EvoNorm2dS1, EvoNorm2dS1a, EvoNorm2dS2, EvoNorm2dS2a +from .fast_norm import is_fast_norm, set_fast_norm, fast_group_norm, fast_layer_norm +from .filter_response_norm import FilterResponseNormTlu2d, FilterResponseNormAct2d +from .gather_excite import GatherExcite +from .global_context import GlobalContext +from .helpers import to_ntuple, to_2tuple, to_3tuple, to_4tuple, make_divisible, extend_tuple +from .inplace_abn import InplaceAbn +from .linear import Linear +from .mixed_conv2d import MixedConv2d +from .mlp import Mlp, GluMlp, GatedMlp, ConvMlp +from .non_local_attn import NonLocalAttn, BatNonLocalAttn +from .norm import GroupNorm, GroupNorm1, LayerNorm, LayerNorm2d +from .norm_act import BatchNormAct2d, GroupNormAct, convert_sync_batchnorm +from .padding import get_padding, get_same_padding, pad_same +from .patch_embed import PatchEmbed +from .pool2d_same import AvgPool2dSame, create_pool2d +from .squeeze_excite import SEModule, SqueezeExcite, EffectiveSEModule, EffectiveSqueezeExcite +from .selective_kernel import SelectiveKernel +from .separable_conv import SeparableConv2d, SeparableConvNormAct +from .space_to_depth import SpaceToDepthModule +from .split_attn import SplitAttn +from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model +from .std_conv import StdConv2d, StdConv2dSame, ScaledStdConv2d, ScaledStdConv2dSame +from .test_time_pool import TestTimePoolHead, apply_test_time_pool +from .trace_utils import _assert, _float_to_int +from .weight_init import trunc_normal_, trunc_normal_tf_, variance_scaling_, lecun_normal_ diff --git a/custom_timm/models/layers/activations.py b/custom_timm/models/layers/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..e16b3bd3a1898365530c1ffc5154a0a4746a136e --- /dev/null +++ b/custom_timm/models/layers/activations.py @@ -0,0 +1,145 @@ +""" Activations + +A collection of activations fn and modules with a common interface so that they can +easily be swapped. All have an `inplace` arg even if not used. + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import torch +from torch import nn as nn +from torch.nn import functional as F + + +def swish(x, inplace: bool = False): + """Swish - Described in: https://arxiv.org/abs/1710.05941 + """ + return x.mul_(x.sigmoid()) if inplace else x.mul(x.sigmoid()) + + +class Swish(nn.Module): + def __init__(self, inplace: bool = False): + super(Swish, self).__init__() + self.inplace = inplace + + def forward(self, x): + return swish(x, self.inplace) + + +def mish(x, inplace: bool = False): + """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681 + NOTE: I don't have a working inplace variant + """ + return x.mul(F.softplus(x).tanh()) + + +class Mish(nn.Module): + """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681 + """ + def __init__(self, inplace: bool = False): + super(Mish, self).__init__() + + def forward(self, x): + return mish(x) + + +def sigmoid(x, inplace: bool = False): + return x.sigmoid_() if inplace else x.sigmoid() + + +# PyTorch has this, but not with a consistent inplace argmument interface +class Sigmoid(nn.Module): + def __init__(self, inplace: bool = False): + super(Sigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + return x.sigmoid_() if self.inplace else x.sigmoid() + + +def tanh(x, inplace: bool = False): + return x.tanh_() if inplace else x.tanh() + + +# PyTorch has this, but not with a consistent inplace argmument interface +class Tanh(nn.Module): + def __init__(self, inplace: bool = False): + super(Tanh, self).__init__() + self.inplace = inplace + + def forward(self, x): + return x.tanh_() if self.inplace else x.tanh() + + +def hard_swish(x, inplace: bool = False): + inner = F.relu6(x + 3.).div_(6.) + return x.mul_(inner) if inplace else x.mul(inner) + + +class HardSwish(nn.Module): + def __init__(self, inplace: bool = False): + super(HardSwish, self).__init__() + self.inplace = inplace + + def forward(self, x): + return hard_swish(x, self.inplace) + + +def hard_sigmoid(x, inplace: bool = False): + if inplace: + return x.add_(3.).clamp_(0., 6.).div_(6.) + else: + return F.relu6(x + 3.) / 6. + + +class HardSigmoid(nn.Module): + def __init__(self, inplace: bool = False): + super(HardSigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + return hard_sigmoid(x, self.inplace) + + +def hard_mish(x, inplace: bool = False): + """ Hard Mish + Experimental, based on notes by Mish author Diganta Misra at + https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md + """ + if inplace: + return x.mul_(0.5 * (x + 2).clamp(min=0, max=2)) + else: + return 0.5 * x * (x + 2).clamp(min=0, max=2) + + +class HardMish(nn.Module): + def __init__(self, inplace: bool = False): + super(HardMish, self).__init__() + self.inplace = inplace + + def forward(self, x): + return hard_mish(x, self.inplace) + + +class PReLU(nn.PReLU): + """Applies PReLU (w/ dummy inplace arg) + """ + def __init__(self, num_parameters: int = 1, init: float = 0.25, inplace: bool = False) -> None: + super(PReLU, self).__init__(num_parameters=num_parameters, init=init) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.prelu(input, self.weight) + + +def gelu(x: torch.Tensor, inplace: bool = False) -> torch.Tensor: + return F.gelu(x) + + +class GELU(nn.Module): + """Applies the Gaussian Error Linear Units function (w/ dummy inplace arg) + """ + def __init__(self, inplace: bool = False): + super(GELU, self).__init__() + + def forward(self, input: torch.Tensor) -> torch.Tensor: + return F.gelu(input) diff --git a/custom_timm/models/layers/activations_jit.py b/custom_timm/models/layers/activations_jit.py new file mode 100644 index 0000000000000000000000000000000000000000..b4a516530ad0abf41f720ac83d02791179bb7b67 --- /dev/null +++ b/custom_timm/models/layers/activations_jit.py @@ -0,0 +1,90 @@ +""" Activations + +A collection of jit-scripted activations fn and modules with a common interface so that they can +easily be swapped. All have an `inplace` arg even if not used. + +All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not +currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted +versions if they contain in-place ops. + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import torch +from torch import nn as nn +from torch.nn import functional as F + + +@torch.jit.script +def swish_jit(x, inplace: bool = False): + """Swish - Described in: https://arxiv.org/abs/1710.05941 + """ + return x.mul(x.sigmoid()) + + +@torch.jit.script +def mish_jit(x, _inplace: bool = False): + """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681 + """ + return x.mul(F.softplus(x).tanh()) + + +class SwishJit(nn.Module): + def __init__(self, inplace: bool = False): + super(SwishJit, self).__init__() + + def forward(self, x): + return swish_jit(x) + + +class MishJit(nn.Module): + def __init__(self, inplace: bool = False): + super(MishJit, self).__init__() + + def forward(self, x): + return mish_jit(x) + + +@torch.jit.script +def hard_sigmoid_jit(x, inplace: bool = False): + # return F.relu6(x + 3.) / 6. + return (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster? + + +class HardSigmoidJit(nn.Module): + def __init__(self, inplace: bool = False): + super(HardSigmoidJit, self).__init__() + + def forward(self, x): + return hard_sigmoid_jit(x) + + +@torch.jit.script +def hard_swish_jit(x, inplace: bool = False): + # return x * (F.relu6(x + 3.) / 6) + return x * (x + 3).clamp(min=0, max=6).div(6.) # clamp seems ever so slightly faster? + + +class HardSwishJit(nn.Module): + def __init__(self, inplace: bool = False): + super(HardSwishJit, self).__init__() + + def forward(self, x): + return hard_swish_jit(x) + + +@torch.jit.script +def hard_mish_jit(x, inplace: bool = False): + """ Hard Mish + Experimental, based on notes by Mish author Diganta Misra at + https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md + """ + return 0.5 * x * (x + 2).clamp(min=0, max=2) + + +class HardMishJit(nn.Module): + def __init__(self, inplace: bool = False): + super(HardMishJit, self).__init__() + + def forward(self, x): + return hard_mish_jit(x) diff --git a/custom_timm/models/layers/activations_me.py b/custom_timm/models/layers/activations_me.py new file mode 100644 index 0000000000000000000000000000000000000000..9a12bb7ebbfef02c508801742d38da6b48dd1bb6 --- /dev/null +++ b/custom_timm/models/layers/activations_me.py @@ -0,0 +1,218 @@ +""" Activations (memory-efficient w/ custom autograd) + +A collection of activations fn and modules with a common interface so that they can +easily be swapped. All have an `inplace` arg even if not used. + +These activations are not compatible with jit scripting or ONNX export of the model, please use either +the JIT or basic versions of the activations. + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import torch +from torch import nn as nn +from torch.nn import functional as F + + +@torch.jit.script +def swish_jit_fwd(x): + return x.mul(torch.sigmoid(x)) + + +@torch.jit.script +def swish_jit_bwd(x, grad_output): + x_sigmoid = torch.sigmoid(x) + return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid))) + + +class SwishJitAutoFn(torch.autograd.Function): + """ torch.jit.script optimised Swish w/ memory-efficient checkpoint + Inspired by conversation btw Jeremy Howard & Adam Pazske + https://twitter.com/jeremyphoward/status/1188251041835315200 + """ + @staticmethod + def symbolic(g, x): + return g.op("Mul", x, g.op("Sigmoid", x)) + + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return swish_jit_fwd(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + return swish_jit_bwd(x, grad_output) + + +def swish_me(x, inplace=False): + return SwishJitAutoFn.apply(x) + + +class SwishMe(nn.Module): + def __init__(self, inplace: bool = False): + super(SwishMe, self).__init__() + + def forward(self, x): + return SwishJitAutoFn.apply(x) + + +@torch.jit.script +def mish_jit_fwd(x): + return x.mul(torch.tanh(F.softplus(x))) + + +@torch.jit.script +def mish_jit_bwd(x, grad_output): + x_sigmoid = torch.sigmoid(x) + x_tanh_sp = F.softplus(x).tanh() + return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp)) + + +class MishJitAutoFn(torch.autograd.Function): + """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681 + A memory efficient, jit scripted variant of Mish + """ + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return mish_jit_fwd(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + return mish_jit_bwd(x, grad_output) + + +def mish_me(x, inplace=False): + return MishJitAutoFn.apply(x) + + +class MishMe(nn.Module): + def __init__(self, inplace: bool = False): + super(MishMe, self).__init__() + + def forward(self, x): + return MishJitAutoFn.apply(x) + + +@torch.jit.script +def hard_sigmoid_jit_fwd(x, inplace: bool = False): + return (x + 3).clamp(min=0, max=6).div(6.) + + +@torch.jit.script +def hard_sigmoid_jit_bwd(x, grad_output): + m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6. + return grad_output * m + + +class HardSigmoidJitAutoFn(torch.autograd.Function): + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return hard_sigmoid_jit_fwd(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + return hard_sigmoid_jit_bwd(x, grad_output) + + +def hard_sigmoid_me(x, inplace: bool = False): + return HardSigmoidJitAutoFn.apply(x) + + +class HardSigmoidMe(nn.Module): + def __init__(self, inplace: bool = False): + super(HardSigmoidMe, self).__init__() + + def forward(self, x): + return HardSigmoidJitAutoFn.apply(x) + + +@torch.jit.script +def hard_swish_jit_fwd(x): + return x * (x + 3).clamp(min=0, max=6).div(6.) + + +@torch.jit.script +def hard_swish_jit_bwd(x, grad_output): + m = torch.ones_like(x) * (x >= 3.) + m = torch.where((x >= -3.) & (x <= 3.), x / 3. + .5, m) + return grad_output * m + + +class HardSwishJitAutoFn(torch.autograd.Function): + """A memory efficient, jit-scripted HardSwish activation""" + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return hard_swish_jit_fwd(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + return hard_swish_jit_bwd(x, grad_output) + + @staticmethod + def symbolic(g, self): + input = g.op("Add", self, g.op('Constant', value_t=torch.tensor(3, dtype=torch.float))) + hardtanh_ = g.op("Clip", input, g.op('Constant', value_t=torch.tensor(0, dtype=torch.float)), g.op('Constant', value_t=torch.tensor(6, dtype=torch.float))) + hardtanh_ = g.op("Div", hardtanh_, g.op('Constant', value_t=torch.tensor(6, dtype=torch.float))) + return g.op("Mul", self, hardtanh_) + + +def hard_swish_me(x, inplace=False): + return HardSwishJitAutoFn.apply(x) + + +class HardSwishMe(nn.Module): + def __init__(self, inplace: bool = False): + super(HardSwishMe, self).__init__() + + def forward(self, x): + return HardSwishJitAutoFn.apply(x) + + +@torch.jit.script +def hard_mish_jit_fwd(x): + return 0.5 * x * (x + 2).clamp(min=0, max=2) + + +@torch.jit.script +def hard_mish_jit_bwd(x, grad_output): + m = torch.ones_like(x) * (x >= -2.) + m = torch.where((x >= -2.) & (x <= 0.), x + 1., m) + return grad_output * m + + +class HardMishJitAutoFn(torch.autograd.Function): + """ A memory efficient, jit scripted variant of Hard Mish + Experimental, based on notes by Mish author Diganta Misra at + https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md + """ + @staticmethod + def forward(ctx, x): + ctx.save_for_backward(x) + return hard_mish_jit_fwd(x) + + @staticmethod + def backward(ctx, grad_output): + x = ctx.saved_tensors[0] + return hard_mish_jit_bwd(x, grad_output) + + +def hard_mish_me(x, inplace: bool = False): + return HardMishJitAutoFn.apply(x) + + +class HardMishMe(nn.Module): + def __init__(self, inplace: bool = False): + super(HardMishMe, self).__init__() + + def forward(self, x): + return HardMishJitAutoFn.apply(x) + + + diff --git a/custom_timm/models/layers/adaptive_avgmax_pool.py b/custom_timm/models/layers/adaptive_avgmax_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..ebc6ada8c5b28c7eac5785b0cc2933eb01a15d46 --- /dev/null +++ b/custom_timm/models/layers/adaptive_avgmax_pool.py @@ -0,0 +1,118 @@ +""" PyTorch selectable adaptive pooling +Adaptive pooling with the ability to select the type of pooling from: + * 'avg' - Average pooling + * 'max' - Max pooling + * 'avgmax' - Sum of average and max pooling re-scaled by 0.5 + * 'avgmaxc' - Concatenation of average and max pooling along feature dim, doubles feature dim + +Both a functional and a nn.Module version of the pooling is provided. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def adaptive_pool_feat_mult(pool_type='avg'): + if pool_type == 'catavgmax': + return 2 + else: + return 1 + + +def adaptive_avgmax_pool2d(x, output_size=1): + x_avg = F.adaptive_avg_pool2d(x, output_size) + x_max = F.adaptive_max_pool2d(x, output_size) + return 0.5 * (x_avg + x_max) + + +def adaptive_catavgmax_pool2d(x, output_size=1): + x_avg = F.adaptive_avg_pool2d(x, output_size) + x_max = F.adaptive_max_pool2d(x, output_size) + return torch.cat((x_avg, x_max), 1) + + +def select_adaptive_pool2d(x, pool_type='avg', output_size=1): + """Selectable global pooling function with dynamic input kernel size + """ + if pool_type == 'avg': + x = F.adaptive_avg_pool2d(x, output_size) + elif pool_type == 'avgmax': + x = adaptive_avgmax_pool2d(x, output_size) + elif pool_type == 'catavgmax': + x = adaptive_catavgmax_pool2d(x, output_size) + elif pool_type == 'max': + x = F.adaptive_max_pool2d(x, output_size) + else: + assert False, 'Invalid pool type: %s' % pool_type + return x + + +class FastAdaptiveAvgPool2d(nn.Module): + def __init__(self, flatten=False): + super(FastAdaptiveAvgPool2d, self).__init__() + self.flatten = flatten + + def forward(self, x): + return x.mean((2, 3), keepdim=not self.flatten) + + +class AdaptiveAvgMaxPool2d(nn.Module): + def __init__(self, output_size=1): + super(AdaptiveAvgMaxPool2d, self).__init__() + self.output_size = output_size + + def forward(self, x): + return adaptive_avgmax_pool2d(x, self.output_size) + + +class AdaptiveCatAvgMaxPool2d(nn.Module): + def __init__(self, output_size=1): + super(AdaptiveCatAvgMaxPool2d, self).__init__() + self.output_size = output_size + + def forward(self, x): + return adaptive_catavgmax_pool2d(x, self.output_size) + + +class SelectAdaptivePool2d(nn.Module): + """Selectable global pooling layer with dynamic input kernel size + """ + def __init__(self, output_size=1, pool_type='fast', flatten=False): + super(SelectAdaptivePool2d, self).__init__() + self.pool_type = pool_type or '' # convert other falsy values to empty string for consistent TS typing + self.flatten = nn.Flatten(1) if flatten else nn.Identity() + if pool_type == '': + self.pool = nn.Identity() # pass through + elif pool_type == 'fast': + assert output_size == 1 + self.pool = FastAdaptiveAvgPool2d(flatten) + self.flatten = nn.Identity() + elif pool_type == 'avg': + self.pool = nn.AdaptiveAvgPool2d(output_size) + elif pool_type == 'avgmax': + self.pool = AdaptiveAvgMaxPool2d(output_size) + elif pool_type == 'catavgmax': + self.pool = AdaptiveCatAvgMaxPool2d(output_size) + elif pool_type == 'max': + self.pool = nn.AdaptiveMaxPool2d(output_size) + else: + assert False, 'Invalid pool type: %s' % pool_type + + def is_identity(self): + return not self.pool_type + + def forward(self, x): + x = self.pool(x) + x = self.flatten(x) + return x + + def feat_mult(self): + return adaptive_pool_feat_mult(self.pool_type) + + def __repr__(self): + return self.__class__.__name__ + ' (' \ + + 'pool_type=' + self.pool_type \ + + ', flatten=' + str(self.flatten) + ')' + diff --git a/custom_timm/models/layers/attention_pool2d.py b/custom_timm/models/layers/attention_pool2d.py new file mode 100644 index 0000000000000000000000000000000000000000..a13a6881feb4c7d31f6caa5ccc0d95288e322a83 --- /dev/null +++ b/custom_timm/models/layers/attention_pool2d.py @@ -0,0 +1,131 @@ +""" Attention Pool 2D + +Implementations of 2D spatial feature pooling using multi-head attention instead of average pool. + +Based on idea in CLIP by OpenAI, licensed Apache 2.0 +https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py + +Hacked together by / Copyright 2021 Ross Wightman +""" +from typing import Union, Tuple + +import torch +import torch.nn as nn + +from .helpers import to_2tuple +from .pos_embed import apply_rot_embed, RotaryEmbedding +from .weight_init import trunc_normal_ + + +class RotAttentionPool2d(nn.Module): + """ Attention based 2D feature pooling w/ rotary (relative) pos embedding. + This is a multi-head attention based replacement for (spatial) average pooling in NN architectures. + + Adapted from the AttentionPool2d in CLIP w/ rotary embedding instead of learned embed. + https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py + + NOTE: While this impl does not require a fixed feature size, performance at differeing resolutions from + train varies widely and falls off dramatically. I'm not sure if there is a way around this... -RW + """ + def __init__( + self, + in_features: int, + out_features: int = None, + embed_dim: int = None, + num_heads: int = 4, + qkv_bias: bool = True, + ): + super().__init__() + embed_dim = embed_dim or in_features + out_features = out_features or in_features + self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias) + self.proj = nn.Linear(embed_dim, out_features) + self.num_heads = num_heads + assert embed_dim % num_heads == 0 + self.head_dim = embed_dim // num_heads + self.scale = self.head_dim ** -0.5 + self.pos_embed = RotaryEmbedding(self.head_dim) + + trunc_normal_(self.qkv.weight, std=in_features ** -0.5) + nn.init.zeros_(self.qkv.bias) + + def forward(self, x): + B, _, H, W = x.shape + N = H * W + x = x.reshape(B, -1, N).permute(0, 2, 1) + + x = torch.cat([x.mean(1, keepdim=True), x], dim=1) + + x = self.qkv(x).reshape(B, N + 1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + q, k, v = x[0], x[1], x[2] + + qc, q = q[:, :, :1], q[:, :, 1:] + sin_emb, cos_emb = self.pos_embed.get_embed((H, W)) + q = apply_rot_embed(q, sin_emb, cos_emb) + q = torch.cat([qc, q], dim=2) + + kc, k = k[:, :, :1], k[:, :, 1:] + k = apply_rot_embed(k, sin_emb, cos_emb) + k = torch.cat([kc, k], dim=2) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1) + x = self.proj(x) + return x[:, 0] + + +class AttentionPool2d(nn.Module): + """ Attention based 2D feature pooling w/ learned (absolute) pos embedding. + This is a multi-head attention based replacement for (spatial) average pooling in NN architectures. + + It was based on impl in CLIP by OpenAI + https://github.com/openai/CLIP/blob/3b473b0e682c091a9e53623eebc1ca1657385717/clip/model.py + + NOTE: This requires feature size upon construction and well prevent adaptive sizing of the network. + """ + def __init__( + self, + in_features: int, + feat_size: Union[int, Tuple[int, int]], + out_features: int = None, + embed_dim: int = None, + num_heads: int = 4, + qkv_bias: bool = True, + ): + super().__init__() + + embed_dim = embed_dim or in_features + out_features = out_features or in_features + assert embed_dim % num_heads == 0 + self.feat_size = to_2tuple(feat_size) + self.qkv = nn.Linear(in_features, embed_dim * 3, bias=qkv_bias) + self.proj = nn.Linear(embed_dim, out_features) + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.scale = self.head_dim ** -0.5 + + spatial_dim = self.feat_size[0] * self.feat_size[1] + self.pos_embed = nn.Parameter(torch.zeros(spatial_dim + 1, in_features)) + trunc_normal_(self.pos_embed, std=in_features ** -0.5) + trunc_normal_(self.qkv.weight, std=in_features ** -0.5) + nn.init.zeros_(self.qkv.bias) + + def forward(self, x): + B, _, H, W = x.shape + N = H * W + assert self.feat_size[0] == H + assert self.feat_size[1] == W + x = x.reshape(B, -1, N).permute(0, 2, 1) + x = torch.cat([x.mean(1, keepdim=True), x], dim=1) + x = x + self.pos_embed.unsqueeze(0).to(x.dtype) + + x = self.qkv(x).reshape(B, N + 1, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + q, k, v = x[0], x[1], x[2] + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(B, N + 1, -1) + x = self.proj(x) + return x[:, 0] diff --git a/custom_timm/models/layers/blur_pool.py b/custom_timm/models/layers/blur_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..e73d886367c995ee44f0ed5f6eb5b5a287aa4935 --- /dev/null +++ b/custom_timm/models/layers/blur_pool.py @@ -0,0 +1,42 @@ +""" +BlurPool layer inspired by + - Kornia's Max_BlurPool2d + - Making Convolutional Networks Shift-Invariant Again :cite:`zhang2019shiftinvar` + +Hacked together by Chris Ha and Ross Wightman +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from .padding import get_padding + + +class BlurPool2d(nn.Module): + r"""Creates a module that computes blurs and downsample a given feature map. + See :cite:`zhang2019shiftinvar` for more details. + Corresponds to the Downsample class, which does blurring and subsampling + + Args: + channels = Number of input channels + filt_size (int): binomial filter size for blurring. currently supports 3 (default) and 5. + stride (int): downsampling filter stride + + Returns: + torch.Tensor: the transformed tensor. + """ + def __init__(self, channels, filt_size=3, stride=2) -> None: + super(BlurPool2d, self).__init__() + assert filt_size > 1 + self.channels = channels + self.filt_size = filt_size + self.stride = stride + self.padding = [get_padding(filt_size, stride, dilation=1)] * 4 + coeffs = torch.tensor((np.poly1d((0.5, 0.5)) ** (self.filt_size - 1)).coeffs.astype(np.float32)) + blur_filter = (coeffs[:, None] * coeffs[None, :])[None, None, :, :].repeat(self.channels, 1, 1, 1) + self.register_buffer('filt', blur_filter, persistent=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = F.pad(x, self.padding, 'reflect') + return F.conv2d(x, self.filt, stride=self.stride, groups=self.channels) diff --git a/custom_timm/models/layers/bottleneck_attn.py b/custom_timm/models/layers/bottleneck_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..c3db464e5ab4f2d3478293034e90a0939dadb628 --- /dev/null +++ b/custom_timm/models/layers/bottleneck_attn.py @@ -0,0 +1,157 @@ +""" Bottleneck Self Attention (Bottleneck Transformers) + +Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605 + +@misc{2101.11605, +Author = {Aravind Srinivas and Tsung-Yi Lin and Niki Parmar and Jonathon Shlens and Pieter Abbeel and Ashish Vaswani}, +Title = {Bottleneck Transformers for Visual Recognition}, +Year = {2021}, +} + +Based on ref gist at: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2 + +This impl is a WIP but given that it is based on the ref gist likely not too far off. + +Hacked together by / Copyright 2021 Ross Wightman +""" +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .helpers import to_2tuple, make_divisible +from .weight_init import trunc_normal_ +from .trace_utils import _assert + + +def rel_logits_1d(q, rel_k, permute_mask: List[int]): + """ Compute relative logits along one dimension + + As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2 + Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925 + + Args: + q: (batch, heads, height, width, dim) + rel_k: (2 * width - 1, dim) + permute_mask: permute output dim according to this + """ + B, H, W, dim = q.shape + x = (q @ rel_k.transpose(-1, -2)) + x = x.reshape(-1, W, 2 * W -1) + + # pad to shift from relative to absolute indexing + x_pad = F.pad(x, [0, 1]).flatten(1) + x_pad = F.pad(x_pad, [0, W - 1]) + + # reshape and slice out the padded elements + x_pad = x_pad.reshape(-1, W + 1, 2 * W - 1) + x = x_pad[:, :W, W - 1:] + + # reshape and tile + x = x.reshape(B, H, 1, W, W).expand(-1, -1, H, -1, -1) + return x.permute(permute_mask) + + +class PosEmbedRel(nn.Module): + """ Relative Position Embedding + As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2 + Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925 + """ + def __init__(self, feat_size, dim_head, scale): + super().__init__() + self.height, self.width = to_2tuple(feat_size) + self.dim_head = dim_head + self.height_rel = nn.Parameter(torch.randn(self.height * 2 - 1, dim_head) * scale) + self.width_rel = nn.Parameter(torch.randn(self.width * 2 - 1, dim_head) * scale) + + def forward(self, q): + B, HW, _ = q.shape + + # relative logits in width dimension. + q = q.reshape(B, self.height, self.width, -1) + rel_logits_w = rel_logits_1d(q, self.width_rel, permute_mask=(0, 1, 3, 2, 4)) + + # relative logits in height dimension. + q = q.transpose(1, 2) + rel_logits_h = rel_logits_1d(q, self.height_rel, permute_mask=(0, 3, 1, 4, 2)) + + rel_logits = rel_logits_h + rel_logits_w + rel_logits = rel_logits.reshape(B, HW, HW) + return rel_logits + + +class BottleneckAttn(nn.Module): + """ Bottleneck Attention + Paper: `Bottleneck Transformers for Visual Recognition` - https://arxiv.org/abs/2101.11605 + + The internal dimensions of the attention module are controlled by the interaction of several arguments. + * the output dimension of the module is specified by dim_out, which falls back to input dim if not set + * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim + * the query and key (qk) dimensions are determined by + * num_heads * dim_head if dim_head is not None + * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None + * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used + + Args: + dim (int): input dimension to the module + dim_out (int): output dimension of the module, same as dim if not set + stride (int): output stride of the module, avg pool used if stride == 2 (default: 1). + num_heads (int): parallel attention heads (default: 4) + dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set + qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0) + qkv_bias (bool): add bias to q, k, and v projections + scale_pos_embed (bool): scale the position embedding as well as Q @ K + """ + def __init__( + self, dim, dim_out=None, feat_size=None, stride=1, num_heads=4, dim_head=None, + qk_ratio=1.0, qkv_bias=False, scale_pos_embed=False): + super().__init__() + assert feat_size is not None, 'A concrete feature size matching expected input (H, W) is required' + dim_out = dim_out or dim + assert dim_out % num_heads == 0 + self.num_heads = num_heads + self.dim_head_qk = dim_head or make_divisible(dim_out * qk_ratio, divisor=8) // num_heads + self.dim_head_v = dim_out // self.num_heads + self.dim_out_qk = num_heads * self.dim_head_qk + self.dim_out_v = num_heads * self.dim_head_v + self.scale = self.dim_head_qk ** -0.5 + self.scale_pos_embed = scale_pos_embed + + self.qkv = nn.Conv2d(dim, self.dim_out_qk * 2 + self.dim_out_v, 1, bias=qkv_bias) + + # NOTE I'm only supporting relative pos embedding for now + self.pos_embed = PosEmbedRel(feat_size, dim_head=self.dim_head_qk, scale=self.scale) + + self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity() + + self.reset_parameters() + + def reset_parameters(self): + trunc_normal_(self.qkv.weight, std=self.qkv.weight.shape[1] ** -0.5) # fan-in + trunc_normal_(self.pos_embed.height_rel, std=self.scale) + trunc_normal_(self.pos_embed.width_rel, std=self.scale) + + def forward(self, x): + B, C, H, W = x.shape + _assert(H == self.pos_embed.height, '') + _assert(W == self.pos_embed.width, '') + + x = self.qkv(x) # B, (2 * dim_head_qk + dim_head_v) * num_heads, H, W + + # NOTE head vs channel split ordering in qkv projection was decided before I allowed qk to differ from v + # So, this is more verbose than if heads were before qkv splits, but throughput is not impacted. + q, k, v = torch.split(x, [self.dim_out_qk, self.dim_out_qk, self.dim_out_v], dim=1) + q = q.reshape(B * self.num_heads, self.dim_head_qk, -1).transpose(-1, -2) + k = k.reshape(B * self.num_heads, self.dim_head_qk, -1) # no transpose, for q @ k + v = v.reshape(B * self.num_heads, self.dim_head_v, -1).transpose(-1, -2) + + if self.scale_pos_embed: + attn = (q @ k + self.pos_embed(q)) * self.scale # B * num_heads, H * W, H * W + else: + attn = (q @ k) * self.scale + self.pos_embed(q) + attn = attn.softmax(dim=-1) + + out = (attn @ v).transpose(-1, -2).reshape(B, self.dim_out_v, H, W) # B, dim_out, H, W + out = self.pool(out) + return out diff --git a/custom_timm/models/layers/cbam.py b/custom_timm/models/layers/cbam.py new file mode 100644 index 0000000000000000000000000000000000000000..576a8306d979c3d93215253eba3affd7efd87bfe --- /dev/null +++ b/custom_timm/models/layers/cbam.py @@ -0,0 +1,112 @@ +""" CBAM (sort-of) Attention + +Experimental impl of CBAM: Convolutional Block Attention Module: https://arxiv.org/abs/1807.06521 + +WARNING: Results with these attention layers have been mixed. They can significantly reduce performance on +some tasks, especially fine-grained it seems. I may end up removing this impl. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +from torch import nn as nn +import torch.nn.functional as F + +from .conv_bn_act import ConvNormAct +from .create_act import create_act_layer, get_act_layer +from .helpers import make_divisible + + +class ChannelAttn(nn.Module): + """ Original CBAM channel attention module, currently avg + max pool variant only. + """ + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): + super(ChannelAttn, self).__init__() + if not rd_channels: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + self.fc1 = nn.Conv2d(channels, rd_channels, 1, bias=mlp_bias) + self.act = act_layer(inplace=True) + self.fc2 = nn.Conv2d(rd_channels, channels, 1, bias=mlp_bias) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_avg = self.fc2(self.act(self.fc1(x.mean((2, 3), keepdim=True)))) + x_max = self.fc2(self.act(self.fc1(x.amax((2, 3), keepdim=True)))) + return x * self.gate(x_avg + x_max) + + +class LightChannelAttn(ChannelAttn): + """An experimental 'lightweight' that sums avg + max pool first + """ + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): + super(LightChannelAttn, self).__init__( + channels, rd_ratio, rd_channels, rd_divisor, act_layer, gate_layer, mlp_bias) + + def forward(self, x): + x_pool = 0.5 * x.mean((2, 3), keepdim=True) + 0.5 * x.amax((2, 3), keepdim=True) + x_attn = self.fc2(self.act(self.fc1(x_pool))) + return x * F.sigmoid(x_attn) + + +class SpatialAttn(nn.Module): + """ Original CBAM spatial attention module + """ + def __init__(self, kernel_size=7, gate_layer='sigmoid'): + super(SpatialAttn, self).__init__() + self.conv = ConvNormAct(2, 1, kernel_size, apply_act=False) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_attn = torch.cat([x.mean(dim=1, keepdim=True), x.amax(dim=1, keepdim=True)], dim=1) + x_attn = self.conv(x_attn) + return x * self.gate(x_attn) + + +class LightSpatialAttn(nn.Module): + """An experimental 'lightweight' variant that sums avg_pool and max_pool results. + """ + def __init__(self, kernel_size=7, gate_layer='sigmoid'): + super(LightSpatialAttn, self).__init__() + self.conv = ConvNormAct(1, 1, kernel_size, apply_act=False) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_attn = 0.5 * x.mean(dim=1, keepdim=True) + 0.5 * x.amax(dim=1, keepdim=True) + x_attn = self.conv(x_attn) + return x * self.gate(x_attn) + + +class CbamModule(nn.Module): + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + spatial_kernel_size=7, act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): + super(CbamModule, self).__init__() + self.channel = ChannelAttn( + channels, rd_ratio=rd_ratio, rd_channels=rd_channels, + rd_divisor=rd_divisor, act_layer=act_layer, gate_layer=gate_layer, mlp_bias=mlp_bias) + self.spatial = SpatialAttn(spatial_kernel_size, gate_layer=gate_layer) + + def forward(self, x): + x = self.channel(x) + x = self.spatial(x) + return x + + +class LightCbamModule(nn.Module): + def __init__( + self, channels, rd_ratio=1./16, rd_channels=None, rd_divisor=1, + spatial_kernel_size=7, act_layer=nn.ReLU, gate_layer='sigmoid', mlp_bias=False): + super(LightCbamModule, self).__init__() + self.channel = LightChannelAttn( + channels, rd_ratio=rd_ratio, rd_channels=rd_channels, + rd_divisor=rd_divisor, act_layer=act_layer, gate_layer=gate_layer, mlp_bias=mlp_bias) + self.spatial = LightSpatialAttn(spatial_kernel_size) + + def forward(self, x): + x = self.channel(x) + x = self.spatial(x) + return x + diff --git a/custom_timm/models/layers/classifier.py b/custom_timm/models/layers/classifier.py new file mode 100644 index 0000000000000000000000000000000000000000..3ac3338782e4c6e4c29cb9dc6a3da5a9331340c7 --- /dev/null +++ b/custom_timm/models/layers/classifier.py @@ -0,0 +1,56 @@ +""" Classifier head and layer factory + +Hacked together by / Copyright 2020 Ross Wightman +""" +from torch import nn as nn +from torch.nn import functional as F + +from .adaptive_avgmax_pool import SelectAdaptivePool2d + + +def _create_pool(num_features, num_classes, pool_type='avg', use_conv=False): + flatten_in_pool = not use_conv # flatten when we use a Linear layer after pooling + if not pool_type: + assert num_classes == 0 or use_conv,\ + 'Pooling can only be disabled if classifier is also removed or conv classifier is used' + flatten_in_pool = False # disable flattening if pooling is pass-through (no pooling) + global_pool = SelectAdaptivePool2d(pool_type=pool_type, flatten=flatten_in_pool) + num_pooled_features = num_features * global_pool.feat_mult() + return global_pool, num_pooled_features + + +def _create_fc(num_features, num_classes, use_conv=False): + if num_classes <= 0: + fc = nn.Identity() # pass-through (no classifier) + elif use_conv: + fc = nn.Conv2d(num_features, num_classes, 1, bias=True) + else: + fc = nn.Linear(num_features, num_classes, bias=True) + return fc + + +def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False): + global_pool, num_pooled_features = _create_pool(num_features, num_classes, pool_type, use_conv=use_conv) + fc = _create_fc(num_pooled_features, num_classes, use_conv=use_conv) + return global_pool, fc + + +class ClassifierHead(nn.Module): + """Classifier head w/ configurable global pooling and dropout.""" + + def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0., use_conv=False): + super(ClassifierHead, self).__init__() + self.drop_rate = drop_rate + self.global_pool, num_pooled_features = _create_pool(in_chs, num_classes, pool_type, use_conv=use_conv) + self.fc = _create_fc(num_pooled_features, num_classes, use_conv=use_conv) + self.flatten = nn.Flatten(1) if use_conv and pool_type else nn.Identity() + + def forward(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate: + x = F.dropout(x, p=float(self.drop_rate), training=self.training) + if pre_logits: + return x.flatten(1) + else: + x = self.fc(x) + return self.flatten(x) diff --git a/custom_timm/models/layers/cond_conv2d.py b/custom_timm/models/layers/cond_conv2d.py new file mode 100644 index 0000000000000000000000000000000000000000..43654c5972167cab0224bfe720d78bae1227eb7d --- /dev/null +++ b/custom_timm/models/layers/cond_conv2d.py @@ -0,0 +1,123 @@ +""" PyTorch Conditionally Parameterized Convolution (CondConv) + +Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference +(https://arxiv.org/abs/1904.04971) + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import math +from functools import partial +import numpy as np +import torch +from torch import nn as nn +from torch.nn import functional as F + +from .helpers import to_2tuple +from .conv2d_same import conv2d_same +from .padding import get_padding_value + + +def get_condconv_initializer(initializer, num_experts, expert_shape): + def condconv_initializer(weight): + """CondConv initializer function.""" + num_params = np.prod(expert_shape) + if (len(weight.shape) != 2 or weight.shape[0] != num_experts or + weight.shape[1] != num_params): + raise (ValueError( + 'CondConv variables must have shape [num_experts, num_params]')) + for i in range(num_experts): + initializer(weight[i].view(expert_shape)) + return condconv_initializer + + +class CondConv2d(nn.Module): + """ Conditionally Parameterized Convolution + Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py + + Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion: + https://github.com/pytorch/pytorch/issues/17983 + """ + __constants__ = ['in_channels', 'out_channels', 'dynamic_padding'] + + def __init__(self, in_channels, out_channels, kernel_size=3, + stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4): + super(CondConv2d, self).__init__() + + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = to_2tuple(kernel_size) + self.stride = to_2tuple(stride) + padding_val, is_padding_dynamic = get_padding_value( + padding, kernel_size, stride=stride, dilation=dilation) + self.dynamic_padding = is_padding_dynamic # if in forward to work with torchscript + self.padding = to_2tuple(padding_val) + self.dilation = to_2tuple(dilation) + self.groups = groups + self.num_experts = num_experts + + self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size + weight_num_param = 1 + for wd in self.weight_shape: + weight_num_param *= wd + self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param)) + + if bias: + self.bias_shape = (self.out_channels,) + self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels)) + else: + self.register_parameter('bias', None) + + self.reset_parameters() + + def reset_parameters(self): + init_weight = get_condconv_initializer( + partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape) + init_weight(self.weight) + if self.bias is not None: + fan_in = np.prod(self.weight_shape[1:]) + bound = 1 / math.sqrt(fan_in) + init_bias = get_condconv_initializer( + partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape) + init_bias(self.bias) + + def forward(self, x, routing_weights): + B, C, H, W = x.shape + weight = torch.matmul(routing_weights, self.weight) + new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size + weight = weight.view(new_weight_shape) + bias = None + if self.bias is not None: + bias = torch.matmul(routing_weights, self.bias) + bias = bias.view(B * self.out_channels) + # move batch elements with channels so each batch element can be efficiently convolved with separate kernel + # reshape instead of view to work with channels_last input + x = x.reshape(1, B * C, H, W) + if self.dynamic_padding: + out = conv2d_same( + x, weight, bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups * B) + else: + out = F.conv2d( + x, weight, bias, stride=self.stride, padding=self.padding, + dilation=self.dilation, groups=self.groups * B) + out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1]) + + # Literal port (from TF definition) + # x = torch.split(x, 1, 0) + # weight = torch.split(weight, 1, 0) + # if self.bias is not None: + # bias = torch.matmul(routing_weights, self.bias) + # bias = torch.split(bias, 1, 0) + # else: + # bias = [None] * B + # out = [] + # for xi, wi, bi in zip(x, weight, bias): + # wi = wi.view(*self.weight_shape) + # if bi is not None: + # bi = bi.view(*self.bias_shape) + # out.append(self.conv_fn( + # xi, wi, bi, stride=self.stride, padding=self.padding, + # dilation=self.dilation, groups=self.groups)) + # out = torch.cat(out, 0) + return out diff --git a/custom_timm/models/layers/config.py b/custom_timm/models/layers/config.py new file mode 100644 index 0000000000000000000000000000000000000000..f07b9d782ba0597c174dee81097c28280335fdba --- /dev/null +++ b/custom_timm/models/layers/config.py @@ -0,0 +1,115 @@ +""" Model / Layer Config singleton state +""" +from typing import Any, Optional + +__all__ = [ + 'is_exportable', 'is_scriptable', 'is_no_jit', + 'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config' +] + +# Set to True if prefer to have layers with no jit optimization (includes activations) +_NO_JIT = False + +# Set to True if prefer to have activation layers with no jit optimization +# NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying +# the jit flags so far are activations. This will change as more layers are updated and/or added. +_NO_ACTIVATION_JIT = False + +# Set to True if exporting a model with Same padding via ONNX +_EXPORTABLE = False + +# Set to True if wanting to use torch.jit.script on a model +_SCRIPTABLE = False + + +def is_no_jit(): + return _NO_JIT + + +class set_no_jit: + def __init__(self, mode: bool) -> None: + global _NO_JIT + self.prev = _NO_JIT + _NO_JIT = mode + + def __enter__(self) -> None: + pass + + def __exit__(self, *args: Any) -> bool: + global _NO_JIT + _NO_JIT = self.prev + return False + + +def is_exportable(): + return _EXPORTABLE + + +class set_exportable: + def __init__(self, mode: bool) -> None: + global _EXPORTABLE + self.prev = _EXPORTABLE + _EXPORTABLE = mode + + def __enter__(self) -> None: + pass + + def __exit__(self, *args: Any) -> bool: + global _EXPORTABLE + _EXPORTABLE = self.prev + return False + + +def is_scriptable(): + return _SCRIPTABLE + + +class set_scriptable: + def __init__(self, mode: bool) -> None: + global _SCRIPTABLE + self.prev = _SCRIPTABLE + _SCRIPTABLE = mode + + def __enter__(self) -> None: + pass + + def __exit__(self, *args: Any) -> bool: + global _SCRIPTABLE + _SCRIPTABLE = self.prev + return False + + +class set_layer_config: + """ Layer config context manager that allows setting all layer config flags at once. + If a flag arg is None, it will not change the current value. + """ + def __init__( + self, + scriptable: Optional[bool] = None, + exportable: Optional[bool] = None, + no_jit: Optional[bool] = None, + no_activation_jit: Optional[bool] = None): + global _SCRIPTABLE + global _EXPORTABLE + global _NO_JIT + global _NO_ACTIVATION_JIT + self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT + if scriptable is not None: + _SCRIPTABLE = scriptable + if exportable is not None: + _EXPORTABLE = exportable + if no_jit is not None: + _NO_JIT = no_jit + if no_activation_jit is not None: + _NO_ACTIVATION_JIT = no_activation_jit + + def __enter__(self) -> None: + pass + + def __exit__(self, *args: Any) -> bool: + global _SCRIPTABLE + global _EXPORTABLE + global _NO_JIT + global _NO_ACTIVATION_JIT + _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev + return False diff --git a/custom_timm/models/layers/conv2d_same.py b/custom_timm/models/layers/conv2d_same.py new file mode 100644 index 0000000000000000000000000000000000000000..75f0f98d4ec1e3f4a0dc004b977815afaa25e7fc --- /dev/null +++ b/custom_timm/models/layers/conv2d_same.py @@ -0,0 +1,42 @@ +""" Conv2d w/ Same Padding + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Tuple, Optional + +from .padding import pad_same, get_padding_value + + +def conv2d_same( + x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1), + padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1): + x = pad_same(x, weight.shape[-2:], stride, dilation) + return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups) + + +class Conv2dSame(nn.Conv2d): + """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions + """ + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, + padding=0, dilation=1, groups=1, bias=True): + super(Conv2dSame, self).__init__( + in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias) + + def forward(self, x): + return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups) + + +def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs): + padding = kwargs.pop('padding', '') + kwargs.setdefault('bias', False) + padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs) + if is_dynamic: + return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs) + else: + return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs) + + diff --git a/custom_timm/models/layers/conv_bn_act.py b/custom_timm/models/layers/conv_bn_act.py new file mode 100644 index 0000000000000000000000000000000000000000..9e7c64b85805b25c861d09116f9590dbebafce7e --- /dev/null +++ b/custom_timm/models/layers/conv_bn_act.py @@ -0,0 +1,88 @@ +""" Conv2d + BN + Act + +Hacked together by / Copyright 2020 Ross Wightman +""" +import functools +from torch import nn as nn + +from .create_conv2d import create_conv2d +from .create_norm_act import get_norm_act_layer + + +class ConvNormAct(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1, + bias=False, apply_act=True, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, drop_layer=None): + super(ConvNormAct, self).__init__() + self.conv = create_conv2d( + in_channels, out_channels, kernel_size, stride=stride, + padding=padding, dilation=dilation, groups=groups, bias=bias) + + # NOTE for backwards compatibility with models that use separate norm and act layer definitions + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + # NOTE for backwards (weight) compatibility, norm layer name remains `.bn` + norm_kwargs = dict(drop_layer=drop_layer) if drop_layer is not None else {} + self.bn = norm_act_layer(out_channels, apply_act=apply_act, **norm_kwargs) + + @property + def in_channels(self): + return self.conv.in_channels + + @property + def out_channels(self): + return self.conv.out_channels + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + return x + + +ConvBnAct = ConvNormAct + + +def create_aa(aa_layer, channels, stride=2, enable=True): + if not aa_layer or not enable: + return nn.Identity() + if isinstance(aa_layer, functools.partial): + if issubclass(aa_layer.func, nn.AvgPool2d): + return aa_layer() + else: + return aa_layer(channels) + elif issubclass(aa_layer, nn.AvgPool2d): + return aa_layer(stride) + else: + return aa_layer(channels=channels, stride=stride) + + +class ConvNormActAa(nn.Module): + def __init__( + self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1, + bias=False, apply_act=True, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, aa_layer=None, drop_layer=None): + super(ConvNormActAa, self).__init__() + use_aa = aa_layer is not None and stride == 2 + + self.conv = create_conv2d( + in_channels, out_channels, kernel_size, stride=1 if use_aa else stride, + padding=padding, dilation=dilation, groups=groups, bias=bias) + + # NOTE for backwards compatibility with models that use separate norm and act layer definitions + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + # NOTE for backwards (weight) compatibility, norm layer name remains `.bn` + norm_kwargs = dict(drop_layer=drop_layer) if drop_layer is not None else {} + self.bn = norm_act_layer(out_channels, apply_act=apply_act, **norm_kwargs) + self.aa = create_aa(aa_layer, out_channels, stride=stride, enable=use_aa) + + @property + def in_channels(self): + return self.conv.in_channels + + @property + def out_channels(self): + return self.conv.out_channels + + def forward(self, x): + x = self.conv(x) + x = self.bn(x) + x = self.aa(x) + return x diff --git a/custom_timm/models/layers/create_act.py b/custom_timm/models/layers/create_act.py new file mode 100644 index 0000000000000000000000000000000000000000..a3044a3d20b56c7739e810f91deb533c23e161e1 --- /dev/null +++ b/custom_timm/models/layers/create_act.py @@ -0,0 +1,154 @@ +""" Activation Factory +Hacked together by / Copyright 2020 Ross Wightman +""" +from typing import Union, Callable, Type + +from .activations import * +from .activations_jit import * +from .activations_me import * +from .config import is_exportable, is_scriptable, is_no_jit + +# PyTorch has an optimized, native 'silu' (aka 'swish') operator as of PyTorch 1.7. +# Also hardsigmoid, hardswish, and soon mish. This code will use native version if present. +# Eventually, the custom SiLU, Mish, Hard*, layers will be removed and only native variants will be used. +_has_silu = 'silu' in dir(torch.nn.functional) +_has_hardswish = 'hardswish' in dir(torch.nn.functional) +_has_hardsigmoid = 'hardsigmoid' in dir(torch.nn.functional) +_has_mish = 'mish' in dir(torch.nn.functional) + + +_ACT_FN_DEFAULT = dict( + silu=F.silu if _has_silu else swish, + swish=F.silu if _has_silu else swish, + mish=F.mish if _has_mish else mish, + relu=F.relu, + relu6=F.relu6, + leaky_relu=F.leaky_relu, + elu=F.elu, + celu=F.celu, + selu=F.selu, + gelu=gelu, + sigmoid=sigmoid, + tanh=tanh, + hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid, + hard_swish=F.hardswish if _has_hardswish else hard_swish, + hard_mish=hard_mish, +) + +_ACT_FN_JIT = dict( + silu=F.silu if _has_silu else swish_jit, + swish=F.silu if _has_silu else swish_jit, + mish=F.mish if _has_mish else mish_jit, + hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid_jit, + hard_swish=F.hardswish if _has_hardswish else hard_swish_jit, + hard_mish=hard_mish_jit +) + +_ACT_FN_ME = dict( + silu=F.silu if _has_silu else swish_me, + swish=F.silu if _has_silu else swish_me, + mish=F.mish if _has_mish else mish_me, + hard_sigmoid=F.hardsigmoid if _has_hardsigmoid else hard_sigmoid_me, + hard_swish=F.hardswish if _has_hardswish else hard_swish_me, + hard_mish=hard_mish_me, +) + +_ACT_FNS = (_ACT_FN_ME, _ACT_FN_JIT, _ACT_FN_DEFAULT) +for a in _ACT_FNS: + a.setdefault('hardsigmoid', a.get('hard_sigmoid')) + a.setdefault('hardswish', a.get('hard_swish')) + + +_ACT_LAYER_DEFAULT = dict( + silu=nn.SiLU if _has_silu else Swish, + swish=nn.SiLU if _has_silu else Swish, + mish=nn.Mish if _has_mish else Mish, + relu=nn.ReLU, + relu6=nn.ReLU6, + leaky_relu=nn.LeakyReLU, + elu=nn.ELU, + prelu=PReLU, + celu=nn.CELU, + selu=nn.SELU, + gelu=GELU, + sigmoid=Sigmoid, + tanh=Tanh, + hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoid, + hard_swish=nn.Hardswish if _has_hardswish else HardSwish, + hard_mish=HardMish, +) + +_ACT_LAYER_JIT = dict( + silu=nn.SiLU if _has_silu else SwishJit, + swish=nn.SiLU if _has_silu else SwishJit, + mish=nn.Mish if _has_mish else MishJit, + hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoidJit, + hard_swish=nn.Hardswish if _has_hardswish else HardSwishJit, + hard_mish=HardMishJit +) + +_ACT_LAYER_ME = dict( + silu=nn.SiLU if _has_silu else SwishMe, + swish=nn.SiLU if _has_silu else SwishMe, + mish=nn.Mish if _has_mish else MishMe, + hard_sigmoid=nn.Hardsigmoid if _has_hardsigmoid else HardSigmoidMe, + hard_swish=nn.Hardswish if _has_hardswish else HardSwishMe, + hard_mish=HardMishMe, +) + +_ACT_LAYERS = (_ACT_LAYER_ME, _ACT_LAYER_JIT, _ACT_LAYER_DEFAULT) +for a in _ACT_LAYERS: + a.setdefault('hardsigmoid', a.get('hard_sigmoid')) + a.setdefault('hardswish', a.get('hard_swish')) + + +def get_act_fn(name: Union[Callable, str] = 'relu'): + """ Activation Function Factory + Fetching activation fns by name with this function allows export or torch script friendly + functions to be returned dynamically based on current config. + """ + if not name: + return None + if isinstance(name, Callable): + return name + if not (is_no_jit() or is_exportable() or is_scriptable()): + # If not exporting or scripting the model, first look for a memory-efficient version with + # custom autograd, then fallback + if name in _ACT_FN_ME: + return _ACT_FN_ME[name] + if not (is_no_jit() or is_exportable()): + if name in _ACT_FN_JIT: + return _ACT_FN_JIT[name] + return _ACT_FN_DEFAULT[name] + + +def get_act_layer(name: Union[Type[nn.Module], str] = 'relu'): + """ Activation Layer Factory + Fetching activation layers by name with this function allows export or torch script friendly + functions to be returned dynamically based on current config. + """ + if not name: + return None + if not isinstance(name, str): + # callable, module, etc + return name + if not (is_no_jit() or is_exportable() or is_scriptable()): + if name in _ACT_LAYER_ME: + return _ACT_LAYER_ME[name] + if not (is_no_jit() or is_exportable()): + if name in _ACT_LAYER_JIT: + return _ACT_LAYER_JIT[name] + return _ACT_LAYER_DEFAULT[name] + + +def create_act_layer(name: Union[nn.Module, str], inplace=None, **kwargs): + act_layer = get_act_layer(name) + if act_layer is None: + return None + if inplace is None: + return act_layer(**kwargs) + try: + return act_layer(inplace=inplace, **kwargs) + except TypeError: + # recover if act layer doesn't have inplace arg + return act_layer(**kwargs) diff --git a/custom_timm/models/layers/create_attn.py b/custom_timm/models/layers/create_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..cc7e91ea9af2e853fd659973c72ba5e86025a1b3 --- /dev/null +++ b/custom_timm/models/layers/create_attn.py @@ -0,0 +1,89 @@ +""" Attention Factory + +Hacked together by / Copyright 2021 Ross Wightman +""" +import torch +from functools import partial + +from .bottleneck_attn import BottleneckAttn +from .cbam import CbamModule, LightCbamModule +from .eca import EcaModule, CecaModule +from .gather_excite import GatherExcite +from .global_context import GlobalContext +from .halo_attn import HaloAttn +from .lambda_layer import LambdaLayer +from .non_local_attn import NonLocalAttn, BatNonLocalAttn +from .selective_kernel import SelectiveKernel +from .split_attn import SplitAttn +from .squeeze_excite import SEModule, EffectiveSEModule + + +def get_attn(attn_type): + if isinstance(attn_type, torch.nn.Module): + return attn_type + module_cls = None + if attn_type: + if isinstance(attn_type, str): + attn_type = attn_type.lower() + # Lightweight attention modules (channel and/or coarse spatial). + # Typically added to existing network architecture blocks in addition to existing convolutions. + if attn_type == 'se': + module_cls = SEModule + elif attn_type == 'ese': + module_cls = EffectiveSEModule + elif attn_type == 'eca': + module_cls = EcaModule + elif attn_type == 'ecam': + module_cls = partial(EcaModule, use_mlp=True) + elif attn_type == 'ceca': + module_cls = CecaModule + elif attn_type == 'ge': + module_cls = GatherExcite + elif attn_type == 'gc': + module_cls = GlobalContext + elif attn_type == 'gca': + module_cls = partial(GlobalContext, fuse_add=True, fuse_scale=False) + elif attn_type == 'cbam': + module_cls = CbamModule + elif attn_type == 'lcbam': + module_cls = LightCbamModule + + # Attention / attention-like modules w/ significant params + # Typically replace some of the existing workhorse convs in a network architecture. + # All of these accept a stride argument and can spatially downsample the input. + elif attn_type == 'sk': + module_cls = SelectiveKernel + elif attn_type == 'splat': + module_cls = SplitAttn + + # Self-attention / attention-like modules w/ significant compute and/or params + # Typically replace some of the existing workhorse convs in a network architecture. + # All of these accept a stride argument and can spatially downsample the input. + elif attn_type == 'lambda': + return LambdaLayer + elif attn_type == 'bottleneck': + return BottleneckAttn + elif attn_type == 'halo': + return HaloAttn + elif attn_type == 'nl': + module_cls = NonLocalAttn + elif attn_type == 'bat': + module_cls = BatNonLocalAttn + + # Woops! + else: + assert False, "Invalid attn module (%s)" % attn_type + elif isinstance(attn_type, bool): + if attn_type: + module_cls = SEModule + else: + module_cls = attn_type + return module_cls + + +def create_attn(attn_type, channels, **kwargs): + module_cls = get_attn(attn_type) + if module_cls is not None: + # NOTE: it's expected the first (positional) argument of all attention layers is the # input channels + return module_cls(channels, **kwargs) + return None diff --git a/custom_timm/models/layers/create_conv2d.py b/custom_timm/models/layers/create_conv2d.py new file mode 100644 index 0000000000000000000000000000000000000000..ac9489ce492d0f768c1ae8892163fa986bac8fd8 --- /dev/null +++ b/custom_timm/models/layers/create_conv2d.py @@ -0,0 +1,36 @@ +""" Create Conv2d Factory Method + +Hacked together by / Copyright 2020 Ross Wightman +""" + +from .mixed_conv2d import MixedConv2d +from .cond_conv2d import CondConv2d +from .conv2d_same import create_conv2d_pad + + +def create_conv2d(in_channels, out_channels, kernel_size, **kwargs): + """ Select a 2d convolution implementation based on arguments + Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d. + + Used extensively by EfficientNet, MobileNetv3 and related networks. + """ + if isinstance(kernel_size, list): + assert 'num_experts' not in kwargs # MixNet + CondConv combo not supported currently + if 'groups' in kwargs: + groups = kwargs.pop('groups') + if groups == in_channels: + kwargs['depthwise'] = True + else: + assert groups == 1 + # We're going to use only lists for defining the MixedConv2d kernel groups, + # ints, tuples, other iterables will continue to pass to normal conv and specify h, w. + m = MixedConv2d(in_channels, out_channels, kernel_size, **kwargs) + else: + depthwise = kwargs.pop('depthwise', False) + # for DW out_channels must be multiple of in_channels as must have out_channels % groups == 0 + groups = in_channels if depthwise else kwargs.pop('groups', 1) + if 'num_experts' in kwargs and kwargs['num_experts'] > 0: + m = CondConv2d(in_channels, out_channels, kernel_size, groups=groups, **kwargs) + else: + m = create_conv2d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs) + return m diff --git a/custom_timm/models/layers/create_norm.py b/custom_timm/models/layers/create_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..b9efae8c8c34d8ebdb5e80921768b898e10ccc7e --- /dev/null +++ b/custom_timm/models/layers/create_norm.py @@ -0,0 +1,56 @@ +""" Norm Layer Factory + +Create norm modules by string (to mirror create_act and creat_norm-act fns) + +Copyright 2022 Ross Wightman +""" +import types +import functools + +import torch.nn as nn + +from .norm import GroupNorm, GroupNorm1, LayerNorm, LayerNorm2d + +_NORM_MAP = dict( + batchnorm=nn.BatchNorm2d, + batchnorm2d=nn.BatchNorm2d, + batchnorm1d=nn.BatchNorm1d, + groupnorm=GroupNorm, + groupnorm1=GroupNorm1, + layernorm=LayerNorm, + layernorm2d=LayerNorm2d, +) +_NORM_TYPES = {m for n, m in _NORM_MAP.items()} + + +def create_norm_layer(layer_name, num_features, act_layer=None, apply_act=True, **kwargs): + layer = get_norm_layer(layer_name, act_layer=act_layer) + layer_instance = layer(num_features, apply_act=apply_act, **kwargs) + return layer_instance + + +def get_norm_layer(norm_layer): + assert isinstance(norm_layer, (type, str, types.FunctionType, functools.partial)) + norm_kwargs = {} + + # unbind partial fn, so args can be rebound later + if isinstance(norm_layer, functools.partial): + norm_kwargs.update(norm_layer.keywords) + norm_layer = norm_layer.func + + if isinstance(norm_layer, str): + layer_name = norm_layer.replace('_', '') + norm_layer = _NORM_MAP.get(layer_name, None) + elif norm_layer in _NORM_TYPES: + norm_layer = norm_layer + elif isinstance(norm_layer, types.FunctionType): + # if function type, assume it is a lambda/fn that creates a norm layer + norm_layer = norm_layer + else: + type_name = norm_layer.__name__.lower().replace('_', '') + norm_layer = _NORM_MAP.get(type_name, None) + assert norm_layer is not None, f"No equivalent norm layer for {type_name}" + + if norm_kwargs: + norm_layer = functools.partial(norm_layer, **norm_kwargs) # bind/rebind args + return norm_layer diff --git a/custom_timm/models/layers/create_norm_act.py b/custom_timm/models/layers/create_norm_act.py new file mode 100644 index 0000000000000000000000000000000000000000..78dd9a51d9fdc932e50ade613d3e8a925ae2f317 --- /dev/null +++ b/custom_timm/models/layers/create_norm_act.py @@ -0,0 +1,91 @@ +""" NormAct (Normalizaiton + Activation Layer) Factory + +Create norm + act combo modules that attempt to be backwards compatible with separate norm + act +isntances in models. Where these are used it will be possible to swap separate BN + act layers with +combined modules like IABN or EvoNorms. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import types +import functools + +from .evo_norm import * +from .filter_response_norm import FilterResponseNormAct2d, FilterResponseNormTlu2d +from .norm_act import BatchNormAct2d, GroupNormAct, LayerNormAct, LayerNormAct2d +from .inplace_abn import InplaceAbn + +_NORM_ACT_MAP = dict( + batchnorm=BatchNormAct2d, + batchnorm2d=BatchNormAct2d, + groupnorm=GroupNormAct, + groupnorm1=functools.partial(GroupNormAct, num_groups=1), + layernorm=LayerNormAct, + layernorm2d=LayerNormAct2d, + evonormb0=EvoNorm2dB0, + evonormb1=EvoNorm2dB1, + evonormb2=EvoNorm2dB2, + evonorms0=EvoNorm2dS0, + evonorms0a=EvoNorm2dS0a, + evonorms1=EvoNorm2dS1, + evonorms1a=EvoNorm2dS1a, + evonorms2=EvoNorm2dS2, + evonorms2a=EvoNorm2dS2a, + frn=FilterResponseNormAct2d, + frntlu=FilterResponseNormTlu2d, + inplaceabn=InplaceAbn, + iabn=InplaceAbn, +) +_NORM_ACT_TYPES = {m for n, m in _NORM_ACT_MAP.items()} +# has act_layer arg to define act type +_NORM_ACT_REQUIRES_ARG = { + BatchNormAct2d, GroupNormAct, LayerNormAct, LayerNormAct2d, FilterResponseNormAct2d, InplaceAbn} + + +def create_norm_act_layer(layer_name, num_features, act_layer=None, apply_act=True, jit=False, **kwargs): + layer = get_norm_act_layer(layer_name, act_layer=act_layer) + layer_instance = layer(num_features, apply_act=apply_act, **kwargs) + if jit: + layer_instance = torch.jit.script(layer_instance) + return layer_instance + + +def get_norm_act_layer(norm_layer, act_layer=None): + assert isinstance(norm_layer, (type, str, types.FunctionType, functools.partial)) + assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial)) + norm_act_kwargs = {} + + # unbind partial fn, so args can be rebound later + if isinstance(norm_layer, functools.partial): + norm_act_kwargs.update(norm_layer.keywords) + norm_layer = norm_layer.func + + if isinstance(norm_layer, str): + layer_name = norm_layer.replace('_', '').lower().split('-')[0] + norm_act_layer = _NORM_ACT_MAP.get(layer_name, None) + elif norm_layer in _NORM_ACT_TYPES: + norm_act_layer = norm_layer + elif isinstance(norm_layer, types.FunctionType): + # if function type, must be a lambda/fn that creates a norm_act layer + norm_act_layer = norm_layer + else: + type_name = norm_layer.__name__.lower() + if type_name.startswith('batchnorm'): + norm_act_layer = BatchNormAct2d + elif type_name.startswith('groupnorm'): + norm_act_layer = GroupNormAct + elif type_name.startswith('groupnorm1'): + norm_act_layer = functools.partial(GroupNormAct, num_groups=1) + elif type_name.startswith('layernorm2d'): + norm_act_layer = LayerNormAct2d + elif type_name.startswith('layernorm'): + norm_act_layer = LayerNormAct + else: + assert False, f"No equivalent norm_act layer for {type_name}" + + if norm_act_layer in _NORM_ACT_REQUIRES_ARG: + # pass `act_layer` through for backwards compat where `act_layer=None` implies no activation. + # In the future, may force use of `apply_act` with `act_layer` arg bound to relevant NormAct types + norm_act_kwargs.setdefault('act_layer', act_layer) + if norm_act_kwargs: + norm_act_layer = functools.partial(norm_act_layer, **norm_act_kwargs) # bind/rebind args + return norm_act_layer diff --git a/custom_timm/models/layers/drop.py b/custom_timm/models/layers/drop.py new file mode 100644 index 0000000000000000000000000000000000000000..1ab1c8f5ba12bd0db2f802c6b5a5dd0296dec7af --- /dev/null +++ b/custom_timm/models/layers/drop.py @@ -0,0 +1,169 @@ +""" DropBlock, DropPath + +PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers. + +Papers: +DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890) + +Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382) + +Code: +DropBlock impl inspired by two Tensorflow impl that I liked: + - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74 + - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def drop_block_2d( + x, drop_prob: float = 0.1, block_size: int = 7, gamma_scale: float = 1.0, + with_noise: bool = False, inplace: bool = False, batchwise: bool = False): + """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf + + DropBlock with an experimental gaussian noise option. This layer has been tested on a few training + runs with success, but needs further validation and possibly optimization for lower runtime impact. + """ + B, C, H, W = x.shape + total_size = W * H + clipped_block_size = min(block_size, min(W, H)) + # seed_drop_rate, the gamma parameter + gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / ( + (W - block_size + 1) * (H - block_size + 1)) + + # Forces the block to be inside the feature map. + w_i, h_i = torch.meshgrid(torch.arange(W).to(x.device), torch.arange(H).to(x.device)) + valid_block = ((w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2)) & \ + ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2)) + valid_block = torch.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype) + + if batchwise: + # one mask for whole batch, quite a bit faster + uniform_noise = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device) + else: + uniform_noise = torch.rand_like(x) + block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype) + block_mask = -F.max_pool2d( + -block_mask, + kernel_size=clipped_block_size, # block_size, + stride=1, + padding=clipped_block_size // 2) + + if with_noise: + normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x) + if inplace: + x.mul_(block_mask).add_(normal_noise * (1 - block_mask)) + else: + x = x * block_mask + normal_noise * (1 - block_mask) + else: + normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(x.dtype) + if inplace: + x.mul_(block_mask * normalize_scale) + else: + x = x * block_mask * normalize_scale + return x + + +def drop_block_fast_2d( + x: torch.Tensor, drop_prob: float = 0.1, block_size: int = 7, + gamma_scale: float = 1.0, with_noise: bool = False, inplace: bool = False): + """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf + + DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid + block mask at edges. + """ + B, C, H, W = x.shape + total_size = W * H + clipped_block_size = min(block_size, min(W, H)) + gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / ( + (W - block_size + 1) * (H - block_size + 1)) + + block_mask = torch.empty_like(x).bernoulli_(gamma) + block_mask = F.max_pool2d( + block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2) + + if with_noise: + normal_noise = torch.empty_like(x).normal_() + if inplace: + x.mul_(1. - block_mask).add_(normal_noise * block_mask) + else: + x = x * (1. - block_mask) + normal_noise * block_mask + else: + block_mask = 1 - block_mask + normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-6)).to(dtype=x.dtype) + if inplace: + x.mul_(block_mask * normalize_scale) + else: + x = x * block_mask * normalize_scale + return x + + +class DropBlock2d(nn.Module): + """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf + """ + + def __init__( + self, + drop_prob: float = 0.1, + block_size: int = 7, + gamma_scale: float = 1.0, + with_noise: bool = False, + inplace: bool = False, + batchwise: bool = False, + fast: bool = True): + super(DropBlock2d, self).__init__() + self.drop_prob = drop_prob + self.gamma_scale = gamma_scale + self.block_size = block_size + self.with_noise = with_noise + self.inplace = inplace + self.batchwise = batchwise + self.fast = fast # FIXME finish comparisons of fast vs not + + def forward(self, x): + if not self.training or not self.drop_prob: + return x + if self.fast: + return drop_block_fast_2d( + x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace) + else: + return drop_block_2d( + x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise) + + +def drop_path(x, drop_prob: float = 0., training: bool = False, scale_by_keep: bool = True): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0. or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + def __init__(self, drop_prob: float = 0., scale_by_keep: bool = True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f'drop_prob={round(self.drop_prob,3):0.3f}' diff --git a/custom_timm/models/layers/eca.py b/custom_timm/models/layers/eca.py new file mode 100644 index 0000000000000000000000000000000000000000..e29be6ac3c95bb61229cdcdd659ec89d541f1a53 --- /dev/null +++ b/custom_timm/models/layers/eca.py @@ -0,0 +1,145 @@ +""" +ECA module from ECAnet + +paper: ECA-Net: Efficient Channel Attention for Deep Convolutional Neural Networks +https://arxiv.org/abs/1910.03151 + +Original ECA model borrowed from https://github.com/BangguWu/ECANet + +Modified circular ECA implementation and adaption for use in timm package +by Chris Ha https://github.com/VRandme + +Original License: + +MIT License + +Copyright (c) 2019 BangguWu, Qilong Wang + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +""" +import math +from torch import nn +import torch.nn.functional as F + + +from .create_act import create_act_layer +from .helpers import make_divisible + + +class EcaModule(nn.Module): + """Constructs an ECA module. + + Args: + channels: Number of channels of the input feature map for use in adaptive kernel sizes + for actual calculations according to channel. + gamma, beta: when channel is given parameters of mapping function + refer to original paper https://arxiv.org/pdf/1910.03151.pdf + (default=None. if channel size not given, use k_size given for kernel size.) + kernel_size: Adaptive selection of kernel size (default=3) + gamm: used in kernel_size calc, see above + beta: used in kernel_size calc, see above + act_layer: optional non-linearity after conv, enables conv bias, this is an experiment + gate_layer: gating non-linearity to use + """ + def __init__( + self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid', + rd_ratio=1/8, rd_channels=None, rd_divisor=8, use_mlp=False): + super(EcaModule, self).__init__() + if channels is not None: + t = int(abs(math.log(channels, 2) + beta) / gamma) + kernel_size = max(t if t % 2 else t + 1, 3) + assert kernel_size % 2 == 1 + padding = (kernel_size - 1) // 2 + if use_mlp: + # NOTE 'mlp' mode is a timm experiment, not in paper + assert channels is not None + if rd_channels is None: + rd_channels = make_divisible(channels * rd_ratio, divisor=rd_divisor) + act_layer = act_layer or nn.ReLU + self.conv = nn.Conv1d(1, rd_channels, kernel_size=1, padding=0, bias=True) + self.act = create_act_layer(act_layer) + self.conv2 = nn.Conv1d(rd_channels, 1, kernel_size=kernel_size, padding=padding, bias=True) + else: + self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=padding, bias=False) + self.act = None + self.conv2 = None + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + y = x.mean((2, 3)).view(x.shape[0], 1, -1) # view for 1d conv + y = self.conv(y) + if self.conv2 is not None: + y = self.act(y) + y = self.conv2(y) + y = self.gate(y).view(x.shape[0], -1, 1, 1) + return x * y.expand_as(x) + + +EfficientChannelAttn = EcaModule # alias + + +class CecaModule(nn.Module): + """Constructs a circular ECA module. + + ECA module where the conv uses circular padding rather than zero padding. + Unlike the spatial dimension, the channels do not have inherent ordering nor + locality. Although this module in essence, applies such an assumption, it is unnecessary + to limit the channels on either "edge" from being circularly adapted to each other. + This will fundamentally increase connectivity and possibly increase performance metrics + (accuracy, robustness), without significantly impacting resource metrics + (parameter size, throughput,latency, etc) + + Args: + channels: Number of channels of the input feature map for use in adaptive kernel sizes + for actual calculations according to channel. + gamma, beta: when channel is given parameters of mapping function + refer to original paper https://arxiv.org/pdf/1910.03151.pdf + (default=None. if channel size not given, use k_size given for kernel size.) + kernel_size: Adaptive selection of kernel size (default=3) + gamm: used in kernel_size calc, see above + beta: used in kernel_size calc, see above + act_layer: optional non-linearity after conv, enables conv bias, this is an experiment + gate_layer: gating non-linearity to use + """ + + def __init__(self, channels=None, kernel_size=3, gamma=2, beta=1, act_layer=None, gate_layer='sigmoid'): + super(CecaModule, self).__init__() + if channels is not None: + t = int(abs(math.log(channels, 2) + beta) / gamma) + kernel_size = max(t if t % 2 else t + 1, 3) + has_act = act_layer is not None + assert kernel_size % 2 == 1 + + # PyTorch circular padding mode is buggy as of pytorch 1.4 + # see https://github.com/pytorch/pytorch/pull/17240 + # implement manual circular padding + self.padding = (kernel_size - 1) // 2 + self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=0, bias=has_act) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + y = x.mean((2, 3)).view(x.shape[0], 1, -1) + # Manually implement circular padding, F.pad does not seemed to be bugged + y = F.pad(y, (self.padding, self.padding), mode='circular') + y = self.conv(y) + y = self.gate(y).view(x.shape[0], -1, 1, 1) + return x * y.expand_as(x) + + +CircularEfficientChannelAttn = CecaModule diff --git a/custom_timm/models/layers/evo_norm.py b/custom_timm/models/layers/evo_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..ea77620712c80a54d943ef0b920556cbafc1f9f6 --- /dev/null +++ b/custom_timm/models/layers/evo_norm.py @@ -0,0 +1,352 @@ +""" EvoNorm in PyTorch + +Based on `Evolving Normalization-Activation Layers` - https://arxiv.org/abs/2004.02967 +@inproceedings{NEURIPS2020, + author = {Liu, Hanxiao and Brock, Andy and Simonyan, Karen and Le, Quoc}, + booktitle = {Advances in Neural Information Processing Systems}, + editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, + pages = {13539--13550}, + publisher = {Curran Associates, Inc.}, + title = {Evolving Normalization-Activation Layers}, + url = {https://proceedings.neurips.cc/paper/2020/file/9d4c03631b8b0c85ae08bf05eda37d0f-Paper.pdf}, + volume = {33}, + year = {2020} +} + +An attempt at getting decent performing EvoNorms running in PyTorch. +While faster than other PyTorch impl, still quite a ways off the built-in BatchNorm +in terms of memory usage and throughput on GPUs. + +I'm testing these modules on TPU w/ PyTorch XLA. Promising start but +currently working around some issues with builtin torch/tensor.var/std. Unlike +GPU, similar train speeds for EvoNormS variants and BatchNorm. + +Hacked together by / Copyright 2020 Ross Wightman +""" +from typing import Sequence, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .create_act import create_act_layer +from .trace_utils import _assert + + +def instance_std(x, eps: float = 1e-5): + std = x.float().var(dim=(2, 3), unbiased=False, keepdim=True).add(eps).sqrt().to(x.dtype) + return std.expand(x.shape) + + +def instance_std_tpu(x, eps: float = 1e-5): + std = manual_var(x, dim=(2, 3)).add(eps).sqrt() + return std.expand(x.shape) +# instance_std = instance_std_tpu + + +def instance_rms(x, eps: float = 1e-5): + rms = x.float().square().mean(dim=(2, 3), keepdim=True).add(eps).sqrt().to(x.dtype) + return rms.expand(x.shape) + + +def manual_var(x, dim: Union[int, Sequence[int]], diff_sqm: bool = False): + xm = x.mean(dim=dim, keepdim=True) + if diff_sqm: + # difference of squared mean and mean squared, faster on TPU can be less stable + var = ((x * x).mean(dim=dim, keepdim=True) - (xm * xm)).clamp(0) + else: + var = ((x - xm) * (x - xm)).mean(dim=dim, keepdim=True) + return var + + +def group_std(x, groups: int = 32, eps: float = 1e-5, flatten: bool = False): + B, C, H, W = x.shape + x_dtype = x.dtype + _assert(C % groups == 0, '') + if flatten: + x = x.reshape(B, groups, -1) # FIXME simpler shape causing TPU / XLA issues + std = x.float().var(dim=2, unbiased=False, keepdim=True).add(eps).sqrt().to(x_dtype) + else: + x = x.reshape(B, groups, C // groups, H, W) + std = x.float().var(dim=(2, 3, 4), unbiased=False, keepdim=True).add(eps).sqrt().to(x_dtype) + return std.expand(x.shape).reshape(B, C, H, W) + + +def group_std_tpu(x, groups: int = 32, eps: float = 1e-5, diff_sqm: bool = False, flatten: bool = False): + # This is a workaround for some stability / odd behaviour of .var and .std + # running on PyTorch XLA w/ TPUs. These manual var impl are producing much better results + B, C, H, W = x.shape + _assert(C % groups == 0, '') + if flatten: + x = x.reshape(B, groups, -1) # FIXME simpler shape causing TPU / XLA issues + var = manual_var(x, dim=-1, diff_sqm=diff_sqm) + else: + x = x.reshape(B, groups, C // groups, H, W) + var = manual_var(x, dim=(2, 3, 4), diff_sqm=diff_sqm) + return var.add(eps).sqrt().expand(x.shape).reshape(B, C, H, W) +#group_std = group_std_tpu # FIXME TPU temporary + + +def group_rms(x, groups: int = 32, eps: float = 1e-5): + B, C, H, W = x.shape + _assert(C % groups == 0, '') + x_dtype = x.dtype + x = x.reshape(B, groups, C // groups, H, W) + rms = x.float().square().mean(dim=(2, 3, 4), keepdim=True).add(eps).sqrt_().to(x_dtype) + return rms.expand(x.shape).reshape(B, C, H, W) + + +class EvoNorm2dB0(nn.Module): + def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-3, **_): + super().__init__() + self.apply_act = apply_act # apply activation (non-linearity) + self.momentum = momentum + self.eps = eps + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.v = nn.Parameter(torch.ones(num_features)) if apply_act else None + self.register_buffer('running_var', torch.ones(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + if self.v is not None: + nn.init.ones_(self.v) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + if self.v is not None: + if self.training: + var = x.float().var(dim=(0, 2, 3), unbiased=False) + # var = manual_var(x, dim=(0, 2, 3)).squeeze() + n = x.numel() / x.shape[1] + self.running_var.copy_( + self.running_var * (1 - self.momentum) + + var.detach() * self.momentum * (n / (n - 1))) + else: + var = self.running_var + left = var.add(self.eps).sqrt_().to(x_dtype).view(v_shape).expand_as(x) + v = self.v.to(x_dtype).view(v_shape) + right = x * v + instance_std(x, self.eps) + x = x / left.max(right) + return x * self.weight.to(x_dtype).view(v_shape) + self.bias.to(x_dtype).view(v_shape) + + +class EvoNorm2dB1(nn.Module): + def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, **_): + super().__init__() + self.apply_act = apply_act # apply activation (non-linearity) + self.momentum = momentum + self.eps = eps + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.register_buffer('running_var', torch.ones(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + if self.apply_act: + if self.training: + var = x.float().var(dim=(0, 2, 3), unbiased=False) + n = x.numel() / x.shape[1] + self.running_var.copy_( + self.running_var * (1 - self.momentum) + + var.detach().to(self.running_var.dtype) * self.momentum * (n / (n - 1))) + else: + var = self.running_var + var = var.to(x_dtype).view(v_shape) + left = var.add(self.eps).sqrt_() + right = (x + 1) * instance_rms(x, self.eps) + x = x / left.max(right) + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) + + +class EvoNorm2dB2(nn.Module): + def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, **_): + super().__init__() + self.apply_act = apply_act # apply activation (non-linearity) + self.momentum = momentum + self.eps = eps + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.register_buffer('running_var', torch.ones(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + if self.apply_act: + if self.training: + var = x.float().var(dim=(0, 2, 3), unbiased=False) + n = x.numel() / x.shape[1] + self.running_var.copy_( + self.running_var * (1 - self.momentum) + + var.detach().to(self.running_var.dtype) * self.momentum * (n / (n - 1))) + else: + var = self.running_var + var = var.to(x_dtype).view(v_shape) + left = var.add(self.eps).sqrt_() + right = instance_rms(x, self.eps) - x + x = x / left.max(right) + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) + + +class EvoNorm2dS0(nn.Module): + def __init__(self, num_features, groups=32, group_size=None, apply_act=True, eps=1e-5, **_): + super().__init__() + self.apply_act = apply_act # apply activation (non-linearity) + if group_size: + assert num_features % group_size == 0 + self.groups = num_features // group_size + else: + self.groups = groups + self.eps = eps + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.v = nn.Parameter(torch.ones(num_features)) if apply_act else None + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + if self.v is not None: + nn.init.ones_(self.v) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + if self.v is not None: + v = self.v.view(v_shape).to(x_dtype) + x = x * (x * v).sigmoid() / group_std(x, self.groups, self.eps) + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) + + +class EvoNorm2dS0a(EvoNorm2dS0): + def __init__(self, num_features, groups=32, group_size=None, apply_act=True, eps=1e-3, **_): + super().__init__( + num_features, groups=groups, group_size=group_size, apply_act=apply_act, eps=eps) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + d = group_std(x, self.groups, self.eps) + if self.v is not None: + v = self.v.view(v_shape).to(x_dtype) + x = x * (x * v).sigmoid() + x = x / d + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) + + +class EvoNorm2dS1(nn.Module): + def __init__( + self, num_features, groups=32, group_size=None, + apply_act=True, act_layer=None, eps=1e-5, **_): + super().__init__() + act_layer = act_layer or nn.SiLU + self.apply_act = apply_act # apply activation (non-linearity) + if act_layer is not None and apply_act: + self.act = create_act_layer(act_layer) + else: + self.act = nn.Identity() + if group_size: + assert num_features % group_size == 0 + self.groups = num_features // group_size + else: + self.groups = groups + self.eps = eps + self.pre_act_norm = False + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + if self.apply_act: + x = self.act(x) / group_std(x, self.groups, self.eps) + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) + + +class EvoNorm2dS1a(EvoNorm2dS1): + def __init__( + self, num_features, groups=32, group_size=None, + apply_act=True, act_layer=None, eps=1e-3, **_): + super().__init__( + num_features, groups=groups, group_size=group_size, apply_act=apply_act, act_layer=act_layer, eps=eps) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + x = self.act(x) / group_std(x, self.groups, self.eps) + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) + + +class EvoNorm2dS2(nn.Module): + def __init__( + self, num_features, groups=32, group_size=None, + apply_act=True, act_layer=None, eps=1e-5, **_): + super().__init__() + act_layer = act_layer or nn.SiLU + self.apply_act = apply_act # apply activation (non-linearity) + if act_layer is not None and apply_act: + self.act = create_act_layer(act_layer) + else: + self.act = nn.Identity() + if group_size: + assert num_features % group_size == 0 + self.groups = num_features // group_size + else: + self.groups = groups + self.eps = eps + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + if self.apply_act: + x = self.act(x) / group_rms(x, self.groups, self.eps) + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) + + +class EvoNorm2dS2a(EvoNorm2dS2): + def __init__( + self, num_features, groups=32, group_size=None, + apply_act=True, act_layer=None, eps=1e-3, **_): + super().__init__( + num_features, groups=groups, group_size=group_size, apply_act=apply_act, act_layer=act_layer, eps=eps) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + x = self.act(x) / group_rms(x, self.groups, self.eps) + return x * self.weight.view(v_shape).to(x_dtype) + self.bias.view(v_shape).to(x_dtype) diff --git a/custom_timm/models/layers/fast_norm.py b/custom_timm/models/layers/fast_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..fb35e47df6798175945d1e0bda4c7792345100c2 --- /dev/null +++ b/custom_timm/models/layers/fast_norm.py @@ -0,0 +1,78 @@ +""" 'Fast' Normalization Functions + +For GroupNorm and LayerNorm these functions bypass typical AMP upcast to float32. + +Additionally, for LayerNorm, the APEX fused LN is used if available (which also does not upcast) + +Hacked together by / Copyright 2022 Ross Wightman +""" +from typing import List, Optional + +import torch +from torch.nn import functional as F + +try: + from apex.normalization.fused_layer_norm import fused_layer_norm_affine + has_apex = True +except ImportError: + has_apex = False + + +# fast (ie lower precision LN) can be disabled with this flag if issues crop up +_USE_FAST_NORM = False # defaulting to False for now + + +def is_fast_norm(): + return _USE_FAST_NORM + + +def set_fast_norm(enable=True): + global _USE_FAST_NORM + _USE_FAST_NORM = enable + + +def fast_group_norm( + x: torch.Tensor, + num_groups: int, + weight: Optional[torch.Tensor] = None, + bias: Optional[torch.Tensor] = None, + eps: float = 1e-5 +) -> torch.Tensor: + if torch.jit.is_scripting(): + # currently cannot use is_autocast_enabled within torchscript + return F.group_norm(x, num_groups, weight, bias, eps) + + if torch.is_autocast_enabled(): + # normally native AMP casts GN inputs to float32 + # here we use the low precision autocast dtype + # FIXME what to do re CPU autocast? + dt = torch.get_autocast_gpu_dtype() + x, weight, bias = x.to(dt), weight.to(dt), bias.to(dt) + + with torch.cuda.amp.autocast(enabled=False): + return F.group_norm(x, num_groups, weight, bias, eps) + + +def fast_layer_norm( + x: torch.Tensor, + normalized_shape: List[int], + weight: Optional[torch.Tensor] = None, + bias: Optional[torch.Tensor] = None, + eps: float = 1e-5 +) -> torch.Tensor: + if torch.jit.is_scripting(): + # currently cannot use is_autocast_enabled within torchscript + return F.layer_norm(x, normalized_shape, weight, bias, eps) + + if has_apex: + return fused_layer_norm_affine(x, weight, bias, normalized_shape, eps) + + if torch.is_autocast_enabled(): + # normally native AMP casts LN inputs to float32 + # apex LN does not, this is behaving like Apex + dt = torch.get_autocast_gpu_dtype() + # FIXME what to do re CPU autocast? + x, weight, bias = x.to(dt), weight.to(dt), bias.to(dt) + + with torch.cuda.amp.autocast(enabled=False): + return F.layer_norm(x, normalized_shape, weight, bias, eps) diff --git a/custom_timm/models/layers/filter_response_norm.py b/custom_timm/models/layers/filter_response_norm.py new file mode 100644 index 0000000000000000000000000000000000000000..a66a1cd493e4cecec27419925a6a2045bb05f25f --- /dev/null +++ b/custom_timm/models/layers/filter_response_norm.py @@ -0,0 +1,68 @@ +""" Filter Response Norm in PyTorch + +Based on `Filter Response Normalization Layer` - https://arxiv.org/abs/1911.09737 + +Hacked together by / Copyright 2021 Ross Wightman +""" +import torch +import torch.nn as nn + +from .create_act import create_act_layer +from .trace_utils import _assert + + +def inv_instance_rms(x, eps: float = 1e-5): + rms = x.square().float().mean(dim=(2, 3), keepdim=True).add(eps).rsqrt().to(x.dtype) + return rms.expand(x.shape) + + +class FilterResponseNormTlu2d(nn.Module): + def __init__(self, num_features, apply_act=True, eps=1e-5, rms=True, **_): + super(FilterResponseNormTlu2d, self).__init__() + self.apply_act = apply_act # apply activation (non-linearity) + self.rms = rms + self.eps = eps + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.tau = nn.Parameter(torch.zeros(num_features)) if apply_act else None + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + if self.tau is not None: + nn.init.zeros_(self.tau) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + x = x * inv_instance_rms(x, self.eps) + x = x * self.weight.view(v_shape).to(dtype=x_dtype) + self.bias.view(v_shape).to(dtype=x_dtype) + return torch.maximum(x, self.tau.reshape(v_shape).to(dtype=x_dtype)) if self.tau is not None else x + + +class FilterResponseNormAct2d(nn.Module): + def __init__(self, num_features, apply_act=True, act_layer=nn.ReLU, inplace=None, rms=True, eps=1e-5, **_): + super(FilterResponseNormAct2d, self).__init__() + if act_layer is not None and apply_act: + self.act = create_act_layer(act_layer, inplace=inplace) + else: + self.act = nn.Identity() + self.rms = rms + self.eps = eps + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.ones_(self.weight) + nn.init.zeros_(self.bias) + + def forward(self, x): + _assert(x.dim() == 4, 'expected 4D input') + x_dtype = x.dtype + v_shape = (1, -1, 1, 1) + x = x * inv_instance_rms(x, self.eps) + x = x * self.weight.view(v_shape).to(dtype=x_dtype) + self.bias.view(v_shape).to(dtype=x_dtype) + return self.act(x) diff --git a/custom_timm/models/layers/gather_excite.py b/custom_timm/models/layers/gather_excite.py new file mode 100644 index 0000000000000000000000000000000000000000..2d60dc961e2b5e135d38e290b8fa5820ef0fe18f --- /dev/null +++ b/custom_timm/models/layers/gather_excite.py @@ -0,0 +1,90 @@ +""" Gather-Excite Attention Block + +Paper: `Gather-Excite: Exploiting Feature Context in CNNs` - https://arxiv.org/abs/1810.12348 + +Official code here, but it's only partial impl in Caffe: https://github.com/hujie-frank/GENet + +I've tried to support all of the extent both w/ and w/o params. I don't believe I've seen another +impl that covers all of the cases. + +NOTE: extent=0 + extra_params=False is equivalent to Squeeze-and-Excitation + +Hacked together by / Copyright 2021 Ross Wightman +""" +import math + +from torch import nn as nn +import torch.nn.functional as F + +from .create_act import create_act_layer, get_act_layer +from .create_conv2d import create_conv2d +from .helpers import make_divisible +from .mlp import ConvMlp + + +class GatherExcite(nn.Module): + """ Gather-Excite Attention Module + """ + def __init__( + self, channels, feat_size=None, extra_params=False, extent=0, use_mlp=True, + rd_ratio=1./16, rd_channels=None, rd_divisor=1, add_maxpool=False, + act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, gate_layer='sigmoid'): + super(GatherExcite, self).__init__() + self.add_maxpool = add_maxpool + act_layer = get_act_layer(act_layer) + self.extent = extent + if extra_params: + self.gather = nn.Sequential() + if extent == 0: + assert feat_size is not None, 'spatial feature size must be specified for global extent w/ params' + self.gather.add_module( + 'conv1', create_conv2d(channels, channels, kernel_size=feat_size, stride=1, depthwise=True)) + if norm_layer: + self.gather.add_module(f'norm1', nn.BatchNorm2d(channels)) + else: + assert extent % 2 == 0 + num_conv = int(math.log2(extent)) + for i in range(num_conv): + self.gather.add_module( + f'conv{i + 1}', + create_conv2d(channels, channels, kernel_size=3, stride=2, depthwise=True)) + if norm_layer: + self.gather.add_module(f'norm{i + 1}', nn.BatchNorm2d(channels)) + if i != num_conv - 1: + self.gather.add_module(f'act{i + 1}', act_layer(inplace=True)) + else: + self.gather = None + if self.extent == 0: + self.gk = 0 + self.gs = 0 + else: + assert extent % 2 == 0 + self.gk = self.extent * 2 - 1 + self.gs = self.extent + + if not rd_channels: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + self.mlp = ConvMlp(channels, rd_channels, act_layer=act_layer) if use_mlp else nn.Identity() + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + size = x.shape[-2:] + if self.gather is not None: + x_ge = self.gather(x) + else: + if self.extent == 0: + # global extent + x_ge = x.mean(dim=(2, 3), keepdims=True) + if self.add_maxpool: + # experimental codepath, may remove or change + x_ge = 0.5 * x_ge + 0.5 * x.amax((2, 3), keepdim=True) + else: + x_ge = F.avg_pool2d( + x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2, count_include_pad=False) + if self.add_maxpool: + # experimental codepath, may remove or change + x_ge = 0.5 * x_ge + 0.5 * F.max_pool2d(x, kernel_size=self.gk, stride=self.gs, padding=self.gk // 2) + x_ge = self.mlp(x_ge) + if x_ge.shape[-1] != 1 or x_ge.shape[-2] != 1: + x_ge = F.interpolate(x_ge, size=size) + return x * self.gate(x_ge) diff --git a/custom_timm/models/layers/global_context.py b/custom_timm/models/layers/global_context.py new file mode 100644 index 0000000000000000000000000000000000000000..de7fb5c15f08a5c2fe42cb7c174fff92d6b0d3bf --- /dev/null +++ b/custom_timm/models/layers/global_context.py @@ -0,0 +1,67 @@ +""" Global Context Attention Block + +Paper: `GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond` + - https://arxiv.org/abs/1904.11492 + +Official code consulted as reference: https://github.com/xvjiarui/GCNet + +Hacked together by / Copyright 2021 Ross Wightman +""" +from torch import nn as nn +import torch.nn.functional as F + +from .create_act import create_act_layer, get_act_layer +from .helpers import make_divisible +from .mlp import ConvMlp +from .norm import LayerNorm2d + + +class GlobalContext(nn.Module): + + def __init__(self, channels, use_attn=True, fuse_add=False, fuse_scale=True, init_last_zero=False, + rd_ratio=1./8, rd_channels=None, rd_divisor=1, act_layer=nn.ReLU, gate_layer='sigmoid'): + super(GlobalContext, self).__init__() + act_layer = get_act_layer(act_layer) + + self.conv_attn = nn.Conv2d(channels, 1, kernel_size=1, bias=True) if use_attn else None + + if rd_channels is None: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + if fuse_add: + self.mlp_add = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) + else: + self.mlp_add = None + if fuse_scale: + self.mlp_scale = ConvMlp(channels, rd_channels, act_layer=act_layer, norm_layer=LayerNorm2d) + else: + self.mlp_scale = None + + self.gate = create_act_layer(gate_layer) + self.init_last_zero = init_last_zero + self.reset_parameters() + + def reset_parameters(self): + if self.conv_attn is not None: + nn.init.kaiming_normal_(self.conv_attn.weight, mode='fan_in', nonlinearity='relu') + if self.mlp_add is not None: + nn.init.zeros_(self.mlp_add.fc2.weight) + + def forward(self, x): + B, C, H, W = x.shape + + if self.conv_attn is not None: + attn = self.conv_attn(x).reshape(B, 1, H * W) # (B, 1, H * W) + attn = F.softmax(attn, dim=-1).unsqueeze(3) # (B, 1, H * W, 1) + context = x.reshape(B, C, H * W).unsqueeze(1) @ attn + context = context.view(B, C, 1, 1) + else: + context = x.mean(dim=(2, 3), keepdim=True) + + if self.mlp_scale is not None: + mlp_x = self.mlp_scale(context) + x = x * self.gate(mlp_x) + if self.mlp_add is not None: + mlp_x = self.mlp_add(context) + x = x + mlp_x + + return x diff --git a/custom_timm/models/layers/halo_attn.py b/custom_timm/models/layers/halo_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..f2ac64f85e08a24646434fc0a995afa0fd9b9ee7 --- /dev/null +++ b/custom_timm/models/layers/halo_attn.py @@ -0,0 +1,233 @@ +""" Halo Self Attention + +Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones` + - https://arxiv.org/abs/2103.12731 + +@misc{2103.12731, +Author = {Ashish Vaswani and Prajit Ramachandran and Aravind Srinivas and Niki Parmar and Blake Hechtman and + Jonathon Shlens}, +Title = {Scaling Local Self-Attention for Parameter Efficient Visual Backbones}, +Year = {2021}, +} + +Status: +This impl is a WIP, there is no official ref impl and some details in paper weren't clear to me. +The attention mechanism works but it's slow as implemented. + +Hacked together by / Copyright 2021 Ross Wightman +""" +from typing import List + +import torch +from torch import nn +import torch.nn.functional as F + +from .helpers import make_divisible +from .weight_init import trunc_normal_ +from .trace_utils import _assert + + +def rel_logits_1d(q, rel_k, permute_mask: List[int]): + """ Compute relative logits along one dimension + + As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2 + Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925 + + Args: + q: (batch, height, width, dim) + rel_k: (2 * window - 1, dim) + permute_mask: permute output dim according to this + """ + B, H, W, dim = q.shape + rel_size = rel_k.shape[0] + win_size = (rel_size + 1) // 2 + + x = (q @ rel_k.transpose(-1, -2)) + x = x.reshape(-1, W, rel_size) + + # pad to shift from relative to absolute indexing + x_pad = F.pad(x, [0, 1]).flatten(1) + x_pad = F.pad(x_pad, [0, rel_size - W]) + + # reshape and slice out the padded elements + x_pad = x_pad.reshape(-1, W + 1, rel_size) + x = x_pad[:, :W, win_size - 1:] + + # reshape and tile + x = x.reshape(B, H, 1, W, win_size).expand(-1, -1, win_size, -1, -1) + return x.permute(permute_mask) + + +class PosEmbedRel(nn.Module): + """ Relative Position Embedding + As per: https://gist.github.com/aravindsrinivas/56359b79f0ce4449bcb04ab4b56a57a2 + Originally from: `Attention Augmented Convolutional Networks` - https://arxiv.org/abs/1904.09925 + + """ + def __init__(self, block_size, win_size, dim_head, scale): + """ + Args: + block_size (int): block size + win_size (int): neighbourhood window size + dim_head (int): attention head dim + scale (float): scale factor (for init) + """ + super().__init__() + self.block_size = block_size + self.dim_head = dim_head + self.height_rel = nn.Parameter(torch.randn(win_size * 2 - 1, dim_head) * scale) + self.width_rel = nn.Parameter(torch.randn(win_size * 2 - 1, dim_head) * scale) + + def forward(self, q): + B, BB, HW, _ = q.shape + + # relative logits in width dimension. + q = q.reshape(-1, self.block_size, self.block_size, self.dim_head) + rel_logits_w = rel_logits_1d(q, self.width_rel, permute_mask=(0, 1, 3, 2, 4)) + + # relative logits in height dimension. + q = q.transpose(1, 2) + rel_logits_h = rel_logits_1d(q, self.height_rel, permute_mask=(0, 3, 1, 4, 2)) + + rel_logits = rel_logits_h + rel_logits_w + rel_logits = rel_logits.reshape(B, BB, HW, -1) + return rel_logits + + +class HaloAttn(nn.Module): + """ Halo Attention + + Paper: `Scaling Local Self-Attention for Parameter Efficient Visual Backbones` + - https://arxiv.org/abs/2103.12731 + + The internal dimensions of the attention module are controlled by the interaction of several arguments. + * the output dimension of the module is specified by dim_out, which falls back to input dim if not set + * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim + * the query and key (qk) dimensions are determined by + * num_heads * dim_head if dim_head is not None + * num_heads * (dim_out * attn_ratio // num_heads) if dim_head is None + * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not used + + Args: + dim (int): input dimension to the module + dim_out (int): output dimension of the module, same as dim if not set + feat_size (Tuple[int, int]): size of input feature_map (not used, for arg compat with bottle/lambda) + stride: output stride of the module, query downscaled if > 1 (default: 1). + num_heads: parallel attention heads (default: 8). + dim_head: dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set + block_size (int): size of blocks. (default: 8) + halo_size (int): size of halo overlap. (default: 3) + qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0) + qkv_bias (bool) : add bias to q, k, and v projections + avg_down (bool): use average pool downsample instead of strided query blocks + scale_pos_embed (bool): scale the position embedding as well as Q @ K + """ + def __init__( + self, dim, dim_out=None, feat_size=None, stride=1, num_heads=8, dim_head=None, block_size=8, halo_size=3, + qk_ratio=1.0, qkv_bias=False, avg_down=False, scale_pos_embed=False): + super().__init__() + dim_out = dim_out or dim + assert dim_out % num_heads == 0 + assert stride in (1, 2) + self.num_heads = num_heads + self.dim_head_qk = dim_head or make_divisible(dim_out * qk_ratio, divisor=8) // num_heads + self.dim_head_v = dim_out // self.num_heads + self.dim_out_qk = num_heads * self.dim_head_qk + self.dim_out_v = num_heads * self.dim_head_v + self.scale = self.dim_head_qk ** -0.5 + self.scale_pos_embed = scale_pos_embed + self.block_size = self.block_size_ds = block_size + self.halo_size = halo_size + self.win_size = block_size + halo_size * 2 # neighbourhood window size + self.block_stride = 1 + use_avg_pool = False + if stride > 1: + use_avg_pool = avg_down or block_size % stride != 0 + self.block_stride = 1 if use_avg_pool else stride + self.block_size_ds = self.block_size // self.block_stride + + # FIXME not clear if this stride behaviour is what the paper intended + # Also, the paper mentions using a 3D conv for dealing with the blocking/gather, and leaving + # data in unfolded block form. I haven't wrapped my head around how that'd look. + self.q = nn.Conv2d(dim, self.dim_out_qk, 1, stride=self.block_stride, bias=qkv_bias) + self.kv = nn.Conv2d(dim, self.dim_out_qk + self.dim_out_v, 1, bias=qkv_bias) + + self.pos_embed = PosEmbedRel( + block_size=self.block_size_ds, win_size=self.win_size, dim_head=self.dim_head_qk, scale=self.scale) + + self.pool = nn.AvgPool2d(2, 2) if use_avg_pool else nn.Identity() + + self.reset_parameters() + + def reset_parameters(self): + std = self.q.weight.shape[1] ** -0.5 # fan-in + trunc_normal_(self.q.weight, std=std) + trunc_normal_(self.kv.weight, std=std) + trunc_normal_(self.pos_embed.height_rel, std=self.scale) + trunc_normal_(self.pos_embed.width_rel, std=self.scale) + + def forward(self, x): + B, C, H, W = x.shape + _assert(H % self.block_size == 0, '') + _assert(W % self.block_size == 0, '') + num_h_blocks = H // self.block_size + num_w_blocks = W // self.block_size + num_blocks = num_h_blocks * num_w_blocks + + q = self.q(x) + # unfold + q = q.reshape( + -1, self.dim_head_qk, + num_h_blocks, self.block_size_ds, num_w_blocks, self.block_size_ds).permute(0, 1, 3, 5, 2, 4) + # B, num_heads * dim_head * block_size ** 2, num_blocks + q = q.reshape(B * self.num_heads, self.dim_head_qk, -1, num_blocks).transpose(1, 3) + # B * num_heads, num_blocks, block_size ** 2, dim_head + + kv = self.kv(x) + # Generate overlapping windows for kv. This approach is good for GPU and CPU. However, unfold() is not + # lowered for PyTorch XLA so it will be very slow. See code at bottom of file for XLA friendly approach. + # FIXME figure out how to switch impl between this and conv2d if XLA being used. + kv = F.pad(kv, [self.halo_size, self.halo_size, self.halo_size, self.halo_size]) + kv = kv.unfold(2, self.win_size, self.block_size).unfold(3, self.win_size, self.block_size).reshape( + B * self.num_heads, self.dim_head_qk + self.dim_head_v, num_blocks, -1).permute(0, 2, 3, 1) + k, v = torch.split(kv, [self.dim_head_qk, self.dim_head_v], dim=-1) + # B * num_heads, num_blocks, win_size ** 2, dim_head_qk or dim_head_v + + if self.scale_pos_embed: + attn = (q @ k.transpose(-1, -2) + self.pos_embed(q)) * self.scale + else: + attn = (q @ k.transpose(-1, -2)) * self.scale + self.pos_embed(q) + # B * num_heads, num_blocks, block_size ** 2, win_size ** 2 + attn = attn.softmax(dim=-1) + + out = (attn @ v).transpose(1, 3) # B * num_heads, dim_head_v, block_size ** 2, num_blocks + # fold + out = out.reshape(-1, self.block_size_ds, self.block_size_ds, num_h_blocks, num_w_blocks) + out = out.permute(0, 3, 1, 4, 2).contiguous().view( + B, self.dim_out_v, H // self.block_stride, W // self.block_stride) + # B, dim_out, H // block_stride, W // block_stride + out = self.pool(out) + return out + + +""" Three alternatives for overlapping windows. + +`.unfold().unfold()` is same speed as stride tricks with similar clarity as F.unfold() + + if is_xla: + # This code achieves haloing on PyTorch XLA with reasonable runtime trade-off, it is + # EXTREMELY slow for backward on a GPU though so I need a way of selecting based on environment. + WW = self.win_size ** 2 + pw = torch.eye(WW, dtype=x.dtype, device=x.device).reshape(WW, 1, self.win_size, self.win_size) + kv = F.conv2d(kv.reshape(-1, 1, H, W), pw, stride=self.block_size, padding=self.halo_size) + elif self.stride_tricks: + kv = F.pad(kv, [self.halo_size, self.halo_size, self.halo_size, self.halo_size]).contiguous() + kv = kv.as_strided(( + B, self.dim_out_qk + self.dim_out_v, self.win_size, self.win_size, num_h_blocks, num_w_blocks), + stride=(kv.stride(0), kv.stride(1), kv.shape[-1], 1, self.block_size * kv.shape[-1], self.block_size)) + else: + kv = F.unfold(kv, kernel_size=self.win_size, stride=self.block_size, padding=self.halo_size) + + kv = kv.reshape( + B * self.num_heads, self.dim_head_qk + self.dim_head_v, -1, num_blocks).transpose(1, 3) +""" diff --git a/custom_timm/models/layers/helpers.py b/custom_timm/models/layers/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..2fa296bcdd110dabe6c442238224132b6d91c79a --- /dev/null +++ b/custom_timm/models/layers/helpers.py @@ -0,0 +1,43 @@ +""" Layer/Module Helpers + +Hacked together by / Copyright 2020 Ross Wightman +""" +from itertools import repeat +import collections.abc + + +# From PyTorch internals +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(repeat(x, n)) + return parse + + +to_1tuple = _ntuple(1) +to_2tuple = _ntuple(2) +to_3tuple = _ntuple(3) +to_4tuple = _ntuple(4) +to_ntuple = _ntuple + + +def make_divisible(v, divisor=8, min_value=None, round_limit=.9): + min_value = min_value or divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + # Make sure that round down does not go down by more than 10%. + if new_v < round_limit * v: + new_v += divisor + return new_v + + +def extend_tuple(x, n): + # pdas a tuple to specified n by padding with last value + if not isinstance(x, (tuple, list)): + x = (x,) + else: + x = tuple(x) + pad_n = n - len(x) + if pad_n <= 0: + return x[:n] + return x + (x[-1],) * pad_n diff --git a/custom_timm/models/layers/inplace_abn.py b/custom_timm/models/layers/inplace_abn.py new file mode 100644 index 0000000000000000000000000000000000000000..a80889339ebb992c11f84a286c3fd7a627776faa --- /dev/null +++ b/custom_timm/models/layers/inplace_abn.py @@ -0,0 +1,87 @@ +import torch +from torch import nn as nn + +try: + from inplace_abn.functions import inplace_abn, inplace_abn_sync + has_iabn = True +except ImportError: + has_iabn = False + + def inplace_abn(x, weight, bias, running_mean, running_var, + training=True, momentum=0.1, eps=1e-05, activation="leaky_relu", activation_param=0.01): + raise ImportError( + "Please install InplaceABN:'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.12'") + + def inplace_abn_sync(**kwargs): + inplace_abn(**kwargs) + + +class InplaceAbn(nn.Module): + """Activated Batch Normalization + + This gathers a BatchNorm and an activation function in a single module + + Parameters + ---------- + num_features : int + Number of feature channels in the input and output. + eps : float + Small constant to prevent numerical issues. + momentum : float + Momentum factor applied to compute running statistics. + affine : bool + If `True` apply learned scale and shift transformation after normalization. + act_layer : str or nn.Module type + Name or type of the activation functions, one of: `leaky_relu`, `elu` + act_param : float + Negative slope for the `leaky_relu` activation. + """ + + def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, apply_act=True, + act_layer="leaky_relu", act_param=0.01, drop_layer=None): + super(InplaceAbn, self).__init__() + self.num_features = num_features + self.affine = affine + self.eps = eps + self.momentum = momentum + if apply_act: + if isinstance(act_layer, str): + assert act_layer in ('leaky_relu', 'elu', 'identity', '') + self.act_name = act_layer if act_layer else 'identity' + else: + # convert act layer passed as type to string + if act_layer == nn.ELU: + self.act_name = 'elu' + elif act_layer == nn.LeakyReLU: + self.act_name = 'leaky_relu' + elif act_layer is None or act_layer == nn.Identity: + self.act_name = 'identity' + else: + assert False, f'Invalid act layer {act_layer.__name__} for IABN' + else: + self.act_name = 'identity' + self.act_param = act_param + if self.affine: + self.weight = nn.Parameter(torch.ones(num_features)) + self.bias = nn.Parameter(torch.zeros(num_features)) + else: + self.register_parameter('weight', None) + self.register_parameter('bias', None) + self.register_buffer('running_mean', torch.zeros(num_features)) + self.register_buffer('running_var', torch.ones(num_features)) + self.reset_parameters() + + def reset_parameters(self): + nn.init.constant_(self.running_mean, 0) + nn.init.constant_(self.running_var, 1) + if self.affine: + nn.init.constant_(self.weight, 1) + nn.init.constant_(self.bias, 0) + + def forward(self, x): + output = inplace_abn( + x, self.weight, self.bias, self.running_mean, self.running_var, + self.training, self.momentum, self.eps, self.act_name, self.act_param) + if isinstance(output, tuple): + output = output[0] + return output diff --git a/custom_timm/models/layers/lambda_layer.py b/custom_timm/models/layers/lambda_layer.py new file mode 100644 index 0000000000000000000000000000000000000000..e50b43c8c55a371fd7466dcb58bb329ec652b131 --- /dev/null +++ b/custom_timm/models/layers/lambda_layer.py @@ -0,0 +1,133 @@ +""" Lambda Layer + +Paper: `LambdaNetworks: Modeling Long-Range Interactions Without Attention` + - https://arxiv.org/abs/2102.08602 + +@misc{2102.08602, +Author = {Irwan Bello}, +Title = {LambdaNetworks: Modeling Long-Range Interactions Without Attention}, +Year = {2021}, +} + +Status: +This impl is a WIP. Code snippets in the paper were used as reference but +good chance some details are missing/wrong. + +I've only implemented local lambda conv based pos embeddings. + +For a PyTorch impl that includes other embedding options checkout +https://github.com/lucidrains/lambda-networks + +Hacked together by / Copyright 2021 Ross Wightman +""" +import torch +from torch import nn +import torch.nn.functional as F + +from .helpers import to_2tuple, make_divisible +from .weight_init import trunc_normal_ + + +def rel_pos_indices(size): + size = to_2tuple(size) + pos = torch.stack(torch.meshgrid(torch.arange(size[0]), torch.arange(size[1]))).flatten(1) + rel_pos = pos[:, None, :] - pos[:, :, None] + rel_pos[0] += size[0] - 1 + rel_pos[1] += size[1] - 1 + return rel_pos # 2, H * W, H * W + + +class LambdaLayer(nn.Module): + """Lambda Layer + + Paper: `LambdaNetworks: Modeling Long-Range Interactions Without Attention` + - https://arxiv.org/abs/2102.08602 + + NOTE: intra-depth parameter 'u' is fixed at 1. It did not appear worth the complexity to add. + + The internal dimensions of the lambda module are controlled via the interaction of several arguments. + * the output dimension of the module is specified by dim_out, which falls back to input dim if not set + * the value (v) dimension is set to dim_out // num_heads, the v projection determines the output dim + * the query (q) and key (k) dimension are determined by + * dim_head = (dim_out * attn_ratio // num_heads) if dim_head is None + * q = num_heads * dim_head, k = dim_head + * as seen above, attn_ratio determines the ratio of q and k relative to the output if dim_head not set + + Args: + dim (int): input dimension to the module + dim_out (int): output dimension of the module, same as dim if not set + feat_size (Tuple[int, int]): size of input feature_map for relative pos variant H, W + stride (int): output stride of the module, avg pool used if stride == 2 + num_heads (int): parallel attention heads. + dim_head (int): dimension of query and key heads, calculated from dim_out * attn_ratio // num_heads if not set + r (int): local lambda convolution radius. Use lambda conv if set, else relative pos if not. (default: 9) + qk_ratio (float): ratio of q and k dimensions to output dimension when dim_head not set. (default: 1.0) + qkv_bias (bool): add bias to q, k, and v projections + """ + def __init__( + self, dim, dim_out=None, feat_size=None, stride=1, num_heads=4, dim_head=16, r=9, + qk_ratio=1.0, qkv_bias=False): + super().__init__() + dim_out = dim_out or dim + assert dim_out % num_heads == 0, ' should be divided by num_heads' + self.dim_qk = dim_head or make_divisible(dim_out * qk_ratio, divisor=8) // num_heads + self.num_heads = num_heads + self.dim_v = dim_out // num_heads + + self.qkv = nn.Conv2d( + dim, + num_heads * self.dim_qk + self.dim_qk + self.dim_v, + kernel_size=1, bias=qkv_bias) + self.norm_q = nn.BatchNorm2d(num_heads * self.dim_qk) + self.norm_v = nn.BatchNorm2d(self.dim_v) + + if r is not None: + # local lambda convolution for pos + self.conv_lambda = nn.Conv3d(1, self.dim_qk, (r, r, 1), padding=(r // 2, r // 2, 0)) + self.pos_emb = None + self.rel_pos_indices = None + else: + # relative pos embedding + assert feat_size is not None + feat_size = to_2tuple(feat_size) + rel_size = [2 * s - 1 for s in feat_size] + self.conv_lambda = None + self.pos_emb = nn.Parameter(torch.zeros(rel_size[0], rel_size[1], self.dim_qk)) + self.register_buffer('rel_pos_indices', rel_pos_indices(feat_size), persistent=False) + + self.pool = nn.AvgPool2d(2, 2) if stride == 2 else nn.Identity() + + self.reset_parameters() + + def reset_parameters(self): + trunc_normal_(self.qkv.weight, std=self.qkv.weight.shape[1] ** -0.5) # fan-in + if self.conv_lambda is not None: + trunc_normal_(self.conv_lambda.weight, std=self.dim_qk ** -0.5) + if self.pos_emb is not None: + trunc_normal_(self.pos_emb, std=.02) + + def forward(self, x): + B, C, H, W = x.shape + M = H * W + qkv = self.qkv(x) + q, k, v = torch.split(qkv, [ + self.num_heads * self.dim_qk, self.dim_qk, self.dim_v], dim=1) + q = self.norm_q(q).reshape(B, self.num_heads, self.dim_qk, M).transpose(-1, -2) # B, num_heads, M, K + v = self.norm_v(v).reshape(B, self.dim_v, M).transpose(-1, -2) # B, M, V + k = F.softmax(k.reshape(B, self.dim_qk, M), dim=-1) # B, K, M + + content_lam = k @ v # B, K, V + content_out = q @ content_lam.unsqueeze(1) # B, num_heads, M, V + + if self.pos_emb is None: + position_lam = self.conv_lambda(v.reshape(B, 1, H, W, self.dim_v)) # B, H, W, V, K + position_lam = position_lam.reshape(B, 1, self.dim_qk, H * W, self.dim_v).transpose(2, 3) # B, 1, M, K, V + else: + # FIXME relative pos embedding path not fully verified + pos_emb = self.pos_emb[self.rel_pos_indices[0], self.rel_pos_indices[1]].expand(B, -1, -1, -1) + position_lam = (pos_emb.transpose(-1, -2) @ v.unsqueeze(1)).unsqueeze(1) # B, 1, M, K, V + position_out = (q.unsqueeze(-2) @ position_lam).squeeze(-2) # B, num_heads, M, V + + out = (content_out + position_out).transpose(-1, -2).reshape(B, C, H, W) # B, C (num_heads * V), H, W + out = self.pool(out) + return out diff --git a/custom_timm/models/layers/linear.py b/custom_timm/models/layers/linear.py new file mode 100644 index 0000000000000000000000000000000000000000..38fe3380b067ea0b275c45ffd689afdeb4598f3c --- /dev/null +++ b/custom_timm/models/layers/linear.py @@ -0,0 +1,19 @@ +""" Linear layer (alternate definition) +""" +import torch +import torch.nn.functional as F +from torch import nn as nn + + +class Linear(nn.Linear): + r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b` + + Wraps torch.nn.Linear to support AMP + torchscript usage by manually casting + weight & bias to input.dtype to work around an issue w/ torch.addmm in this use case. + """ + def forward(self, input: torch.Tensor) -> torch.Tensor: + if torch.jit.is_scripting(): + bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None + return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) + else: + return F.linear(input, self.weight, self.bias) diff --git a/custom_timm/models/layers/median_pool.py b/custom_timm/models/layers/median_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..40bd71a7a3840aaebefd2af0a99605b845054cd7 --- /dev/null +++ b/custom_timm/models/layers/median_pool.py @@ -0,0 +1,49 @@ +""" Median Pool +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch.nn as nn +import torch.nn.functional as F +from .helpers import to_2tuple, to_4tuple + + +class MedianPool2d(nn.Module): + """ Median pool (usable as median filter when stride=1) module. + + Args: + kernel_size: size of pooling kernel, int or 2-tuple + stride: pool stride, int or 2-tuple + padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad + same: override padding and enforce same padding, boolean + """ + def __init__(self, kernel_size=3, stride=1, padding=0, same=False): + super(MedianPool2d, self).__init__() + self.k = to_2tuple(kernel_size) + self.stride = to_2tuple(stride) + self.padding = to_4tuple(padding) # convert to l, r, t, b + self.same = same + + def _padding(self, x): + if self.same: + ih, iw = x.size()[2:] + if ih % self.stride[0] == 0: + ph = max(self.k[0] - self.stride[0], 0) + else: + ph = max(self.k[0] - (ih % self.stride[0]), 0) + if iw % self.stride[1] == 0: + pw = max(self.k[1] - self.stride[1], 0) + else: + pw = max(self.k[1] - (iw % self.stride[1]), 0) + pl = pw // 2 + pr = pw - pl + pt = ph // 2 + pb = ph - pt + padding = (pl, pr, pt, pb) + else: + padding = self.padding + return padding + + def forward(self, x): + x = F.pad(x, self._padding(x), mode='reflect') + x = x.unfold(2, self.k[0], self.stride[0]).unfold(3, self.k[1], self.stride[1]) + x = x.contiguous().view(x.size()[:4] + (-1,)).median(dim=-1)[0] + return x diff --git a/custom_timm/models/layers/mixed_conv2d.py b/custom_timm/models/layers/mixed_conv2d.py new file mode 100644 index 0000000000000000000000000000000000000000..fa0ce565c0a9d348d4e68165960fa77fcf7f70d7 --- /dev/null +++ b/custom_timm/models/layers/mixed_conv2d.py @@ -0,0 +1,51 @@ +""" PyTorch Mixed Convolution + +Paper: MixConv: Mixed Depthwise Convolutional Kernels (https://arxiv.org/abs/1907.09595) + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import torch +from torch import nn as nn + +from .conv2d_same import create_conv2d_pad + + +def _split_channels(num_chan, num_groups): + split = [num_chan // num_groups for _ in range(num_groups)] + split[0] += num_chan - sum(split) + return split + + +class MixedConv2d(nn.ModuleDict): + """ Mixed Grouped Convolution + + Based on MDConv and GroupedConv in MixNet impl: + https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py + """ + def __init__(self, in_channels, out_channels, kernel_size=3, + stride=1, padding='', dilation=1, depthwise=False, **kwargs): + super(MixedConv2d, self).__init__() + + kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size] + num_groups = len(kernel_size) + in_splits = _split_channels(in_channels, num_groups) + out_splits = _split_channels(out_channels, num_groups) + self.in_channels = sum(in_splits) + self.out_channels = sum(out_splits) + for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)): + conv_groups = in_ch if depthwise else 1 + # use add_module to keep key space clean + self.add_module( + str(idx), + create_conv2d_pad( + in_ch, out_ch, k, stride=stride, + padding=padding, dilation=dilation, groups=conv_groups, **kwargs) + ) + self.splits = in_splits + + def forward(self, x): + x_split = torch.split(x, self.splits, 1) + x_out = [c(x_split[i]) for i, c in enumerate(self.values())] + x = torch.cat(x_out, 1) + return x diff --git a/custom_timm/models/layers/ml_decoder.py b/custom_timm/models/layers/ml_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..3f828c6d0ab059dc9ce60ad1b3bf7c064f7bf0f9 --- /dev/null +++ b/custom_timm/models/layers/ml_decoder.py @@ -0,0 +1,156 @@ +from typing import Optional + +import torch +from torch import nn +from torch import nn, Tensor +from torch.nn.modules.transformer import _get_activation_fn + + +def add_ml_decoder_head(model): + if hasattr(model, 'global_pool') and hasattr(model, 'fc'): # most CNN models, like Resnet50 + model.global_pool = nn.Identity() + del model.fc + num_classes = model.num_classes + num_features = model.num_features + model.fc = MLDecoder(num_classes=num_classes, initial_num_features=num_features) + elif hasattr(model, 'global_pool') and hasattr(model, 'classifier'): # EfficientNet + model.global_pool = nn.Identity() + del model.classifier + num_classes = model.num_classes + num_features = model.num_features + model.classifier = MLDecoder(num_classes=num_classes, initial_num_features=num_features) + elif 'RegNet' in model._get_name() or 'TResNet' in model._get_name(): # hasattr(model, 'head') + del model.head + num_classes = model.num_classes + num_features = model.num_features + model.head = MLDecoder(num_classes=num_classes, initial_num_features=num_features) + else: + print("Model code-writing is not aligned currently with ml-decoder") + exit(-1) + if hasattr(model, 'drop_rate'): # Ml-Decoder has inner dropout + model.drop_rate = 0 + return model + + +class TransformerDecoderLayerOptimal(nn.Module): + def __init__(self, d_model, nhead=8, dim_feedforward=2048, dropout=0.1, activation="relu", + layer_norm_eps=1e-5) -> None: + super(TransformerDecoderLayerOptimal, self).__init__() + self.norm1 = nn.LayerNorm(d_model, eps=layer_norm_eps) + self.dropout = nn.Dropout(dropout) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) + + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm2 = nn.LayerNorm(d_model, eps=layer_norm_eps) + self.norm3 = nn.LayerNorm(d_model, eps=layer_norm_eps) + + self.activation = _get_activation_fn(activation) + + def __setstate__(self, state): + if 'activation' not in state: + state['activation'] = torch.nn.functional.relu + super(TransformerDecoderLayerOptimal, self).__setstate__(state) + + def forward(self, tgt: Tensor, memory: Tensor, tgt_mask: Optional[Tensor] = None, + memory_mask: Optional[Tensor] = None, + tgt_key_padding_mask: Optional[Tensor] = None, + memory_key_padding_mask: Optional[Tensor] = None) -> Tensor: + tgt = tgt + self.dropout1(tgt) + tgt = self.norm1(tgt) + tgt2 = self.multihead_attn(tgt, memory, memory)[0] + tgt = tgt + self.dropout2(tgt2) + tgt = self.norm2(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout3(tgt2) + tgt = self.norm3(tgt) + return tgt + + +# @torch.jit.script +# class ExtrapClasses(object): +# def __init__(self, num_queries: int, group_size: int): +# self.num_queries = num_queries +# self.group_size = group_size +# +# def __call__(self, h: torch.Tensor, class_embed_w: torch.Tensor, class_embed_b: torch.Tensor, out_extrap: +# torch.Tensor): +# # h = h.unsqueeze(-1).expand(-1, -1, -1, self.group_size) +# h = h[..., None].repeat(1, 1, 1, self.group_size) # torch.Size([bs, 5, 768, groups]) +# w = class_embed_w.view((self.num_queries, h.shape[2], self.group_size)) +# out = (h * w).sum(dim=2) + class_embed_b +# out = out.view((h.shape[0], self.group_size * self.num_queries)) +# return out + +@torch.jit.script +class GroupFC(object): + def __init__(self, embed_len_decoder: int): + self.embed_len_decoder = embed_len_decoder + + def __call__(self, h: torch.Tensor, duplicate_pooling: torch.Tensor, out_extrap: torch.Tensor): + for i in range(self.embed_len_decoder): + h_i = h[:, i, :] + w_i = duplicate_pooling[i, :, :] + out_extrap[:, i, :] = torch.matmul(h_i, w_i) + + +class MLDecoder(nn.Module): + def __init__(self, num_classes, num_of_groups=-1, decoder_embedding=768, initial_num_features=2048): + super(MLDecoder, self).__init__() + embed_len_decoder = 100 if num_of_groups < 0 else num_of_groups + if embed_len_decoder > num_classes: + embed_len_decoder = num_classes + + # switching to 768 initial embeddings + decoder_embedding = 768 if decoder_embedding < 0 else decoder_embedding + self.embed_standart = nn.Linear(initial_num_features, decoder_embedding) + + # decoder + decoder_dropout = 0.1 + num_layers_decoder = 1 + dim_feedforward = 2048 + layer_decode = TransformerDecoderLayerOptimal(d_model=decoder_embedding, + dim_feedforward=dim_feedforward, dropout=decoder_dropout) + self.decoder = nn.TransformerDecoder(layer_decode, num_layers=num_layers_decoder) + + # non-learnable queries + self.query_embed = nn.Embedding(embed_len_decoder, decoder_embedding) + self.query_embed.requires_grad_(False) + + # group fully-connected + self.num_classes = num_classes + self.duplicate_factor = int(num_classes / embed_len_decoder + 0.999) + self.duplicate_pooling = torch.nn.Parameter( + torch.Tensor(embed_len_decoder, decoder_embedding, self.duplicate_factor)) + self.duplicate_pooling_bias = torch.nn.Parameter(torch.Tensor(num_classes)) + torch.nn.init.xavier_normal_(self.duplicate_pooling) + torch.nn.init.constant_(self.duplicate_pooling_bias, 0) + self.group_fc = GroupFC(embed_len_decoder) + + def forward(self, x): + if len(x.shape) == 4: # [bs,2048, 7,7] + embedding_spatial = x.flatten(2).transpose(1, 2) + else: # [bs, 197,468] + embedding_spatial = x + embedding_spatial_786 = self.embed_standart(embedding_spatial) + embedding_spatial_786 = torch.nn.functional.relu(embedding_spatial_786, inplace=True) + + bs = embedding_spatial_786.shape[0] + query_embed = self.query_embed.weight + # tgt = query_embed.unsqueeze(1).repeat(1, bs, 1) + tgt = query_embed.unsqueeze(1).expand(-1, bs, -1) # no allocation of memory with expand + h = self.decoder(tgt, embedding_spatial_786.transpose(0, 1)) # [embed_len_decoder, batch, 768] + h = h.transpose(0, 1) + + out_extrap = torch.zeros(h.shape[0], h.shape[1], self.duplicate_factor, device=h.device, dtype=h.dtype) + self.group_fc(h, self.duplicate_pooling, out_extrap) + h_out = out_extrap.flatten(1)[:, :self.num_classes] + h_out += self.duplicate_pooling_bias + logits = h_out + return logits diff --git a/custom_timm/models/layers/mlp.py b/custom_timm/models/layers/mlp.py new file mode 100644 index 0000000000000000000000000000000000000000..91e80a84c78b7c90314235fd10a50f459f36ce57 --- /dev/null +++ b/custom_timm/models/layers/mlp.py @@ -0,0 +1,126 @@ +""" MLP module w/ dropout and configurable activation layer + +Hacked together by / Copyright 2020 Ross Wightman +""" +from torch import nn as nn + +from .helpers import to_2tuple + + +class Mlp(nn.Module): + """ MLP as used in Vision Transformer, MLP-Mixer and related networks + """ + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, bias=True, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.fc2(x) + x = self.drop2(x) + return x + + +class GluMlp(nn.Module): + """ MLP w/ GLU style gating + See: https://arxiv.org/abs/1612.08083, https://arxiv.org/abs/2002.05202 + """ + def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.Sigmoid, bias=True, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + assert hidden_features % 2 == 0 + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + self.fc2 = nn.Linear(hidden_features // 2, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def init_weights(self): + # override init of fc1 w/ gate portion set to weight near zero, bias=1 + fc1_mid = self.fc1.bias.shape[0] // 2 + nn.init.ones_(self.fc1.bias[fc1_mid:]) + nn.init.normal_(self.fc1.weight[fc1_mid:], std=1e-6) + + def forward(self, x): + x = self.fc1(x) + x, gates = x.chunk(2, dim=-1) + x = x * self.act(gates) + x = self.drop1(x) + x = self.fc2(x) + x = self.drop2(x) + return x + + +class GatedMlp(nn.Module): + """ MLP as used in gMLP + """ + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, + gate_layer=None, bias=True, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + drop_probs = to_2tuple(drop) + + self.fc1 = nn.Linear(in_features, hidden_features, bias=bias[0]) + self.act = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + if gate_layer is not None: + assert hidden_features % 2 == 0 + self.gate = gate_layer(hidden_features) + hidden_features = hidden_features // 2 # FIXME base reduction on gate property? + else: + self.gate = nn.Identity() + self.fc2 = nn.Linear(hidden_features, out_features, bias=bias[1]) + self.drop2 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop1(x) + x = self.gate(x) + x = self.fc2(x) + x = self.drop2(x) + return x + + +class ConvMlp(nn.Module): + """ MLP using 1x1 convs that keeps spatial dims + """ + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.ReLU, + norm_layer=None, bias=True, drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + bias = to_2tuple(bias) + + self.fc1 = nn.Conv2d(in_features, hidden_features, kernel_size=1, bias=bias[0]) + self.norm = norm_layer(hidden_features) if norm_layer else nn.Identity() + self.act = act_layer() + self.drop = nn.Dropout(drop) + self.fc2 = nn.Conv2d(hidden_features, out_features, kernel_size=1, bias=bias[1]) + + def forward(self, x): + x = self.fc1(x) + x = self.norm(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + return x diff --git a/custom_timm/models/layers/non_local_attn.py b/custom_timm/models/layers/non_local_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..670e8f2475374b1f31741f75f1dedf617e0e6546 --- /dev/null +++ b/custom_timm/models/layers/non_local_attn.py @@ -0,0 +1,145 @@ +""" Bilinear-Attention-Transform and Non-Local Attention + +Paper: `Non-Local Neural Networks With Grouped Bilinear Attentional Transforms` + - https://openaccess.thecvf.com/content_CVPR_2020/html/Chi_Non-Local_Neural_Networks_With_Grouped_Bilinear_Attentional_Transforms_CVPR_2020_paper.html +Adapted from original code: https://github.com/BA-Transform/BAT-Image-Classification +""" +import torch +from torch import nn +from torch.nn import functional as F + +from .conv_bn_act import ConvNormAct +from .helpers import make_divisible +from .trace_utils import _assert + + +class NonLocalAttn(nn.Module): + """Spatial NL block for image classification. + + This was adapted from https://github.com/BA-Transform/BAT-Image-Classification + Their NonLocal impl inspired by https://github.com/facebookresearch/video-nonlocal-net. + """ + + def __init__(self, in_channels, use_scale=True, rd_ratio=1/8, rd_channels=None, rd_divisor=8, **kwargs): + super(NonLocalAttn, self).__init__() + if rd_channels is None: + rd_channels = make_divisible(in_channels * rd_ratio, divisor=rd_divisor) + self.scale = in_channels ** -0.5 if use_scale else 1.0 + self.t = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True) + self.p = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True) + self.g = nn.Conv2d(in_channels, rd_channels, kernel_size=1, stride=1, bias=True) + self.z = nn.Conv2d(rd_channels, in_channels, kernel_size=1, stride=1, bias=True) + self.norm = nn.BatchNorm2d(in_channels) + self.reset_parameters() + + def forward(self, x): + shortcut = x + + t = self.t(x) + p = self.p(x) + g = self.g(x) + + B, C, H, W = t.size() + t = t.view(B, C, -1).permute(0, 2, 1) + p = p.view(B, C, -1) + g = g.view(B, C, -1).permute(0, 2, 1) + + att = torch.bmm(t, p) * self.scale + att = F.softmax(att, dim=2) + x = torch.bmm(att, g) + + x = x.permute(0, 2, 1).reshape(B, C, H, W) + x = self.z(x) + x = self.norm(x) + shortcut + + return x + + def reset_parameters(self): + for name, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_( + m.weight, mode='fan_out', nonlinearity='relu') + if len(list(m.parameters())) > 1: + nn.init.constant_(m.bias, 0.0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 0) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.GroupNorm): + nn.init.constant_(m.weight, 0) + nn.init.constant_(m.bias, 0) + + +class BilinearAttnTransform(nn.Module): + + def __init__(self, in_channels, block_size, groups, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d): + super(BilinearAttnTransform, self).__init__() + + self.conv1 = ConvNormAct(in_channels, groups, 1, act_layer=act_layer, norm_layer=norm_layer) + self.conv_p = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(block_size, 1)) + self.conv_q = nn.Conv2d(groups, block_size * block_size * groups, kernel_size=(1, block_size)) + self.conv2 = ConvNormAct(in_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer) + self.block_size = block_size + self.groups = groups + self.in_channels = in_channels + + def resize_mat(self, x, t: int): + B, C, block_size, block_size1 = x.shape + _assert(block_size == block_size1, '') + if t <= 1: + return x + x = x.view(B * C, -1, 1, 1) + x = x * torch.eye(t, t, dtype=x.dtype, device=x.device) + x = x.view(B * C, block_size, block_size, t, t) + x = torch.cat(torch.split(x, 1, dim=1), dim=3) + x = torch.cat(torch.split(x, 1, dim=2), dim=4) + x = x.view(B, C, block_size * t, block_size * t) + return x + + def forward(self, x): + _assert(x.shape[-1] % self.block_size == 0, '') + _assert(x.shape[-2] % self.block_size == 0, '') + B, C, H, W = x.shape + out = self.conv1(x) + rp = F.adaptive_max_pool2d(out, (self.block_size, 1)) + cp = F.adaptive_max_pool2d(out, (1, self.block_size)) + p = self.conv_p(rp).view(B, self.groups, self.block_size, self.block_size).sigmoid() + q = self.conv_q(cp).view(B, self.groups, self.block_size, self.block_size).sigmoid() + p = p / p.sum(dim=3, keepdim=True) + q = q / q.sum(dim=2, keepdim=True) + p = p.view(B, self.groups, 1, self.block_size, self.block_size).expand(x.size( + 0), self.groups, C // self.groups, self.block_size, self.block_size).contiguous() + p = p.view(B, C, self.block_size, self.block_size) + q = q.view(B, self.groups, 1, self.block_size, self.block_size).expand(x.size( + 0), self.groups, C // self.groups, self.block_size, self.block_size).contiguous() + q = q.view(B, C, self.block_size, self.block_size) + p = self.resize_mat(p, H // self.block_size) + q = self.resize_mat(q, W // self.block_size) + y = p.matmul(x) + y = y.matmul(q) + + y = self.conv2(y) + return y + + +class BatNonLocalAttn(nn.Module): + """ BAT + Adapted from: https://github.com/BA-Transform/BAT-Image-Classification + """ + + def __init__( + self, in_channels, block_size=7, groups=2, rd_ratio=0.25, rd_channels=None, rd_divisor=8, + drop_rate=0.2, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, **_): + super().__init__() + if rd_channels is None: + rd_channels = make_divisible(in_channels * rd_ratio, divisor=rd_divisor) + self.conv1 = ConvNormAct(in_channels, rd_channels, 1, act_layer=act_layer, norm_layer=norm_layer) + self.ba = BilinearAttnTransform(rd_channels, block_size, groups, act_layer=act_layer, norm_layer=norm_layer) + self.conv2 = ConvNormAct(rd_channels, in_channels, 1, act_layer=act_layer, norm_layer=norm_layer) + self.dropout = nn.Dropout2d(p=drop_rate) + + def forward(self, x): + xl = self.conv1(x) + y = self.ba(xl) + y = self.conv2(y) + y = self.dropout(y) + return y + x diff --git a/custom_timm/models/layers/norm.py b/custom_timm/models/layers/norm.py new file mode 100644 index 0000000000000000000000000000000000000000..77d719ede3a111a32448da4c6607e6acafa1c3e5 --- /dev/null +++ b/custom_timm/models/layers/norm.py @@ -0,0 +1,117 @@ +""" Normalization layers and wrappers + +Norm layer definitions that support fast norm and consistent channel arg order (always first arg). + +Hacked together by / Copyright 2022 Ross Wightman +""" + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .fast_norm import is_fast_norm, fast_group_norm, fast_layer_norm + + +class GroupNorm(nn.GroupNorm): + def __init__(self, num_channels, num_groups=32, eps=1e-5, affine=True): + # NOTE num_channels is swapped to first arg for consistency in swapping norm layers with BN + super().__init__(num_groups, num_channels, eps=eps, affine=affine) + self.fast_norm = is_fast_norm() # can't script unless we have these flags here (no globals) + + def forward(self, x): + if self.fast_norm: + return fast_group_norm(x, self.num_groups, self.weight, self.bias, self.eps) + else: + return F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps) + + +class GroupNorm1(nn.GroupNorm): + """ Group Normalization with 1 group. + Input: tensor in shape [B, C, *] + """ + + def __init__(self, num_channels, **kwargs): + super().__init__(1, num_channels, **kwargs) + self.fast_norm = is_fast_norm() # can't script unless we have these flags here (no globals) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.fast_norm: + return fast_group_norm(x, self.num_groups, self.weight, self.bias, self.eps) + else: + return F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps) + + +class LayerNorm(nn.LayerNorm): + """ LayerNorm w/ fast norm option + """ + def __init__(self, num_channels, eps=1e-6, affine=True): + super().__init__(num_channels, eps=eps, elementwise_affine=affine) + self._fast_norm = is_fast_norm() # can't script unless we have these flags here (no globals) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self._fast_norm: + x = fast_layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + else: + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + return x + + +class LayerNorm2d(nn.LayerNorm): + """ LayerNorm for channels of '2D' spatial NCHW tensors """ + def __init__(self, num_channels, eps=1e-6, affine=True): + super().__init__(num_channels, eps=eps, elementwise_affine=affine) + self._fast_norm = is_fast_norm() # can't script unless we have these flags here (no globals) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.permute(0, 2, 3, 1) + if self._fast_norm: + x = fast_layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + else: + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + x = x.permute(0, 3, 1, 2) + return x + + +def _is_contiguous(tensor: torch.Tensor) -> bool: + # jit is oh so lovely :/ + if torch.jit.is_scripting(): + return tensor.is_contiguous() + else: + return tensor.is_contiguous(memory_format=torch.contiguous_format) + + +@torch.jit.script +def _layer_norm_cf(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float): + s, u = torch.var_mean(x, dim=1, unbiased=False, keepdim=True) + x = (x - u) * torch.rsqrt(s + eps) + x = x * weight[:, None, None] + bias[:, None, None] + return x + + +def _layer_norm_cf_sqm(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float): + u = x.mean(dim=1, keepdim=True) + s = ((x * x).mean(dim=1, keepdim=True) - (u * u)).clamp(0) + x = (x - u) * torch.rsqrt(s + eps) + x = x * weight.view(1, -1, 1, 1) + bias.view(1, -1, 1, 1) + return x + + +class LayerNormExp2d(nn.LayerNorm): + """ LayerNorm for channels_first tensors with 2d spatial dimensions (ie N, C, H, W). + + Experimental implementation w/ manual norm for tensors non-contiguous tensors. + + This improves throughput in some scenarios (tested on Ampere GPU), esp w/ channels_last + layout. However, benefits are not always clear and can perform worse on other GPUs. + """ + + def __init__(self, num_channels, eps=1e-6): + super().__init__(num_channels, eps=eps) + + def forward(self, x) -> torch.Tensor: + if _is_contiguous(x): + x = F.layer_norm( + x.permute(0, 2, 3, 1), self.normalized_shape, self.weight, self.bias, self.eps).permute(0, 3, 1, 2) + else: + x = _layer_norm_cf(x, self.weight, self.bias, self.eps) + return x diff --git a/custom_timm/models/layers/norm_act.py b/custom_timm/models/layers/norm_act.py new file mode 100644 index 0000000000000000000000000000000000000000..ff075fbcf40d1b055704072493865f712bd5d65e --- /dev/null +++ b/custom_timm/models/layers/norm_act.py @@ -0,0 +1,252 @@ +""" Normalization + Activation Layers + +Provides Norm+Act fns for standard PyTorch norm layers such as +* BatchNorm +* GroupNorm +* LayerNorm + +This allows swapping with alternative layers that are natively both norm + act such as +* EvoNorm (evo_norm.py) +* FilterResponseNorm (filter_response_norm.py) +* InplaceABN (inplace_abn.py) + +Hacked together by / Copyright 2022 Ross Wightman +""" +from typing import Union, List, Optional, Any + +import torch +from torch import nn as nn +from torch.nn import functional as F + +from .create_act import get_act_layer +from .fast_norm import is_fast_norm, fast_group_norm, fast_layer_norm +from .trace_utils import _assert + + +class BatchNormAct2d(nn.BatchNorm2d): + """BatchNorm + Activation + + This module performs BatchNorm + Activation in a manner that will remain backwards + compatible with weights trained with separate bn, act. This is why we inherit from BN + instead of composing it as a .bn member. + """ + def __init__( + self, + num_features, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + apply_act=True, + act_layer=nn.ReLU, + inplace=True, + drop_layer=None, + device=None, + dtype=None + ): + try: + factory_kwargs = {'device': device, 'dtype': dtype} + super(BatchNormAct2d, self).__init__( + num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats, + **factory_kwargs + ) + except TypeError: + # NOTE for backwards compat with old PyTorch w/o factory device/dtype support + super(BatchNormAct2d, self).__init__( + num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats) + self.drop = drop_layer() if drop_layer is not None else nn.Identity() + act_layer = get_act_layer(act_layer) # string -> nn.Module + if act_layer is not None and apply_act: + act_args = dict(inplace=True) if inplace else {} + self.act = act_layer(**act_args) + else: + self.act = nn.Identity() + + def forward(self, x): + # cut & paste of torch.nn.BatchNorm2d.forward impl to avoid issues with torchscript and tracing + _assert(x.ndim == 4, f'expected 4D input (got {x.ndim}D input)') + + # exponential_average_factor is set to self.momentum + # (when it is available) only so that it gets updated + # in ONNX graph when this node is exported to ONNX. + if self.momentum is None: + exponential_average_factor = 0.0 + else: + exponential_average_factor = self.momentum + + if self.training and self.track_running_stats: + # TODO: if statement only here to tell the jit to skip emitting this when it is None + if self.num_batches_tracked is not None: # type: ignore[has-type] + self.num_batches_tracked = self.num_batches_tracked + 1 # type: ignore[has-type] + if self.momentum is None: # use cumulative moving average + exponential_average_factor = 1.0 / float(self.num_batches_tracked) + else: # use exponential moving average + exponential_average_factor = self.momentum + + r""" + Decide whether the mini-batch stats should be used for normalization rather than the buffers. + Mini-batch stats are used in training mode, and in eval mode when buffers are None. + """ + if self.training: + bn_training = True + else: + bn_training = (self.running_mean is None) and (self.running_var is None) + + r""" + Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be + passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are + used for normalization (i.e. in eval mode when buffers are not None). + """ + x = F.batch_norm( + x, + # If buffers are not to be tracked, ensure that they won't be updated + self.running_mean if not self.training or self.track_running_stats else None, + self.running_var if not self.training or self.track_running_stats else None, + self.weight, + self.bias, + bn_training, + exponential_average_factor, + self.eps, + ) + x = self.drop(x) + x = self.act(x) + return x + + +class SyncBatchNormAct(nn.SyncBatchNorm): + # Thanks to Selim Seferbekov (https://github.com/rwightman/pytorch-image-models/issues/1254) + # This is a quick workaround to support SyncBatchNorm for timm BatchNormAct2d layers + # but ONLY when used in conjunction with the timm conversion function below. + # Do not create this module directly or use the PyTorch conversion function. + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = super().forward(x) # SyncBN doesn't work with torchscript anyways, so this is fine + if hasattr(self, "drop"): + x = self.drop(x) + if hasattr(self, "act"): + x = self.act(x) + return x + + +def convert_sync_batchnorm(module, process_group=None): + # convert both BatchNorm and BatchNormAct layers to Synchronized variants + module_output = module + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): + if isinstance(module, BatchNormAct2d): + # convert timm norm + act layer + module_output = SyncBatchNormAct( + module.num_features, + module.eps, + module.momentum, + module.affine, + module.track_running_stats, + process_group=process_group, + ) + # set act and drop attr from the original module + module_output.act = module.act + module_output.drop = module.drop + else: + # convert standard BatchNorm layers + module_output = torch.nn.SyncBatchNorm( + module.num_features, + module.eps, + module.momentum, + module.affine, + module.track_running_stats, + process_group, + ) + if module.affine: + with torch.no_grad(): + module_output.weight = module.weight + module_output.bias = module.bias + module_output.running_mean = module.running_mean + module_output.running_var = module.running_var + module_output.num_batches_tracked = module.num_batches_tracked + if hasattr(module, "qconfig"): + module_output.qconfig = module.qconfig + for name, child in module.named_children(): + module_output.add_module(name, convert_sync_batchnorm(child, process_group)) + del module + return module_output + + +def _num_groups(num_channels, num_groups, group_size): + if group_size: + assert num_channels % group_size == 0 + return num_channels // group_size + return num_groups + + +class GroupNormAct(nn.GroupNorm): + # NOTE num_channel and num_groups order flipped for easier layer swaps / binding of fixed args + def __init__( + self, num_channels, num_groups=32, eps=1e-5, affine=True, group_size=None, + apply_act=True, act_layer=nn.ReLU, inplace=True, drop_layer=None): + super(GroupNormAct, self).__init__( + _num_groups(num_channels, num_groups, group_size), num_channels, eps=eps, affine=affine) + self.drop = drop_layer() if drop_layer is not None else nn.Identity() + act_layer = get_act_layer(act_layer) # string -> nn.Module + if act_layer is not None and apply_act: + act_args = dict(inplace=True) if inplace else {} + self.act = act_layer(**act_args) + else: + self.act = nn.Identity() + self._fast_norm = is_fast_norm() + + def forward(self, x): + if self._fast_norm: + x = fast_group_norm(x, self.num_groups, self.weight, self.bias, self.eps) + else: + x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps) + x = self.drop(x) + x = self.act(x) + return x + + +class LayerNormAct(nn.LayerNorm): + def __init__( + self, normalization_shape: Union[int, List[int], torch.Size], eps=1e-5, affine=True, + apply_act=True, act_layer=nn.ReLU, inplace=True, drop_layer=None): + super(LayerNormAct, self).__init__(normalization_shape, eps=eps, elementwise_affine=affine) + self.drop = drop_layer() if drop_layer is not None else nn.Identity() + act_layer = get_act_layer(act_layer) # string -> nn.Module + if act_layer is not None and apply_act: + act_args = dict(inplace=True) if inplace else {} + self.act = act_layer(**act_args) + else: + self.act = nn.Identity() + self._fast_norm = is_fast_norm() + + def forward(self, x): + if self._fast_norm: + x = fast_layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + else: + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + x = self.drop(x) + x = self.act(x) + return x + + +class LayerNormAct2d(nn.LayerNorm): + def __init__( + self, num_channels, eps=1e-5, affine=True, + apply_act=True, act_layer=nn.ReLU, inplace=True, drop_layer=None): + super(LayerNormAct2d, self).__init__(num_channels, eps=eps, elementwise_affine=affine) + self.drop = drop_layer() if drop_layer is not None else nn.Identity() + act_layer = get_act_layer(act_layer) # string -> nn.Module + if act_layer is not None and apply_act: + act_args = dict(inplace=True) if inplace else {} + self.act = act_layer(**act_args) + else: + self.act = nn.Identity() + self._fast_norm = is_fast_norm() + + def forward(self, x): + x = x.permute(0, 2, 3, 1) + if self._fast_norm: + x = fast_layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + else: + x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps) + x = x.permute(0, 3, 1, 2) + x = self.drop(x) + x = self.act(x) + return x diff --git a/custom_timm/models/layers/padding.py b/custom_timm/models/layers/padding.py new file mode 100644 index 0000000000000000000000000000000000000000..34afc37c6c59c8782ad29c7a779f58177011f891 --- /dev/null +++ b/custom_timm/models/layers/padding.py @@ -0,0 +1,56 @@ +""" Padding Helpers + +Hacked together by / Copyright 2020 Ross Wightman +""" +import math +from typing import List, Tuple + +import torch.nn.functional as F + + +# Calculate symmetric padding for a convolution +def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int: + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding + + +# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution +def get_same_padding(x: int, k: int, s: int, d: int): + return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0) + + +# Can SAME padding for given args be done statically? +def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_): + return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0 + + +# Dynamically pad input x with 'SAME' padding for conv with specified args +def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0): + ih, iw = x.size()[-2:] + pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1]) + if pad_h > 0 or pad_w > 0: + x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value) + return x + + +def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]: + dynamic = False + if isinstance(padding, str): + # for any string padding, the padding will be calculated for you, one of three ways + padding = padding.lower() + if padding == 'same': + # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact + if is_static_pad(kernel_size, **kwargs): + # static case, no extra overhead + padding = get_padding(kernel_size, **kwargs) + else: + # dynamic 'SAME' padding, has runtime/GPU memory overhead + padding = 0 + dynamic = True + elif padding == 'valid': + # 'VALID' padding, same as padding=0 + padding = 0 + else: + # Default to PyTorch style 'same'-ish symmetric padding + padding = get_padding(kernel_size, **kwargs) + return padding, dynamic diff --git a/custom_timm/models/layers/patch_embed.py b/custom_timm/models/layers/patch_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..be8740ce89b2ea4e37edad188b9f6d72ac8bdb8d --- /dev/null +++ b/custom_timm/models/layers/patch_embed.py @@ -0,0 +1,48 @@ +""" Image to Patch Embedding using Conv2d + +A convolution based approach to patchifying a 2D image w/ embedding projection. + +Based on the impl in https://github.com/google-research/vision_transformer + +Hacked together by / Copyright 2020 Ross Wightman +""" +from torch import nn as nn + +from .helpers import to_2tuple +from .trace_utils import _assert + + +class PatchEmbed(nn.Module): + """ 2D Image to Patch Embedding + """ + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + embed_dim=768, + norm_layer=None, + flatten=True, + bias=True, + ): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + B, C, H, W = x.shape + _assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).") + _assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).") + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + x = self.norm(x) + return x diff --git a/custom_timm/models/layers/pool2d_same.py b/custom_timm/models/layers/pool2d_same.py new file mode 100644 index 0000000000000000000000000000000000000000..4c2a1c44713e552be850865ada9623a1c3b1d836 --- /dev/null +++ b/custom_timm/models/layers/pool2d_same.py @@ -0,0 +1,73 @@ +""" AvgPool2d w/ Same Padding + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import List, Tuple, Optional + +from .helpers import to_2tuple +from .padding import pad_same, get_padding_value + + +def avg_pool2d_same(x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0), + ceil_mode: bool = False, count_include_pad: bool = True): + # FIXME how to deal with count_include_pad vs not for external padding? + x = pad_same(x, kernel_size, stride) + return F.avg_pool2d(x, kernel_size, stride, (0, 0), ceil_mode, count_include_pad) + + +class AvgPool2dSame(nn.AvgPool2d): + """ Tensorflow like 'SAME' wrapper for 2D average pooling + """ + def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True): + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + super(AvgPool2dSame, self).__init__(kernel_size, stride, (0, 0), ceil_mode, count_include_pad) + + def forward(self, x): + x = pad_same(x, self.kernel_size, self.stride) + return F.avg_pool2d( + x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad) + + +def max_pool2d_same( + x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0), + dilation: List[int] = (1, 1), ceil_mode: bool = False): + x = pad_same(x, kernel_size, stride, value=-float('inf')) + return F.max_pool2d(x, kernel_size, stride, (0, 0), dilation, ceil_mode) + + +class MaxPool2dSame(nn.MaxPool2d): + """ Tensorflow like 'SAME' wrapper for 2D max pooling + """ + def __init__(self, kernel_size: int, stride=None, padding=0, dilation=1, ceil_mode=False): + kernel_size = to_2tuple(kernel_size) + stride = to_2tuple(stride) + dilation = to_2tuple(dilation) + super(MaxPool2dSame, self).__init__(kernel_size, stride, (0, 0), dilation, ceil_mode) + + def forward(self, x): + x = pad_same(x, self.kernel_size, self.stride, value=-float('inf')) + return F.max_pool2d(x, self.kernel_size, self.stride, (0, 0), self.dilation, self.ceil_mode) + + +def create_pool2d(pool_type, kernel_size, stride=None, **kwargs): + stride = stride or kernel_size + padding = kwargs.pop('padding', '') + padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, **kwargs) + if is_dynamic: + if pool_type == 'avg': + return AvgPool2dSame(kernel_size, stride=stride, **kwargs) + elif pool_type == 'max': + return MaxPool2dSame(kernel_size, stride=stride, **kwargs) + else: + assert False, f'Unsupported pool type {pool_type}' + else: + if pool_type == 'avg': + return nn.AvgPool2d(kernel_size, stride=stride, padding=padding, **kwargs) + elif pool_type == 'max': + return nn.MaxPool2d(kernel_size, stride=stride, padding=padding, **kwargs) + else: + assert False, f'Unsupported pool type {pool_type}' diff --git a/custom_timm/models/layers/pos_embed.py b/custom_timm/models/layers/pos_embed.py new file mode 100644 index 0000000000000000000000000000000000000000..99a122a09bdfa6a0ac12b13de91a3db7173813c7 --- /dev/null +++ b/custom_timm/models/layers/pos_embed.py @@ -0,0 +1,207 @@ +import math +from typing import List, Tuple, Optional, Union + +import torch +from torch import nn as nn + + +def pixel_freq_bands( + num_bands: int, + max_freq: float = 224., + linear_bands: bool = True, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +): + if linear_bands: + bands = torch.linspace(1.0, max_freq / 2, num_bands, dtype=dtype, device=device) + else: + bands = 2 ** torch.linspace(0, math.log(max_freq, 2) - 1, num_bands, dtype=dtype, device=device) + return bands * torch.pi + + +def inv_freq_bands( + num_bands: int, + temperature: float = 100000., + step: int = 2, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +) -> torch.Tensor: + inv_freq = 1. / (temperature ** (torch.arange(0, num_bands, step, dtype=dtype, device=device) / num_bands)) + return inv_freq + + +def build_sincos2d_pos_embed( + feat_shape: List[int], + dim: int = 64, + temperature: float = 10000., + reverse_coord: bool = False, + interleave_sin_cos: bool = False, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None +) -> torch.Tensor: + """ + + Args: + feat_shape: + dim: + temperature: + reverse_coord: stack grid order W, H instead of H, W + interleave_sin_cos: sin, cos, sin, cos stack instead of sin, sin, cos, cos + dtype: + device: + + Returns: + + """ + assert dim % 4 == 0, 'Embed dimension must be divisible by 4 for sin-cos 2D position embedding' + pos_dim = dim // 4 + bands = inv_freq_bands(pos_dim, temperature=temperature, step=1, dtype=dtype, device=device) + + if reverse_coord: + feat_shape = feat_shape[::-1] # stack W, H instead of H, W + grid = torch.stack( + torch.meshgrid([torch.arange(s, device=device, dtype=dtype) for s in feat_shape])).flatten(1).transpose(0, 1) + pos2 = grid.unsqueeze(-1) * bands.unsqueeze(0) + # FIXME add support for unflattened spatial dim? + + stack_dim = 2 if interleave_sin_cos else 1 # stack sin, cos, sin, cos instead of sin sin cos cos + pos_emb = torch.stack([torch.sin(pos2), torch.cos(pos2)], dim=stack_dim).flatten(1) + return pos_emb + + +def build_fourier_pos_embed( + feat_shape: List[int], + bands: Optional[torch.Tensor] = None, + num_bands: int = 64, + max_res: int = 224, + linear_bands: bool = False, + include_grid: bool = False, + concat_out: bool = True, + in_pixels: bool = True, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +) -> List[torch.Tensor]: + if bands is None: + if in_pixels: + bands = pixel_freq_bands(num_bands, float(max_res), linear_bands=linear_bands, dtype=dtype, device=device) + else: + bands = inv_freq_bands(num_bands, step=1, dtype=dtype, device=device) + else: + if device is None: + device = bands.device + if dtype is None: + dtype = bands.dtype + + if in_pixels: + grid = torch.stack(torch.meshgrid( + [torch.linspace(-1., 1., steps=s, device=device, dtype=dtype) for s in feat_shape]), dim=-1) + else: + grid = torch.stack(torch.meshgrid( + [torch.arange(s, device=device, dtype=dtype) for s in feat_shape]), dim=-1) + grid = grid.unsqueeze(-1) + pos = grid * bands + + pos_sin, pos_cos = pos.sin(), pos.cos() + out = (grid, pos_sin, pos_cos) if include_grid else (pos_sin, pos_cos) + # FIXME torchscript doesn't like multiple return types, probably need to always cat? + if concat_out: + out = torch.cat(out, dim=-1) + return out + + +class FourierEmbed(nn.Module): + + def __init__(self, max_res: int = 224, num_bands: int = 64, concat_grid=True, keep_spatial=False): + super().__init__() + self.max_res = max_res + self.num_bands = num_bands + self.concat_grid = concat_grid + self.keep_spatial = keep_spatial + self.register_buffer('bands', pixel_freq_bands(max_res, num_bands), persistent=False) + + def forward(self, x): + B, C = x.shape[:2] + feat_shape = x.shape[2:] + emb = build_fourier_pos_embed( + feat_shape, + self.bands, + include_grid=self.concat_grid, + dtype=x.dtype, + device=x.device) + emb = emb.transpose(-1, -2).flatten(len(feat_shape)) + batch_expand = (B,) + (-1,) * (x.ndim - 1) + + # FIXME support nD + if self.keep_spatial: + x = torch.cat([x, emb.unsqueeze(0).expand(batch_expand).permute(0, 3, 1, 2)], dim=1) + else: + x = torch.cat([x.permute(0, 2, 3, 1), emb.unsqueeze(0).expand(batch_expand)], dim=-1) + x = x.reshape(B, feat_shape.numel(), -1) + + return x + + +def rot(x): + return torch.stack([-x[..., 1::2], x[..., ::2]], -1).reshape(x.shape) + + +def apply_rot_embed(x: torch.Tensor, sin_emb, cos_emb): + return x * cos_emb + rot(x) * sin_emb + + +def apply_rot_embed_list(x: List[torch.Tensor], sin_emb, cos_emb): + if isinstance(x, torch.Tensor): + x = [x] + return [t * cos_emb + rot(t) * sin_emb for t in x] + + +def apply_rot_embed_split(x: torch.Tensor, emb): + split = emb.shape[-1] // 2 + return x * emb[:, :split] + rot(x) * emb[:, split:] + + +def build_rotary_pos_embed( + feat_shape: List[int], + bands: Optional[torch.Tensor] = None, + dim: int = 64, + max_freq: float = 224, + linear_bands: bool = False, + dtype: torch.dtype = torch.float32, + device: Optional[torch.device] = None, +): + """ + NOTE: shape arg should include spatial dim only + """ + feat_shape = torch.Size(feat_shape) + + sin_emb, cos_emb = build_fourier_pos_embed( + feat_shape, bands=bands, num_bands=dim // 4, max_res=max_freq, linear_bands=linear_bands, + concat_out=False, device=device, dtype=dtype) + N = feat_shape.numel() + sin_emb = sin_emb.reshape(N, -1).repeat_interleave(2, -1) + cos_emb = cos_emb.reshape(N, -1).repeat_interleave(2, -1) + return sin_emb, cos_emb + + +class RotaryEmbedding(nn.Module): + """ Rotary position embedding + + NOTE: This is my initial attempt at impl rotary embedding for spatial use, it has not + been well tested, and will likely change. It will be moved to its own file. + + The following impl/resources were referenced for this impl: + * https://github.com/lucidrains/vit-pytorch/blob/6f3a5fcf0bca1c5ec33a35ef48d97213709df4ba/vit_pytorch/rvt.py + * https://blog.eleuther.ai/rotary-embeddings/ + """ + def __init__(self, dim, max_res=224, linear_bands: bool = False): + super().__init__() + self.dim = dim + self.register_buffer('bands', pixel_freq_bands(dim // 4, max_res, linear_bands=linear_bands), persistent=False) + + def get_embed(self, shape: List[int]): + return build_rotary_pos_embed(shape, self.bands) + + def forward(self, x): + # assuming channel-first tensor where spatial dim are >= 2 + sin_emb, cos_emb = self.get_embed(x.shape[2:]) + return apply_rot_embed(x, sin_emb, cos_emb) diff --git a/custom_timm/models/layers/selective_kernel.py b/custom_timm/models/layers/selective_kernel.py new file mode 100644 index 0000000000000000000000000000000000000000..3d71e3aa696b69bbfc00b39228c0e5fb7152c83c --- /dev/null +++ b/custom_timm/models/layers/selective_kernel.py @@ -0,0 +1,119 @@ +""" Selective Kernel Convolution/Attention + +Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586) + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +from torch import nn as nn + +from .conv_bn_act import ConvNormActAa +from .helpers import make_divisible +from .trace_utils import _assert + + +def _kernel_valid(k): + if isinstance(k, (list, tuple)): + for ki in k: + return _kernel_valid(ki) + assert k >= 3 and k % 2 + + +class SelectiveKernelAttn(nn.Module): + def __init__(self, channels, num_paths=2, attn_channels=32, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d): + """ Selective Kernel Attention Module + + Selective Kernel attention mechanism factored out into its own module. + + """ + super(SelectiveKernelAttn, self).__init__() + self.num_paths = num_paths + self.fc_reduce = nn.Conv2d(channels, attn_channels, kernel_size=1, bias=False) + self.bn = norm_layer(attn_channels) + self.act = act_layer(inplace=True) + self.fc_select = nn.Conv2d(attn_channels, channels * num_paths, kernel_size=1, bias=False) + + def forward(self, x): + _assert(x.shape[1] == self.num_paths, '') + x = x.sum(1).mean((2, 3), keepdim=True) + x = self.fc_reduce(x) + x = self.bn(x) + x = self.act(x) + x = self.fc_select(x) + B, C, H, W = x.shape + x = x.view(B, self.num_paths, C // self.num_paths, H, W) + x = torch.softmax(x, dim=1) + return x + + +class SelectiveKernel(nn.Module): + + def __init__(self, in_channels, out_channels=None, kernel_size=None, stride=1, dilation=1, groups=1, + rd_ratio=1./16, rd_channels=None, rd_divisor=8, keep_3x3=True, split_input=True, + act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, drop_layer=None): + """ Selective Kernel Convolution Module + + As described in Selective Kernel Networks (https://arxiv.org/abs/1903.06586) with some modifications. + + Largest change is the input split, which divides the input channels across each convolution path, this can + be viewed as a grouping of sorts, but the output channel counts expand to the module level value. This keeps + the parameter count from ballooning when the convolutions themselves don't have groups, but still provides + a noteworthy increase in performance over similar param count models without this attention layer. -Ross W + + Args: + in_channels (int): module input (feature) channel count + out_channels (int): module output (feature) channel count + kernel_size (int, list): kernel size for each convolution branch + stride (int): stride for convolutions + dilation (int): dilation for module as a whole, impacts dilation of each branch + groups (int): number of groups for each branch + rd_ratio (int, float): reduction factor for attention features + keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations + split_input (bool): split input channels evenly across each convolution branch, keeps param count lower, + can be viewed as grouping by path, output expands to module out_channels count + act_layer (nn.Module): activation layer to use + norm_layer (nn.Module): batchnorm/norm layer to use + aa_layer (nn.Module): anti-aliasing module + drop_layer (nn.Module): spatial drop module in convs (drop block, etc) + """ + super(SelectiveKernel, self).__init__() + out_channels = out_channels or in_channels + kernel_size = kernel_size or [3, 5] # default to one 3x3 and one 5x5 branch. 5x5 -> 3x3 + dilation + _kernel_valid(kernel_size) + if not isinstance(kernel_size, list): + kernel_size = [kernel_size] * 2 + if keep_3x3: + dilation = [dilation * (k - 1) // 2 for k in kernel_size] + kernel_size = [3] * len(kernel_size) + else: + dilation = [dilation] * len(kernel_size) + self.num_paths = len(kernel_size) + self.in_channels = in_channels + self.out_channels = out_channels + self.split_input = split_input + if self.split_input: + assert in_channels % self.num_paths == 0 + in_channels = in_channels // self.num_paths + groups = min(out_channels, groups) + + conv_kwargs = dict( + stride=stride, groups=groups, act_layer=act_layer, norm_layer=norm_layer, + aa_layer=aa_layer, drop_layer=drop_layer) + self.paths = nn.ModuleList([ + ConvNormActAa(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs) + for k, d in zip(kernel_size, dilation)]) + + attn_channels = rd_channels or make_divisible(out_channels * rd_ratio, divisor=rd_divisor) + self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels) + + def forward(self, x): + if self.split_input: + x_split = torch.split(x, self.in_channels // self.num_paths, 1) + x_paths = [op(x_split[i]) for i, op in enumerate(self.paths)] + else: + x_paths = [op(x) for op in self.paths] + x = torch.stack(x_paths, dim=1) + x_attn = self.attn(x) + x = x * x_attn + x = torch.sum(x, dim=1) + return x diff --git a/custom_timm/models/layers/separable_conv.py b/custom_timm/models/layers/separable_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..c081e02bc45900a7220bc7ffbb709eedbb1cc4df --- /dev/null +++ b/custom_timm/models/layers/separable_conv.py @@ -0,0 +1,76 @@ +""" Depthwise Separable Conv Modules + +Basic DWS convs. Other variations of DWS exist with batch norm or activations between the +DW and PW convs such as the Depthwise modules in MobileNetV2 / EfficientNet and Xception. + +Hacked together by / Copyright 2020 Ross Wightman +""" +from torch import nn as nn + +from .create_conv2d import create_conv2d +from .create_norm_act import get_norm_act_layer + + +class SeparableConvNormAct(nn.Module): + """ Separable Conv w/ trailing Norm and Activation + """ + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False, + channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, act_layer=nn.ReLU, + apply_act=True, drop_layer=None): + super(SeparableConvNormAct, self).__init__() + + self.conv_dw = create_conv2d( + in_channels, int(in_channels * channel_multiplier), kernel_size, + stride=stride, dilation=dilation, padding=padding, depthwise=True) + + self.conv_pw = create_conv2d( + int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias) + + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + norm_kwargs = dict(drop_layer=drop_layer) if drop_layer is not None else {} + self.bn = norm_act_layer(out_channels, apply_act=apply_act, **norm_kwargs) + + @property + def in_channels(self): + return self.conv_dw.in_channels + + @property + def out_channels(self): + return self.conv_pw.out_channels + + def forward(self, x): + x = self.conv_dw(x) + x = self.conv_pw(x) + x = self.bn(x) + return x + + +SeparableConvBnAct = SeparableConvNormAct + + +class SeparableConv2d(nn.Module): + """ Separable Conv + """ + def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False, + channel_multiplier=1.0, pw_kernel_size=1): + super(SeparableConv2d, self).__init__() + + self.conv_dw = create_conv2d( + in_channels, int(in_channels * channel_multiplier), kernel_size, + stride=stride, dilation=dilation, padding=padding, depthwise=True) + + self.conv_pw = create_conv2d( + int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias) + + @property + def in_channels(self): + return self.conv_dw.in_channels + + @property + def out_channels(self): + return self.conv_pw.out_channels + + def forward(self, x): + x = self.conv_dw(x) + x = self.conv_pw(x) + return x diff --git a/custom_timm/models/layers/space_to_depth.py b/custom_timm/models/layers/space_to_depth.py new file mode 100644 index 0000000000000000000000000000000000000000..a7e8e0b2a486d51fe3e4ab0472d89b7f1b92e1dc --- /dev/null +++ b/custom_timm/models/layers/space_to_depth.py @@ -0,0 +1,53 @@ +import torch +import torch.nn as nn + + +class SpaceToDepth(nn.Module): + def __init__(self, block_size=4): + super().__init__() + assert block_size == 4 + self.bs = block_size + + def forward(self, x): + N, C, H, W = x.size() + x = x.view(N, C, H // self.bs, self.bs, W // self.bs, self.bs) # (N, C, H//bs, bs, W//bs, bs) + x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # (N, bs, bs, C, H//bs, W//bs) + x = x.view(N, C * (self.bs ** 2), H // self.bs, W // self.bs) # (N, C*bs^2, H//bs, W//bs) + return x + + +@torch.jit.script +class SpaceToDepthJit(object): + def __call__(self, x: torch.Tensor): + # assuming hard-coded that block_size==4 for acceleration + N, C, H, W = x.size() + x = x.view(N, C, H // 4, 4, W // 4, 4) # (N, C, H//bs, bs, W//bs, bs) + x = x.permute(0, 3, 5, 1, 2, 4).contiguous() # (N, bs, bs, C, H//bs, W//bs) + x = x.view(N, C * 16, H // 4, W // 4) # (N, C*bs^2, H//bs, W//bs) + return x + + +class SpaceToDepthModule(nn.Module): + def __init__(self, no_jit=False): + super().__init__() + if not no_jit: + self.op = SpaceToDepthJit() + else: + self.op = SpaceToDepth() + + def forward(self, x): + return self.op(x) + + +class DepthToSpace(nn.Module): + + def __init__(self, block_size): + super().__init__() + self.bs = block_size + + def forward(self, x): + N, C, H, W = x.size() + x = x.view(N, self.bs, self.bs, C // (self.bs ** 2), H, W) # (N, bs, bs, C//bs^2, H, W) + x = x.permute(0, 3, 4, 1, 5, 2).contiguous() # (N, C//bs^2, H, bs, W, bs) + x = x.view(N, C // (self.bs ** 2), H * self.bs, W * self.bs) # (N, C//bs^2, H * bs, W * bs) + return x diff --git a/custom_timm/models/layers/split_attn.py b/custom_timm/models/layers/split_attn.py new file mode 100644 index 0000000000000000000000000000000000000000..ac54f8988ac6bdc9e852585692248875a016b7fb --- /dev/null +++ b/custom_timm/models/layers/split_attn.py @@ -0,0 +1,84 @@ +""" Split Attention Conv2d (for ResNeSt Models) + +Paper: `ResNeSt: Split-Attention Networks` - /https://arxiv.org/abs/2004.08955 + +Adapted from original PyTorch impl at https://github.com/zhanghang1989/ResNeSt + +Modified for torchscript compat, performance, and consistency with timm by Ross Wightman +""" +import torch +import torch.nn.functional as F +from torch import nn + +from .helpers import make_divisible + + +class RadixSoftmax(nn.Module): + def __init__(self, radix, cardinality): + super(RadixSoftmax, self).__init__() + self.radix = radix + self.cardinality = cardinality + + def forward(self, x): + batch = x.size(0) + if self.radix > 1: + x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2) + x = F.softmax(x, dim=1) + x = x.reshape(batch, -1) + else: + x = torch.sigmoid(x) + return x + + +class SplitAttn(nn.Module): + """Split-Attention (aka Splat) + """ + def __init__(self, in_channels, out_channels=None, kernel_size=3, stride=1, padding=None, + dilation=1, groups=1, bias=False, radix=2, rd_ratio=0.25, rd_channels=None, rd_divisor=8, + act_layer=nn.ReLU, norm_layer=None, drop_layer=None, **kwargs): + super(SplitAttn, self).__init__() + out_channels = out_channels or in_channels + self.radix = radix + mid_chs = out_channels * radix + if rd_channels is None: + attn_chs = make_divisible(in_channels * radix * rd_ratio, min_value=32, divisor=rd_divisor) + else: + attn_chs = rd_channels * radix + + padding = kernel_size // 2 if padding is None else padding + self.conv = nn.Conv2d( + in_channels, mid_chs, kernel_size, stride, padding, dilation, + groups=groups * radix, bias=bias, **kwargs) + self.bn0 = norm_layer(mid_chs) if norm_layer else nn.Identity() + self.drop = drop_layer() if drop_layer is not None else nn.Identity() + self.act0 = act_layer(inplace=True) + self.fc1 = nn.Conv2d(out_channels, attn_chs, 1, groups=groups) + self.bn1 = norm_layer(attn_chs) if norm_layer else nn.Identity() + self.act1 = act_layer(inplace=True) + self.fc2 = nn.Conv2d(attn_chs, mid_chs, 1, groups=groups) + self.rsoftmax = RadixSoftmax(radix, groups) + + def forward(self, x): + x = self.conv(x) + x = self.bn0(x) + x = self.drop(x) + x = self.act0(x) + + B, RC, H, W = x.shape + if self.radix > 1: + x = x.reshape((B, self.radix, RC // self.radix, H, W)) + x_gap = x.sum(dim=1) + else: + x_gap = x + x_gap = x_gap.mean((2, 3), keepdim=True) + x_gap = self.fc1(x_gap) + x_gap = self.bn1(x_gap) + x_gap = self.act1(x_gap) + x_attn = self.fc2(x_gap) + + x_attn = self.rsoftmax(x_attn).view(B, -1, 1, 1) + if self.radix > 1: + out = (x * x_attn.reshape((B, self.radix, RC // self.radix, 1, 1))).sum(dim=1) + else: + out = x * x_attn + return out.contiguous() diff --git a/custom_timm/models/layers/split_batchnorm.py b/custom_timm/models/layers/split_batchnorm.py new file mode 100644 index 0000000000000000000000000000000000000000..830781b335161f8d6dd74c9458070bb1fa88a918 --- /dev/null +++ b/custom_timm/models/layers/split_batchnorm.py @@ -0,0 +1,75 @@ +""" Split BatchNorm + +A PyTorch BatchNorm layer that splits input batch into N equal parts and passes each through +a separate BN layer. The first split is passed through the parent BN layers with weight/bias +keys the same as the original BN. All other splits pass through BN sub-layers under the '.aux_bn' +namespace. + +This allows easily removing the auxiliary BN layers after training to efficiently +achieve the 'Auxiliary BatchNorm' as described in the AdvProp Paper, section 4.2, +'Disentangled Learning via An Auxiliary BN' + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +import torch.nn as nn + + +class SplitBatchNorm2d(torch.nn.BatchNorm2d): + + def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, + track_running_stats=True, num_splits=2): + super().__init__(num_features, eps, momentum, affine, track_running_stats) + assert num_splits > 1, 'Should have at least one aux BN layer (num_splits at least 2)' + self.num_splits = num_splits + self.aux_bn = nn.ModuleList([ + nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats) for _ in range(num_splits - 1)]) + + def forward(self, input: torch.Tensor): + if self.training: # aux BN only relevant while training + split_size = input.shape[0] // self.num_splits + assert input.shape[0] == split_size * self.num_splits, "batch size must be evenly divisible by num_splits" + split_input = input.split(split_size) + x = [super().forward(split_input[0])] + for i, a in enumerate(self.aux_bn): + x.append(a(split_input[i + 1])) + return torch.cat(x, dim=0) + else: + return super().forward(input) + + +def convert_splitbn_model(module, num_splits=2): + """ + Recursively traverse module and its children to replace all instances of + ``torch.nn.modules.batchnorm._BatchNorm`` with `SplitBatchnorm2d`. + Args: + module (torch.nn.Module): input module + num_splits: number of separate batchnorm layers to split input across + Example:: + >>> # model is an instance of torch.nn.Module + >>> model = timm.models.convert_splitbn_model(model, num_splits=2) + """ + mod = module + if isinstance(module, torch.nn.modules.instancenorm._InstanceNorm): + return module + if isinstance(module, torch.nn.modules.batchnorm._BatchNorm): + mod = SplitBatchNorm2d( + module.num_features, module.eps, module.momentum, module.affine, + module.track_running_stats, num_splits=num_splits) + mod.running_mean = module.running_mean + mod.running_var = module.running_var + mod.num_batches_tracked = module.num_batches_tracked + if module.affine: + mod.weight.data = module.weight.data.clone().detach() + mod.bias.data = module.bias.data.clone().detach() + for aux in mod.aux_bn: + aux.running_mean = module.running_mean.clone() + aux.running_var = module.running_var.clone() + aux.num_batches_tracked = module.num_batches_tracked.clone() + if module.affine: + aux.weight.data = module.weight.data.clone().detach() + aux.bias.data = module.bias.data.clone().detach() + for name, child in module.named_children(): + mod.add_module(name, convert_splitbn_model(child, num_splits=num_splits)) + del module + return mod diff --git a/custom_timm/models/layers/squeeze_excite.py b/custom_timm/models/layers/squeeze_excite.py new file mode 100644 index 0000000000000000000000000000000000000000..2e41d956c964fbccc369864303cdb4a6e3560d9c --- /dev/null +++ b/custom_timm/models/layers/squeeze_excite.py @@ -0,0 +1,74 @@ +""" Squeeze-and-Excitation Channel Attention + +An SE implementation originally based on PyTorch SE-Net impl. +Has since evolved with additional functionality / configuration. + +Paper: `Squeeze-and-Excitation Networks` - https://arxiv.org/abs/1709.01507 + +Also included is Effective Squeeze-Excitation (ESE). +Paper: `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + +Hacked together by / Copyright 2021 Ross Wightman +""" +from torch import nn as nn + +from .create_act import create_act_layer +from .helpers import make_divisible + + +class SEModule(nn.Module): + """ SE Module as defined in original SE-Nets with a few additions + Additions include: + * divisor can be specified to keep channels % div == 0 (default: 8) + * reduction channels can be specified directly by arg (if rd_channels is set) + * reduction channels can be specified by float rd_ratio (default: 1/16) + * global max pooling can be added to the squeeze aggregation + * customizable activation, normalization, and gate layer + """ + def __init__( + self, channels, rd_ratio=1. / 16, rd_channels=None, rd_divisor=8, add_maxpool=False, + bias=True, act_layer=nn.ReLU, norm_layer=None, gate_layer='sigmoid'): + super(SEModule, self).__init__() + self.add_maxpool = add_maxpool + if not rd_channels: + rd_channels = make_divisible(channels * rd_ratio, rd_divisor, round_limit=0.) + self.fc1 = nn.Conv2d(channels, rd_channels, kernel_size=1, bias=bias) + self.bn = norm_layer(rd_channels) if norm_layer else nn.Identity() + self.act = create_act_layer(act_layer, inplace=True) + self.fc2 = nn.Conv2d(rd_channels, channels, kernel_size=1, bias=bias) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + if self.add_maxpool: + # experimental codepath, may remove or change + x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True) + x_se = self.fc1(x_se) + x_se = self.act(self.bn(x_se)) + x_se = self.fc2(x_se) + return x * self.gate(x_se) + + +SqueezeExcite = SEModule # alias + + +class EffectiveSEModule(nn.Module): + """ 'Effective Squeeze-Excitation + From `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + """ + def __init__(self, channels, add_maxpool=False, gate_layer='hard_sigmoid', **_): + super(EffectiveSEModule, self).__init__() + self.add_maxpool = add_maxpool + self.fc = nn.Conv2d(channels, channels, kernel_size=1, padding=0) + self.gate = create_act_layer(gate_layer) + + def forward(self, x): + x_se = x.mean((2, 3), keepdim=True) + if self.add_maxpool: + # experimental codepath, may remove or change + x_se = 0.5 * x_se + 0.5 * x.amax((2, 3), keepdim=True) + x_se = self.fc(x_se) + return x * self.gate(x_se) + + +EffectiveSqueezeExcite = EffectiveSEModule # alias diff --git a/custom_timm/models/layers/std_conv.py b/custom_timm/models/layers/std_conv.py new file mode 100644 index 0000000000000000000000000000000000000000..d896ba5c2f7f517d6ce0508d789a516e7bfb4cf1 --- /dev/null +++ b/custom_timm/models/layers/std_conv.py @@ -0,0 +1,133 @@ +""" Convolution with Weight Standardization (StdConv and ScaledStdConv) + +StdConv: +@article{weightstandardization, + author = {Siyuan Qiao and Huiyu Wang and Chenxi Liu and Wei Shen and Alan Yuille}, + title = {Weight Standardization}, + journal = {arXiv preprint arXiv:1903.10520}, + year = {2019}, +} +Code: https://github.com/joe-siyuan-qiao/WeightStandardization + +ScaledStdConv: +Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 +Official Deepmind JAX code: https://github.com/deepmind/deepmind-research/tree/master/nfnets + +Hacked together by / copyright Ross Wightman, 2021. +""" +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .padding import get_padding, get_padding_value, pad_same + + +class StdConv2d(nn.Conv2d): + """Conv2d with Weight Standardization. Used for BiT ResNet-V2 models. + + Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` - + https://arxiv.org/abs/1903.10520v2 + """ + def __init__( + self, in_channel, out_channels, kernel_size, stride=1, padding=None, + dilation=1, groups=1, bias=False, eps=1e-6): + if padding is None: + padding = get_padding(kernel_size, stride, dilation) + super().__init__( + in_channel, out_channels, kernel_size, stride=stride, + padding=padding, dilation=dilation, groups=groups, bias=bias) + self.eps = eps + + def forward(self, x): + weight = F.batch_norm( + self.weight.reshape(1, self.out_channels, -1), None, None, + training=True, momentum=0., eps=self.eps).reshape_as(self.weight) + x = F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) + return x + + +class StdConv2dSame(nn.Conv2d): + """Conv2d with Weight Standardization. TF compatible SAME padding. Used for ViT Hybrid model. + + Paper: `Micro-Batch Training with Batch-Channel Normalization and Weight Standardization` - + https://arxiv.org/abs/1903.10520v2 + """ + def __init__( + self, in_channel, out_channels, kernel_size, stride=1, padding='SAME', + dilation=1, groups=1, bias=False, eps=1e-6): + padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation) + super().__init__( + in_channel, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, + groups=groups, bias=bias) + self.same_pad = is_dynamic + self.eps = eps + + def forward(self, x): + if self.same_pad: + x = pad_same(x, self.kernel_size, self.stride, self.dilation) + weight = F.batch_norm( + self.weight.reshape(1, self.out_channels, -1), None, None, + training=True, momentum=0., eps=self.eps).reshape_as(self.weight) + x = F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) + return x + + +class ScaledStdConv2d(nn.Conv2d): + """Conv2d layer with Scaled Weight Standardization. + + Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` - + https://arxiv.org/abs/2101.08692 + + NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor. + """ + + def __init__( + self, in_channels, out_channels, kernel_size, stride=1, padding=None, + dilation=1, groups=1, bias=True, gamma=1.0, eps=1e-6, gain_init=1.0): + if padding is None: + padding = get_padding(kernel_size, stride, dilation) + super().__init__( + in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, + groups=groups, bias=bias) + self.gain = nn.Parameter(torch.full((self.out_channels, 1, 1, 1), gain_init)) + self.scale = gamma * self.weight[0].numel() ** -0.5 # gamma * 1 / sqrt(fan-in) + self.eps = eps + + def forward(self, x): + weight = F.batch_norm( + self.weight.reshape(1, self.out_channels, -1), None, None, + weight=(self.gain * self.scale).view(-1), + training=True, momentum=0., eps=self.eps).reshape_as(self.weight) + return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) + + +class ScaledStdConv2dSame(nn.Conv2d): + """Conv2d layer with Scaled Weight Standardization and Tensorflow-like SAME padding support + + Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` - + https://arxiv.org/abs/2101.08692 + + NOTE: the operations used in this impl differ slightly from the DeepMind Haiku impl. The impact is minor. + """ + + def __init__( + self, in_channels, out_channels, kernel_size, stride=1, padding='SAME', + dilation=1, groups=1, bias=True, gamma=1.0, eps=1e-6, gain_init=1.0): + padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation) + super().__init__( + in_channels, out_channels, kernel_size, stride=stride, padding=padding, dilation=dilation, + groups=groups, bias=bias) + self.gain = nn.Parameter(torch.full((self.out_channels, 1, 1, 1), gain_init)) + self.scale = gamma * self.weight[0].numel() ** -0.5 + self.same_pad = is_dynamic + self.eps = eps + + def forward(self, x): + if self.same_pad: + x = pad_same(x, self.kernel_size, self.stride, self.dilation) + weight = F.batch_norm( + self.weight.reshape(1, self.out_channels, -1), None, None, + weight=(self.gain * self.scale).view(-1), + training=True, momentum=0., eps=self.eps).reshape_as(self.weight) + return F.conv2d(x, weight, self.bias, self.stride, self.padding, self.dilation, self.groups) diff --git a/custom_timm/models/layers/test_time_pool.py b/custom_timm/models/layers/test_time_pool.py new file mode 100644 index 0000000000000000000000000000000000000000..5826d8c966d7bffa62f5f5fdd224f3f691276ce6 --- /dev/null +++ b/custom_timm/models/layers/test_time_pool.py @@ -0,0 +1,52 @@ +""" Test Time Pooling (Average-Max Pool) + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import logging +from torch import nn +import torch.nn.functional as F + +from .adaptive_avgmax_pool import adaptive_avgmax_pool2d + + +_logger = logging.getLogger(__name__) + + +class TestTimePoolHead(nn.Module): + def __init__(self, base, original_pool=7): + super(TestTimePoolHead, self).__init__() + self.base = base + self.original_pool = original_pool + base_fc = self.base.get_classifier() + if isinstance(base_fc, nn.Conv2d): + self.fc = base_fc + else: + self.fc = nn.Conv2d( + self.base.num_features, self.base.num_classes, kernel_size=1, bias=True) + self.fc.weight.data.copy_(base_fc.weight.data.view(self.fc.weight.size())) + self.fc.bias.data.copy_(base_fc.bias.data.view(self.fc.bias.size())) + self.base.reset_classifier(0) # delete original fc layer + + def forward(self, x): + x = self.base.forward_features(x) + x = F.avg_pool2d(x, kernel_size=self.original_pool, stride=1) + x = self.fc(x) + x = adaptive_avgmax_pool2d(x, 1) + return x.view(x.size(0), -1) + + +def apply_test_time_pool(model, config, use_test_size=False): + test_time_pool = False + if not hasattr(model, 'default_cfg') or not model.default_cfg: + return model, False + if use_test_size and 'test_input_size' in model.default_cfg: + df_input_size = model.default_cfg['test_input_size'] + else: + df_input_size = model.default_cfg['input_size'] + if config['input_size'][-1] > df_input_size[-1] and config['input_size'][-2] > df_input_size[-2]: + _logger.info('Target input size %s > pretrained default %s, using test time pooling' % + (str(config['input_size'][-2:]), str(df_input_size[-2:]))) + model = TestTimePoolHead(model, original_pool=model.default_cfg['pool_size']) + test_time_pool = True + return model, test_time_pool diff --git a/custom_timm/models/layers/trace_utils.py b/custom_timm/models/layers/trace_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..83970729e628b525d24162f5df37ee5bc253438f --- /dev/null +++ b/custom_timm/models/layers/trace_utils.py @@ -0,0 +1,13 @@ +try: + from torch import _assert +except ImportError: + def _assert(condition: bool, message: str): + assert condition, message + + +def _float_to_int(x: float) -> int: + """ + Symbolic tracing helper to substitute for inbuilt `int`. + Hint: Inbuilt `int` can't accept an argument of type `Proxy` + """ + return int(x) diff --git a/custom_timm/models/layers/weight_init.py b/custom_timm/models/layers/weight_init.py new file mode 100644 index 0000000000000000000000000000000000000000..943e4f4cec20eb03230748aa919010d017643303 --- /dev/null +++ b/custom_timm/models/layers/weight_init.py @@ -0,0 +1,125 @@ +import torch +import math +import warnings + +from torch.nn.init import _calculate_fan_in_and_fan_out + + +def _trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1. + math.erf(x / math.sqrt(2.))) / 2. + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2) + + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.): + # type: (Tensor, float, float, float, float) -> Tensor + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are + applied while sampling the normal with mean/std applied, therefore a, b args + should be adjusted to match the range of mean, std args. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + with torch.no_grad(): + return _trunc_normal_(tensor, mean, std, a, b) + + +def trunc_normal_tf_(tensor, mean=0., std=1., a=-2., b=2.): + # type: (Tensor, float, float, float, float) -> Tensor + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the + bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0 + and the result is subsquently scaled and shifted by the mean and std args. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + with torch.no_grad(): + _trunc_normal_(tensor, 0, 1.0, a, b) + tensor.mul_(std).add_(mean) + return tensor + + +def variance_scaling_(tensor, scale=1.0, mode='fan_in', distribution='normal'): + fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor) + if mode == 'fan_in': + denom = fan_in + elif mode == 'fan_out': + denom = fan_out + elif mode == 'fan_avg': + denom = (fan_in + fan_out) / 2 + + variance = scale / denom + + if distribution == "truncated_normal": + # constant is stddev of standard normal truncated to (-2, 2) + trunc_normal_tf_(tensor, std=math.sqrt(variance) / .87962566103423978) + elif distribution == "normal": + with torch.no_grad(): + tensor.normal_(std=math.sqrt(variance)) + elif distribution == "uniform": + bound = math.sqrt(3 * variance) + with torch.no_grad(): + tensor.uniform_(-bound, bound) + else: + raise ValueError(f"invalid distribution {distribution}") + + +def lecun_normal_(tensor): + variance_scaling_(tensor, mode='fan_in', distribution='truncated_normal') diff --git a/custom_timm/models/levit.py b/custom_timm/models/levit.py new file mode 100644 index 0000000000000000000000000000000000000000..3f8a360681a6d7381eb28d1ec716bb061fb7e5e5 --- /dev/null +++ b/custom_timm/models/levit.py @@ -0,0 +1,592 @@ +""" LeViT + +Paper: `LeViT: a Vision Transformer in ConvNet's Clothing for Faster Inference` + - https://arxiv.org/abs/2104.01136 + +@article{graham2021levit, + title={LeViT: a Vision Transformer in ConvNet's Clothing for Faster Inference}, + author={Benjamin Graham and Alaaeldin El-Nouby and Hugo Touvron and Pierre Stock and Armand Joulin and Herv\'e J\'egou and Matthijs Douze}, + journal={arXiv preprint arXiv:22104.01136}, + year={2021} +} + +Adapted from official impl at https://github.com/facebookresearch/LeViT, original copyright bellow. + +This version combines both conv/linear models and fixes torchscript compatibility. + +Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman +""" + +# Copyright (c) 2015-present, Facebook, Inc. +# All rights reserved. + +# Modified from +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +# Copyright 2020 Ross Wightman, Apache-2.0 License +import itertools +from copy import deepcopy +from functools import partial +from typing import Dict + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_STD, IMAGENET_DEFAULT_MEAN +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import to_ntuple, get_act_layer +from .vision_transformer import trunc_normal_ +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.0.c', 'classifier': ('head.l', 'head_dist.l'), + **kwargs + } + + +default_cfgs = dict( + levit_128s=_cfg( + url='https://dl.fbaipublicfiles.com/LeViT/LeViT-128S-96703c44.pth' + ), + levit_128=_cfg( + url='https://dl.fbaipublicfiles.com/LeViT/LeViT-128-b88c2750.pth' + ), + levit_192=_cfg( + url='https://dl.fbaipublicfiles.com/LeViT/LeViT-192-92712e41.pth' + ), + levit_256=_cfg( + url='https://dl.fbaipublicfiles.com/LeViT/LeViT-256-13b5763e.pth' + ), + levit_384=_cfg( + url='https://dl.fbaipublicfiles.com/LeViT/LeViT-384-9bdaf2e2.pth' + ), + + levit_256d=_cfg(url='', classifier='head.l'), +) + +model_cfgs = dict( + levit_128s=dict( + embed_dim=(128, 256, 384), key_dim=16, num_heads=(4, 6, 8), depth=(2, 3, 4)), + levit_128=dict( + embed_dim=(128, 256, 384), key_dim=16, num_heads=(4, 8, 12), depth=(4, 4, 4)), + levit_192=dict( + embed_dim=(192, 288, 384), key_dim=32, num_heads=(3, 5, 6), depth=(4, 4, 4)), + levit_256=dict( + embed_dim=(256, 384, 512), key_dim=32, num_heads=(4, 6, 8), depth=(4, 4, 4)), + levit_384=dict( + embed_dim=(384, 512, 768), key_dim=32, num_heads=(6, 9, 12), depth=(4, 4, 4)), + + levit_256d=dict( + embed_dim=(256, 384, 512), key_dim=32, num_heads=(4, 6, 8), depth=(4, 8, 6)), +) + +__all__ = ['Levit'] + + +@register_model +def levit_128s(pretrained=False, use_conv=False, **kwargs): + return create_levit( + 'levit_128s', pretrained=pretrained, use_conv=use_conv, **kwargs) + + +@register_model +def levit_128(pretrained=False, use_conv=False, **kwargs): + return create_levit( + 'levit_128', pretrained=pretrained, use_conv=use_conv, **kwargs) + + +@register_model +def levit_192(pretrained=False, use_conv=False, **kwargs): + return create_levit( + 'levit_192', pretrained=pretrained, use_conv=use_conv, **kwargs) + + +@register_model +def levit_256(pretrained=False, use_conv=False, **kwargs): + return create_levit( + 'levit_256', pretrained=pretrained, use_conv=use_conv, **kwargs) + + +@register_model +def levit_384(pretrained=False, use_conv=False, **kwargs): + return create_levit( + 'levit_384', pretrained=pretrained, use_conv=use_conv, **kwargs) + + +@register_model +def levit_256d(pretrained=False, use_conv=False, **kwargs): + return create_levit( + 'levit_256d', pretrained=pretrained, use_conv=use_conv, distilled=False, **kwargs) + + +class ConvNorm(nn.Sequential): + def __init__( + self, in_chs, out_chs, kernel_size=1, stride=1, pad=0, dilation=1, + groups=1, bn_weight_init=1, resolution=-10000): + super().__init__() + self.add_module('c', nn.Conv2d(in_chs, out_chs, kernel_size, stride, pad, dilation, groups, bias=False)) + self.add_module('bn', nn.BatchNorm2d(out_chs)) + + nn.init.constant_(self.bn.weight, bn_weight_init) + + @torch.no_grad() + def fuse(self): + c, bn = self._modules.values() + w = bn.weight / (bn.running_var + bn.eps) ** 0.5 + w = c.weight * w[:, None, None, None] + b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5 + m = nn.Conv2d( + w.size(1), w.size(0), w.shape[2:], stride=self.c.stride, + padding=self.c.padding, dilation=self.c.dilation, groups=self.c.groups) + m.weight.data.copy_(w) + m.bias.data.copy_(b) + return m + + +class LinearNorm(nn.Sequential): + def __init__(self, in_features, out_features, bn_weight_init=1, resolution=-100000): + super().__init__() + self.add_module('c', nn.Linear(in_features, out_features, bias=False)) + self.add_module('bn', nn.BatchNorm1d(out_features)) + + nn.init.constant_(self.bn.weight, bn_weight_init) + + @torch.no_grad() + def fuse(self): + l, bn = self._modules.values() + w = bn.weight / (bn.running_var + bn.eps) ** 0.5 + w = l.weight * w[:, None] + b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5 + m = nn.Linear(w.size(1), w.size(0)) + m.weight.data.copy_(w) + m.bias.data.copy_(b) + return m + + def forward(self, x): + x = self.c(x) + return self.bn(x.flatten(0, 1)).reshape_as(x) + + +class NormLinear(nn.Sequential): + def __init__(self, in_features, out_features, bias=True, std=0.02): + super().__init__() + self.add_module('bn', nn.BatchNorm1d(in_features)) + self.add_module('l', nn.Linear(in_features, out_features, bias=bias)) + + trunc_normal_(self.l.weight, std=std) + if self.l.bias is not None: + nn.init.constant_(self.l.bias, 0) + + @torch.no_grad() + def fuse(self): + bn, l = self._modules.values() + w = bn.weight / (bn.running_var + bn.eps) ** 0.5 + b = bn.bias - self.bn.running_mean * self.bn.weight / (bn.running_var + bn.eps) ** 0.5 + w = l.weight * w[None, :] + if l.bias is None: + b = b @ self.l.weight.T + else: + b = (l.weight @ b[:, None]).view(-1) + self.l.bias + m = nn.Linear(w.size(1), w.size(0)) + m.weight.data.copy_(w) + m.bias.data.copy_(b) + return m + + +def stem_b16(in_chs, out_chs, activation, resolution=224): + return nn.Sequential( + ConvNorm(in_chs, out_chs // 8, 3, 2, 1, resolution=resolution), + activation(), + ConvNorm(out_chs // 8, out_chs // 4, 3, 2, 1, resolution=resolution // 2), + activation(), + ConvNorm(out_chs // 4, out_chs // 2, 3, 2, 1, resolution=resolution // 4), + activation(), + ConvNorm(out_chs // 2, out_chs, 3, 2, 1, resolution=resolution // 8)) + + +class Residual(nn.Module): + def __init__(self, m, drop): + super().__init__() + self.m = m + self.drop = drop + + def forward(self, x): + if self.training and self.drop > 0: + return x + self.m(x) * torch.rand( + x.size(0), 1, 1, device=x.device).ge_(self.drop).div(1 - self.drop).detach() + else: + return x + self.m(x) + + +class Subsample(nn.Module): + def __init__(self, stride, resolution): + super().__init__() + self.stride = stride + self.resolution = resolution + + def forward(self, x): + B, N, C = x.shape + x = x.view(B, self.resolution, self.resolution, C)[:, ::self.stride, ::self.stride] + return x.reshape(B, -1, C) + + +class Attention(nn.Module): + ab: Dict[str, torch.Tensor] + + def __init__( + self, dim, key_dim, num_heads=8, attn_ratio=4, act_layer=None, resolution=14, use_conv=False): + super().__init__() + ln_layer = ConvNorm if use_conv else LinearNorm + self.use_conv = use_conv + self.num_heads = num_heads + self.scale = key_dim ** -0.5 + self.key_dim = key_dim + self.key_attn_dim = key_dim * num_heads + self.val_dim = int(attn_ratio * key_dim) + self.val_attn_dim = int(attn_ratio * key_dim) * num_heads + + self.qkv = ln_layer(dim, self.val_attn_dim + self.key_attn_dim * 2, resolution=resolution) + self.proj = nn.Sequential( + act_layer(), + ln_layer(self.val_attn_dim, dim, bn_weight_init=0, resolution=resolution) + ) + + self.attention_biases = nn.Parameter(torch.zeros(num_heads, resolution ** 2)) + pos = torch.stack(torch.meshgrid(torch.arange(resolution), torch.arange(resolution))).flatten(1) + rel_pos = (pos[..., :, None] - pos[..., None, :]).abs() + rel_pos = (rel_pos[0] * resolution) + rel_pos[1] + self.register_buffer('attention_bias_idxs', rel_pos) + self.ab = {} + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and self.ab: + self.ab = {} # clear ab cache + + def get_attention_biases(self, device: torch.device) -> torch.Tensor: + if self.training: + return self.attention_biases[:, self.attention_bias_idxs] + else: + device_key = str(device) + if device_key not in self.ab: + self.ab[device_key] = self.attention_biases[:, self.attention_bias_idxs] + return self.ab[device_key] + + def forward(self, x): # x (B,C,H,W) + if self.use_conv: + B, C, H, W = x.shape + q, k, v = self.qkv(x).view( + B, self.num_heads, -1, H * W).split([self.key_dim, self.key_dim, self.val_dim], dim=2) + + attn = (q.transpose(-2, -1) @ k) * self.scale + self.get_attention_biases(x.device) + attn = attn.softmax(dim=-1) + + x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + else: + B, N, C = x.shape + q, k, v = self.qkv(x).view( + B, N, self.num_heads, -1).split([self.key_dim, self.key_dim, self.val_dim], dim=3) + q = q.permute(0, 2, 1, 3) + k = k.permute(0, 2, 3, 1) + v = v.permute(0, 2, 1, 3) + + attn = q @ k * self.scale + self.get_attention_biases(x.device) + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(B, N, self.val_attn_dim) + x = self.proj(x) + return x + + +class AttentionSubsample(nn.Module): + ab: Dict[str, torch.Tensor] + + def __init__( + self, in_dim, out_dim, key_dim, num_heads=8, attn_ratio=2, + act_layer=None, stride=2, resolution=14, resolution_out=7, use_conv=False): + super().__init__() + self.stride = stride + self.num_heads = num_heads + self.scale = key_dim ** -0.5 + self.key_dim = key_dim + self.key_attn_dim = key_dim * num_heads + self.val_dim = int(attn_ratio * key_dim) + self.val_attn_dim = self.val_dim * self.num_heads + self.resolution = resolution + self.resolution_out_area = resolution_out ** 2 + + self.use_conv = use_conv + if self.use_conv: + ln_layer = ConvNorm + sub_layer = partial(nn.AvgPool2d, kernel_size=1, padding=0) + else: + ln_layer = LinearNorm + sub_layer = partial(Subsample, resolution=resolution) + + self.kv = ln_layer(in_dim, self.val_attn_dim + self.key_attn_dim, resolution=resolution) + self.q = nn.Sequential( + sub_layer(stride=stride), + ln_layer(in_dim, self.key_attn_dim, resolution=resolution_out) + ) + self.proj = nn.Sequential( + act_layer(), + ln_layer(self.val_attn_dim, out_dim, resolution=resolution_out) + ) + + self.attention_biases = nn.Parameter(torch.zeros(num_heads, self.resolution ** 2)) + k_pos = torch.stack(torch.meshgrid(torch.arange(resolution), torch.arange(resolution))).flatten(1) + q_pos = torch.stack(torch.meshgrid( + torch.arange(0, resolution, step=stride), + torch.arange(0, resolution, step=stride))).flatten(1) + rel_pos = (q_pos[..., :, None] - k_pos[..., None, :]).abs() + rel_pos = (rel_pos[0] * resolution) + rel_pos[1] + self.register_buffer('attention_bias_idxs', rel_pos) + + self.ab = {} # per-device attention_biases cache + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and self.ab: + self.ab = {} # clear ab cache + + def get_attention_biases(self, device: torch.device) -> torch.Tensor: + if self.training: + return self.attention_biases[:, self.attention_bias_idxs] + else: + device_key = str(device) + if device_key not in self.ab: + self.ab[device_key] = self.attention_biases[:, self.attention_bias_idxs] + return self.ab[device_key] + + def forward(self, x): + if self.use_conv: + B, C, H, W = x.shape + k, v = self.kv(x).view(B, self.num_heads, -1, H * W).split([self.key_dim, self.val_dim], dim=2) + q = self.q(x).view(B, self.num_heads, self.key_dim, self.resolution_out_area) + + attn = (q.transpose(-2, -1) @ k) * self.scale + self.get_attention_biases(x.device) + attn = attn.softmax(dim=-1) + + x = (v @ attn.transpose(-2, -1)).reshape(B, -1, self.resolution, self.resolution) + else: + B, N, C = x.shape + k, v = self.kv(x).view(B, N, self.num_heads, -1).split([self.key_dim, self.val_dim], dim=3) + k = k.permute(0, 2, 3, 1) # BHCN + v = v.permute(0, 2, 1, 3) # BHNC + q = self.q(x).view(B, self.resolution_out_area, self.num_heads, self.key_dim).permute(0, 2, 1, 3) + + attn = q @ k * self.scale + self.get_attention_biases(x.device) + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(B, -1, self.val_attn_dim) + x = self.proj(x) + return x + + +class Levit(nn.Module): + """ Vision Transformer with support for patch or hybrid CNN input stage + + NOTE: distillation is defaulted to True since pretrained weights use it, will cause problems + w/ train scripts that don't take tuple outputs, + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + embed_dim=(192,), + key_dim=64, + depth=(12,), + num_heads=(3,), + attn_ratio=2, + mlp_ratio=2, + hybrid_backbone=None, + down_ops=None, + act_layer='hard_swish', + attn_act_layer='hard_swish', + use_conv=False, + global_pool='avg', + drop_rate=0., + drop_path_rate=0.): + super().__init__() + act_layer = get_act_layer(act_layer) + attn_act_layer = get_act_layer(attn_act_layer) + ln_layer = ConvNorm if use_conv else LinearNorm + self.use_conv = use_conv + if isinstance(img_size, tuple): + # FIXME origin impl passes single img/res dim through whole hierarchy, + # not sure this model will be used enough to spend time fixing it. + assert img_size[0] == img_size[1] + img_size = img_size[0] + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = embed_dim[-1] + self.embed_dim = embed_dim + self.grad_checkpointing = False + + num_stages = len(embed_dim) + assert len(depth) == len(num_heads) == num_stages + key_dim = to_ntuple(num_stages)(key_dim) + attn_ratio = to_ntuple(num_stages)(attn_ratio) + mlp_ratio = to_ntuple(num_stages)(mlp_ratio) + down_ops = down_ops or ( + # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride) + ('Subsample', key_dim[0], embed_dim[0] // key_dim[0], 4, 2, 2), + ('Subsample', key_dim[0], embed_dim[1] // key_dim[1], 4, 2, 2), + ('',) + ) + + self.patch_embed = hybrid_backbone or stem_b16(in_chans, embed_dim[0], activation=act_layer) + + self.blocks = [] + resolution = img_size // patch_size + for i, (ed, kd, dpth, nh, ar, mr, do) in enumerate( + zip(embed_dim, key_dim, depth, num_heads, attn_ratio, mlp_ratio, down_ops)): + for _ in range(dpth): + self.blocks.append( + Residual( + Attention( + ed, kd, nh, attn_ratio=ar, act_layer=attn_act_layer, + resolution=resolution, use_conv=use_conv), + drop_path_rate)) + if mr > 0: + h = int(ed * mr) + self.blocks.append( + Residual(nn.Sequential( + ln_layer(ed, h, resolution=resolution), + act_layer(), + ln_layer(h, ed, bn_weight_init=0, resolution=resolution), + ), drop_path_rate)) + if do[0] == 'Subsample': + # ('Subsample',key_dim, num_heads, attn_ratio, mlp_ratio, stride) + resolution_out = (resolution - 1) // do[5] + 1 + self.blocks.append( + AttentionSubsample( + *embed_dim[i:i + 2], key_dim=do[1], num_heads=do[2], + attn_ratio=do[3], act_layer=attn_act_layer, stride=do[5], + resolution=resolution, resolution_out=resolution_out, use_conv=use_conv)) + resolution = resolution_out + if do[4] > 0: # mlp_ratio + h = int(embed_dim[i + 1] * do[4]) + self.blocks.append( + Residual(nn.Sequential( + ln_layer(embed_dim[i + 1], h, resolution=resolution), + act_layer(), + ln_layer(h, embed_dim[i + 1], bn_weight_init=0, resolution=resolution), + ), drop_path_rate)) + self.blocks = nn.Sequential(*self.blocks) + + # Classifier head + self.head = NormLinear(embed_dim[-1], num_classes) if num_classes > 0 else nn.Identity() + + @torch.jit.ignore + def no_weight_decay(self): + return {x for x in self.state_dict().keys() if 'attention_biases' in x} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^cls_token|pos_embed|patch_embed', # stem and embed + blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None, distillation=None): + self.num_classes = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = NormLinear(self.embed_dim[-1], num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + if not self.use_conv: + x = x.flatten(2).transpose(1, 2) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean(dim=(-2, -1)) if self.use_conv else x.mean(dim=1) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +class LevitDistilled(Levit): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.head_dist = NormLinear(self.num_features, self.num_classes) if self.num_classes > 0 else nn.Identity() + self.distilled_training = False # must set this True to train w/ distillation token + + @torch.jit.ignore + def get_classifier(self): + return self.head, self.head_dist + + def reset_classifier(self, num_classes, global_pool=None, distillation=None): + self.num_classes = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = NormLinear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + self.head_dist = NormLinear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + @torch.jit.ignore + def set_distilled_training(self, enable=True): + self.distilled_training = enable + + def forward_head(self, x): + if self.global_pool == 'avg': + x = x.mean(dim=(-2, -1)) if self.use_conv else x.mean(dim=1) + x, x_dist = self.head(x), self.head_dist(x) + if self.distilled_training and self.training and not torch.jit.is_scripting(): + # only return separate classification predictions when training in distilled mode + return x, x_dist + else: + # during standard train/finetune, inference average the classifier predictions + return (x + x_dist) / 2 + + +def checkpoint_filter_fn(state_dict, model): + if 'model' in state_dict: + # For deit models + state_dict = state_dict['model'] + D = model.state_dict() + for k in state_dict.keys(): + if k in D and D[k].ndim == 4 and state_dict[k].ndim == 2: + state_dict[k] = state_dict[k][:, :, None, None] + return state_dict + + +def create_levit(variant, pretrained=False, distilled=True, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model_cfg = dict(**model_cfgs[variant], **kwargs) + model = build_model_with_cfg( + LevitDistilled if distilled else Levit, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **model_cfg) + return model + diff --git a/custom_timm/models/maxxvit.py b/custom_timm/models/maxxvit.py new file mode 100644 index 0000000000000000000000000000000000000000..f01e0812e86cb6a205d0bb18adf7de1d03a3e318 --- /dev/null +++ b/custom_timm/models/maxxvit.py @@ -0,0 +1,1914 @@ +""" MaxVit and CoAtNet Vision Transformer - CNN Hybrids in PyTorch + +This is a from-scratch implementation of both CoAtNet and MaxVit in PyTorch. + +99% of the implementation was done from papers, however last minute some adjustments were made +based on the (as yet unfinished?) public code release https://github.com/google-research/maxvit + +There are multiple sets of models defined for both architectures. Typically, names with a + `_rw` suffix are my own original configs prior to referencing https://github.com/google-research/maxvit. +These configs work well and appear to be a bit faster / lower resource than the paper. + +The models without extra prefix / suffix' (coatnet_0_224, maxvit_tiny_224, etc), are intended to +match paper, BUT, without any official pretrained weights it's difficult to confirm a 100% match. + +# FIXME / WARNING +This impl remains a WIP, some configs and models may vanish or change... + +Papers: + +MaxViT: Multi-Axis Vision Transformer - https://arxiv.org/abs/2204.01697 +@article{tu2022maxvit, + title={MaxViT: Multi-Axis Vision Transformer}, + author={Tu, Zhengzhong and Talebi, Hossein and Zhang, Han and Yang, Feng and Milanfar, Peyman and Bovik, Alan and Li, Yinxiao}, + journal={ECCV}, + year={2022}, +} + +CoAtNet: Marrying Convolution and Attention for All Data Sizes - https://arxiv.org/abs/2106.04803 +@article{DBLP:journals/corr/abs-2106-04803, + author = {Zihang Dai and Hanxiao Liu and Quoc V. Le and Mingxing Tan}, + title = {CoAtNet: Marrying Convolution and Attention for All Data Sizes}, + journal = {CoRR}, + volume = {abs/2106.04803}, + year = {2021} +} + +Hacked together by / Copyright 2022, Ross Wightman +""" + +import math +from collections import OrderedDict +from dataclasses import dataclass, replace, field +from functools import partial +from typing import Callable, Optional, Union, Tuple, List + +import torch +from torch import nn +from torch.utils.checkpoint import checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, checkpoint_seq, named_apply +from .fx_features import register_notrace_function +from .layers import Mlp, ConvMlp, DropPath, ClassifierHead, trunc_normal_tf_, LayerNorm2d, LayerNorm +from .layers import create_attn, get_act_layer, get_norm_layer, get_norm_act_layer, create_conv2d +from .layers import to_2tuple, extend_tuple, make_divisible, _assert +from .registry import register_model +from .vision_transformer_relpos import RelPosMlp, RelPosBias # FIXME move these to common location + +__all__ = ['MaxxVitCfg', 'MaxxVitConvCfg', 'MaxxVitTransformerCfg', 'MaxxVit'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.95, 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + 'first_conv': 'stem.conv1', 'classifier': 'head.fc', + 'fixed_input_size': True, + **kwargs + } + + +default_cfgs = { + # Fiddling with configs / defaults / still pretraining + 'coatnet_pico_rw_224': _cfg(url=''), + 'coatnet_nano_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_nano_rw_224_sw-f53093b4.pth', + crop_pct=0.9), + 'coatnet_0_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_0_rw_224_sw-a6439706.pth'), + 'coatnet_1_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_1_rw_224_sw-5cae1ea8.pth' + ), + 'coatnet_2_rw_224': _cfg(url=''), + 'coatnet_3_rw_224': _cfg(url=''), + + # Highly experimental configs + 'coatnet_bn_0_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_bn_0_rw_224_sw-c228e218.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, + crop_pct=0.95), + 'coatnet_rmlp_nano_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_nano_rw_224_sw-bd1d51b3.pth', + crop_pct=0.9), + 'coatnet_rmlp_0_rw_224': _cfg(url=''), + 'coatnet_rmlp_1_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_1_rw_224_sw-9051e6c3.pth'), + 'coatnet_rmlp_2_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnet_rmlp_2_rw_224_sw-5ccfac55.pth'), + 'coatnet_rmlp_3_rw_224': _cfg(url=''), + 'coatnet_nano_cc_224': _cfg(url=''), + 'coatnext_nano_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/coatnext_nano_rw_224_ad-22cb71c2.pth', + crop_pct=0.9), + + # Trying to be like the CoAtNet paper configs + 'coatnet_0_224': _cfg(url=''), + 'coatnet_1_224': _cfg(url=''), + 'coatnet_2_224': _cfg(url=''), + 'coatnet_3_224': _cfg(url=''), + 'coatnet_4_224': _cfg(url=''), + 'coatnet_5_224': _cfg(url=''), + + # Experimental configs + 'maxvit_pico_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_nano_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_nano_rw_256_sw-fb127241.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_tiny_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_tiny_rw_224_sw-7d0dffeb.pth'), + 'maxvit_tiny_rw_256': _cfg( + url='', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_pico_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_pico_rw_256_sw-8d82f2c6.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_nano_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_nano_rw_256_sw-c17bb0d6.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_tiny_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_tiny_rw_256_sw-bbef0ff5.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxvit_rmlp_small_rw_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxvit_rmlp_small_rw_224_sw-6ef0ae4f.pth', + crop_pct=0.9, + ), + 'maxvit_rmlp_small_rw_256': _cfg( + url='', + input_size=(3, 256, 256), pool_size=(8, 8)), + + 'maxvit_tiny_pm_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + + 'maxxvit_rmlp_nano_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_nano_rw_256_sw-0325d459.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_rmlp_tiny_rw_256': _cfg(url='', input_size=(3, 256, 256), pool_size=(8, 8)), + 'maxxvit_rmlp_small_rw_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-maxx/maxxvit_rmlp_small_rw_256_sw-37e217ff.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + + # Trying to be like the MaxViT paper configs + 'maxvit_tiny_224': _cfg(url=''), + 'maxvit_small_224': _cfg(url=''), + 'maxvit_base_224': _cfg(url=''), + 'maxvit_large_224': _cfg(url=''), + 'maxvit_xlarge_224': _cfg(url=''), +} + + +@dataclass +class MaxxVitTransformerCfg: + dim_head: int = 32 + expand_ratio: float = 4.0 + expand_first: bool = True + shortcut_bias: bool = True + attn_bias: bool = True + attn_drop: float = 0. + proj_drop: float = 0. + pool_type: str = 'avg2' + rel_pos_type: str = 'bias' + rel_pos_dim: int = 512 # for relative position types w/ MLP + partition_ratio: int = 32 + window_size: Optional[Tuple[int, int]] = None + grid_size: Optional[Tuple[int, int]] = None + init_values: Optional[float] = None + act_layer: str = 'gelu' + norm_layer: str = 'layernorm2d' + norm_layer_cl: str = 'layernorm' + norm_eps: float = 1e-6 + + def __post_init__(self): + if self.grid_size is not None: + self.grid_size = to_2tuple(self.grid_size) + if self.window_size is not None: + self.window_size = to_2tuple(self.window_size) + if self.grid_size is None: + self.grid_size = self.window_size + + +@dataclass +class MaxxVitConvCfg: + block_type: str = 'mbconv' + expand_ratio: float = 4.0 + expand_output: bool = True # calculate expansion channels from output (vs input chs) + kernel_size: int = 3 + group_size: int = 1 # 1 == depthwise + pre_norm_act: bool = False # activation after pre-norm + output_bias: bool = True # bias for shortcut + final 1x1 projection conv + stride_mode: str = 'dw' # stride done via one of 'pool', '1x1', 'dw' + pool_type: str = 'avg2' + downsample_pool_type: str = 'avg2' + attn_early: bool = False # apply attn between conv2 and norm2, instead of after norm2 + attn_layer: str = 'se' + attn_act_layer: str = 'silu' + attn_ratio: float = 0.25 + init_values: Optional[float] = 1e-6 # for ConvNeXt block, ignored by MBConv + act_layer: str = 'gelu' + norm_layer: str = '' + norm_layer_cl: str = '' + norm_eps: Optional[float] = None + + def __post_init__(self): + # mbconv vs convnext blocks have different defaults, set in post_init to avoid explicit config args + assert self.block_type in ('mbconv', 'convnext') + use_mbconv = self.block_type == 'mbconv' + if not self.norm_layer: + self.norm_layer = 'batchnorm2d' if use_mbconv else 'layernorm2d' + if not self.norm_layer_cl and not use_mbconv: + self.norm_layer_cl = 'layernorm' + if self.norm_eps is None: + self.norm_eps = 1e-5 if use_mbconv else 1e-6 + self.downsample_pool_type = self.downsample_pool_type or self.pool_type + + +@dataclass +class MaxxVitCfg: + embed_dim: Tuple[int, ...] = (96, 192, 384, 768) + depths: Tuple[int, ...] = (2, 3, 5, 2) + block_type: Tuple[Union[str, Tuple[str, ...]], ...] = ('C', 'C', 'T', 'T') + stem_width: Union[int, Tuple[int, int]] = 64 + stem_bias: bool = True + conv_cfg: MaxxVitConvCfg = field(default_factory=MaxxVitConvCfg) + transformer_cfg: MaxxVitTransformerCfg = field(default_factory=MaxxVitTransformerCfg) + weight_init: str = 'vit_eff' + + +def _rw_coat_cfg( + stride_mode='pool', + pool_type='avg2', + conv_output_bias=False, + conv_attn_early=False, + conv_attn_act_layer='relu', + conv_norm_layer='', + transformer_shortcut_bias=True, + transformer_norm_layer='layernorm2d', + transformer_norm_layer_cl='layernorm', + init_values=None, + rel_pos_type='bias', + rel_pos_dim=512, +): + # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit + # Common differences for initial timm models: + # - pre-norm layer in MZBConv included an activation after norm + # - mbconv expansion calculated from input instead of output chs + # - mbconv shortcut and final 1x1 conv did not have a bias + # - SE act layer was relu, not silu + # - mbconv uses silu in timm, not gelu + # - expansion in attention block done via output proj, not input proj + # Variable differences (evolved over training initial models): + # - avg pool with kernel_size=2 favoured downsampling (instead of maxpool for coat) + # - SE attention was between conv2 and norm/act + # - default to avg pool for mbconv downsample instead of 1x1 or dw conv + # - transformer block shortcut has no bias + return dict( + conv_cfg=MaxxVitConvCfg( + stride_mode=stride_mode, + pool_type=pool_type, + pre_norm_act=True, + expand_output=False, + output_bias=conv_output_bias, + attn_early=conv_attn_early, + attn_act_layer=conv_attn_act_layer, + act_layer='silu', + norm_layer=conv_norm_layer, + ), + transformer_cfg=MaxxVitTransformerCfg( + expand_first=False, + shortcut_bias=transformer_shortcut_bias, + pool_type=pool_type, + init_values=init_values, + norm_layer=transformer_norm_layer, + norm_layer_cl=transformer_norm_layer_cl, + rel_pos_type=rel_pos_type, + rel_pos_dim=rel_pos_dim, + ), + ) + + +def _rw_max_cfg( + stride_mode='dw', + pool_type='avg2', + conv_output_bias=False, + conv_attn_ratio=1 / 16, + conv_norm_layer='', + transformer_norm_layer='layernorm2d', + transformer_norm_layer_cl='layernorm', + window_size=None, + dim_head=32, + init_values=None, + rel_pos_type='bias', + rel_pos_dim=512, +): + # 'RW' timm variant models were created and trained before seeing https://github.com/google-research/maxvit + # Differences of initial timm models: + # - mbconv expansion calculated from input instead of output chs + # - mbconv shortcut and final 1x1 conv did not have a bias + # - mbconv uses silu in timm, not gelu + # - expansion in attention block done via output proj, not input proj + return dict( + conv_cfg=MaxxVitConvCfg( + stride_mode=stride_mode, + pool_type=pool_type, + expand_output=False, + output_bias=conv_output_bias, + attn_ratio=conv_attn_ratio, + act_layer='silu', + norm_layer=conv_norm_layer, + ), + transformer_cfg=MaxxVitTransformerCfg( + expand_first=False, + pool_type=pool_type, + dim_head=dim_head, + window_size=window_size, + init_values=init_values, + norm_layer=transformer_norm_layer, + norm_layer_cl=transformer_norm_layer_cl, + rel_pos_type=rel_pos_type, + rel_pos_dim=rel_pos_dim, + ), + ) + + +def _next_cfg( + stride_mode='dw', + pool_type='avg2', + conv_norm_layer='layernorm2d', + conv_norm_layer_cl='layernorm', + transformer_norm_layer='layernorm2d', + transformer_norm_layer_cl='layernorm', + window_size=None, + init_values=1e-6, + rel_pos_type='mlp', # MLP by default for maxxvit + rel_pos_dim=512, +): + # For experimental models with convnext instead of mbconv + init_values = to_2tuple(init_values) + return dict( + conv_cfg=MaxxVitConvCfg( + block_type='convnext', + stride_mode=stride_mode, + pool_type=pool_type, + expand_output=False, + init_values=init_values[0], + norm_layer=conv_norm_layer, + norm_layer_cl=conv_norm_layer_cl, + ), + transformer_cfg=MaxxVitTransformerCfg( + expand_first=False, + pool_type=pool_type, + window_size=window_size, + init_values=init_values[1], + norm_layer=transformer_norm_layer, + norm_layer_cl=transformer_norm_layer_cl, + rel_pos_type=rel_pos_type, + rel_pos_dim=rel_pos_dim, + ), + ) + + +model_cfgs = dict( + # Fiddling with configs / defaults / still pretraining + coatnet_pico_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 3, 5, 2), + stem_width=(32, 64), + **_rw_max_cfg( # using newer max defaults here + conv_output_bias=True, + conv_attn_ratio=0.25, + ), + ), + coatnet_nano_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + **_rw_max_cfg( # using newer max defaults here + stride_mode='pool', + conv_output_bias=True, + conv_attn_ratio=0.25, + ), + ), + coatnet_0_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 7, 2), # deeper than paper '0' model + stem_width=(32, 64), + **_rw_coat_cfg( + conv_attn_early=True, + transformer_shortcut_bias=False, + ), + ), + coatnet_1_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + stem_width=(32, 64), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_early=True, + transformer_shortcut_bias=False, + ) + ), + coatnet_2_rw_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + stem_width=(64, 128), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + ), + ), + coatnet_3_rw_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=(96, 192), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + ), + ), + + # Highly experimental configs + coatnet_bn_0_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 7, 2), # deeper than paper '0' model + stem_width=(32, 64), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_early=True, + transformer_shortcut_bias=False, + transformer_norm_layer='batchnorm2d', + ) + ), + coatnet_rmlp_nano_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + **_rw_max_cfg( + conv_output_bias=True, + conv_attn_ratio=0.25, + rel_pos_type='mlp', + rel_pos_dim=384, + ), + ), + coatnet_rmlp_0_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 7, 2), # deeper than paper '0' model + stem_width=(32, 64), + **_rw_coat_cfg( + stride_mode='dw', + rel_pos_type='mlp', + ), + ), + coatnet_rmlp_1_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + stem_width=(32, 64), + **_rw_coat_cfg( + pool_type='max', + conv_attn_early=True, + transformer_shortcut_bias=False, + rel_pos_type='mlp', + rel_pos_dim=384, # was supposed to be 512, woops + ), + ), + coatnet_rmlp_2_rw_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + stem_width=(64, 128), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + rel_pos_type='mlp' + ), + ), + coatnet_rmlp_3_rw_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=(96, 192), + **_rw_coat_cfg( + stride_mode='dw', + conv_attn_act_layer='silu', + init_values=1e-6, + rel_pos_type='mlp' + ), + ), + + coatnet_nano_cc_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + block_type=('C', 'C', ('C', 'T'), ('C', 'T')), + **_rw_coat_cfg(), + ), + coatnext_nano_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(3, 4, 6, 3), + stem_width=(32, 64), + weight_init='normal', + **_next_cfg( + rel_pos_type='bias', + init_values=(1e-5, None) + ), + ), + + # Trying to be like the CoAtNet paper configs + coatnet_0_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 3, 5, 2), + stem_width=64, + ), + coatnet_1_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + stem_width=64, + ), + coatnet_2_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + stem_width=128, + ), + coatnet_3_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + stem_width=192, + ), + coatnet_4_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 12, 28, 2), + stem_width=192, + ), + coatnet_5_224=MaxxVitCfg( + embed_dim=(256, 512, 1280, 2048), + depths=(2, 12, 28, 2), + stem_width=192, + ), + + # Experimental MaxVit configs + maxvit_pico_rw_256=MaxxVitCfg( + embed_dim=(32, 64, 128, 256), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(24, 32), + **_rw_max_cfg(), + ), + maxvit_nano_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(1, 2, 3, 1), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), + maxvit_tiny_rw_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), + maxvit_tiny_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), + + maxvit_rmlp_pico_rw_256=MaxxVitCfg( + embed_dim=(32, 64, 128, 256), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(24, 32), + **_rw_max_cfg(rel_pos_type='mlp'), + ), + maxvit_rmlp_nano_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(1, 2, 3, 1), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(rel_pos_type='mlp'), + ), + maxvit_rmlp_tiny_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(rel_pos_type='mlp'), + ), + maxvit_rmlp_small_rw_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg( + rel_pos_type='mlp', + init_values=1e-6, + ), + ), + maxvit_rmlp_small_rw_256=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_rw_max_cfg( + rel_pos_type='mlp', + init_values=1e-6, + ), + ), + + maxvit_tiny_pm_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('PM',) * 4, + stem_width=(32, 64), + **_rw_max_cfg(), + ), + + maxxvit_rmlp_nano_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(1, 2, 3, 1), + block_type=('M',) * 4, + stem_width=(32, 64), + weight_init='normal', + **_next_cfg(), + ), + maxxvit_rmlp_tiny_rw_256=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(32, 64), + **_next_cfg(), + ), + maxxvit_rmlp_small_rw_256=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=(48, 96), + **_next_cfg(), + ), + + # Trying to be like the MaxViT paper configs + maxvit_tiny_224=MaxxVitCfg( + embed_dim=(64, 128, 256, 512), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=64, + ), + maxvit_small_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 2, 5, 2), + block_type=('M',) * 4, + stem_width=64, + ), + maxvit_base_224=MaxxVitCfg( + embed_dim=(96, 192, 384, 768), + depths=(2, 6, 14, 2), + block_type=('M',) * 4, + stem_width=64, + ), + maxvit_large_224=MaxxVitCfg( + embed_dim=(128, 256, 512, 1024), + depths=(2, 6, 14, 2), + block_type=('M',) * 4, + stem_width=128, + ), + maxvit_xlarge_224=MaxxVitCfg( + embed_dim=(192, 384, 768, 1536), + depths=(2, 6, 14, 2), + block_type=('M',) * 4, + stem_width=192, + ), + +) + + +class Attention2d(nn.Module): + """ multi-head attention for 2D NCHW tensors""" + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + dim_head: int = 32, + bias: bool = True, + expand_first: bool = True, + rel_pos_cls: Callable = None, + attn_drop: float = 0., + proj_drop: float = 0. + ): + super().__init__() + dim_out = dim_out or dim + dim_attn = dim_out if expand_first else dim + self.num_heads = dim_attn // dim_head + self.dim_head = dim_head + self.scale = dim_head ** -0.5 + + self.qkv = nn.Conv2d(dim, dim_attn * 3, 1, bias=bias) + self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Conv2d(dim_attn, dim_out, 1, bias=bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): + B, C, H, W = x.shape + + q, k, v = self.qkv(x).view(B, self.num_heads, self.dim_head * 3, -1).chunk(3, dim=2) + + attn = (q.transpose(-2, -1) @ k) * self.scale + if self.rel_pos is not None: + attn = self.rel_pos(attn) + elif shared_rel_pos is not None: + attn = attn + shared_rel_pos + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (v @ attn.transpose(-2, -1)).view(B, -1, H, W) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class AttentionCl(nn.Module): + """ Channels-last multi-head attention (B, ..., C) """ + def __init__( + self, + dim: int, + dim_out: Optional[int] = None, + dim_head: int = 32, + bias: bool = True, + expand_first: bool = True, + rel_pos_cls: Callable = None, + attn_drop: float = 0., + proj_drop: float = 0. + ): + super().__init__() + dim_out = dim_out or dim + dim_attn = dim_out if expand_first and dim_out > dim else dim + assert dim_attn % dim_head == 0, 'attn dim should be divisible by head_dim' + self.num_heads = dim_attn // dim_head + self.dim_head = dim_head + self.scale = dim_head ** -0.5 + + self.qkv = nn.Linear(dim, dim_attn * 3, bias=bias) + self.rel_pos = rel_pos_cls(num_heads=self.num_heads) if rel_pos_cls else None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim_attn, dim_out, bias=bias) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): + B = x.shape[0] + restore_shape = x.shape[:-1] + + q, k, v = self.qkv(x).view(B, -1, self.num_heads, self.dim_head * 3).transpose(1, 2).chunk(3, dim=3) + + attn = (q @ k.transpose(-2, -1)) * self.scale + if self.rel_pos is not None: + attn = self.rel_pos(attn, shared_rel_pos=shared_rel_pos) + elif shared_rel_pos is not None: + attn = attn + shared_rel_pos + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(restore_shape + (-1,)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class LayerScale(nn.Module): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + gamma = self.gamma + return x.mul_(gamma) if self.inplace else x * gamma + + +class LayerScale2d(nn.Module): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + gamma = self.gamma.view(1, -1, 1, 1) + return x.mul_(gamma) if self.inplace else x * gamma + + +class Downsample2d(nn.Module): + """ A downsample pooling module supporting several maxpool and avgpool modes + * 'max' - MaxPool2d w/ kernel_size 3, stride 2, padding 1 + * 'max2' - MaxPool2d w/ kernel_size = stride = 2 + * 'avg' - AvgPool2d w/ kernel_size 3, stride 2, padding 1 + * 'avg2' - AvgPool2d w/ kernel_size = stride = 2 + """ + + def __init__( + self, + dim: int, + dim_out: int, + pool_type: str = 'avg2', + bias: bool = True, + ): + super().__init__() + assert pool_type in ('max', 'max2', 'avg', 'avg2') + if pool_type == 'max': + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + elif pool_type == 'max2': + self.pool = nn.MaxPool2d(2) # kernel_size == stride == 2 + elif pool_type == 'avg': + self.pool = nn.AvgPool2d(kernel_size=3, stride=2, padding=1, count_include_pad=False) + else: + self.pool = nn.AvgPool2d(2) # kernel_size == stride == 2 + + if dim != dim_out: + self.expand = nn.Conv2d(dim, dim_out, 1, bias=bias) + else: + self.expand = nn.Identity() + + def forward(self, x): + x = self.pool(x) # spatial downsample + x = self.expand(x) # expand chs + return x + + +def _init_transformer(module, name, scheme=''): + if isinstance(module, (nn.Conv2d, nn.Linear)): + if scheme == 'normal': + nn.init.normal_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif scheme == 'trunc_normal': + trunc_normal_tf_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif scheme == 'xavier_normal': + nn.init.xavier_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + else: + # vit like + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + if 'mlp' in name: + nn.init.normal_(module.bias, std=1e-6) + else: + nn.init.zeros_(module.bias) + + +class TransformerBlock2d(nn.Module): + """ Transformer block with 2D downsampling + '2D' NCHW tensor layout + + Some gains can be seen on GPU using a 1D / CL block, BUT w/ the need to switch back/forth to NCHW + for spatial pooling, the benefit is minimal so ended up using just this variant for CoAt configs. + + This impl was faster on TPU w/ PT XLA than the 1D experiment. + """ + + def __init__( + self, + dim: int, + dim_out: int, + stride: int = 1, + rel_pos_cls: Callable = None, + cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), + drop_path: float = 0., + ): + super().__init__() + norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps) + act_layer = get_act_layer(cfg.act_layer) + + if stride == 2: + self.shortcut = Downsample2d(dim, dim_out, pool_type=cfg.pool_type, bias=cfg.shortcut_bias) + self.norm1 = nn.Sequential(OrderedDict([ + ('norm', norm_layer(dim)), + ('down', Downsample2d(dim, dim, pool_type=cfg.pool_type)), + ])) + else: + assert dim == dim_out + self.shortcut = nn.Identity() + self.norm1 = norm_layer(dim) + + self.attn = Attention2d( + dim, + dim_out, + dim_head=cfg.dim_head, + expand_first=cfg.expand_first, + bias=cfg.attn_bias, + rel_pos_cls=rel_pos_cls, + attn_drop=cfg.attn_drop, + proj_drop=cfg.proj_drop + ) + self.ls1 = LayerScale2d(dim_out, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim_out) + self.mlp = ConvMlp( + in_features=dim_out, + hidden_features=int(dim_out * cfg.expand_ratio), + act_layer=act_layer, + drop=cfg.proj_drop) + self.ls2 = LayerScale2d(dim_out, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def init_weights(self, scheme=''): + named_apply(partial(_init_transformer, scheme=scheme), self) + + def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): + x = self.shortcut(x) + self.drop_path1(self.ls1(self.attn(self.norm1(x), shared_rel_pos=shared_rel_pos))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +def _init_conv(module, name, scheme=''): + if isinstance(module, nn.Conv2d): + if scheme == 'normal': + nn.init.normal_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif scheme == 'trunc_normal': + trunc_normal_tf_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif scheme == 'xavier_normal': + nn.init.xavier_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + else: + # efficientnet like + fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels + fan_out //= module.groups + nn.init.normal_(module.weight, 0, math.sqrt(2.0 / fan_out)) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def num_groups(group_size, channels): + if not group_size: # 0 or None + return 1 # normal conv with 1 group + else: + # NOTE group_size == 1 -> depthwise conv + assert channels % group_size == 0 + return channels // group_size + + +class MbConvBlock(nn.Module): + """ Pre-Norm Conv Block - 1x1 - kxk - 1x1, w/ inverted bottleneck (expand) + """ + def __init__( + self, + in_chs: int, + out_chs: int, + stride: int = 1, + dilation: Tuple[int, int] = (1, 1), + cfg: MaxxVitConvCfg = MaxxVitConvCfg(), + drop_path: float = 0. + ): + super(MbConvBlock, self).__init__() + norm_act_layer = partial(get_norm_act_layer(cfg.norm_layer, cfg.act_layer), eps=cfg.norm_eps) + mid_chs = make_divisible((out_chs if cfg.expand_output else in_chs) * cfg.expand_ratio) + groups = num_groups(cfg.group_size, mid_chs) + + if stride == 2: + self.shortcut = Downsample2d(in_chs, out_chs, pool_type=cfg.pool_type, bias=cfg.output_bias) + else: + self.shortcut = nn.Identity() + + assert cfg.stride_mode in ('pool', '1x1', 'dw') + stride_pool, stride_1, stride_2 = 1, 1, 1 + if cfg.stride_mode == 'pool': + # NOTE this is not described in paper, experiment to find faster option that doesn't stride in 1x1 + stride_pool, dilation_2 = stride, dilation[1] + # FIXME handle dilation of avg pool + elif cfg.stride_mode == '1x1': + # NOTE I don't like this option described in paper, 1x1 w/ stride throws info away + stride_1, dilation_2 = stride, dilation[1] + else: + stride_2, dilation_2 = stride, dilation[0] + + self.pre_norm = norm_act_layer(in_chs, apply_act=cfg.pre_norm_act) + if stride_pool > 1: + self.down = Downsample2d(in_chs, in_chs, pool_type=cfg.downsample_pool_type) + else: + self.down = nn.Identity() + self.conv1_1x1 = create_conv2d(in_chs, mid_chs, 1, stride=stride_1) + self.norm1 = norm_act_layer(mid_chs) + + self.conv2_kxk = create_conv2d( + mid_chs, mid_chs, cfg.kernel_size, stride=stride_2, dilation=dilation_2, groups=groups) + + attn_kwargs = {} + if isinstance(cfg.attn_layer, str): + if cfg.attn_layer == 'se' or cfg.attn_layer == 'eca': + attn_kwargs['act_layer'] = cfg.attn_act_layer + attn_kwargs['rd_channels'] = int(cfg.attn_ratio * (out_chs if cfg.expand_output else mid_chs)) + + # two different orderings for SE and norm2 (due to some weights and trials using SE before norm2) + if cfg.attn_early: + self.se_early = create_attn(cfg.attn_layer, mid_chs, **attn_kwargs) + self.norm2 = norm_act_layer(mid_chs) + self.se = None + else: + self.se_early = None + self.norm2 = norm_act_layer(mid_chs) + self.se = create_attn(cfg.attn_layer, mid_chs, **attn_kwargs) + + self.conv3_1x1 = create_conv2d(mid_chs, out_chs, 1, bias=cfg.output_bias) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def init_weights(self, scheme=''): + named_apply(partial(_init_conv, scheme=scheme), self) + + def forward(self, x): + shortcut = self.shortcut(x) + x = self.pre_norm(x) + x = self.down(x) + + # 1x1 expansion conv & norm-act + x = self.conv1_1x1(x) + x = self.norm1(x) + + # depthwise / grouped 3x3 conv w/ SE (or other) channel attention & norm-act + x = self.conv2_kxk(x) + if self.se_early is not None: + x = self.se_early(x) + x = self.norm2(x) + if self.se is not None: + x = self.se(x) + + # 1x1 linear projection to output width + x = self.conv3_1x1(x) + x = self.drop_path(x) + shortcut + return x + + +class ConvNeXtBlock(nn.Module): + """ ConvNeXt Block + """ + + def __init__( + self, + in_chs: int, + out_chs: Optional[int] = None, + kernel_size: int = 7, + stride: int = 1, + dilation: Tuple[int, int] = (1, 1), + cfg: MaxxVitConvCfg = MaxxVitConvCfg(), + conv_mlp: bool = True, + drop_path: float = 0. + ): + super().__init__() + out_chs = out_chs or in_chs + act_layer = get_act_layer(cfg.act_layer) + if conv_mlp: + norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps) + mlp_layer = ConvMlp + else: + assert 'layernorm' in cfg.norm_layer + norm_layer = LayerNorm + mlp_layer = Mlp + self.use_conv_mlp = conv_mlp + + if stride == 2: + self.shortcut = Downsample2d(in_chs, out_chs) + elif in_chs != out_chs: + self.shortcut = nn.Conv2d(in_chs, out_chs, kernel_size=1, bias=cfg.output_bias) + else: + self.shortcut = nn.Identity() + + assert cfg.stride_mode in ('pool', 'dw') + stride_pool, stride_dw = 1, 1 + # FIXME handle dilation? + if cfg.stride_mode == 'pool': + stride_pool = stride + else: + stride_dw = stride + + if stride_pool == 2: + self.down = Downsample2d(in_chs, in_chs, pool_type=cfg.downsample_pool_type) + else: + self.down = nn.Identity() + + self.conv_dw = create_conv2d( + in_chs, out_chs, kernel_size=kernel_size, stride=stride_dw, dilation=dilation[1], + depthwise=True, bias=cfg.output_bias) + self.norm = norm_layer(out_chs) + self.mlp = mlp_layer(out_chs, int(cfg.expand_ratio * out_chs), bias=cfg.output_bias, act_layer=act_layer) + if conv_mlp: + self.ls = LayerScale2d(out_chs, cfg.init_values) if cfg.init_values else nn.Identity() + else: + self.ls = LayerScale(out_chs, cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + shortcut = self.shortcut(x) + x = self.down(x) + x = self.conv_dw(x) + if self.use_conv_mlp: + x = self.norm(x) + x = self.mlp(x) + x = self.ls(x) + else: + x = x.permute(0, 2, 3, 1) + x = self.norm(x) + x = self.mlp(x) + x = self.ls(x) + x = x.permute(0, 3, 1, 2) + + x = self.drop_path(x) + shortcut + return x + + +def window_partition(x, window_size: List[int]): + B, H, W, C = x.shape + _assert(H % window_size[0] == 0, f'height ({H}) must be divisible by window ({window_size[0]})') + _assert(W % window_size[1] == 0, '') + x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def window_reverse(windows, window_size: List[int], img_size: List[int]): + H, W = img_size + C = windows.shape[-1] + x = windows.view(-1, H // window_size[0], W // window_size[1], window_size[0], window_size[1], C) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, H, W, C) + return x + + +def grid_partition(x, grid_size: List[int]): + B, H, W, C = x.shape + _assert(H % grid_size[0] == 0, f'height {H} must be divisible by grid {grid_size[0]}') + _assert(W % grid_size[1] == 0, '') + x = x.view(B, grid_size[0], H // grid_size[0], grid_size[1], W // grid_size[1], C) + windows = x.permute(0, 2, 4, 1, 3, 5).contiguous().view(-1, grid_size[0], grid_size[1], C) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def grid_reverse(windows, grid_size: List[int], img_size: List[int]): + H, W = img_size + C = windows.shape[-1] + x = windows.view(-1, H // grid_size[0], W // grid_size[1], grid_size[0], grid_size[1], C) + x = x.permute(0, 3, 1, 4, 2, 5).contiguous().view(-1, H, W, C) + return x + + +def get_rel_pos_cls(cfg: MaxxVitTransformerCfg, window_size): + rel_pos_cls = None + if cfg.rel_pos_type == 'mlp': + rel_pos_cls = partial(RelPosMlp, window_size=window_size, hidden_dim=cfg.rel_pos_dim) + elif cfg.rel_pos_type == 'bias': + rel_pos_cls = partial(RelPosBias, window_size=window_size) + return rel_pos_cls + + +class PartitionAttentionCl(nn.Module): + """ Grid or Block partition + Attn + FFN. + NxC 'channels last' tensor layout. + """ + + def __init__( + self, + dim: int, + partition_type: str = 'block', + cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), + drop_path: float = 0., + ): + super().__init__() + norm_layer = partial(get_norm_layer(cfg.norm_layer_cl), eps=cfg.norm_eps) # NOTE this block is channels-last + act_layer = get_act_layer(cfg.act_layer) + + self.partition_block = partition_type == 'block' + self.partition_size = to_2tuple(cfg.window_size if self.partition_block else cfg.grid_size) + rel_pos_cls = get_rel_pos_cls(cfg, self.partition_size) + + self.norm1 = norm_layer(dim) + self.attn = AttentionCl( + dim, + dim, + dim_head=cfg.dim_head, + bias=cfg.attn_bias, + rel_pos_cls=rel_pos_cls, + attn_drop=cfg.attn_drop, + proj_drop=cfg.proj_drop, + ) + self.ls1 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp( + in_features=dim, + hidden_features=int(dim * cfg.expand_ratio), + act_layer=act_layer, + drop=cfg.proj_drop) + self.ls2 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def _partition_attn(self, x): + img_size = x.shape[1:3] + if self.partition_block: + partitioned = window_partition(x, self.partition_size) + else: + partitioned = grid_partition(x, self.partition_size) + + partitioned = self.attn(partitioned) + + if self.partition_block: + x = window_reverse(partitioned, self.partition_size, img_size) + else: + x = grid_reverse(partitioned, self.partition_size, img_size) + return x + + def forward(self, x): + x = x + self.drop_path1(self.ls1(self._partition_attn(self.norm1(x)))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class ParallelPartitionAttention(nn.Module): + """ Experimental. Grid and Block partition + single FFN + NxC tensor layout. + """ + + def __init__( + self, + dim: int, + cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), + drop_path: float = 0., + ): + super().__init__() + assert dim % 2 == 0 + norm_layer = partial(get_norm_layer(cfg.norm_layer_cl), eps=cfg.norm_eps) # NOTE this block is channels-last + act_layer = get_act_layer(cfg.act_layer) + + assert cfg.window_size == cfg.grid_size + self.partition_size = to_2tuple(cfg.window_size) + rel_pos_cls = get_rel_pos_cls(cfg, self.partition_size) + + self.norm1 = norm_layer(dim) + self.attn_block = AttentionCl( + dim, + dim // 2, + dim_head=cfg.dim_head, + bias=cfg.attn_bias, + rel_pos_cls=rel_pos_cls, + attn_drop=cfg.attn_drop, + proj_drop=cfg.proj_drop, + ) + self.attn_grid = AttentionCl( + dim, + dim // 2, + dim_head=cfg.dim_head, + bias=cfg.attn_bias, + rel_pos_cls=rel_pos_cls, + attn_drop=cfg.attn_drop, + proj_drop=cfg.proj_drop, + ) + self.ls1 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp( + in_features=dim, + hidden_features=int(dim * cfg.expand_ratio), + out_features=dim, + act_layer=act_layer, + drop=cfg.proj_drop) + self.ls2 = LayerScale(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def _partition_attn(self, x): + img_size = x.shape[1:3] + + partitioned_block = window_partition(x, self.partition_size) + partitioned_block = self.attn_block(partitioned_block) + x_window = window_reverse(partitioned_block, self.partition_size, img_size) + + partitioned_grid = grid_partition(x, self.partition_size) + partitioned_grid = self.attn_grid(partitioned_grid) + x_grid = grid_reverse(partitioned_grid, self.partition_size, img_size) + + return torch.cat([x_window, x_grid], dim=-1) + + def forward(self, x): + x = x + self.drop_path1(self.ls1(self._partition_attn(self.norm1(x)))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +def window_partition_nchw(x, window_size: List[int]): + B, C, H, W = x.shape + _assert(H % window_size[0] == 0, f'height ({H}) must be divisible by window ({window_size[0]})') + _assert(W % window_size[1] == 0, '') + x = x.view(B, C, H // window_size[0], window_size[0], W // window_size[1], window_size[1]) + windows = x.permute(0, 2, 4, 1, 3, 5).contiguous().view(-1, C, window_size[0], window_size[1]) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def window_reverse_nchw(windows, window_size: List[int], img_size: List[int]): + H, W = img_size + C = windows.shape[1] + x = windows.view(-1, H // window_size[0], W // window_size[1], C, window_size[0], window_size[1]) + x = x.permute(0, 3, 1, 4, 2, 5).contiguous().view(-1, C, H, W) + return x + + +def grid_partition_nchw(x, grid_size: List[int]): + B, C, H, W = x.shape + _assert(H % grid_size[0] == 0, f'height {H} must be divisible by grid {grid_size[0]}') + _assert(W % grid_size[1] == 0, '') + x = x.view(B, C, grid_size[0], H // grid_size[0], grid_size[1], W // grid_size[1]) + windows = x.permute(0, 3, 5, 1, 2, 4).contiguous().view(-1, C, grid_size[0], grid_size[1]) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def grid_reverse_nchw(windows, grid_size: List[int], img_size: List[int]): + H, W = img_size + C = windows.shape[1] + x = windows.view(-1, H // grid_size[0], W // grid_size[1], C, grid_size[0], grid_size[1]) + x = x.permute(0, 3, 4, 1, 5, 2).contiguous().view(-1, C, H, W) + return x + + +class PartitionAttention2d(nn.Module): + """ Grid or Block partition + Attn + FFN + + '2D' NCHW tensor layout. + """ + + def __init__( + self, + dim: int, + partition_type: str = 'block', + cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), + drop_path: float = 0., + ): + super().__init__() + norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps) # NOTE this block is channels-last + act_layer = get_act_layer(cfg.act_layer) + + self.partition_block = partition_type == 'block' + self.partition_size = to_2tuple(cfg.window_size if self.partition_block else cfg.grid_size) + rel_pos_cls = get_rel_pos_cls(cfg, self.partition_size) + + self.norm1 = norm_layer(dim) + self.attn = Attention2d( + dim, + dim, + dim_head=cfg.dim_head, + bias=cfg.attn_bias, + rel_pos_cls=rel_pos_cls, + attn_drop=cfg.attn_drop, + proj_drop=cfg.proj_drop, + ) + self.ls1 = LayerScale2d(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = ConvMlp( + in_features=dim, + hidden_features=int(dim * cfg.expand_ratio), + act_layer=act_layer, + drop=cfg.proj_drop) + self.ls2 = LayerScale2d(dim, init_values=cfg.init_values) if cfg.init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def _partition_attn(self, x): + img_size = x.shape[-2:] + if self.partition_block: + partitioned = window_partition_nchw(x, self.partition_size) + else: + partitioned = grid_partition_nchw(x, self.partition_size) + + partitioned = self.attn(partitioned) + + if self.partition_block: + x = window_reverse_nchw(partitioned, self.partition_size, img_size) + else: + x = grid_reverse_nchw(partitioned, self.partition_size, img_size) + return x + + def forward(self, x): + x = x + self.drop_path1(self.ls1(self._partition_attn(self.norm1(x)))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class MaxxVitBlock(nn.Module): + """ MaxVit conv, window partition + FFN , grid partition + FFN + """ + + def __init__( + self, + dim: int, + dim_out: int, + stride: int = 1, + conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg(), + transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), + use_nchw_attn: bool = False, # FIXME move to cfg? True is ~20-30% faster on TPU, 5-10% slower on GPU + drop_path: float = 0., + ): + super().__init__() + + conv_cls = ConvNeXtBlock if conv_cfg.block_type == 'convnext' else MbConvBlock + self.conv = conv_cls(dim, dim_out, stride=stride, cfg=conv_cfg, drop_path=drop_path) + + attn_kwargs = dict(dim=dim_out, cfg=transformer_cfg, drop_path=drop_path) + partition_layer = PartitionAttention2d if use_nchw_attn else PartitionAttentionCl + self.nchw_attn = use_nchw_attn + self.attn_block = partition_layer(**attn_kwargs) + self.attn_grid = partition_layer(partition_type='grid', **attn_kwargs) + + def init_weights(self, scheme=''): + named_apply(partial(_init_transformer, scheme=scheme), self.attn_block) + named_apply(partial(_init_transformer, scheme=scheme), self.attn_grid) + named_apply(partial(_init_conv, scheme=scheme), self.conv) + + def forward(self, x): + # NCHW format + x = self.conv(x) + + if not self.nchw_attn: + x = x.permute(0, 2, 3, 1) # to NHWC (channels-last) + x = self.attn_block(x) + x = self.attn_grid(x) + if not self.nchw_attn: + x = x.permute(0, 3, 1, 2) # back to NCHW + return x + + +class ParallelMaxxVitBlock(nn.Module): + """ MaxVit block with parallel cat(window + grid), one FF + Experimental timm block. + """ + + def __init__( + self, + dim, + dim_out, + stride=1, + num_conv=2, + conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg(), + transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), + drop_path=0., + ): + super().__init__() + + conv_cls = ConvNeXtBlock if conv_cfg.block_type == 'convnext' else MbConvBlock + if num_conv > 1: + convs = [conv_cls(dim, dim_out, stride=stride, cfg=conv_cfg, drop_path=drop_path)] + convs += [conv_cls(dim_out, dim_out, cfg=conv_cfg, drop_path=drop_path)] * (num_conv - 1) + self.conv = nn.Sequential(*convs) + else: + self.conv = conv_cls(dim, dim_out, stride=stride, cfg=conv_cfg, drop_path=drop_path) + self.attn = ParallelPartitionAttention(dim=dim_out, cfg=transformer_cfg, drop_path=drop_path) + + def init_weights(self, scheme=''): + named_apply(partial(_init_transformer, scheme=scheme), self.attn) + named_apply(partial(_init_conv, scheme=scheme), self.conv) + + def forward(self, x): + x = self.conv(x) + x = x.permute(0, 2, 3, 1) + x = self.attn(x) + x = x.permute(0, 3, 1, 2) + return x + + +class MaxxVitStage(nn.Module): + def __init__( + self, + in_chs: int, + out_chs: int, + stride: int = 2, + depth: int = 4, + feat_size: Tuple[int, int] = (14, 14), + block_types: Union[str, Tuple[str]] = 'C', + transformer_cfg: MaxxVitTransformerCfg = MaxxVitTransformerCfg(), + conv_cfg: MaxxVitConvCfg = MaxxVitConvCfg(), + drop_path: Union[float, List[float]] = 0., + ): + super().__init__() + self.grad_checkpointing = False + + block_types = extend_tuple(block_types, depth) + blocks = [] + for i, t in enumerate(block_types): + block_stride = stride if i == 0 else 1 + assert t in ('C', 'T', 'M', 'PM') + if t == 'C': + conv_cls = ConvNeXtBlock if conv_cfg.block_type == 'convnext' else MbConvBlock + blocks += [conv_cls( + in_chs, + out_chs, + stride=block_stride, + cfg=conv_cfg, + drop_path=drop_path[i], + )] + elif t == 'T': + rel_pos_cls = get_rel_pos_cls(transformer_cfg, feat_size) + blocks += [TransformerBlock2d( + in_chs, + out_chs, + stride=block_stride, + rel_pos_cls=rel_pos_cls, + cfg=transformer_cfg, + drop_path=drop_path[i], + )] + elif t == 'M': + blocks += [MaxxVitBlock( + in_chs, + out_chs, + stride=block_stride, + conv_cfg=conv_cfg, + transformer_cfg=transformer_cfg, + drop_path=drop_path[i], + )] + elif t == 'PM': + blocks += [ParallelMaxxVitBlock( + in_chs, + out_chs, + stride=block_stride, + conv_cfg=conv_cfg, + transformer_cfg=transformer_cfg, + drop_path=drop_path[i], + )] + in_chs = out_chs + self.blocks = nn.Sequential(*blocks) + + def forward(self, x): + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + return x + + +class Stem(nn.Module): + + def __init__( + self, + in_chs: int, + out_chs: int, + kernel_size: int = 3, + act_layer: str = 'gelu', + norm_layer: str = 'batchnorm2d', + norm_eps: float = 1e-5, + ): + super().__init__() + if not isinstance(out_chs, (list, tuple)): + out_chs = to_2tuple(out_chs) + + norm_act_layer = partial(get_norm_act_layer(norm_layer, act_layer), eps=norm_eps) + self.out_chs = out_chs[-1] + self.stride = 2 + + self.conv1 = create_conv2d(in_chs, out_chs[0], kernel_size, stride=2) + self.norm1 = norm_act_layer(out_chs[0]) + self.conv2 = create_conv2d(out_chs[0], out_chs[1], kernel_size, stride=1) + + def init_weights(self, scheme=''): + named_apply(partial(_init_conv, scheme=scheme), self) + + def forward(self, x): + x = self.conv1(x) + x = self.norm1(x) + x = self.conv2(x) + return x + + +def cfg_window_size(cfg: MaxxVitTransformerCfg, img_size: Tuple[int, int]): + if cfg.window_size is not None: + assert cfg.grid_size + return cfg + partition_size = img_size[0] // cfg.partition_ratio, img_size[1] // cfg.partition_ratio + cfg = replace(cfg, window_size=partition_size, grid_size=partition_size) + return cfg + + +class MaxxVit(nn.Module): + """ CoaTNet + MaxVit base model. + + Highly configurable for different block compositions, tensor layouts, pooling types. + """ + + def __init__( + self, + cfg: MaxxVitCfg, + img_size: Union[int, Tuple[int, int]] = 224, + in_chans: int = 3, + num_classes: int = 1000, + global_pool: str = 'avg', + drop_rate: float = 0., + drop_path_rate: float = 0. + ): + super().__init__() + img_size = to_2tuple(img_size) + transformer_cfg = cfg_window_size(cfg.transformer_cfg, img_size) + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = cfg.embed_dim[-1] + self.embed_dim = cfg.embed_dim + self.drop_rate = drop_rate + self.grad_checkpointing = False + + self.stem = Stem( + in_chs=in_chans, + out_chs=cfg.stem_width, + act_layer=cfg.conv_cfg.act_layer, + norm_layer=cfg.conv_cfg.norm_layer, + norm_eps=cfg.conv_cfg.norm_eps, + ) + + stride = self.stem.stride + feat_size = tuple([i // s for i, s in zip(img_size, to_2tuple(stride))]) + + num_stages = len(cfg.embed_dim) + assert len(cfg.depths) == num_stages + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)] + in_chs = self.stem.out_chs + stages = [] + for i in range(num_stages): + stage_stride = 2 + out_chs = cfg.embed_dim[i] + feat_size = tuple([(r - 1) // stage_stride + 1 for r in feat_size]) + stages += [MaxxVitStage( + in_chs, + out_chs, + depth=cfg.depths[i], + block_types=cfg.block_type[i], + conv_cfg=cfg.conv_cfg, + transformer_cfg=transformer_cfg, + feat_size=feat_size, + drop_path=dpr[i], + )] + stride *= stage_stride + in_chs = out_chs + self.stages = nn.Sequential(*stages) + + final_norm_layer = get_norm_layer(cfg.transformer_cfg.norm_layer) + self.norm = final_norm_layer(self.num_features, eps=cfg.transformer_cfg.norm_eps) + + # Classifier head + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate) + + # Weight init (default PyTorch init works well for AdamW if scheme not set) + assert cfg.weight_init in ('', 'normal', 'trunc_normal', 'xavier_normal', 'vit_eff') + if cfg.weight_init: + named_apply(partial(self._init_weights, scheme=cfg.weight_init), self) + + def _init_weights(self, module, name, scheme=''): + if hasattr(module, 'init_weights'): + try: + module.init_weights(scheme=scheme) + except TypeError: + module.init_weights() + + @torch.jit.ignore + def no_weight_decay(self): + return { + k for k, _ in self.named_parameters() + if any(n in k for n in ["relative_position_bias_table", "rel_pos.mlp"])} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', # stem and embed + blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is None: + global_pool = self.head.global_pool.pool_type + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + x = self.stages(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_maxxvit(variant, cfg_variant=None, pretrained=False, **kwargs): + return build_model_with_cfg( + MaxxVit, variant, pretrained, + model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], + feature_cfg=dict(flatten_sequential=True), + **kwargs) + + +@register_model +def coatnet_pico_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_pico_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_nano_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_nano_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_0_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_0_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_1_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_1_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_2_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_2_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_3_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_3_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_bn_0_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_bn_0_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_rmlp_nano_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_nano_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_rmlp_0_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_0_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_rmlp_1_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_1_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_rmlp_2_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_2_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_rmlp_3_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_rmlp_3_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_nano_cc_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_nano_cc_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnext_nano_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnext_nano_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_0_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_0_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_1_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_1_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_2_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_2_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_3_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_3_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_4_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_4_224', pretrained=pretrained, **kwargs) + + +@register_model +def coatnet_5_224(pretrained=False, **kwargs): + return _create_maxxvit('coatnet_5_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_pico_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_pico_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_nano_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_nano_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_tiny_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_tiny_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_tiny_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_tiny_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_rmlp_pico_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_rmlp_pico_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_rmlp_nano_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_rmlp_nano_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_rmlp_tiny_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_rmlp_tiny_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_rmlp_small_rw_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_rmlp_small_rw_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_rmlp_small_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_rmlp_small_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_tiny_pm_256(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_tiny_pm_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxxvit_rmlp_nano_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxxvit_rmlp_nano_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxxvit_rmlp_tiny_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxxvit_rmlp_tiny_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxxvit_rmlp_small_rw_256(pretrained=False, **kwargs): + return _create_maxxvit('maxxvit_rmlp_small_rw_256', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_tiny_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_tiny_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_small_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_small_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_base_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_base_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_large_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_large_224', pretrained=pretrained, **kwargs) + + +@register_model +def maxvit_xlarge_224(pretrained=False, **kwargs): + return _create_maxxvit('maxvit_xlarge_224', pretrained=pretrained, **kwargs) + diff --git a/custom_timm/models/mlp_mixer.py b/custom_timm/models/mlp_mixer.py new file mode 100644 index 0000000000000000000000000000000000000000..b044244baa63476f32e63b63e7604748bbbf0360 --- /dev/null +++ b/custom_timm/models/mlp_mixer.py @@ -0,0 +1,681 @@ +""" MLP-Mixer, ResMLP, and gMLP in PyTorch + +This impl originally based on MLP-Mixer paper. + +Official JAX impl: https://github.com/google-research/vision_transformer/blob/linen/vit_jax/models_mixer.py + +Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + +@article{tolstikhin2021, + title={MLP-Mixer: An all-MLP Architecture for Vision}, + author={Tolstikhin, Ilya and Houlsby, Neil and Kolesnikov, Alexander and Beyer, Lucas and Zhai, Xiaohua and Unterthiner, + Thomas and Yung, Jessica and Keysers, Daniel and Uszkoreit, Jakob and Lucic, Mario and Dosovitskiy, Alexey}, + journal={arXiv preprint arXiv:2105.01601}, + year={2021} +} + +Also supporting ResMlp, and a preliminary (not verified) implementations of gMLP + +Code: https://github.com/facebookresearch/deit +Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 +@misc{touvron2021resmlp, + title={ResMLP: Feedforward networks for image classification with data-efficient training}, + author={Hugo Touvron and Piotr Bojanowski and Mathilde Caron and Matthieu Cord and Alaaeldin El-Nouby and + Edouard Grave and Armand Joulin and Gabriel Synnaeve and Jakob Verbeek and Hervé Jégou}, + year={2021}, + eprint={2105.03404}, +} + +Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050 +@misc{liu2021pay, + title={Pay Attention to MLPs}, + author={Hanxiao Liu and Zihang Dai and David R. So and Quoc V. Le}, + year={2021}, + eprint={2105.08050}, +} + +A thank you to paper authors for releasing code and weights. + +Hacked together by / Copyright 2021 Ross Wightman +""" +import math +from copy import deepcopy +from functools import partial + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, named_apply, checkpoint_seq +from .layers import PatchEmbed, Mlp, GluMlp, GatedMlp, DropPath, lecun_normal_, to_2tuple +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': 0.875, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + 'first_conv': 'stem.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = dict( + mixer_s32_224=_cfg(), + mixer_s16_224=_cfg(), + mixer_b32_224=_cfg(), + mixer_b16_224=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_b16_224-76587d61.pth', + ), + mixer_b16_224_in21k=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_b16_224_in21k-617b3de2.pth', + num_classes=21843 + ), + mixer_l32_224=_cfg(), + mixer_l16_224=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_l16_224-92f9adc4.pth', + ), + mixer_l16_224_in21k=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_mixer_l16_224_in21k-846aa33c.pth', + num_classes=21843 + ), + + # Mixer ImageNet-21K-P pretraining + mixer_b16_224_miil_in21k=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mixer_b16_224_miil_in21k-2a558a71.pth', + mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear', num_classes=11221, + ), + mixer_b16_224_miil=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mixer_b16_224_miil-9229a591.pth', + mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear', + ), + + gmixer_12_224=_cfg(mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + gmixer_24_224=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gmixer_24_224_raa-7daf7ae6.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + + resmlp_12_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_12_no_dist.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + resmlp_24_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_24_no_dist.pth', + #url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resmlp_24_224_raa-a8256759.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + resmlp_36_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_36_no_dist.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + resmlp_big_24_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlpB_24_no_dist.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + + resmlp_12_distilled_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_12_dist.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + resmlp_24_distilled_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_24_dist.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + resmlp_36_distilled_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_36_dist.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + resmlp_big_24_distilled_224=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlpB_24_dist.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + + resmlp_big_24_224_in22ft1k=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlpB_24_22k.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + + resmlp_12_224_dino=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_12_dino.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + resmlp_24_224_dino=_cfg( + url='https://dl.fbaipublicfiles.com/deit/resmlp_24_dino.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD), + + gmlp_ti16_224=_cfg(), + gmlp_s16_224=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gmlp_s16_224_raa-10536d42.pth', + ), + gmlp_b16_224=_cfg(), +) + + +class MixerBlock(nn.Module): + """ Residual Block w/ token mixing and channel MLPs + Based on: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + def __init__( + self, dim, seq_len, mlp_ratio=(0.5, 4.0), mlp_layer=Mlp, + norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, drop=0., drop_path=0.): + super().__init__() + tokens_dim, channels_dim = [int(x * dim) for x in to_2tuple(mlp_ratio)] + self.norm1 = norm_layer(dim) + self.mlp_tokens = mlp_layer(seq_len, tokens_dim, act_layer=act_layer, drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp_channels = mlp_layer(dim, channels_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.mlp_tokens(self.norm1(x).transpose(1, 2)).transpose(1, 2)) + x = x + self.drop_path(self.mlp_channels(self.norm2(x))) + return x + + +class Affine(nn.Module): + def __init__(self, dim): + super().__init__() + self.alpha = nn.Parameter(torch.ones((1, 1, dim))) + self.beta = nn.Parameter(torch.zeros((1, 1, dim))) + + def forward(self, x): + return torch.addcmul(self.beta, self.alpha, x) + + +class ResBlock(nn.Module): + """ Residual MLP block w/ LayerScale and Affine 'norm' + + Based on: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + def __init__( + self, dim, seq_len, mlp_ratio=4, mlp_layer=Mlp, norm_layer=Affine, + act_layer=nn.GELU, init_values=1e-4, drop=0., drop_path=0.): + super().__init__() + channel_dim = int(dim * mlp_ratio) + self.norm1 = norm_layer(dim) + self.linear_tokens = nn.Linear(seq_len, seq_len) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp_channels = mlp_layer(dim, channel_dim, act_layer=act_layer, drop=drop) + self.ls1 = nn.Parameter(init_values * torch.ones(dim)) + self.ls2 = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + x = x + self.drop_path(self.ls1 * self.linear_tokens(self.norm1(x).transpose(1, 2)).transpose(1, 2)) + x = x + self.drop_path(self.ls2 * self.mlp_channels(self.norm2(x))) + return x + + +class SpatialGatingUnit(nn.Module): + """ Spatial Gating Unit + + Based on: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050 + """ + def __init__(self, dim, seq_len, norm_layer=nn.LayerNorm): + super().__init__() + gate_dim = dim // 2 + self.norm = norm_layer(gate_dim) + self.proj = nn.Linear(seq_len, seq_len) + + def init_weights(self): + # special init for the projection gate, called as override by base model init + nn.init.normal_(self.proj.weight, std=1e-6) + nn.init.ones_(self.proj.bias) + + def forward(self, x): + u, v = x.chunk(2, dim=-1) + v = self.norm(v) + v = self.proj(v.transpose(-1, -2)) + return u * v.transpose(-1, -2) + + +class SpatialGatingBlock(nn.Module): + """ Residual Block w/ Spatial Gating + + Based on: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050 + """ + def __init__( + self, dim, seq_len, mlp_ratio=4, mlp_layer=GatedMlp, + norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, drop=0., drop_path=0.): + super().__init__() + channel_dim = int(dim * mlp_ratio) + self.norm = norm_layer(dim) + sgu = partial(SpatialGatingUnit, seq_len=seq_len) + self.mlp_channels = mlp_layer(dim, channel_dim, act_layer=act_layer, gate_layer=sgu, drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + x = x + self.drop_path(self.mlp_channels(self.norm(x))) + return x + + +class MlpMixer(nn.Module): + + def __init__( + self, + num_classes=1000, + img_size=224, + in_chans=3, + patch_size=16, + num_blocks=8, + embed_dim=512, + mlp_ratio=(0.5, 4.0), + block_layer=MixerBlock, + mlp_layer=Mlp, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + drop_rate=0., + drop_path_rate=0., + nlhb=False, + stem_norm=False, + global_pool='avg', + ): + super().__init__() + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.grad_checkpointing = False + + self.stem = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, + embed_dim=embed_dim, norm_layer=norm_layer if stem_norm else None) + # FIXME drop_path (stochastic depth scaling rule or all the same?) + self.blocks = nn.Sequential(*[ + block_layer( + embed_dim, self.stem.num_patches, mlp_ratio, mlp_layer=mlp_layer, norm_layer=norm_layer, + act_layer=act_layer, drop=drop_rate, drop_path=drop_path_rate) + for _ in range(num_blocks)]) + self.norm = norm_layer(embed_dim) + self.head = nn.Linear(embed_dim, self.num_classes) if num_classes > 0 else nn.Identity() + + self.init_weights(nlhb=nlhb) + + @torch.jit.ignore + def init_weights(self, nlhb=False): + head_bias = -math.log(self.num_classes) if nlhb else 0. + named_apply(partial(_init_weights, head_bias=head_bias), module=self) # depth-first + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', # stem and embed + blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.stem(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + if self.global_pool == 'avg': + x = x.mean(dim=1) + x = self.head(x) + return x + + +def _init_weights(module: nn.Module, name: str, head_bias: float = 0., flax=False): + """ Mixer weight initialization (trying to match Flax defaults) + """ + if isinstance(module, nn.Linear): + if name.startswith('head'): + nn.init.zeros_(module.weight) + nn.init.constant_(module.bias, head_bias) + else: + if flax: + # Flax defaults + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + else: + # like MLP init in vit (my original init) + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + if 'mlp' in name: + nn.init.normal_(module.bias, std=1e-6) + else: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Conv2d): + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)): + nn.init.ones_(module.weight) + nn.init.zeros_(module.bias) + elif hasattr(module, 'init_weights'): + # NOTE if a parent module contains init_weights method, it can override the init of the + # child modules as this will be called in depth-first order. + module.init_weights() + + +def checkpoint_filter_fn(state_dict, model): + """ Remap checkpoints if needed """ + if 'patch_embed.proj.weight' in state_dict: + # Remap FB ResMlp models -> timm + out_dict = {} + for k, v in state_dict.items(): + k = k.replace('patch_embed.', 'stem.') + k = k.replace('attn.', 'linear_tokens.') + k = k.replace('mlp.', 'mlp_channels.') + k = k.replace('gamma_', 'ls') + if k.endswith('.alpha') or k.endswith('.beta'): + v = v.reshape(1, 1, -1) + out_dict[k] = v + return out_dict + return state_dict + + +def _create_mixer(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for MLP-Mixer models.') + + model = build_model_with_cfg( + MlpMixer, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def mixer_s32_224(pretrained=False, **kwargs): + """ Mixer-S/32 224x224 + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=32, num_blocks=8, embed_dim=512, **kwargs) + model = _create_mixer('mixer_s32_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_s16_224(pretrained=False, **kwargs): + """ Mixer-S/16 224x224 + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=16, num_blocks=8, embed_dim=512, **kwargs) + model = _create_mixer('mixer_s16_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_b32_224(pretrained=False, **kwargs): + """ Mixer-B/32 224x224 + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=32, num_blocks=12, embed_dim=768, **kwargs) + model = _create_mixer('mixer_b32_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_b16_224(pretrained=False, **kwargs): + """ Mixer-B/16 224x224. ImageNet-1k pretrained weights. + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs) + model = _create_mixer('mixer_b16_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_b16_224_in21k(pretrained=False, **kwargs): + """ Mixer-B/16 224x224. ImageNet-21k pretrained weights. + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs) + model = _create_mixer('mixer_b16_224_in21k', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_l32_224(pretrained=False, **kwargs): + """ Mixer-L/32 224x224. + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=32, num_blocks=24, embed_dim=1024, **kwargs) + model = _create_mixer('mixer_l32_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_l16_224(pretrained=False, **kwargs): + """ Mixer-L/16 224x224. ImageNet-1k pretrained weights. + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=16, num_blocks=24, embed_dim=1024, **kwargs) + model = _create_mixer('mixer_l16_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_l16_224_in21k(pretrained=False, **kwargs): + """ Mixer-L/16 224x224. ImageNet-21k pretrained weights. + Paper: 'MLP-Mixer: An all-MLP Architecture for Vision' - https://arxiv.org/abs/2105.01601 + """ + model_args = dict(patch_size=16, num_blocks=24, embed_dim=1024, **kwargs) + model = _create_mixer('mixer_l16_224_in21k', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_b16_224_miil(pretrained=False, **kwargs): + """ Mixer-B/16 224x224. ImageNet-21k pretrained weights. + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs) + model = _create_mixer('mixer_b16_224_miil', pretrained=pretrained, **model_args) + return model + + +@register_model +def mixer_b16_224_miil_in21k(pretrained=False, **kwargs): + """ Mixer-B/16 224x224. ImageNet-1k pretrained weights. + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model_args = dict(patch_size=16, num_blocks=12, embed_dim=768, **kwargs) + model = _create_mixer('mixer_b16_224_miil_in21k', pretrained=pretrained, **model_args) + return model + + +@register_model +def gmixer_12_224(pretrained=False, **kwargs): + """ Glu-Mixer-12 224x224 + Experiment by Ross Wightman, adding (Si)GLU to MLP-Mixer + """ + model_args = dict( + patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=(1.0, 4.0), + mlp_layer=GluMlp, act_layer=nn.SiLU, **kwargs) + model = _create_mixer('gmixer_12_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def gmixer_24_224(pretrained=False, **kwargs): + """ Glu-Mixer-24 224x224 + Experiment by Ross Wightman, adding (Si)GLU to MLP-Mixer + """ + model_args = dict( + patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=(1.0, 4.0), + mlp_layer=GluMlp, act_layer=nn.SiLU, **kwargs) + model = _create_mixer('gmixer_24_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_12_224(pretrained=False, **kwargs): + """ ResMLP-12 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=4, block_layer=ResBlock, norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_12_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_24_224(pretrained=False, **kwargs): + """ ResMLP-24 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-5), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_24_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_36_224(pretrained=False, **kwargs): + """ ResMLP-36 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=16, num_blocks=36, embed_dim=384, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_36_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_big_24_224(pretrained=False, **kwargs): + """ ResMLP-B-24 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=8, num_blocks=24, embed_dim=768, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_big_24_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_12_distilled_224(pretrained=False, **kwargs): + """ ResMLP-12 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=4, block_layer=ResBlock, norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_12_distilled_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_24_distilled_224(pretrained=False, **kwargs): + """ ResMLP-24 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-5), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_24_distilled_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_36_distilled_224(pretrained=False, **kwargs): + """ ResMLP-36 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=16, num_blocks=36, embed_dim=384, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_36_distilled_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_big_24_distilled_224(pretrained=False, **kwargs): + """ ResMLP-B-24 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=8, num_blocks=24, embed_dim=768, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_big_24_distilled_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_big_24_224_in22ft1k(pretrained=False, **kwargs): + """ ResMLP-B-24 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + """ + model_args = dict( + patch_size=8, num_blocks=24, embed_dim=768, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-6), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_big_24_224_in22ft1k', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_12_224_dino(pretrained=False, **kwargs): + """ ResMLP-12 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + + Model pretrained via DINO (self-supervised) - https://arxiv.org/abs/2104.14294 + """ + model_args = dict( + patch_size=16, num_blocks=12, embed_dim=384, mlp_ratio=4, block_layer=ResBlock, norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_12_224_dino', pretrained=pretrained, **model_args) + return model + + +@register_model +def resmlp_24_224_dino(pretrained=False, **kwargs): + """ ResMLP-24 + Paper: `ResMLP: Feedforward networks for image classification...` - https://arxiv.org/abs/2105.03404 + + Model pretrained via DINO (self-supervised) - https://arxiv.org/abs/2104.14294 + """ + model_args = dict( + patch_size=16, num_blocks=24, embed_dim=384, mlp_ratio=4, + block_layer=partial(ResBlock, init_values=1e-5), norm_layer=Affine, **kwargs) + model = _create_mixer('resmlp_24_224_dino', pretrained=pretrained, **model_args) + return model + + +@register_model +def gmlp_ti16_224(pretrained=False, **kwargs): + """ gMLP-Tiny + Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050 + """ + model_args = dict( + patch_size=16, num_blocks=30, embed_dim=128, mlp_ratio=6, block_layer=SpatialGatingBlock, + mlp_layer=GatedMlp, **kwargs) + model = _create_mixer('gmlp_ti16_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def gmlp_s16_224(pretrained=False, **kwargs): + """ gMLP-Small + Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050 + """ + model_args = dict( + patch_size=16, num_blocks=30, embed_dim=256, mlp_ratio=6, block_layer=SpatialGatingBlock, + mlp_layer=GatedMlp, **kwargs) + model = _create_mixer('gmlp_s16_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def gmlp_b16_224(pretrained=False, **kwargs): + """ gMLP-Base + Paper: `Pay Attention to MLPs` - https://arxiv.org/abs/2105.08050 + """ + model_args = dict( + patch_size=16, num_blocks=30, embed_dim=512, mlp_ratio=6, block_layer=SpatialGatingBlock, + mlp_layer=GatedMlp, **kwargs) + model = _create_mixer('gmlp_b16_224', pretrained=pretrained, **model_args) + return model diff --git a/custom_timm/models/mobilenetv3.py b/custom_timm/models/mobilenetv3.py new file mode 100644 index 0000000000000000000000000000000000000000..19dd8b5b4bf10ea2dc307fda75ed8d49bc312f82 --- /dev/null +++ b/custom_timm/models/mobilenetv3.py @@ -0,0 +1,739 @@ +""" MobileNet V3 + +A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl. + +Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244 + +Hacked together by / Copyright 2019, Ross Wightman +""" +from functools import partial +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .efficientnet_blocks import SqueezeExcite +from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights,\ + round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT +from .features import FeatureInfo, FeatureHooks +from .helpers import build_model_with_cfg, pretrained_cfg_for_features, checkpoint_seq +from .layers import SelectAdaptivePool2d, Linear, create_conv2d, get_act_fn, get_norm_act_layer +from .registry import register_model + +__all__ = ['MobileNetV3', 'MobileNetV3Features'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv_stem', 'classifier': 'classifier', + **kwargs + } + + +default_cfgs = { + 'mobilenetv3_large_075': _cfg(url=''), + 'mobilenetv3_large_100': _cfg( + interpolation='bicubic', + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_large_100_ra-f55367f5.pth'), + 'mobilenetv3_large_100_miil': _cfg( + interpolation='bilinear', mean=(0., 0., 0.), std=(1., 1., 1.), + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mobilenetv3_large_100_1k_miil_78_0-66471c13.pth'), + 'mobilenetv3_large_100_miil_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/mobilenetv3_large_100_in21k_miil-d71cc17b.pth', + interpolation='bilinear', mean=(0., 0., 0.), std=(1., 1., 1.), num_classes=11221), + + 'mobilenetv3_small_050': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_050_lambc-4b7bbe87.pth', + interpolation='bicubic'), + 'mobilenetv3_small_075': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_075_lambc-384766db.pth', + interpolation='bicubic'), + 'mobilenetv3_small_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_small_100_lamb-266a294c.pth', + interpolation='bicubic'), + + 'mobilenetv3_rw': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth', + interpolation='bicubic'), + + 'tf_mobilenetv3_large_075': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'tf_mobilenetv3_large_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'tf_mobilenetv3_large_minimal_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'tf_mobilenetv3_small_075': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'tf_mobilenetv3_small_100': _cfg( + url= 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + 'tf_mobilenetv3_small_minimal_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD), + + 'fbnetv3_b': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetv3_b_224-ead5d2a1.pth', + test_input_size=(3, 256, 256), crop_pct=0.95), + 'fbnetv3_d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetv3_d_224-c98bce42.pth', + test_input_size=(3, 256, 256), crop_pct=0.95), + 'fbnetv3_g': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetv3_g_240-0b1df83b.pth', + input_size=(3, 240, 240), test_input_size=(3, 288, 288), crop_pct=0.95, pool_size=(8, 8)), + + "lcnet_035": _cfg(), + "lcnet_050": _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/lcnet_050-f447553b.pth', + interpolation='bicubic', + ), + "lcnet_075": _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/lcnet_075-318cad2c.pth', + interpolation='bicubic', + ), + "lcnet_100": _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/lcnet_100-a929038c.pth', + interpolation='bicubic', + ), + "lcnet_150": _cfg(), +} + + +class MobileNetV3(nn.Module): + """ MobiletNet-V3 + + Based on my EfficientNet implementation and building blocks, this model utilizes the MobileNet-v3 specific + 'efficient head', where global pooling is done before the head convolution without a final batch-norm + layer before the classifier. + + Paper: `Searching for MobileNetV3` - https://arxiv.org/abs/1905.02244 + + Other architectures utilizing MobileNet-V3 efficient head that are supported by this impl include: + * HardCoRe-NAS - https://arxiv.org/abs/2102.11646 (defn in hardcorenas.py uses this class) + * FBNet-V3 - https://arxiv.org/abs/2006.02049 + * LCNet - https://arxiv.org/abs/2109.15099 + """ + + def __init__( + self, block_args, num_classes=1000, in_chans=3, stem_size=16, fix_stem=False, num_features=1280, + head_bias=True, pad_type='', act_layer=None, norm_layer=None, se_layer=None, se_from_exp=True, + round_chs_fn=round_channels, drop_rate=0., drop_path_rate=0., global_pool='avg'): + super(MobileNetV3, self).__init__() + act_layer = act_layer or nn.ReLU + norm_layer = norm_layer or nn.BatchNorm2d + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + se_layer = se_layer or SqueezeExcite + self.num_classes = num_classes + self.num_features = num_features + self.drop_rate = drop_rate + self.grad_checkpointing = False + + # Stem + if not fix_stem: + stem_size = round_chs_fn(stem_size) + self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type) + self.bn1 = norm_act_layer(stem_size, inplace=True) + + # Middle stages (IR/ER/DS Blocks) + builder = EfficientNetBuilder( + output_stride=32, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp, + act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, drop_path_rate=drop_path_rate) + self.blocks = nn.Sequential(*builder(stem_size, block_args)) + self.feature_info = builder.features + head_chs = builder.in_chs + + # Head + Pooling + self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) + num_pooled_chs = head_chs * self.global_pool.feat_mult() + self.conv_head = create_conv2d(num_pooled_chs, self.num_features, 1, padding=pad_type, bias=head_bias) + self.act2 = act_layer(inplace=True) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() # don't flatten if pooling disabled + self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + efficientnet_init_weights(self) + + def as_sequential(self): + layers = [self.conv_stem, self.bn1] + layers.extend(self.blocks) + layers.extend([self.global_pool, self.conv_head, self.act2]) + layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier]) + return nn.Sequential(*layers) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^conv_stem|bn1', + blocks=r'^blocks\.(\d+)' if coarse else r'^blocks\.(\d+)\.(\d+)' + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.classifier + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + # cannot meaningfully change pooling of efficient head after creation + self.global_pool = SelectAdaptivePool2d(pool_type=global_pool) + self.flatten = nn.Flatten(1) if global_pool else nn.Identity() # don't flatten if pooling disabled + self.classifier = Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.conv_stem(x) + x = self.bn1(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x, flatten=True) + else: + x = self.blocks(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + x = self.conv_head(x) + x = self.act2(x) + if pre_logits: + return x.flatten(1) + else: + x = self.flatten(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return self.classifier(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +class MobileNetV3Features(nn.Module): + """ MobileNetV3 Feature Extractor + + A work-in-progress feature extraction module for MobileNet-V3 to use as a backbone for segmentation + and object detection models. + """ + + def __init__( + self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck', in_chans=3, + stem_size=16, fix_stem=False, output_stride=32, pad_type='', round_chs_fn=round_channels, + se_from_exp=True, act_layer=None, norm_layer=None, se_layer=None, drop_rate=0., drop_path_rate=0.): + super(MobileNetV3Features, self).__init__() + act_layer = act_layer or nn.ReLU + norm_layer = norm_layer or nn.BatchNorm2d + se_layer = se_layer or SqueezeExcite + self.drop_rate = drop_rate + + # Stem + if not fix_stem: + stem_size = round_chs_fn(stem_size) + self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type) + self.bn1 = norm_layer(stem_size) + self.act1 = act_layer(inplace=True) + + # Middle stages (IR/ER/DS Blocks) + builder = EfficientNetBuilder( + output_stride=output_stride, pad_type=pad_type, round_chs_fn=round_chs_fn, se_from_exp=se_from_exp, + act_layer=act_layer, norm_layer=norm_layer, se_layer=se_layer, + drop_path_rate=drop_path_rate, feature_location=feature_location) + self.blocks = nn.Sequential(*builder(stem_size, block_args)) + self.feature_info = FeatureInfo(builder.features, out_indices) + self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices} + + efficientnet_init_weights(self) + + # Register feature extraction hooks with FeatureHooks helper + self.feature_hooks = None + if feature_location != 'bottleneck': + hooks = self.feature_info.get_dicts(keys=('module', 'hook_type')) + self.feature_hooks = FeatureHooks(hooks, self.named_modules()) + + def forward(self, x) -> List[torch.Tensor]: + x = self.conv_stem(x) + x = self.bn1(x) + x = self.act1(x) + if self.feature_hooks is None: + features = [] + if 0 in self._stage_out_idx: + features.append(x) # add stem out + for i, b in enumerate(self.blocks): + x = b(x) + if i + 1 in self._stage_out_idx: + features.append(x) + return features + else: + self.blocks(x) + out = self.feature_hooks.get_output(x.device) + return list(out.values()) + + +def _create_mnv3(variant, pretrained=False, **kwargs): + features_only = False + model_cls = MobileNetV3 + kwargs_filter = None + if kwargs.pop('features_only', False): + features_only = True + kwargs_filter = ('num_classes', 'num_features', 'head_conv', 'head_bias', 'global_pool') + model_cls = MobileNetV3Features + model = build_model_with_cfg( + model_cls, variant, pretrained, + pretrained_strict=not features_only, + kwargs_filter=kwargs_filter, + **kwargs) + if features_only: + model.default_cfg = pretrained_cfg_for_features(model.default_cfg) + return model + + +def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """Creates a MobileNet-V3 model. + + Ref impl: ? + Paper: https://arxiv.org/abs/1905.02244 + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_e1_c16_nre_noskip'], # relu + # stage 1, 112x112 in + ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu + # stage 2, 56x56 in + ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu + # stage 3, 28x28 in + ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish + # stage 4, 14x14in + ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish + # stage 5, 14x14in + ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish + # stage 6, 7x7 in + ['cn_r1_k1_s1_c960'], # hard-swish + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + head_bias=False, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'hard_swish'), + se_layer=partial(SqueezeExcite, gate_layer='hard_sigmoid'), + **kwargs, + ) + model = _create_mnv3(variant, pretrained, **model_kwargs) + return model + + +def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """Creates a MobileNet-V3 model. + + Ref impl: ? + Paper: https://arxiv.org/abs/1905.02244 + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + if 'small' in variant: + num_features = 1024 + if 'minimal' in variant: + act_layer = resolve_act_layer(kwargs, 'relu') + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s2_e1_c16'], + # stage 1, 56x56 in + ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'], + # stage 2, 28x28 in + ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'], + # stage 3, 14x14 in + ['ir_r2_k3_s1_e3_c48'], + # stage 4, 14x14in + ['ir_r3_k3_s2_e6_c96'], + # stage 6, 7x7 in + ['cn_r1_k1_s1_c576'], + ] + else: + act_layer = resolve_act_layer(kwargs, 'hard_swish') + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s2_e1_c16_se0.25_nre'], # relu + # stage 1, 56x56 in + ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'], # relu + # stage 2, 28x28 in + ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'], # hard-swish + # stage 3, 14x14 in + ['ir_r2_k5_s1_e3_c48_se0.25'], # hard-swish + # stage 4, 14x14in + ['ir_r3_k5_s2_e6_c96_se0.25'], # hard-swish + # stage 6, 7x7 in + ['cn_r1_k1_s1_c576'], # hard-swish + ] + else: + num_features = 1280 + if 'minimal' in variant: + act_layer = resolve_act_layer(kwargs, 'relu') + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_e1_c16'], + # stage 1, 112x112 in + ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'], + # stage 2, 56x56 in + ['ir_r3_k3_s2_e3_c40'], + # stage 3, 28x28 in + ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], + # stage 4, 14x14in + ['ir_r2_k3_s1_e6_c112'], + # stage 5, 14x14in + ['ir_r3_k3_s2_e6_c160'], + # stage 6, 7x7 in + ['cn_r1_k1_s1_c960'], + ] + else: + act_layer = resolve_act_layer(kwargs, 'hard_swish') + arch_def = [ + # stage 0, 112x112 in + ['ds_r1_k3_s1_e1_c16_nre'], # relu + # stage 1, 112x112 in + ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'], # relu + # stage 2, 56x56 in + ['ir_r3_k5_s2_e3_c40_se0.25_nre'], # relu + # stage 3, 28x28 in + ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'], # hard-swish + # stage 4, 14x14in + ['ir_r2_k3_s1_e6_c112_se0.25'], # hard-swish + # stage 5, 14x14in + ['ir_r3_k5_s2_e6_c160_se0.25'], # hard-swish + # stage 6, 7x7 in + ['cn_r1_k1_s1_c960'], # hard-swish + ] + se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU, rd_round_fn=round_channels) + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + num_features=num_features, + stem_size=16, + fix_stem=channel_multiplier < 0.75, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=act_layer, + se_layer=se_layer, + **kwargs, + ) + model = _create_mnv3(variant, pretrained, **model_kwargs) + return model + + +def _gen_fbnetv3(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """ FBNetV3 + Paper: `FBNetV3: Joint Architecture-Recipe Search using Predictor Pretraining` + - https://arxiv.org/abs/2006.02049 + FIXME untested, this is a preliminary impl of some FBNet-V3 variants. + """ + vl = variant.split('_')[-1] + if vl in ('a', 'b'): + stem_size = 16 + arch_def = [ + ['ds_r2_k3_s1_e1_c16'], + ['ir_r1_k5_s2_e4_c24', 'ir_r3_k5_s1_e2_c24'], + ['ir_r1_k5_s2_e5_c40_se0.25', 'ir_r4_k5_s1_e3_c40_se0.25'], + ['ir_r1_k5_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'], + ['ir_r1_k3_s1_e5_c120_se0.25', 'ir_r5_k5_s1_e3_c120_se0.25'], + ['ir_r1_k3_s2_e6_c184_se0.25', 'ir_r5_k5_s1_e4_c184_se0.25', 'ir_r1_k5_s1_e6_c224_se0.25'], + ['cn_r1_k1_s1_c1344'], + ] + elif vl == 'd': + stem_size = 24 + arch_def = [ + ['ds_r2_k3_s1_e1_c16'], + ['ir_r1_k3_s2_e5_c24', 'ir_r5_k3_s1_e2_c24'], + ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r4_k3_s1_e3_c40_se0.25'], + ['ir_r1_k3_s2_e5_c72', 'ir_r4_k3_s1_e3_c72'], + ['ir_r1_k3_s1_e5_c128_se0.25', 'ir_r6_k5_s1_e3_c128_se0.25'], + ['ir_r1_k3_s2_e6_c208_se0.25', 'ir_r5_k5_s1_e5_c208_se0.25', 'ir_r1_k5_s1_e6_c240_se0.25'], + ['cn_r1_k1_s1_c1440'], + ] + elif vl == 'g': + stem_size = 32 + arch_def = [ + ['ds_r3_k3_s1_e1_c24'], + ['ir_r1_k5_s2_e4_c40', 'ir_r4_k5_s1_e2_c40'], + ['ir_r1_k5_s2_e4_c56_se0.25', 'ir_r4_k5_s1_e3_c56_se0.25'], + ['ir_r1_k5_s2_e5_c104', 'ir_r4_k3_s1_e3_c104'], + ['ir_r1_k3_s1_e5_c160_se0.25', 'ir_r8_k5_s1_e3_c160_se0.25'], + ['ir_r1_k3_s2_e6_c264_se0.25', 'ir_r6_k5_s1_e5_c264_se0.25', 'ir_r2_k5_s1_e6_c288_se0.25'], + ['cn_r1_k1_s1_c1728'], + ] + else: + raise NotImplemented + round_chs_fn = partial(round_channels, multiplier=channel_multiplier, round_limit=0.95) + se_layer = partial(SqueezeExcite, gate_layer='hard_sigmoid', rd_round_fn=round_chs_fn) + act_layer = resolve_act_layer(kwargs, 'hard_swish') + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + num_features=1984, + head_bias=False, + stem_size=stem_size, + round_chs_fn=round_chs_fn, + se_from_exp=False, + norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=act_layer, + se_layer=se_layer, + **kwargs, + ) + model = _create_mnv3(variant, pretrained, **model_kwargs) + return model + + +def _gen_lcnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """ LCNet + Essentially a MobileNet-V3 crossed with a MobileNet-V1 + + Paper: `PP-LCNet: A Lightweight CPU Convolutional Neural Network` - https://arxiv.org/abs/2109.15099 + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + arch_def = [ + # stage 0, 112x112 in + ['dsa_r1_k3_s1_c32'], + # stage 1, 112x112 in + ['dsa_r2_k3_s2_c64'], + # stage 2, 56x56 in + ['dsa_r2_k3_s2_c128'], + # stage 3, 28x28 in + ['dsa_r1_k3_s2_c256', 'dsa_r1_k5_s1_c256'], + # stage 4, 14x14in + ['dsa_r4_k5_s1_c256'], + # stage 5, 14x14in + ['dsa_r2_k5_s2_c512_se0.25'], + # 7x7 + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + stem_size=16, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'hard_swish'), + se_layer=partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU), + num_features=1280, + **kwargs, + ) + model = _create_mnv3(variant, pretrained, **model_kwargs) + return model + + +def _gen_lcnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs): + """ LCNet + Essentially a MobileNet-V3 crossed with a MobileNet-V1 + + Paper: `PP-LCNet: A Lightweight CPU Convolutional Neural Network` - https://arxiv.org/abs/2109.15099 + + Args: + channel_multiplier: multiplier to number of channels per layer. + """ + arch_def = [ + # stage 0, 112x112 in + ['dsa_r1_k3_s1_c32'], + # stage 1, 112x112 in + ['dsa_r2_k3_s2_c64'], + # stage 2, 56x56 in + ['dsa_r2_k3_s2_c128'], + # stage 3, 28x28 in + ['dsa_r1_k3_s2_c256', 'dsa_r1_k5_s1_c256'], + # stage 4, 14x14in + ['dsa_r4_k5_s1_c256'], + # stage 5, 14x14in + ['dsa_r2_k5_s2_c512_se0.25'], + # 7x7 + ] + model_kwargs = dict( + block_args=decode_arch_def(arch_def), + stem_size=16, + round_chs_fn=partial(round_channels, multiplier=channel_multiplier), + norm_layer=partial(nn.BatchNorm2d, **resolve_bn_args(kwargs)), + act_layer=resolve_act_layer(kwargs, 'hard_swish'), + se_layer=partial(SqueezeExcite, gate_layer='hard_sigmoid', force_act_layer=nn.ReLU), + num_features=1280, + **kwargs, + ) + model = _create_mnv3(variant, pretrained, **model_kwargs) + return model + + +@register_model +def mobilenetv3_large_075(pretrained=False, **kwargs): + """ MobileNet V3 """ + model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv3_large_100(pretrained=False, **kwargs): + """ MobileNet V3 """ + model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv3_large_100_miil(pretrained=False, **kwargs): + """ MobileNet V3 + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model = _gen_mobilenet_v3('mobilenetv3_large_100_miil', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv3_large_100_miil_in21k(pretrained=False, **kwargs): + """ MobileNet V3, 21k pretraining + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model = _gen_mobilenet_v3('mobilenetv3_large_100_miil_in21k', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv3_small_050(pretrained=False, **kwargs): + """ MobileNet V3 """ + model = _gen_mobilenet_v3('mobilenetv3_small_050', 0.50, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv3_small_075(pretrained=False, **kwargs): + """ MobileNet V3 """ + model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv3_small_100(pretrained=False, **kwargs): + """ MobileNet V3 """ + model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def mobilenetv3_rw(pretrained=False, **kwargs): + """ MobileNet V3 """ + if pretrained: + # pretrained model trained with non-default BN epsilon + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mobilenetv3_large_075(pretrained=False, **kwargs): + """ MobileNet V3 """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mobilenetv3_large_100(pretrained=False, **kwargs): + """ MobileNet V3 """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs): + """ MobileNet V3 """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mobilenetv3_small_075(pretrained=False, **kwargs): + """ MobileNet V3 """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mobilenetv3_small_100(pretrained=False, **kwargs): + """ MobileNet V3 """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs): + """ MobileNet V3 """ + kwargs['bn_eps'] = BN_EPS_TF_DEFAULT + kwargs['pad_type'] = 'same' + model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def fbnetv3_b(pretrained=False, **kwargs): + """ FBNetV3-B """ + model = _gen_fbnetv3('fbnetv3_b', pretrained=pretrained, **kwargs) + return model + + +@register_model +def fbnetv3_d(pretrained=False, **kwargs): + """ FBNetV3-D """ + model = _gen_fbnetv3('fbnetv3_d', pretrained=pretrained, **kwargs) + return model + + +@register_model +def fbnetv3_g(pretrained=False, **kwargs): + """ FBNetV3-G """ + model = _gen_fbnetv3('fbnetv3_g', pretrained=pretrained, **kwargs) + return model + + +@register_model +def lcnet_035(pretrained=False, **kwargs): + """ PP-LCNet 0.35""" + model = _gen_lcnet('lcnet_035', 0.35, pretrained=pretrained, **kwargs) + return model + + +@register_model +def lcnet_050(pretrained=False, **kwargs): + """ PP-LCNet 0.5""" + model = _gen_lcnet('lcnet_050', 0.5, pretrained=pretrained, **kwargs) + return model + + +@register_model +def lcnet_075(pretrained=False, **kwargs): + """ PP-LCNet 1.0""" + model = _gen_lcnet('lcnet_075', 0.75, pretrained=pretrained, **kwargs) + return model + + +@register_model +def lcnet_100(pretrained=False, **kwargs): + """ PP-LCNet 1.0""" + model = _gen_lcnet('lcnet_100', 1.0, pretrained=pretrained, **kwargs) + return model + + +@register_model +def lcnet_150(pretrained=False, **kwargs): + """ PP-LCNet 1.5""" + model = _gen_lcnet('lcnet_150', 1.5, pretrained=pretrained, **kwargs) + return model diff --git a/custom_timm/models/mobilevit.py b/custom_timm/models/mobilevit.py new file mode 100644 index 0000000000000000000000000000000000000000..bd5479a7cf9a379cc40e918a57980db6812be045 --- /dev/null +++ b/custom_timm/models/mobilevit.py @@ -0,0 +1,699 @@ +""" MobileViT + +Paper: +V1: `MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer` - https://arxiv.org/abs/2110.02178 +V2: `Separable Self-attention for Mobile Vision Transformers` - https://arxiv.org/abs/2206.02680 + +MobileVitBlock and checkpoints adapted from https://github.com/apple/ml-cvnets (original copyright below) +License: https://github.com/apple/ml-cvnets/blob/main/LICENSE (Apple open source) + +Rest of code, ByobNet, and Transformer block hacked together by / Copyright 2022, Ross Wightman +""" +# +# For licensing see accompanying LICENSE file. +# Copyright (C) 2020 Apple Inc. All Rights Reserved. +# +import math +from typing import Union, Callable, Dict, Tuple, Optional, Sequence + +import torch +from torch import nn +import torch.nn.functional as F + +from .byobnet import register_block, ByoBlockCfg, ByoModelCfg, ByobNet, LayerFn, num_groups +from .fx_features import register_notrace_module +from .layers import to_2tuple, make_divisible, LayerNorm2d, GroupNorm1, ConvMlp, DropPath +from .vision_transformer import Block as TransformerBlock +from .helpers import build_model_with_cfg +from .registry import register_model + +__all__ = [] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 256, 256), 'pool_size': (8, 8), + 'crop_pct': 0.9, 'interpolation': 'bicubic', + 'mean': (0., 0., 0.), 'std': (1., 1., 1.), + 'first_conv': 'stem.conv', 'classifier': 'head.fc', + 'fixed_input_size': False, + **kwargs + } + + +default_cfgs = { + 'mobilevit_xxs': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevit_xxs-ad385b40.pth'), + 'mobilevit_xs': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevit_xs-8fbd6366.pth'), + 'mobilevit_s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevit_s-38a5a959.pth'), + 'semobilevit_s': _cfg(), + + 'mobilevitv2_050': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_050-49951ee2.pth', + crop_pct=0.888), + 'mobilevitv2_075': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_075-b5556ef6.pth', + crop_pct=0.888), + 'mobilevitv2_100': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_100-e464ef3b.pth', + crop_pct=0.888), + 'mobilevitv2_125': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_125-0ae35027.pth', + crop_pct=0.888), + 'mobilevitv2_150': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_150-737c5019.pth', + crop_pct=0.888), + 'mobilevitv2_175': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_175-16462ee2.pth', + crop_pct=0.888), + 'mobilevitv2_200': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_200-b3422f67.pth', + crop_pct=0.888), + + 'mobilevitv2_150_in22ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_150_in22ft1k-0b555d7b.pth', + crop_pct=0.888), + 'mobilevitv2_175_in22ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_175_in22ft1k-4117fa1f.pth', + crop_pct=0.888), + 'mobilevitv2_200_in22ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_200_in22ft1k-1d7c8927.pth', + crop_pct=0.888), + + 'mobilevitv2_150_384_in22ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_150_384_in22ft1k-9e142854.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), + 'mobilevitv2_175_384_in22ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_175_384_in22ft1k-059cbe56.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), + 'mobilevitv2_200_384_in22ft1k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-mvit-weights/mobilevitv2_200_384_in22ft1k-32c87503.pth', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0), +} + + +def _inverted_residual_block(d, c, s, br=4.0): + # inverted residual is a bottleneck block with bottle_ratio > 1 applied to in_chs, linear output, gs=1 (depthwise) + return ByoBlockCfg( + type='bottle', d=d, c=c, s=s, gs=1, br=br, + block_kwargs=dict(bottle_in=True, linear_out=True)) + + +def _mobilevit_block(d, c, s, transformer_dim, transformer_depth, patch_size=4, br=4.0): + # inverted residual + mobilevit blocks as per MobileViT network + return ( + _inverted_residual_block(d=d, c=c, s=s, br=br), + ByoBlockCfg( + type='mobilevit', d=1, c=c, s=1, + block_kwargs=dict( + transformer_dim=transformer_dim, + transformer_depth=transformer_depth, + patch_size=patch_size) + ) + ) + + +def _mobilevitv2_block(d, c, s, transformer_depth, patch_size=2, br=2.0, transformer_br=0.5): + # inverted residual + mobilevit blocks as per MobileViT network + return ( + _inverted_residual_block(d=d, c=c, s=s, br=br), + ByoBlockCfg( + type='mobilevit2', d=1, c=c, s=1, br=transformer_br, gs=1, + block_kwargs=dict( + transformer_depth=transformer_depth, + patch_size=patch_size) + ) + ) + + +def _mobilevitv2_cfg(multiplier=1.0): + chs = (64, 128, 256, 384, 512) + if multiplier != 1.0: + chs = tuple([int(c * multiplier) for c in chs]) + cfg = ByoModelCfg( + blocks=( + _inverted_residual_block(d=1, c=chs[0], s=1, br=2.0), + _inverted_residual_block(d=2, c=chs[1], s=2, br=2.0), + _mobilevitv2_block(d=1, c=chs[2], s=2, transformer_depth=2), + _mobilevitv2_block(d=1, c=chs[3], s=2, transformer_depth=4), + _mobilevitv2_block(d=1, c=chs[4], s=2, transformer_depth=3), + ), + stem_chs=int(32 * multiplier), + stem_type='3x3', + stem_pool='', + downsample='', + act_layer='silu', + ) + return cfg + + +model_cfgs = dict( + mobilevit_xxs=ByoModelCfg( + blocks=( + _inverted_residual_block(d=1, c=16, s=1, br=2.0), + _inverted_residual_block(d=3, c=24, s=2, br=2.0), + _mobilevit_block(d=1, c=48, s=2, transformer_dim=64, transformer_depth=2, patch_size=2, br=2.0), + _mobilevit_block(d=1, c=64, s=2, transformer_dim=80, transformer_depth=4, patch_size=2, br=2.0), + _mobilevit_block(d=1, c=80, s=2, transformer_dim=96, transformer_depth=3, patch_size=2, br=2.0), + ), + stem_chs=16, + stem_type='3x3', + stem_pool='', + downsample='', + act_layer='silu', + num_features=320, + ), + + mobilevit_xs=ByoModelCfg( + blocks=( + _inverted_residual_block(d=1, c=32, s=1), + _inverted_residual_block(d=3, c=48, s=2), + _mobilevit_block(d=1, c=64, s=2, transformer_dim=96, transformer_depth=2, patch_size=2), + _mobilevit_block(d=1, c=80, s=2, transformer_dim=120, transformer_depth=4, patch_size=2), + _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=3, patch_size=2), + ), + stem_chs=16, + stem_type='3x3', + stem_pool='', + downsample='', + act_layer='silu', + num_features=384, + ), + + mobilevit_s=ByoModelCfg( + blocks=( + _inverted_residual_block(d=1, c=32, s=1), + _inverted_residual_block(d=3, c=64, s=2), + _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=2, patch_size=2), + _mobilevit_block(d=1, c=128, s=2, transformer_dim=192, transformer_depth=4, patch_size=2), + _mobilevit_block(d=1, c=160, s=2, transformer_dim=240, transformer_depth=3, patch_size=2), + ), + stem_chs=16, + stem_type='3x3', + stem_pool='', + downsample='', + act_layer='silu', + num_features=640, + ), + + semobilevit_s=ByoModelCfg( + blocks=( + _inverted_residual_block(d=1, c=32, s=1), + _inverted_residual_block(d=3, c=64, s=2), + _mobilevit_block(d=1, c=96, s=2, transformer_dim=144, transformer_depth=2, patch_size=2), + _mobilevit_block(d=1, c=128, s=2, transformer_dim=192, transformer_depth=4, patch_size=2), + _mobilevit_block(d=1, c=160, s=2, transformer_dim=240, transformer_depth=3, patch_size=2), + ), + stem_chs=16, + stem_type='3x3', + stem_pool='', + downsample='', + attn_layer='se', + attn_kwargs=dict(rd_ratio=1/8), + num_features=640, + ), + + mobilevitv2_050=_mobilevitv2_cfg(.50), + mobilevitv2_075=_mobilevitv2_cfg(.75), + mobilevitv2_125=_mobilevitv2_cfg(1.25), + mobilevitv2_100=_mobilevitv2_cfg(1.0), + mobilevitv2_150=_mobilevitv2_cfg(1.5), + mobilevitv2_175=_mobilevitv2_cfg(1.75), + mobilevitv2_200=_mobilevitv2_cfg(2.0), +) + + +@register_notrace_module +class MobileVitBlock(nn.Module): + """ MobileViT block + Paper: https://arxiv.org/abs/2110.02178?context=cs.LG + """ + def __init__( + self, + in_chs: int, + out_chs: Optional[int] = None, + kernel_size: int = 3, + stride: int = 1, + bottle_ratio: float = 1.0, + group_size: Optional[int] = None, + dilation: Tuple[int, int] = (1, 1), + mlp_ratio: float = 2.0, + transformer_dim: Optional[int] = None, + transformer_depth: int = 2, + patch_size: int = 8, + num_heads: int = 4, + attn_drop: float = 0., + drop: int = 0., + no_fusion: bool = False, + drop_path_rate: float = 0., + layers: LayerFn = None, + transformer_norm_layer: Callable = nn.LayerNorm, + **kwargs, # eat unused args + ): + super(MobileVitBlock, self).__init__() + + layers = layers or LayerFn() + groups = num_groups(group_size, in_chs) + out_chs = out_chs or in_chs + transformer_dim = transformer_dim or make_divisible(bottle_ratio * in_chs) + + self.conv_kxk = layers.conv_norm_act( + in_chs, in_chs, kernel_size=kernel_size, + stride=stride, groups=groups, dilation=dilation[0]) + self.conv_1x1 = nn.Conv2d(in_chs, transformer_dim, kernel_size=1, bias=False) + + self.transformer = nn.Sequential(*[ + TransformerBlock( + transformer_dim, mlp_ratio=mlp_ratio, num_heads=num_heads, qkv_bias=True, + attn_drop=attn_drop, drop=drop, drop_path=drop_path_rate, + act_layer=layers.act, norm_layer=transformer_norm_layer) + for _ in range(transformer_depth) + ]) + self.norm = transformer_norm_layer(transformer_dim) + + self.conv_proj = layers.conv_norm_act(transformer_dim, out_chs, kernel_size=1, stride=1) + + if no_fusion: + self.conv_fusion = None + else: + self.conv_fusion = layers.conv_norm_act(in_chs + out_chs, out_chs, kernel_size=kernel_size, stride=1) + + self.patch_size = to_2tuple(patch_size) + self.patch_area = self.patch_size[0] * self.patch_size[1] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x + + # Local representation + x = self.conv_kxk(x) + x = self.conv_1x1(x) + + # Unfold (feature map -> patches) + patch_h, patch_w = self.patch_size + B, C, H, W = x.shape + new_h, new_w = math.ceil(H / patch_h) * patch_h, math.ceil(W / patch_w) * patch_w + num_patch_h, num_patch_w = new_h // patch_h, new_w // patch_w # n_h, n_w + num_patches = num_patch_h * num_patch_w # N + interpolate = False + if new_h != H or new_w != W: + # Note: Padding can be done, but then it needs to be handled in attention function. + x = F.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=False) + interpolate = True + + # [B, C, H, W] --> [B * C * n_h, n_w, p_h, p_w] + x = x.reshape(B * C * num_patch_h, patch_h, num_patch_w, patch_w).transpose(1, 2) + # [B * C * n_h, n_w, p_h, p_w] --> [BP, N, C] where P = p_h * p_w and N = n_h * n_w + x = x.reshape(B, C, num_patches, self.patch_area).transpose(1, 3).reshape(B * self.patch_area, num_patches, -1) + + # Global representations + x = self.transformer(x) + x = self.norm(x) + + # Fold (patch -> feature map) + # [B, P, N, C] --> [B*C*n_h, n_w, p_h, p_w] + x = x.contiguous().view(B, self.patch_area, num_patches, -1) + x = x.transpose(1, 3).reshape(B * C * num_patch_h, num_patch_w, patch_h, patch_w) + # [B*C*n_h, n_w, p_h, p_w] --> [B*C*n_h, p_h, n_w, p_w] --> [B, C, H, W] + x = x.transpose(1, 2).reshape(B, C, num_patch_h * patch_h, num_patch_w * patch_w) + if interpolate: + x = F.interpolate(x, size=(H, W), mode="bilinear", align_corners=False) + + x = self.conv_proj(x) + if self.conv_fusion is not None: + x = self.conv_fusion(torch.cat((shortcut, x), dim=1)) + return x + + +class LinearSelfAttention(nn.Module): + """ + This layer applies a self-attention with linear complexity, as described in `https://arxiv.org/abs/2206.02680` + This layer can be used for self- as well as cross-attention. + Args: + embed_dim (int): :math:`C` from an expected input of size :math:`(N, C, H, W)` + attn_drop (float): Dropout value for context scores. Default: 0.0 + bias (bool): Use bias in learnable layers. Default: True + Shape: + - Input: :math:`(N, C, P, N)` where :math:`N` is the batch size, :math:`C` is the input channels, + :math:`P` is the number of pixels in the patch, and :math:`N` is the number of patches + - Output: same as the input + .. note:: + For MobileViTv2, we unfold the feature map [B, C, H, W] into [B, C, P, N] where P is the number of pixels + in a patch and N is the number of patches. Because channel is the first dimension in this unfolded tensor, + we use point-wise convolution (instead of a linear layer). This avoids a transpose operation (which may be + expensive on resource-constrained devices) that may be required to convert the unfolded tensor from + channel-first to channel-last format in case of a linear layer. + """ + + def __init__( + self, + embed_dim: int, + attn_drop: float = 0.0, + proj_drop: float = 0.0, + bias: bool = True, + ) -> None: + super().__init__() + self.embed_dim = embed_dim + + self.qkv_proj = nn.Conv2d( + in_channels=embed_dim, + out_channels=1 + (2 * embed_dim), + bias=bias, + kernel_size=1, + ) + self.attn_drop = nn.Dropout(attn_drop) + self.out_proj = nn.Conv2d( + in_channels=embed_dim, + out_channels=embed_dim, + bias=bias, + kernel_size=1, + ) + self.out_drop = nn.Dropout(proj_drop) + + def _forward_self_attn(self, x: torch.Tensor) -> torch.Tensor: + # [B, C, P, N] --> [B, h + 2d, P, N] + qkv = self.qkv_proj(x) + + # Project x into query, key and value + # Query --> [B, 1, P, N] + # value, key --> [B, d, P, N] + query, key, value = qkv.split([1, self.embed_dim, self.embed_dim], dim=1) + + # apply softmax along N dimension + context_scores = F.softmax(query, dim=-1) + context_scores = self.attn_drop(context_scores) + + # Compute context vector + # [B, d, P, N] x [B, 1, P, N] -> [B, d, P, N] --> [B, d, P, 1] + context_vector = (key * context_scores).sum(dim=-1, keepdim=True) + + # combine context vector with values + # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N] + out = F.relu(value) * context_vector.expand_as(value) + out = self.out_proj(out) + out = self.out_drop(out) + return out + + @torch.jit.ignore() + def _forward_cross_attn(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor: + # x --> [B, C, P, N] + # x_prev = [B, C, P, M] + batch_size, in_dim, kv_patch_area, kv_num_patches = x.shape + q_patch_area, q_num_patches = x.shape[-2:] + + assert ( + kv_patch_area == q_patch_area + ), "The number of pixels in a patch for query and key_value should be the same" + + # compute query, key, and value + # [B, C, P, M] --> [B, 1 + d, P, M] + qk = F.conv2d( + x_prev, + weight=self.qkv_proj.weight[:self.embed_dim + 1], + bias=self.qkv_proj.bias[:self.embed_dim + 1], + ) + + # [B, 1 + d, P, M] --> [B, 1, P, M], [B, d, P, M] + query, key = qk.split([1, self.embed_dim], dim=1) + # [B, C, P, N] --> [B, d, P, N] + value = F.conv2d( + x, + weight=self.qkv_proj.weight[self.embed_dim + 1], + bias=self.qkv_proj.bias[self.embed_dim + 1] if self.qkv_proj.bias is not None else None, + ) + + # apply softmax along M dimension + context_scores = F.softmax(query, dim=-1) + context_scores = self.attn_drop(context_scores) + + # compute context vector + # [B, d, P, M] * [B, 1, P, M] -> [B, d, P, M] --> [B, d, P, 1] + context_vector = (key * context_scores).sum(dim=-1, keepdim=True) + + # combine context vector with values + # [B, d, P, N] * [B, d, P, 1] --> [B, d, P, N] + out = F.relu(value) * context_vector.expand_as(value) + out = self.out_proj(out) + out = self.out_drop(out) + return out + + def forward(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor: + if x_prev is None: + return self._forward_self_attn(x) + else: + return self._forward_cross_attn(x, x_prev=x_prev) + + +class LinearTransformerBlock(nn.Module): + """ + This class defines the pre-norm transformer encoder with linear self-attention in `MobileViTv2 paper <>`_ + Args: + embed_dim (int): :math:`C_{in}` from an expected input of size :math:`(B, C_{in}, P, N)` + mlp_ratio (float): Inner dimension ratio of the FFN relative to embed_dim + drop (float): Dropout rate. Default: 0.0 + attn_drop (float): Dropout rate for attention in multi-head attention. Default: 0.0 + drop_path (float): Stochastic depth rate Default: 0.0 + norm_layer (Callable): Normalization layer. Default: layer_norm_2d + Shape: + - Input: :math:`(B, C_{in}, P, N)` where :math:`B` is batch size, :math:`C_{in}` is input embedding dim, + :math:`P` is number of pixels in a patch, and :math:`N` is number of patches, + - Output: same shape as the input + """ + + def __init__( + self, + embed_dim: int, + mlp_ratio: float = 2.0, + drop: float = 0.0, + attn_drop: float = 0.0, + drop_path: float = 0.0, + act_layer=None, + norm_layer=None, + ) -> None: + super().__init__() + act_layer = act_layer or nn.SiLU + norm_layer = norm_layer or GroupNorm1 + + self.norm1 = norm_layer(embed_dim) + self.attn = LinearSelfAttention(embed_dim=embed_dim, attn_drop=attn_drop, proj_drop=drop) + self.drop_path1 = DropPath(drop_path) + + self.norm2 = norm_layer(embed_dim) + self.mlp = ConvMlp( + in_features=embed_dim, + hidden_features=int(embed_dim * mlp_ratio), + act_layer=act_layer, + drop=drop) + self.drop_path2 = DropPath(drop_path) + + def forward(self, x: torch.Tensor, x_prev: Optional[torch.Tensor] = None) -> torch.Tensor: + if x_prev is None: + # self-attention + x = x + self.drop_path1(self.attn(self.norm1(x))) + else: + # cross-attention + res = x + x = self.norm1(x) # norm + x = self.attn(x, x_prev) # attn + x = self.drop_path1(x) + res # residual + + # Feed forward network + x = x + self.drop_path2(self.mlp(self.norm2(x))) + return x + + +@register_notrace_module +class MobileVitV2Block(nn.Module): + """ + This class defines the `MobileViTv2 block <>`_ + """ + + def __init__( + self, + in_chs: int, + out_chs: Optional[int] = None, + kernel_size: int = 3, + bottle_ratio: float = 1.0, + group_size: Optional[int] = 1, + dilation: Tuple[int, int] = (1, 1), + mlp_ratio: float = 2.0, + transformer_dim: Optional[int] = None, + transformer_depth: int = 2, + patch_size: int = 8, + attn_drop: float = 0., + drop: int = 0., + drop_path_rate: float = 0., + layers: LayerFn = None, + transformer_norm_layer: Callable = GroupNorm1, + **kwargs, # eat unused args + ): + super(MobileVitV2Block, self).__init__() + layers = layers or LayerFn() + groups = num_groups(group_size, in_chs) + out_chs = out_chs or in_chs + transformer_dim = transformer_dim or make_divisible(bottle_ratio * in_chs) + + self.conv_kxk = layers.conv_norm_act( + in_chs, in_chs, kernel_size=kernel_size, + stride=1, groups=groups, dilation=dilation[0]) + self.conv_1x1 = nn.Conv2d(in_chs, transformer_dim, kernel_size=1, bias=False) + + self.transformer = nn.Sequential(*[ + LinearTransformerBlock( + transformer_dim, + mlp_ratio=mlp_ratio, + attn_drop=attn_drop, + drop=drop, + drop_path=drop_path_rate, + act_layer=layers.act, + norm_layer=transformer_norm_layer + ) + for _ in range(transformer_depth) + ]) + self.norm = transformer_norm_layer(transformer_dim) + + self.conv_proj = layers.conv_norm_act(transformer_dim, out_chs, kernel_size=1, stride=1, apply_act=False) + + self.patch_size = to_2tuple(patch_size) + self.patch_area = self.patch_size[0] * self.patch_size[1] + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, C, H, W = x.shape + patch_h, patch_w = self.patch_size + new_h, new_w = math.ceil(H / patch_h) * patch_h, math.ceil(W / patch_w) * patch_w + num_patch_h, num_patch_w = new_h // patch_h, new_w // patch_w # n_h, n_w + num_patches = num_patch_h * num_patch_w # N + if new_h != H or new_w != W: + x = F.interpolate(x, size=(new_h, new_w), mode="bilinear", align_corners=True) + + # Local representation + x = self.conv_kxk(x) + x = self.conv_1x1(x) + + # Unfold (feature map -> patches), [B, C, H, W] -> [B, C, P, N] + C = x.shape[1] + x = x.reshape(B, C, num_patch_h, patch_h, num_patch_w, patch_w).permute(0, 1, 3, 5, 2, 4) + x = x.reshape(B, C, -1, num_patches) + + # Global representations + x = self.transformer(x) + x = self.norm(x) + + # Fold (patches -> feature map), [B, C, P, N] --> [B, C, H, W] + x = x.reshape(B, C, patch_h, patch_w, num_patch_h, num_patch_w).permute(0, 1, 4, 2, 5, 3) + x = x.reshape(B, C, num_patch_h * patch_h, num_patch_w * patch_w) + + x = self.conv_proj(x) + return x + + +register_block('mobilevit', MobileVitBlock) +register_block('mobilevit2', MobileVitV2Block) + + +def _create_mobilevit(variant, cfg_variant=None, pretrained=False, **kwargs): + return build_model_with_cfg( + ByobNet, variant, pretrained, + model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], + feature_cfg=dict(flatten_sequential=True), + **kwargs) + + +def _create_mobilevit2(variant, cfg_variant=None, pretrained=False, **kwargs): + return build_model_with_cfg( + ByobNet, variant, pretrained, + model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], + feature_cfg=dict(flatten_sequential=True), + **kwargs) + + +@register_model +def mobilevit_xxs(pretrained=False, **kwargs): + return _create_mobilevit('mobilevit_xxs', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevit_xs(pretrained=False, **kwargs): + return _create_mobilevit('mobilevit_xs', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevit_s(pretrained=False, **kwargs): + return _create_mobilevit('mobilevit_s', pretrained=pretrained, **kwargs) + + +@register_model +def semobilevit_s(pretrained=False, **kwargs): + return _create_mobilevit('semobilevit_s', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_050(pretrained=False, **kwargs): + return _create_mobilevit('mobilevitv2_050', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_075(pretrained=False, **kwargs): + return _create_mobilevit('mobilevitv2_075', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_100(pretrained=False, **kwargs): + return _create_mobilevit('mobilevitv2_100', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_125(pretrained=False, **kwargs): + return _create_mobilevit('mobilevitv2_125', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_150(pretrained=False, **kwargs): + return _create_mobilevit('mobilevitv2_150', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_175(pretrained=False, **kwargs): + return _create_mobilevit('mobilevitv2_175', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_200(pretrained=False, **kwargs): + return _create_mobilevit('mobilevitv2_200', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_150_in22ft1k(pretrained=False, **kwargs): + return _create_mobilevit( + 'mobilevitv2_150_in22ft1k', cfg_variant='mobilevitv2_150', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_175_in22ft1k(pretrained=False, **kwargs): + return _create_mobilevit( + 'mobilevitv2_175_in22ft1k', cfg_variant='mobilevitv2_175', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_200_in22ft1k(pretrained=False, **kwargs): + return _create_mobilevit( + 'mobilevitv2_200_in22ft1k', cfg_variant='mobilevitv2_200', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_150_384_in22ft1k(pretrained=False, **kwargs): + return _create_mobilevit( + 'mobilevitv2_150_384_in22ft1k', cfg_variant='mobilevitv2_150', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_175_384_in22ft1k(pretrained=False, **kwargs): + return _create_mobilevit( + 'mobilevitv2_175_384_in22ft1k', cfg_variant='mobilevitv2_175', pretrained=pretrained, **kwargs) + + +@register_model +def mobilevitv2_200_384_in22ft1k(pretrained=False, **kwargs): + return _create_mobilevit( + 'mobilevitv2_200_384_in22ft1k', cfg_variant='mobilevitv2_200', pretrained=pretrained, **kwargs) \ No newline at end of file diff --git a/custom_timm/models/mvitv2.py b/custom_timm/models/mvitv2.py new file mode 100644 index 0000000000000000000000000000000000000000..b7ec58979f3b2f35393f4555abcb3342d055710b --- /dev/null +++ b/custom_timm/models/mvitv2.py @@ -0,0 +1,1010 @@ +""" Multi-Scale Vision Transformer v2 + +@inproceedings{li2021improved, + title={MViTv2: Improved multiscale vision transformers for classification and detection}, + author={Li, Yanghao and Wu, Chao-Yuan and Fan, Haoqi and Mangalam, Karttikeya and Xiong, Bo and Malik, Jitendra and Feichtenhofer, Christoph}, + booktitle={CVPR}, + year={2022} +} + +Code adapted from original Apache 2.0 licensed impl at https://github.com/facebookresearch/mvit +Original copyright below. + +Modifications and timm support by / Copyright 2022, Ross Wightman +""" +# Copyright (c) Meta Platforms, Inc. and affiliates. All Rights Reserved. All Rights Reserved. +import operator +from collections import OrderedDict +from dataclasses import dataclass +from functools import partial, reduce +from typing import Union, List, Tuple, Optional + +import torch +import torch.utils.checkpoint as checkpoint +from torch import nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_function +from .helpers import build_model_with_cfg +from .layers import Mlp, DropPath, trunc_normal_tf_, get_norm_layer, to_2tuple +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head.fc', + 'fixed_input_size': True, + **kwargs + } + + +default_cfgs = dict( + mvitv2_tiny=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_T_in1k.pyth'), + mvitv2_small=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_S_in1k.pyth'), + mvitv2_base=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in1k.pyth'), + mvitv2_large=_cfg(url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in1k.pyth'), + + mvitv2_base_in21k=_cfg( + url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_B_in21k.pyth', + num_classes=19168), + mvitv2_large_in21k=_cfg( + url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_L_in21k.pyth', + num_classes=19168), + mvitv2_huge_in21k=_cfg( + url='https://dl.fbaipublicfiles.com/mvit/mvitv2_models/MViTv2_H_in21k.pyth', + num_classes=19168), + + mvitv2_small_cls=_cfg(url=''), +) + + +@dataclass +class MultiScaleVitCfg: + depths: Tuple[int, ...] = (2, 3, 16, 3) + embed_dim: Union[int, Tuple[int, ...]] = 96 + num_heads: Union[int, Tuple[int, ...]] = 1 + mlp_ratio: float = 4. + pool_first: bool = False + expand_attn: bool = True + qkv_bias: bool = True + use_cls_token: bool = False + use_abs_pos: bool = False + residual_pooling: bool = True + mode: str = 'conv' + kernel_qkv: Tuple[int, int] = (3, 3) + stride_q: Optional[Tuple[Tuple[int, int]]] = ((1, 1), (2, 2), (2, 2), (2, 2)) + stride_kv: Optional[Tuple[Tuple[int, int]]] = None + stride_kv_adaptive: Optional[Tuple[int, int]] = (4, 4) + patch_kernel: Tuple[int, int] = (7, 7) + patch_stride: Tuple[int, int] = (4, 4) + patch_padding: Tuple[int, int] = (3, 3) + pool_type: str = 'max' + rel_pos_type: str = 'spatial' + act_layer: Union[str, Tuple[str, str]] = 'gelu' + norm_layer: Union[str, Tuple[str, str]] = 'layernorm' + norm_eps: float = 1e-6 + + def __post_init__(self): + num_stages = len(self.depths) + if not isinstance(self.embed_dim, (tuple, list)): + self.embed_dim = tuple(self.embed_dim * 2 ** i for i in range(num_stages)) + assert len(self.embed_dim) == num_stages + + if not isinstance(self.num_heads, (tuple, list)): + self.num_heads = tuple(self.num_heads * 2 ** i for i in range(num_stages)) + assert len(self.num_heads) == num_stages + + if self.stride_kv_adaptive is not None and self.stride_kv is None: + _stride_kv = self.stride_kv_adaptive + pool_kv_stride = [] + for i in range(num_stages): + if min(self.stride_q[i]) > 1: + _stride_kv = [ + max(_stride_kv[d] // self.stride_q[i][d], 1) + for d in range(len(_stride_kv)) + ] + pool_kv_stride.append(tuple(_stride_kv)) + self.stride_kv = tuple(pool_kv_stride) + + +model_cfgs = dict( + mvitv2_tiny=MultiScaleVitCfg( + depths=(1, 2, 5, 2), + ), + mvitv2_small=MultiScaleVitCfg( + depths=(1, 2, 11, 2), + ), + mvitv2_base=MultiScaleVitCfg( + depths=(2, 3, 16, 3), + ), + mvitv2_large=MultiScaleVitCfg( + depths=(2, 6, 36, 4), + embed_dim=144, + num_heads=2, + expand_attn=False, + ), + + mvitv2_base_in21k=MultiScaleVitCfg( + depths=(2, 3, 16, 3), + ), + mvitv2_large_in21k=MultiScaleVitCfg( + depths=(2, 6, 36, 4), + embed_dim=144, + num_heads=2, + expand_attn=False, + ), + + mvitv2_small_cls=MultiScaleVitCfg( + depths=(1, 2, 11, 2), + use_cls_token=True, + ), +) + + +def prod(iterable): + return reduce(operator.mul, iterable, 1) + + +class PatchEmbed(nn.Module): + """ + PatchEmbed. + """ + + def __init__( + self, + dim_in=3, + dim_out=768, + kernel=(7, 7), + stride=(4, 4), + padding=(3, 3), + ): + super().__init__() + + self.proj = nn.Conv2d( + dim_in, + dim_out, + kernel_size=kernel, + stride=stride, + padding=padding, + ) + + def forward(self, x) -> Tuple[torch.Tensor, List[int]]: + x = self.proj(x) + # B C H W -> B HW C + return x.flatten(2).transpose(1, 2), x.shape[-2:] + + +@register_notrace_function +def reshape_pre_pool( + x, + feat_size: List[int], + has_cls_token: bool = True +) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + H, W = feat_size + if has_cls_token: + cls_tok, x = x[:, :, :1, :], x[:, :, 1:, :] + else: + cls_tok = None + x = x.reshape(-1, H, W, x.shape[-1]).permute(0, 3, 1, 2).contiguous() + return x, cls_tok + + +@register_notrace_function +def reshape_post_pool( + x, + num_heads: int, + cls_tok: Optional[torch.Tensor] = None +) -> Tuple[torch.Tensor, List[int]]: + feat_size = [x.shape[2], x.shape[3]] + L_pooled = x.shape[2] * x.shape[3] + x = x.reshape(-1, num_heads, x.shape[1], L_pooled).transpose(2, 3) + if cls_tok is not None: + x = torch.cat((cls_tok, x), dim=2) + return x, feat_size + + +@register_notrace_function +def cal_rel_pos_type( + attn: torch.Tensor, + q: torch.Tensor, + has_cls_token: bool, + q_size: List[int], + k_size: List[int], + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, +): + """ + Spatial Relative Positional Embeddings. + """ + sp_idx = 1 if has_cls_token else 0 + q_h, q_w = q_size + k_h, k_w = k_size + + # Scale up rel pos if shapes for q and k are different. + q_h_ratio = max(k_h / q_h, 1.0) + k_h_ratio = max(q_h / k_h, 1.0) + dist_h = torch.arange(q_h)[:, None] * q_h_ratio - torch.arange(k_h)[None, :] * k_h_ratio + dist_h += (k_h - 1) * k_h_ratio + q_w_ratio = max(k_w / q_w, 1.0) + k_w_ratio = max(q_w / k_w, 1.0) + dist_w = torch.arange(q_w)[:, None] * q_w_ratio - torch.arange(k_w)[None, :] * k_w_ratio + dist_w += (k_w - 1) * k_w_ratio + + Rh = rel_pos_h[dist_h.long()] + Rw = rel_pos_w[dist_w.long()] + + B, n_head, q_N, dim = q.shape + + r_q = q[:, :, sp_idx:].reshape(B, n_head, q_h, q_w, dim) + rel_h = torch.einsum("byhwc,hkc->byhwk", r_q, Rh) + rel_w = torch.einsum("byhwc,wkc->byhwk", r_q, Rw) + + attn[:, :, sp_idx:, sp_idx:] = ( + attn[:, :, sp_idx:, sp_idx:].view(B, -1, q_h, q_w, k_h, k_w) + + rel_h[:, :, :, :, :, None] + + rel_w[:, :, :, :, None, :] + ).view(B, -1, q_h * q_w, k_h * k_w) + + return attn + + +class MultiScaleAttentionPoolFirst(nn.Module): + def __init__( + self, + dim, + dim_out, + feat_size, + num_heads=8, + qkv_bias=True, + mode="conv", + kernel_q=(1, 1), + kernel_kv=(1, 1), + stride_q=(1, 1), + stride_kv=(1, 1), + has_cls_token=True, + rel_pos_type='spatial', + residual_pooling=True, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.num_heads = num_heads + self.dim_out = dim_out + self.head_dim = dim_out // num_heads + self.scale = self.head_dim ** -0.5 + self.has_cls_token = has_cls_token + padding_q = tuple([int(q // 2) for q in kernel_q]) + padding_kv = tuple([int(kv // 2) for kv in kernel_kv]) + + self.q = nn.Linear(dim, dim_out, bias=qkv_bias) + self.k = nn.Linear(dim, dim_out, bias=qkv_bias) + self.v = nn.Linear(dim, dim_out, bias=qkv_bias) + self.proj = nn.Linear(dim_out, dim_out) + + # Skip pooling with kernel and stride size of (1, 1, 1). + if prod(kernel_q) == 1 and prod(stride_q) == 1: + kernel_q = None + if prod(kernel_kv) == 1 and prod(stride_kv) == 1: + kernel_kv = None + self.mode = mode + self.unshared = mode == 'conv_unshared' + self.pool_q, self.pool_k, self.pool_v = None, None, None + self.norm_q, self.norm_k, self.norm_v = None, None, None + if mode in ("avg", "max"): + pool_op = nn.MaxPool2d if mode == "max" else nn.AvgPool2d + if kernel_q: + self.pool_q = pool_op(kernel_q, stride_q, padding_q) + if kernel_kv: + self.pool_k = pool_op(kernel_kv, stride_kv, padding_kv) + self.pool_v = pool_op(kernel_kv, stride_kv, padding_kv) + elif mode == "conv" or mode == "conv_unshared": + dim_conv = dim // num_heads if mode == "conv" else dim + if kernel_q: + self.pool_q = nn.Conv2d( + dim_conv, + dim_conv, + kernel_q, + stride=stride_q, + padding=padding_q, + groups=dim_conv, + bias=False, + ) + self.norm_q = norm_layer(dim_conv) + if kernel_kv: + self.pool_k = nn.Conv2d( + dim_conv, + dim_conv, + kernel_kv, + stride=stride_kv, + padding=padding_kv, + groups=dim_conv, + bias=False, + ) + self.norm_k = norm_layer(dim_conv) + self.pool_v = nn.Conv2d( + dim_conv, + dim_conv, + kernel_kv, + stride=stride_kv, + padding=padding_kv, + groups=dim_conv, + bias=False, + ) + self.norm_v = norm_layer(dim_conv) + else: + raise NotImplementedError(f"Unsupported model {mode}") + + # relative pos embedding + self.rel_pos_type = rel_pos_type + if self.rel_pos_type == 'spatial': + assert feat_size[0] == feat_size[1] + size = feat_size[0] + q_size = size // stride_q[1] if len(stride_q) > 0 else size + kv_size = size // stride_kv[1] if len(stride_kv) > 0 else size + rel_sp_dim = 2 * max(q_size, kv_size) - 1 + + self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim)) + trunc_normal_tf_(self.rel_pos_h, std=0.02) + trunc_normal_tf_(self.rel_pos_w, std=0.02) + + self.residual_pooling = residual_pooling + + def forward(self, x, feat_size: List[int]): + B, N, _ = x.shape + + fold_dim = 1 if self.unshared else self.num_heads + x = x.reshape(B, N, fold_dim, -1).permute(0, 2, 1, 3) + q = k = v = x + + if self.pool_q is not None: + q, q_tok = reshape_pre_pool(q, feat_size, self.has_cls_token) + q = self.pool_q(q) + q, q_size = reshape_post_pool(q, self.num_heads, q_tok) + else: + q_size = feat_size + if self.norm_q is not None: + q = self.norm_q(q) + + if self.pool_k is not None: + k, k_tok = reshape_pre_pool(k, feat_size, self.has_cls_token) + k = self.pool_k(k) + k, k_size = reshape_post_pool(k, self.num_heads, k_tok) + else: + k_size = feat_size + if self.norm_k is not None: + k = self.norm_k(k) + + if self.pool_v is not None: + v, v_tok = reshape_pre_pool(v, feat_size, self.has_cls_token) + v = self.pool_v(v) + v, v_size = reshape_post_pool(v, self.num_heads, v_tok) + else: + v_size = feat_size + if self.norm_v is not None: + v = self.norm_v(v) + + q_N = q_size[0] * q_size[1] + int(self.has_cls_token) + q = q.permute(0, 2, 1, 3).reshape(B, q_N, -1) + q = self.q(q).reshape(B, q_N, self.num_heads, -1).permute(0, 2, 1, 3) + + k_N = k_size[0] * k_size[1] + int(self.has_cls_token) + k = k.permute(0, 2, 1, 3).reshape(B, k_N, -1) + k = self.k(k).reshape(B, k_N, self.num_heads, -1).permute(0, 2, 1, 3) + + v_N = v_size[0] * v_size[1] + int(self.has_cls_token) + v = v.permute(0, 2, 1, 3).reshape(B, v_N, -1) + v = self.v(v).reshape(B, v_N, self.num_heads, -1).permute(0, 2, 1, 3) + + attn = (q * self.scale) @ k.transpose(-2, -1) + if self.rel_pos_type == 'spatial': + attn = cal_rel_pos_type( + attn, + q, + self.has_cls_token, + q_size, + k_size, + self.rel_pos_h, + self.rel_pos_w, + ) + attn = attn.softmax(dim=-1) + x = attn @ v + + if self.residual_pooling: + x = x + q + + x = x.transpose(1, 2).reshape(B, -1, self.dim_out) + x = self.proj(x) + + return x, q_size + + +class MultiScaleAttention(nn.Module): + def __init__( + self, + dim, + dim_out, + feat_size, + num_heads=8, + qkv_bias=True, + mode="conv", + kernel_q=(1, 1), + kernel_kv=(1, 1), + stride_q=(1, 1), + stride_kv=(1, 1), + has_cls_token=True, + rel_pos_type='spatial', + residual_pooling=True, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.num_heads = num_heads + self.dim_out = dim_out + self.head_dim = dim_out // num_heads + self.scale = self.head_dim ** -0.5 + self.has_cls_token = has_cls_token + padding_q = tuple([int(q // 2) for q in kernel_q]) + padding_kv = tuple([int(kv // 2) for kv in kernel_kv]) + + self.qkv = nn.Linear(dim, dim_out * 3, bias=qkv_bias) + self.proj = nn.Linear(dim_out, dim_out) + + # Skip pooling with kernel and stride size of (1, 1, 1). + if prod(kernel_q) == 1 and prod(stride_q) == 1: + kernel_q = None + if prod(kernel_kv) == 1 and prod(stride_kv) == 1: + kernel_kv = None + self.mode = mode + self.unshared = mode == 'conv_unshared' + self.norm_q, self.norm_k, self.norm_v = None, None, None + self.pool_q, self.pool_k, self.pool_v = None, None, None + if mode in ("avg", "max"): + pool_op = nn.MaxPool2d if mode == "max" else nn.AvgPool2d + if kernel_q: + self.pool_q = pool_op(kernel_q, stride_q, padding_q) + if kernel_kv: + self.pool_k = pool_op(kernel_kv, stride_kv, padding_kv) + self.pool_v = pool_op(kernel_kv, stride_kv, padding_kv) + elif mode == "conv" or mode == "conv_unshared": + dim_conv = dim_out // num_heads if mode == "conv" else dim_out + if kernel_q: + self.pool_q = nn.Conv2d( + dim_conv, + dim_conv, + kernel_q, + stride=stride_q, + padding=padding_q, + groups=dim_conv, + bias=False, + ) + self.norm_q = norm_layer(dim_conv) + if kernel_kv: + self.pool_k = nn.Conv2d( + dim_conv, + dim_conv, + kernel_kv, + stride=stride_kv, + padding=padding_kv, + groups=dim_conv, + bias=False, + ) + self.norm_k = norm_layer(dim_conv) + self.pool_v = nn.Conv2d( + dim_conv, + dim_conv, + kernel_kv, + stride=stride_kv, + padding=padding_kv, + groups=dim_conv, + bias=False, + ) + self.norm_v = norm_layer(dim_conv) + else: + raise NotImplementedError(f"Unsupported model {mode}") + + # relative pos embedding + self.rel_pos_type = rel_pos_type + if self.rel_pos_type == 'spatial': + assert feat_size[0] == feat_size[1] + size = feat_size[0] + q_size = size // stride_q[1] if len(stride_q) > 0 else size + kv_size = size // stride_kv[1] if len(stride_kv) > 0 else size + rel_sp_dim = 2 * max(q_size, kv_size) - 1 + + self.rel_pos_h = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(rel_sp_dim, self.head_dim)) + trunc_normal_tf_(self.rel_pos_h, std=0.02) + trunc_normal_tf_(self.rel_pos_w, std=0.02) + + self.residual_pooling = residual_pooling + + def forward(self, x, feat_size: List[int]): + B, N, _ = x.shape + + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(dim=0) + + if self.pool_q is not None: + q, q_tok = reshape_pre_pool(q, feat_size, self.has_cls_token) + q = self.pool_q(q) + q, q_size = reshape_post_pool(q, self.num_heads, q_tok) + else: + q_size = feat_size + if self.norm_q is not None: + q = self.norm_q(q) + + if self.pool_k is not None: + k, k_tok = reshape_pre_pool(k, feat_size, self.has_cls_token) + k = self.pool_k(k) + k, k_size = reshape_post_pool(k, self.num_heads, k_tok) + else: + k_size = feat_size + if self.norm_k is not None: + k = self.norm_k(k) + + if self.pool_v is not None: + v, v_tok = reshape_pre_pool(v, feat_size, self.has_cls_token) + v = self.pool_v(v) + v, _ = reshape_post_pool(v, self.num_heads, v_tok) + if self.norm_v is not None: + v = self.norm_v(v) + + attn = (q * self.scale) @ k.transpose(-2, -1) + if self.rel_pos_type == 'spatial': + attn = cal_rel_pos_type( + attn, + q, + self.has_cls_token, + q_size, + k_size, + self.rel_pos_h, + self.rel_pos_w, + ) + attn = attn.softmax(dim=-1) + x = attn @ v + + if self.residual_pooling: + x = x + q + + x = x.transpose(1, 2).reshape(B, -1, self.dim_out) + x = self.proj(x) + + return x, q_size + + +class MultiScaleBlock(nn.Module): + def __init__( + self, + dim, + dim_out, + num_heads, + feat_size, + mlp_ratio=4.0, + qkv_bias=True, + drop_path=0.0, + norm_layer=nn.LayerNorm, + kernel_q=(1, 1), + kernel_kv=(1, 1), + stride_q=(1, 1), + stride_kv=(1, 1), + mode="conv", + has_cls_token=True, + expand_attn=False, + pool_first=False, + rel_pos_type='spatial', + residual_pooling=True, + ): + super().__init__() + proj_needed = dim != dim_out + self.dim = dim + self.dim_out = dim_out + self.has_cls_token = has_cls_token + + self.norm1 = norm_layer(dim) + + self.shortcut_proj_attn = nn.Linear(dim, dim_out) if proj_needed and expand_attn else None + if stride_q and prod(stride_q) > 1: + kernel_skip = [s + 1 if s > 1 else s for s in stride_q] + stride_skip = stride_q + padding_skip = [int(skip // 2) for skip in kernel_skip] + self.shortcut_pool_attn = nn.MaxPool2d(kernel_skip, stride_skip, padding_skip) + else: + self.shortcut_pool_attn = None + + att_dim = dim_out if expand_attn else dim + attn_layer = MultiScaleAttentionPoolFirst if pool_first else MultiScaleAttention + self.attn = attn_layer( + dim, + att_dim, + num_heads=num_heads, + feat_size=feat_size, + qkv_bias=qkv_bias, + kernel_q=kernel_q, + kernel_kv=kernel_kv, + stride_q=stride_q, + stride_kv=stride_kv, + norm_layer=norm_layer, + has_cls_token=has_cls_token, + mode=mode, + rel_pos_type=rel_pos_type, + residual_pooling=residual_pooling, + ) + self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(att_dim) + mlp_dim_out = dim_out + self.shortcut_proj_mlp = nn.Linear(dim, dim_out) if proj_needed and not expand_attn else None + self.mlp = Mlp( + in_features=att_dim, + hidden_features=int(att_dim * mlp_ratio), + out_features=mlp_dim_out, + ) + self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def _shortcut_pool(self, x, feat_size: List[int]): + if self.shortcut_pool_attn is None: + return x + if self.has_cls_token: + cls_tok, x = x[:, :1, :], x[:, 1:, :] + else: + cls_tok = None + B, L, C = x.shape + H, W = feat_size + x = x.reshape(B, H, W, C).permute(0, 3, 1, 2).contiguous() + x = self.shortcut_pool_attn(x) + x = x.reshape(B, C, -1).transpose(1, 2) + if cls_tok is not None: + x = torch.cat((cls_tok, x), dim=1) + return x + + def forward(self, x, feat_size: List[int]): + x_norm = self.norm1(x) + # NOTE as per the original impl, this seems odd, but shortcut uses un-normalized input if no proj + x_shortcut = x if self.shortcut_proj_attn is None else self.shortcut_proj_attn(x_norm) + x_shortcut = self._shortcut_pool(x_shortcut, feat_size) + x, feat_size_new = self.attn(x_norm, feat_size) + x = x_shortcut + self.drop_path1(x) + + x_norm = self.norm2(x) + x_shortcut = x if self.shortcut_proj_mlp is None else self.shortcut_proj_mlp(x_norm) + x = x_shortcut + self.drop_path2(self.mlp(x_norm)) + return x, feat_size_new + + +class MultiScaleVitStage(nn.Module): + + def __init__( + self, + dim, + dim_out, + depth, + num_heads, + feat_size, + mlp_ratio=4.0, + qkv_bias=True, + mode="conv", + kernel_q=(1, 1), + kernel_kv=(1, 1), + stride_q=(1, 1), + stride_kv=(1, 1), + has_cls_token=True, + expand_attn=False, + pool_first=False, + rel_pos_type='spatial', + residual_pooling=True, + norm_layer=nn.LayerNorm, + drop_path=0.0, + ): + super().__init__() + self.grad_checkpointing = False + + self.blocks = nn.ModuleList() + if expand_attn: + out_dims = (dim_out,) * depth + else: + out_dims = (dim,) * (depth - 1) + (dim_out,) + + for i in range(depth): + attention_block = MultiScaleBlock( + dim=dim, + dim_out=out_dims[i], + num_heads=num_heads, + feat_size=feat_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + kernel_q=kernel_q, + kernel_kv=kernel_kv, + stride_q=stride_q if i == 0 else (1, 1), + stride_kv=stride_kv, + mode=mode, + has_cls_token=has_cls_token, + pool_first=pool_first, + rel_pos_type=rel_pos_type, + residual_pooling=residual_pooling, + expand_attn=expand_attn, + norm_layer=norm_layer, + drop_path=drop_path[i] if isinstance(drop_path, (list, tuple)) else drop_path, + ) + dim = out_dims[i] + self.blocks.append(attention_block) + if i == 0: + feat_size = tuple([size // stride for size, stride in zip(feat_size, stride_q)]) + + self.feat_size = feat_size + + def forward(self, x, feat_size: List[int]): + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x, feat_size = checkpoint.checkpoint(blk, x, feat_size) + else: + x, feat_size = blk(x, feat_size) + return x, feat_size + + +class MultiScaleVit(nn.Module): + """ + Improved Multiscale Vision Transformers for Classification and Detection + Yanghao Li*, Chao-Yuan Wu*, Haoqi Fan, Karttikeya Mangalam, Bo Xiong, Jitendra Malik, + Christoph Feichtenhofer* + https://arxiv.org/abs/2112.01526 + + Multiscale Vision Transformers + Haoqi Fan*, Bo Xiong*, Karttikeya Mangalam*, Yanghao Li*, Zhicheng Yan, Jitendra Malik, + Christoph Feichtenhofer* + https://arxiv.org/abs/2104.11227 + """ + + def __init__( + self, + cfg: MultiScaleVitCfg, + img_size: Tuple[int, int] = (224, 224), + in_chans: int = 3, + global_pool: str = 'avg', + num_classes: int = 1000, + drop_path_rate: float = 0., + drop_rate: float = 0., + ): + super().__init__() + img_size = to_2tuple(img_size) + norm_layer = partial(get_norm_layer(cfg.norm_layer), eps=cfg.norm_eps) + self.num_classes = num_classes + self.drop_rate = drop_rate + self.global_pool = global_pool + self.depths = tuple(cfg.depths) + self.expand_attn = cfg.expand_attn + + embed_dim = cfg.embed_dim[0] + self.patch_embed = PatchEmbed( + dim_in=in_chans, + dim_out=embed_dim, + kernel=cfg.patch_kernel, + stride=cfg.patch_stride, + padding=cfg.patch_padding, + ) + patch_dims = (img_size[0] // cfg.patch_stride[0], img_size[1] // cfg.patch_stride[1]) + num_patches = prod(patch_dims) + + if cfg.use_cls_token: + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.num_prefix_tokens = 1 + pos_embed_dim = num_patches + 1 + else: + self.num_prefix_tokens = 0 + self.cls_token = None + pos_embed_dim = num_patches + + if cfg.use_abs_pos: + self.pos_embed = nn.Parameter(torch.zeros(1, pos_embed_dim, embed_dim)) + else: + self.pos_embed = None + + num_stages = len(cfg.embed_dim) + feat_size = patch_dims + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)] + self.stages = nn.ModuleList() + for i in range(num_stages): + if cfg.expand_attn: + dim_out = cfg.embed_dim[i] + else: + dim_out = cfg.embed_dim[min(i + 1, num_stages - 1)] + stage = MultiScaleVitStage( + dim=embed_dim, + dim_out=dim_out, + depth=cfg.depths[i], + num_heads=cfg.num_heads[i], + feat_size=feat_size, + mlp_ratio=cfg.mlp_ratio, + qkv_bias=cfg.qkv_bias, + mode=cfg.mode, + pool_first=cfg.pool_first, + expand_attn=cfg.expand_attn, + kernel_q=cfg.kernel_qkv, + kernel_kv=cfg.kernel_qkv, + stride_q=cfg.stride_q[i], + stride_kv=cfg.stride_kv[i], + has_cls_token=cfg.use_cls_token, + rel_pos_type=cfg.rel_pos_type, + residual_pooling=cfg.residual_pooling, + norm_layer=norm_layer, + drop_path=dpr[i], + ) + embed_dim = dim_out + feat_size = stage.feat_size + self.stages.append(stage) + + self.num_features = embed_dim + self.norm = norm_layer(embed_dim) + self.head = nn.Sequential(OrderedDict([ + ('drop', nn.Dropout(self.drop_rate)), + ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()) + ])) + + if self.pos_embed is not None: + trunc_normal_tf_(self.pos_embed, std=0.02) + if self.cls_token is not None: + trunc_normal_tf_(self.cls_token, std=0.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_tf_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {k for k, _ in self.named_parameters() + if any(n in k for n in ["pos_embed", "rel_pos_h", "rel_pos_w", "cls_token"])} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^patch_embed', # stem and embed + blocks=[(r'^stages\.(\d+)', None), (r'^norm', (99999,))] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = nn.Sequential(OrderedDict([ + ('drop', nn.Dropout(self.drop_rate)), + ('fc', nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()) + ])) + + def forward_features(self, x): + x, feat_size = self.patch_embed(x) + B, N, C = x.shape + + if self.cls_token is not None: + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat((cls_tokens, x), dim=1) + + if self.pos_embed is not None: + x = x + self.pos_embed + + for stage in self.stages: + x, feat_size = stage(x, feat_size) + + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + if self.global_pool == 'avg': + x = x[:, self.num_prefix_tokens:].mean(1) + else: + x = x[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def checkpoint_filter_fn(state_dict, model): + if 'stages.0.blocks.0.norm1.weight' in state_dict: + return state_dict + + import re + if 'model_state' in state_dict: + state_dict = state_dict['model_state'] + + depths = getattr(model, 'depths', None) + expand_attn = getattr(model, 'expand_attn', True) + assert depths is not None, 'model requires depth attribute to remap checkpoints' + depth_map = {} + block_idx = 0 + for stage_idx, d in enumerate(depths): + depth_map.update({i: (stage_idx, i - block_idx) for i in range(block_idx, block_idx + d)}) + block_idx += d + + out_dict = {} + for k, v in state_dict.items(): + k = re.sub( + r'blocks\.(\d+)', + lambda x: f'stages.{depth_map[int(x.group(1))][0]}.blocks.{depth_map[int(x.group(1))][1]}', + k) + + if expand_attn: + k = re.sub(r'stages\.(\d+).blocks\.(\d+).proj', f'stages.\\1.blocks.\\2.shortcut_proj_attn', k) + else: + k = re.sub(r'stages\.(\d+).blocks\.(\d+).proj', f'stages.\\1.blocks.\\2.shortcut_proj_mlp', k) + if 'head' in k: + k = k.replace('head.projection', 'head.fc') + out_dict[k] = v + + # for k, v in state_dict.items(): + # if model.pos_embed is not None and k == 'pos_embed' and v.shape[1] != model.pos_embed.shape[1]: + # # To resize pos embedding when using model at different size from pretrained weights + # v = resize_pos_embed( + # v, + # model.pos_embed, + # 0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1), + # model.patch_embed.grid_size + # ) + + return out_dict + + +def _create_mvitv2(variant, cfg_variant=None, pretrained=False, **kwargs): + return build_model_with_cfg( + MultiScaleVit, variant, pretrained, + model_cfg=model_cfgs[variant] if not cfg_variant else model_cfgs[cfg_variant], + pretrained_filter_fn=checkpoint_filter_fn, + feature_cfg=dict(flatten_sequential=True), + **kwargs) + + +@register_model +def mvitv2_tiny(pretrained=False, **kwargs): + return _create_mvitv2('mvitv2_tiny', pretrained=pretrained, **kwargs) + + +@register_model +def mvitv2_small(pretrained=False, **kwargs): + return _create_mvitv2('mvitv2_small', pretrained=pretrained, **kwargs) + + +@register_model +def mvitv2_base(pretrained=False, **kwargs): + return _create_mvitv2('mvitv2_base', pretrained=pretrained, **kwargs) + + +@register_model +def mvitv2_large(pretrained=False, **kwargs): + return _create_mvitv2('mvitv2_large', pretrained=pretrained, **kwargs) + + +# @register_model +# def mvitv2_base_in21k(pretrained=False, **kwargs): +# return _create_mvitv2('mvitv2_base_in21k', pretrained=pretrained, **kwargs) +# +# +# @register_model +# def mvitv2_large_in21k(pretrained=False, **kwargs): +# return _create_mvitv2('mvitv2_large_in21k', pretrained=pretrained, **kwargs) +# +# +# @register_model +# def mvitv2_huge_in21k(pretrained=False, **kwargs): +# return _create_mvitv2('mvitv2_huge_in21k', pretrained=pretrained, **kwargs) + + +@register_model +def mvitv2_small_cls(pretrained=False, **kwargs): + return _create_mvitv2('mvitv2_small_cls', pretrained=pretrained, **kwargs) diff --git a/custom_timm/models/nasnet.py b/custom_timm/models/nasnet.py new file mode 100644 index 0000000000000000000000000000000000000000..50db1a3d351db0e9caa2002e16b8003b561050f8 --- /dev/null +++ b/custom_timm/models/nasnet.py @@ -0,0 +1,588 @@ +""" NasNet-A (Large) + nasnetalarge implementation grabbed from Cadene's pretrained models + https://github.com/Cadene/pretrained-models.pytorch +""" +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .helpers import build_model_with_cfg +from .layers import ConvNormAct, create_conv2d, create_pool2d, create_classifier +from .registry import register_model + +__all__ = ['NASNetALarge'] + +default_cfgs = { + 'nasnetalarge': { + 'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nasnetalarge-dc4a7b8b.pth', + 'input_size': (3, 331, 331), + 'pool_size': (11, 11), + 'crop_pct': 0.911, + 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), + 'std': (0.5, 0.5, 0.5), + 'num_classes': 1000, + 'first_conv': 'conv0.conv', + 'classifier': 'last_linear', + 'label_offset': 1, # 1001 classes in pretrained weights + }, +} + + +class ActConvBn(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=''): + super(ActConvBn, self).__init__() + self.act = nn.ReLU() + self.conv = create_conv2d( + in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) + self.bn = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.1) + + def forward(self, x): + x = self.act(x) + x = self.conv(x) + x = self.bn(x) + return x + + +class SeparableConv2d(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride, padding=''): + super(SeparableConv2d, self).__init__() + self.depthwise_conv2d = create_conv2d( + in_channels, in_channels, kernel_size=kernel_size, + stride=stride, padding=padding, groups=in_channels) + self.pointwise_conv2d = create_conv2d( + in_channels, out_channels, kernel_size=1, padding=0) + + def forward(self, x): + x = self.depthwise_conv2d(x) + x = self.pointwise_conv2d(x) + return x + + +class BranchSeparables(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, pad_type='', stem_cell=False): + super(BranchSeparables, self).__init__() + middle_channels = out_channels if stem_cell else in_channels + self.act_1 = nn.ReLU() + self.separable_1 = SeparableConv2d( + in_channels, middle_channels, kernel_size, stride=stride, padding=pad_type) + self.bn_sep_1 = nn.BatchNorm2d(middle_channels, eps=0.001, momentum=0.1) + self.act_2 = nn.ReLU(inplace=True) + self.separable_2 = SeparableConv2d( + middle_channels, out_channels, kernel_size, stride=1, padding=pad_type) + self.bn_sep_2 = nn.BatchNorm2d(out_channels, eps=0.001, momentum=0.1) + + def forward(self, x): + x = self.act_1(x) + x = self.separable_1(x) + x = self.bn_sep_1(x) + x = self.act_2(x) + x = self.separable_2(x) + x = self.bn_sep_2(x) + return x + + +class CellStem0(nn.Module): + def __init__(self, stem_size, num_channels=42, pad_type=''): + super(CellStem0, self).__init__() + self.num_channels = num_channels + self.stem_size = stem_size + self.conv_1x1 = ActConvBn(self.stem_size, self.num_channels, 1, stride=1) + + self.comb_iter_0_left = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type) + self.comb_iter_0_right = BranchSeparables(self.stem_size, self.num_channels, 7, 2, pad_type, stem_cell=True) + + self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type) + self.comb_iter_1_right = BranchSeparables(self.stem_size, self.num_channels, 7, 2, pad_type, stem_cell=True) + + self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type) + self.comb_iter_2_right = BranchSeparables(self.stem_size, self.num_channels, 5, 2, pad_type, stem_cell=True) + + self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables(self.num_channels, self.num_channels, 3, 1, pad_type) + self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type) + + def forward(self, x): + x1 = self.conv_1x1(x) + + x_comb_iter_0_left = self.comb_iter_0_left(x1) + x_comb_iter_0_right = self.comb_iter_0_right(x) + x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right + + x_comb_iter_1_left = self.comb_iter_1_left(x1) + x_comb_iter_1_right = self.comb_iter_1_right(x) + x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right + + x_comb_iter_2_left = self.comb_iter_2_left(x1) + x_comb_iter_2_right = self.comb_iter_2_right(x) + x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right + + x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0) + x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1 + + x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0) + x_comb_iter_4_right = self.comb_iter_4_right(x1) + x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right + + x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1) + return x_out + + +class CellStem1(nn.Module): + + def __init__(self, stem_size, num_channels, pad_type=''): + super(CellStem1, self).__init__() + self.num_channels = num_channels + self.stem_size = stem_size + self.conv_1x1 = ActConvBn(2 * self.num_channels, self.num_channels, 1, stride=1) + + self.act = nn.ReLU() + self.path_1 = nn.Sequential() + self.path_1.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)) + self.path_1.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False)) + + self.path_2 = nn.Sequential() + self.path_2.add_module('pad', nn.ZeroPad2d((-1, 1, -1, 1))) + self.path_2.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)) + self.path_2.add_module('conv', nn.Conv2d(self.stem_size, self.num_channels // 2, 1, stride=1, bias=False)) + + self.final_path_bn = nn.BatchNorm2d(self.num_channels, eps=0.001, momentum=0.1) + + self.comb_iter_0_left = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type) + self.comb_iter_0_right = BranchSeparables(self.num_channels, self.num_channels, 7, 2, pad_type) + + self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type) + self.comb_iter_1_right = BranchSeparables(self.num_channels, self.num_channels, 7, 2, pad_type) + + self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type) + self.comb_iter_2_right = BranchSeparables(self.num_channels, self.num_channels, 5, 2, pad_type) + + self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables(self.num_channels, self.num_channels, 3, 1, pad_type) + self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type) + + def forward(self, x_conv0, x_stem_0): + x_left = self.conv_1x1(x_stem_0) + + x_relu = self.act(x_conv0) + # path 1 + x_path1 = self.path_1(x_relu) + # path 2 + x_path2 = self.path_2(x_relu) + # final path + x_right = self.final_path_bn(torch.cat([x_path1, x_path2], 1)) + + x_comb_iter_0_left = self.comb_iter_0_left(x_left) + x_comb_iter_0_right = self.comb_iter_0_right(x_right) + x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right + + x_comb_iter_1_left = self.comb_iter_1_left(x_left) + x_comb_iter_1_right = self.comb_iter_1_right(x_right) + x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right + + x_comb_iter_2_left = self.comb_iter_2_left(x_left) + x_comb_iter_2_right = self.comb_iter_2_right(x_right) + x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right + + x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0) + x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1 + + x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0) + x_comb_iter_4_right = self.comb_iter_4_right(x_left) + x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right + + x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1) + return x_out + + +class FirstCell(nn.Module): + + def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''): + super(FirstCell, self).__init__() + self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1) + + self.act = nn.ReLU() + self.path_1 = nn.Sequential() + self.path_1.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)) + self.path_1.add_module('conv', nn.Conv2d(in_chs_left, out_chs_left, 1, stride=1, bias=False)) + + self.path_2 = nn.Sequential() + self.path_2.add_module('pad', nn.ZeroPad2d((-1, 1, -1, 1))) + self.path_2.add_module('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)) + self.path_2.add_module('conv', nn.Conv2d(in_chs_left, out_chs_left, 1, stride=1, bias=False)) + + self.final_path_bn = nn.BatchNorm2d(out_chs_left * 2, eps=0.001, momentum=0.1) + + self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type) + self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type) + + self.comb_iter_1_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type) + self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type) + + self.comb_iter_2_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_3_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type) + + def forward(self, x, x_prev): + x_relu = self.act(x_prev) + x_path1 = self.path_1(x_relu) + x_path2 = self.path_2(x_relu) + x_left = self.final_path_bn(torch.cat([x_path1, x_path2], 1)) + x_right = self.conv_1x1(x) + + x_comb_iter_0_left = self.comb_iter_0_left(x_right) + x_comb_iter_0_right = self.comb_iter_0_right(x_left) + x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right + + x_comb_iter_1_left = self.comb_iter_1_left(x_left) + x_comb_iter_1_right = self.comb_iter_1_right(x_left) + x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right + + x_comb_iter_2_left = self.comb_iter_2_left(x_right) + x_comb_iter_2 = x_comb_iter_2_left + x_left + + x_comb_iter_3_left = self.comb_iter_3_left(x_left) + x_comb_iter_3_right = self.comb_iter_3_right(x_left) + x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right + + x_comb_iter_4_left = self.comb_iter_4_left(x_right) + x_comb_iter_4 = x_comb_iter_4_left + x_right + + x_out = torch.cat([x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1) + return x_out + + +class NormalCell(nn.Module): + + def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''): + super(NormalCell, self).__init__() + self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type) + self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type) + + self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 1, pad_type) + self.comb_iter_0_right = BranchSeparables(out_chs_left, out_chs_left, 3, 1, pad_type) + + self.comb_iter_1_left = BranchSeparables(out_chs_left, out_chs_left, 5, 1, pad_type) + self.comb_iter_1_right = BranchSeparables(out_chs_left, out_chs_left, 3, 1, pad_type) + + self.comb_iter_2_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_3_left = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type) + + def forward(self, x, x_prev): + x_left = self.conv_prev_1x1(x_prev) + x_right = self.conv_1x1(x) + + x_comb_iter_0_left = self.comb_iter_0_left(x_right) + x_comb_iter_0_right = self.comb_iter_0_right(x_left) + x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right + + x_comb_iter_1_left = self.comb_iter_1_left(x_left) + x_comb_iter_1_right = self.comb_iter_1_right(x_left) + x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right + + x_comb_iter_2_left = self.comb_iter_2_left(x_right) + x_comb_iter_2 = x_comb_iter_2_left + x_left + + x_comb_iter_3_left = self.comb_iter_3_left(x_left) + x_comb_iter_3_right = self.comb_iter_3_right(x_left) + x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right + + x_comb_iter_4_left = self.comb_iter_4_left(x_right) + x_comb_iter_4 = x_comb_iter_4_left + x_right + + x_out = torch.cat([x_left, x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1) + return x_out + + +class ReductionCell0(nn.Module): + + def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''): + super(ReductionCell0, self).__init__() + self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type) + self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type) + + self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type) + self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type) + + self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type) + self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type) + + self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type) + self.comb_iter_2_right = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type) + + self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type) + self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type) + + def forward(self, x, x_prev): + x_left = self.conv_prev_1x1(x_prev) + x_right = self.conv_1x1(x) + + x_comb_iter_0_left = self.comb_iter_0_left(x_right) + x_comb_iter_0_right = self.comb_iter_0_right(x_left) + x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right + + x_comb_iter_1_left = self.comb_iter_1_left(x_right) + x_comb_iter_1_right = self.comb_iter_1_right(x_left) + x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right + + x_comb_iter_2_left = self.comb_iter_2_left(x_right) + x_comb_iter_2_right = self.comb_iter_2_right(x_left) + x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right + + x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0) + x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1 + + x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0) + x_comb_iter_4_right = self.comb_iter_4_right(x_right) + x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right + + x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1) + return x_out + + +class ReductionCell1(nn.Module): + + def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''): + super(ReductionCell1, self).__init__() + self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, 1, stride=1, padding=pad_type) + self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, 1, stride=1, padding=pad_type) + + self.comb_iter_0_left = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type) + self.comb_iter_0_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type) + + self.comb_iter_1_left = create_pool2d('max', 3, 2, padding=pad_type) + self.comb_iter_1_right = BranchSeparables(out_chs_right, out_chs_right, 7, 2, pad_type) + + self.comb_iter_2_left = create_pool2d('avg', 3, 2, count_include_pad=False, padding=pad_type) + self.comb_iter_2_right = BranchSeparables(out_chs_right, out_chs_right, 5, 2, pad_type) + + self.comb_iter_3_right = create_pool2d('avg', 3, 1, count_include_pad=False, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables(out_chs_right, out_chs_right, 3, 1, pad_type) + self.comb_iter_4_right = create_pool2d('max', 3, 2, padding=pad_type) + + def forward(self, x, x_prev): + x_left = self.conv_prev_1x1(x_prev) + x_right = self.conv_1x1(x) + + x_comb_iter_0_left = self.comb_iter_0_left(x_right) + x_comb_iter_0_right = self.comb_iter_0_right(x_left) + x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right + + x_comb_iter_1_left = self.comb_iter_1_left(x_right) + x_comb_iter_1_right = self.comb_iter_1_right(x_left) + x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right + + x_comb_iter_2_left = self.comb_iter_2_left(x_right) + x_comb_iter_2_right = self.comb_iter_2_right(x_left) + x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right + + x_comb_iter_3_right = self.comb_iter_3_right(x_comb_iter_0) + x_comb_iter_3 = x_comb_iter_3_right + x_comb_iter_1 + + x_comb_iter_4_left = self.comb_iter_4_left(x_comb_iter_0) + x_comb_iter_4_right = self.comb_iter_4_right(x_right) + x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right + + x_out = torch.cat([x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1) + return x_out + + +class NASNetALarge(nn.Module): + """NASNetALarge (6 @ 4032) """ + + def __init__( + self, num_classes=1000, in_chans=3, stem_size=96, channel_multiplier=2, + num_features=4032, output_stride=32, drop_rate=0., global_pool='avg', pad_type='same'): + super(NASNetALarge, self).__init__() + self.num_classes = num_classes + self.stem_size = stem_size + self.num_features = num_features + self.channel_multiplier = channel_multiplier + self.drop_rate = drop_rate + assert output_stride == 32 + + channels = self.num_features // 24 + # 24 is default value for the architecture + + self.conv0 = ConvNormAct( + in_channels=in_chans, out_channels=self.stem_size, kernel_size=3, padding=0, stride=2, + norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False) + + self.cell_stem_0 = CellStem0( + self.stem_size, num_channels=channels // (channel_multiplier ** 2), pad_type=pad_type) + self.cell_stem_1 = CellStem1( + self.stem_size, num_channels=channels // channel_multiplier, pad_type=pad_type) + + self.cell_0 = FirstCell( + in_chs_left=channels, out_chs_left=channels // 2, + in_chs_right=2 * channels, out_chs_right=channels, pad_type=pad_type) + self.cell_1 = NormalCell( + in_chs_left=2 * channels, out_chs_left=channels, + in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type) + self.cell_2 = NormalCell( + in_chs_left=6 * channels, out_chs_left=channels, + in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type) + self.cell_3 = NormalCell( + in_chs_left=6 * channels, out_chs_left=channels, + in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type) + self.cell_4 = NormalCell( + in_chs_left=6 * channels, out_chs_left=channels, + in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type) + self.cell_5 = NormalCell( + in_chs_left=6 * channels, out_chs_left=channels, + in_chs_right=6 * channels, out_chs_right=channels, pad_type=pad_type) + + self.reduction_cell_0 = ReductionCell0( + in_chs_left=6 * channels, out_chs_left=2 * channels, + in_chs_right=6 * channels, out_chs_right=2 * channels, pad_type=pad_type) + self.cell_6 = FirstCell( + in_chs_left=6 * channels, out_chs_left=channels, + in_chs_right=8 * channels, out_chs_right=2 * channels, pad_type=pad_type) + self.cell_7 = NormalCell( + in_chs_left=8 * channels, out_chs_left=2 * channels, + in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type) + self.cell_8 = NormalCell( + in_chs_left=12 * channels, out_chs_left=2 * channels, + in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type) + self.cell_9 = NormalCell( + in_chs_left=12 * channels, out_chs_left=2 * channels, + in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type) + self.cell_10 = NormalCell( + in_chs_left=12 * channels, out_chs_left=2 * channels, + in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type) + self.cell_11 = NormalCell( + in_chs_left=12 * channels, out_chs_left=2 * channels, + in_chs_right=12 * channels, out_chs_right=2 * channels, pad_type=pad_type) + + self.reduction_cell_1 = ReductionCell1( + in_chs_left=12 * channels, out_chs_left=4 * channels, + in_chs_right=12 * channels, out_chs_right=4 * channels, pad_type=pad_type) + self.cell_12 = FirstCell( + in_chs_left=12 * channels, out_chs_left=2 * channels, + in_chs_right=16 * channels, out_chs_right=4 * channels, pad_type=pad_type) + self.cell_13 = NormalCell( + in_chs_left=16 * channels, out_chs_left=4 * channels, + in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type) + self.cell_14 = NormalCell( + in_chs_left=24 * channels, out_chs_left=4 * channels, + in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type) + self.cell_15 = NormalCell( + in_chs_left=24 * channels, out_chs_left=4 * channels, + in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type) + self.cell_16 = NormalCell( + in_chs_left=24 * channels, out_chs_left=4 * channels, + in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type) + self.cell_17 = NormalCell( + in_chs_left=24 * channels, out_chs_left=4 * channels, + in_chs_right=24 * channels, out_chs_right=4 * channels, pad_type=pad_type) + self.act = nn.ReLU(inplace=True) + self.feature_info = [ + dict(num_chs=96, reduction=2, module='conv0'), + dict(num_chs=168, reduction=4, module='cell_stem_1.conv_1x1.act'), + dict(num_chs=1008, reduction=8, module='reduction_cell_0.conv_1x1.act'), + dict(num_chs=2016, reduction=16, module='reduction_cell_1.conv_1x1.act'), + dict(num_chs=4032, reduction=32, module='act'), + ] + + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^conv0|cell_stem_[01]', + blocks=[ + (r'^cell_(\d+)', None), + (r'^reduction_cell_0', (6,)), + (r'^reduction_cell_1', (12,)), + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.last_linear + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x_conv0 = self.conv0(x) + + x_stem_0 = self.cell_stem_0(x_conv0) + x_stem_1 = self.cell_stem_1(x_conv0, x_stem_0) + + x_cell_0 = self.cell_0(x_stem_1, x_stem_0) + x_cell_1 = self.cell_1(x_cell_0, x_stem_1) + x_cell_2 = self.cell_2(x_cell_1, x_cell_0) + x_cell_3 = self.cell_3(x_cell_2, x_cell_1) + x_cell_4 = self.cell_4(x_cell_3, x_cell_2) + x_cell_5 = self.cell_5(x_cell_4, x_cell_3) + + x_reduction_cell_0 = self.reduction_cell_0(x_cell_5, x_cell_4) + x_cell_6 = self.cell_6(x_reduction_cell_0, x_cell_4) + x_cell_7 = self.cell_7(x_cell_6, x_reduction_cell_0) + x_cell_8 = self.cell_8(x_cell_7, x_cell_6) + x_cell_9 = self.cell_9(x_cell_8, x_cell_7) + x_cell_10 = self.cell_10(x_cell_9, x_cell_8) + x_cell_11 = self.cell_11(x_cell_10, x_cell_9) + + x_reduction_cell_1 = self.reduction_cell_1(x_cell_11, x_cell_10) + x_cell_12 = self.cell_12(x_reduction_cell_1, x_cell_10) + x_cell_13 = self.cell_13(x_cell_12, x_reduction_cell_1) + x_cell_14 = self.cell_14(x_cell_13, x_cell_12) + x_cell_15 = self.cell_15(x_cell_14, x_cell_13) + x_cell_16 = self.cell_16(x_cell_15, x_cell_14) + x_cell_17 = self.cell_17(x_cell_16, x_cell_15) + x = self.act(x_cell_17) + return x + + def forward_head(self, x): + x = self.global_pool(x) + if self.drop_rate > 0: + x = F.dropout(x, self.drop_rate, training=self.training) + x = self.last_linear(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_nasnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + NASNetALarge, variant, pretrained, + feature_cfg=dict(feature_cls='hook', no_rewrite=True), # not possible to re-write this model + **kwargs) + + +@register_model +def nasnetalarge(pretrained=False, **kwargs): + """NASNet-A large model architecture. + """ + model_kwargs = dict(pad_type='same', **kwargs) + return _create_nasnet('nasnetalarge', pretrained, **model_kwargs) diff --git a/custom_timm/models/nest.py b/custom_timm/models/nest.py new file mode 100644 index 0000000000000000000000000000000000000000..f626a2e61b5b6137170f42e7b8bf8f1f62d7e48f --- /dev/null +++ b/custom_timm/models/nest.py @@ -0,0 +1,486 @@ +""" Nested Transformer (NesT) in PyTorch + +A PyTorch implement of Aggregating Nested Transformers as described in: + +'Aggregating Nested Transformers' + - https://arxiv.org/abs/2105.12723 + +The official Jax code is released and available at https://github.com/google-research/nested-transformer. The weights +have been converted with convert/convert_nest_flax.py + +Acknowledgments: +* The paper authors for sharing their research, code, and model weights +* Ross Wightman's existing code off which I based this + +Copyright 2021 Alexander Soare +""" + +import collections.abc +import logging +import math +from functools import partial + +import torch +import torch.nn.functional as F +from torch import nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_function +from .helpers import build_model_with_cfg, named_apply, checkpoint_seq +from .layers import PatchEmbed, Mlp, DropPath, create_classifier, trunc_normal_ +from .layers import _assert +from .layers import create_conv2d, create_pool2d, to_ntuple +from .registry import register_model + +_logger = logging.getLogger(__name__) + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': [14, 14], + 'crop_pct': .875, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # (weights from official Google JAX impl) + 'nest_base': _cfg(), + 'nest_small': _cfg(), + 'nest_tiny': _cfg(), + 'jx_nest_base': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/jx_nest_base-8bc41011.pth'), + 'jx_nest_small': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/jx_nest_small-422eaded.pth'), + 'jx_nest_tiny': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/jx_nest_tiny-e3428fb9.pth'), +} + + +class Attention(nn.Module): + """ + This is much like `.vision_transformer.Attention` but uses *localised* self attention by accepting an input with + an extra "image block" dim + """ + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, 3*dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + """ + x is shape: B (batch_size), T (image blocks), N (seq length per image block), C (embed dim) + """ + B, T, N, C = x.shape + # result of next line is (qkv, B, num (H)eads, T, N, (C')hannels per head) + qkv = self.qkv(x).reshape(B, T, N, 3, self.num_heads, C // self.num_heads).permute(3, 0, 4, 1, 2, 5) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale # (B, H, T, N, N) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + # (B, H, T, N, C'), permute -> (B, T, N, C', H) + x = (attn @ v).permute(0, 2, 3, 4, 1).reshape(B, T, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x # (B, T, N, C) + + +class TransformerLayer(nn.Module): + """ + This is much like `.vision_transformer.Block` but: + - Called TransformerLayer here to allow for "block" as defined in the paper ("non-overlapping image blocks") + - Uses modified Attention layer that handles the "block" dimension + """ + def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + y = self.norm1(x) + x = x + self.drop_path(self.attn(y)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class ConvPool(nn.Module): + def __init__(self, in_channels, out_channels, norm_layer, pad_type=''): + super().__init__() + self.conv = create_conv2d(in_channels, out_channels, kernel_size=3, padding=pad_type, bias=True) + self.norm = norm_layer(out_channels) + self.pool = create_pool2d('max', kernel_size=3, stride=2, padding=pad_type) + + def forward(self, x): + """ + x is expected to have shape (B, C, H, W) + """ + _assert(x.shape[-2] % 2 == 0, 'BlockAggregation requires even input spatial dims') + _assert(x.shape[-1] % 2 == 0, 'BlockAggregation requires even input spatial dims') + x = self.conv(x) + # Layer norm done over channel dim only + x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + x = self.pool(x) + return x # (B, C, H//2, W//2) + + +def blockify(x, block_size: int): + """image to blocks + Args: + x (Tensor): with shape (B, H, W, C) + block_size (int): edge length of a single square block in units of H, W + """ + B, H, W, C = x.shape + _assert(H % block_size == 0, '`block_size` must divide input height evenly') + _assert(W % block_size == 0, '`block_size` must divide input width evenly') + grid_height = H // block_size + grid_width = W // block_size + x = x.reshape(B, grid_height, block_size, grid_width, block_size, C) + x = x.transpose(2, 3).reshape(B, grid_height * grid_width, -1, C) + return x # (B, T, N, C) + + +@register_notrace_function # reason: int receives Proxy +def deblockify(x, block_size: int): + """blocks to image + Args: + x (Tensor): with shape (B, T, N, C) where T is number of blocks and N is sequence size per block + block_size (int): edge length of a single square block in units of desired H, W + """ + B, T, _, C = x.shape + grid_size = int(math.sqrt(T)) + height = width = grid_size * block_size + x = x.reshape(B, grid_size, grid_size, block_size, block_size, C) + x = x.transpose(2, 3).reshape(B, height, width, C) + return x # (B, H, W, C) + + +class NestLevel(nn.Module): + """ Single hierarchical level of a Nested Transformer + """ + def __init__( + self, num_blocks, block_size, seq_length, num_heads, depth, embed_dim, prev_embed_dim=None, + mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rates=[], + norm_layer=None, act_layer=None, pad_type=''): + super().__init__() + self.block_size = block_size + self.grad_checkpointing = False + + self.pos_embed = nn.Parameter(torch.zeros(1, num_blocks, seq_length, embed_dim)) + + if prev_embed_dim is not None: + self.pool = ConvPool(prev_embed_dim, embed_dim, norm_layer=norm_layer, pad_type=pad_type) + else: + self.pool = nn.Identity() + + # Transformer encoder + if len(drop_path_rates): + assert len(drop_path_rates) == depth, 'Must provide as many drop path rates as there are transformer layers' + self.transformer_encoder = nn.Sequential(*[ + TransformerLayer( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=drop_path_rates[i], + norm_layer=norm_layer, act_layer=act_layer) + for i in range(depth)]) + + def forward(self, x): + """ + expects x as (B, C, H, W) + """ + x = self.pool(x) + x = x.permute(0, 2, 3, 1) # (B, H', W', C), switch to channels last for transformer + x = blockify(x, self.block_size) # (B, T, N, C') + x = x + self.pos_embed + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.transformer_encoder, x) + else: + x = self.transformer_encoder(x) # (B, T, N, C') + x = deblockify(x, self.block_size) # (B, H', W', C') + # Channel-first for block aggregation, and generally to replicate convnet feature map at each stage + return x.permute(0, 3, 1, 2) # (B, C, H', W') + + +class Nest(nn.Module): + """ Nested Transformer (NesT) + + A PyTorch impl of : `Aggregating Nested Transformers` + - https://arxiv.org/abs/2105.12723 + """ + + def __init__( + self, img_size=224, in_chans=3, patch_size=4, num_levels=3, embed_dims=(128, 256, 512), + num_heads=(4, 8, 16), depths=(2, 2, 20), num_classes=1000, mlp_ratio=4., qkv_bias=True, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.5, norm_layer=None, act_layer=None, + pad_type='', weight_init='', global_pool='avg' + ): + """ + Args: + img_size (int, tuple): input image size + in_chans (int): number of input channels + patch_size (int): patch size + num_levels (int): number of block hierarchies (T_d in the paper) + embed_dims (int, tuple): embedding dimensions of each level + num_heads (int, tuple): number of attention heads for each level + depths (int, tuple): number of transformer layers for each level + num_classes (int): number of classes for classification head + mlp_ratio (int): ratio of mlp hidden dim to embedding dim for MLP of transformer layers + qkv_bias (bool): enable bias for qkv if True + drop_rate (float): dropout rate for MLP of transformer layers, MSA final projection layer, and classifier + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + norm_layer: (nn.Module): normalization layer for transformer layers + act_layer: (nn.Module): activation layer in MLP of transformer layers + pad_type: str: Type of padding to use '' for PyTorch symmetric, 'same' for TF SAME + weight_init: (str): weight init scheme + global_pool: (str): type of pooling operation to apply to final feature map + + Notes: + - Default values follow NesT-B from the original Jax code. + - `embed_dims`, `num_heads`, `depths` should be ints or tuples with length `num_levels`. + - For those following the paper, Table A1 may have errors! + - https://github.com/google-research/nested-transformer/issues/2 + """ + super().__init__() + + for param_name in ['embed_dims', 'num_heads', 'depths']: + param_value = locals()[param_name] + if isinstance(param_value, collections.abc.Sequence): + assert len(param_value) == num_levels, f'Require `len({param_name}) == num_levels`' + + embed_dims = to_ntuple(num_levels)(embed_dims) + num_heads = to_ntuple(num_levels)(num_heads) + depths = to_ntuple(num_levels)(depths) + self.num_classes = num_classes + self.num_features = embed_dims[-1] + self.feature_info = [] + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + self.drop_rate = drop_rate + self.num_levels = num_levels + if isinstance(img_size, collections.abc.Sequence): + assert img_size[0] == img_size[1], 'Model only handles square inputs' + img_size = img_size[0] + assert img_size % patch_size == 0, '`patch_size` must divide `img_size` evenly' + self.patch_size = patch_size + + # Number of blocks at each level + self.num_blocks = (4 ** torch.arange(num_levels)).flip(0).tolist() + assert (img_size // patch_size) % math.sqrt(self.num_blocks[0]) == 0, \ + 'First level blocks don\'t fit evenly. Check `img_size`, `patch_size`, and `num_levels`' + + # Block edge size in units of patches + # Hint: (img_size // patch_size) gives number of patches along edge of image. sqrt(self.num_blocks[0]) is the + # number of blocks along edge of image + self.block_size = int((img_size // patch_size) // math.sqrt(self.num_blocks[0])) + + # Patch embedding + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dims[0], flatten=False) + self.num_patches = self.patch_embed.num_patches + self.seq_length = self.num_patches // self.num_blocks[0] + + # Build up each hierarchical level + levels = [] + dp_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] + prev_dim = None + curr_stride = 4 + for i in range(len(self.num_blocks)): + dim = embed_dims[i] + levels.append(NestLevel( + self.num_blocks[i], self.block_size, self.seq_length, num_heads[i], depths[i], dim, prev_dim, + mlp_ratio, qkv_bias, drop_rate, attn_drop_rate, dp_rates[i], norm_layer, act_layer, pad_type=pad_type)) + self.feature_info += [dict(num_chs=dim, reduction=curr_stride, module=f'levels.{i}')] + prev_dim = dim + curr_stride *= 2 + self.levels = nn.Sequential(*levels) + + # Final normalization layer + self.norm = norm_layer(embed_dims[-1]) + + # Classifier + self.global_pool, self.head = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + self.init_weights(weight_init) + + @torch.jit.ignore + def init_weights(self, mode=''): + assert mode in ('nlhb', '') + head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0. + for level in self.levels: + trunc_normal_(level.pos_embed, std=.02, a=-2, b=2) + named_apply(partial(_init_nest_weights, head_bias=head_bias), self) + + @torch.jit.ignore + def no_weight_decay(self): + return {f'level.{i}.pos_embed' for i in range(len(self.levels))} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^patch_embed', # stem and embed + blocks=[ + (r'^levels\.(\d+)' if coarse else r'^levels\.(\d+)\.transformer_encoder\.(\d+)', None), + (r'^levels\.(\d+)\.(?:pool|pos_embed)', (0,)), + (r'^norm', (99999,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for l in self.levels: + l.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.head = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x = self.patch_embed(x) + x = self.levels(x) + # Layer norm done over channel dim only (to NHWC and back) + x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _init_nest_weights(module: nn.Module, name: str = '', head_bias: float = 0.): + """ NesT weight initialization + Can replicate Jax implementation. Otherwise follows vision_transformer.py + """ + if isinstance(module, nn.Linear): + if name.startswith('head'): + trunc_normal_(module.weight, std=.02, a=-2, b=2) + nn.init.constant_(module.bias, head_bias) + else: + trunc_normal_(module.weight, std=.02, a=-2, b=2) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Conv2d): + trunc_normal_(module.weight, std=.02, a=-2, b=2) + if module.bias is not None: + nn.init.zeros_(module.bias) + + +def resize_pos_embed(posemb, posemb_new): + """ + Rescale the grid of position embeddings when loading from state_dict + Expected shape of position embeddings is (1, T, N, C), and considers only square images + """ + _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape) + seq_length_old = posemb.shape[2] + num_blocks_new, seq_length_new = posemb_new.shape[1:3] + size_new = int(math.sqrt(num_blocks_new*seq_length_new)) + # First change to (1, C, H, W) + posemb = deblockify(posemb, int(math.sqrt(seq_length_old))).permute(0, 3, 1, 2) + posemb = F.interpolate(posemb, size=[size_new, size_new], mode='bicubic', align_corners=False) + # Now change to new (1, T, N, C) + posemb = blockify(posemb.permute(0, 2, 3, 1), int(math.sqrt(seq_length_new))) + return posemb + + +def checkpoint_filter_fn(state_dict, model): + """ resize positional embeddings of pretrained weights """ + pos_embed_keys = [k for k in state_dict.keys() if k.startswith('pos_embed_')] + for k in pos_embed_keys: + if state_dict[k].shape != getattr(model, k).shape: + state_dict[k] = resize_pos_embed(state_dict[k], getattr(model, k)) + return state_dict + + +def _create_nest(variant, pretrained=False, **kwargs): + model = build_model_with_cfg( + Nest, variant, pretrained, + feature_cfg=dict(out_indices=(0, 1, 2), flatten_sequential=True), + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + + return model + + +@register_model +def nest_base(pretrained=False, **kwargs): + """ Nest-B @ 224x224 + """ + model_kwargs = dict( + embed_dims=(128, 256, 512), num_heads=(4, 8, 16), depths=(2, 2, 20), **kwargs) + model = _create_nest('nest_base', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def nest_small(pretrained=False, **kwargs): + """ Nest-S @ 224x224 + """ + model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 20), **kwargs) + model = _create_nest('nest_small', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def nest_tiny(pretrained=False, **kwargs): + """ Nest-T @ 224x224 + """ + model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 8), **kwargs) + model = _create_nest('nest_tiny', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def jx_nest_base(pretrained=False, **kwargs): + """ Nest-B @ 224x224, Pretrained weights converted from official Jax impl. + """ + kwargs['pad_type'] = 'same' + model_kwargs = dict(embed_dims=(128, 256, 512), num_heads=(4, 8, 16), depths=(2, 2, 20), **kwargs) + model = _create_nest('jx_nest_base', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def jx_nest_small(pretrained=False, **kwargs): + """ Nest-S @ 224x224, Pretrained weights converted from official Jax impl. + """ + kwargs['pad_type'] = 'same' + model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 20), **kwargs) + model = _create_nest('jx_nest_small', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def jx_nest_tiny(pretrained=False, **kwargs): + """ Nest-T @ 224x224, Pretrained weights converted from official Jax impl. + """ + kwargs['pad_type'] = 'same' + model_kwargs = dict(embed_dims=(96, 192, 384), num_heads=(3, 6, 12), depths=(2, 2, 8), **kwargs) + model = _create_nest('jx_nest_tiny', pretrained=pretrained, **model_kwargs) + return model diff --git a/custom_timm/models/nfnet.py b/custom_timm/models/nfnet.py new file mode 100644 index 0000000000000000000000000000000000000000..e65151f4b9108ba19143cba01ac282b4c3f3c973 --- /dev/null +++ b/custom_timm/models/nfnet.py @@ -0,0 +1,893 @@ +""" Normalization Free Nets. NFNet, NF-RegNet, NF-ResNet (pre-activation) Models + +Paper: `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + +Paper: `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + +Official Deepmind JAX code: https://github.com/deepmind/deepmind-research/tree/master/nfnets + +Status: +* These models are a work in progress, experiments ongoing. +* Pretrained weights for two models so far, more to come. +* Model details updated to closer match official JAX code now that it's released +* NF-ResNet, NF-RegNet-B, and NFNet-F models supported + +Hacked together by / copyright Ross Wightman, 2021. +""" +import math +from dataclasses import dataclass, field +from collections import OrderedDict +from typing import Tuple, Optional +from functools import partial + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_module +from .helpers import build_model_with_cfg, checkpoint_seq +from .registry import register_model +from .layers import ClassifierHead, DropPath, AvgPool2dSame, ScaledStdConv2d, ScaledStdConv2dSame,\ + get_act_layer, get_act_fn, get_attn, make_divisible + + +def _dcfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.9, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv1', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = dict( + dm_nfnet_f0=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f0-604f9c3a.pth', + pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256), crop_pct=.9), + dm_nfnet_f1=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f1-fc540f82.pth', + pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 320, 320), crop_pct=0.91), + dm_nfnet_f2=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f2-89875923.pth', + pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 352, 352), crop_pct=0.92), + dm_nfnet_f3=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f3-d74ab3aa.pth', + pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 416, 416), crop_pct=0.94), + dm_nfnet_f4=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f4-0ac5b10b.pth', + pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 512, 512), crop_pct=0.951), + dm_nfnet_f5=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f5-ecb20ab1.pth', + pool_size=(13, 13), input_size=(3, 416, 416), test_input_size=(3, 544, 544), crop_pct=0.954), + dm_nfnet_f6=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-dnf-weights/dm_nfnet_f6-e0f12116.pth', + pool_size=(14, 14), input_size=(3, 448, 448), test_input_size=(3, 576, 576), crop_pct=0.956), + + nfnet_f0=_dcfg( + url='', pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256)), + nfnet_f1=_dcfg( + url='', pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 320, 320)), + nfnet_f2=_dcfg( + url='', pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 352, 352)), + nfnet_f3=_dcfg( + url='', pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 416, 416)), + nfnet_f4=_dcfg( + url='', pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 512, 512)), + nfnet_f5=_dcfg( + url='', pool_size=(13, 13), input_size=(3, 416, 416), test_input_size=(3, 544, 544)), + nfnet_f6=_dcfg( + url='', pool_size=(14, 14), input_size=(3, 448, 448), test_input_size=(3, 576, 576)), + nfnet_f7=_dcfg( + url='', pool_size=(15, 15), input_size=(3, 480, 480), test_input_size=(3, 608, 608)), + + nfnet_l0=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nfnet_l0_ra2-45c6688d.pth', + pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 288, 288), crop_pct=1.0), + eca_nfnet_l0=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l0_ra2-e3e9ac50.pth', + hf_hub_id='timm/eca_nfnet_l0', + pool_size=(7, 7), input_size=(3, 224, 224), test_input_size=(3, 288, 288), crop_pct=1.0), + eca_nfnet_l1=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l1_ra2-7dce93cd.pth', + pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 320, 320), crop_pct=1.0), + eca_nfnet_l2=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecanfnet_l2_ra3-da781a61.pth', + pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 384, 384), crop_pct=1.0), + eca_nfnet_l3=_dcfg( + url='', + pool_size=(11, 11), input_size=(3, 352, 352), test_input_size=(3, 448, 448), crop_pct=1.0), + + nf_regnet_b0=_dcfg( + url='', pool_size=(6, 6), input_size=(3, 192, 192), test_input_size=(3, 256, 256), first_conv='stem.conv'), + nf_regnet_b1=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nf_regnet_b1_256_ra2-ad85cfef.pth', + pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 288, 288), first_conv='stem.conv'), # NOT to paper spec + nf_regnet_b2=_dcfg( + url='', pool_size=(8, 8), input_size=(3, 240, 240), test_input_size=(3, 272, 272), first_conv='stem.conv'), + nf_regnet_b3=_dcfg( + url='', pool_size=(9, 9), input_size=(3, 288, 288), test_input_size=(3, 320, 320), first_conv='stem.conv'), + nf_regnet_b4=_dcfg( + url='', pool_size=(10, 10), input_size=(3, 320, 320), test_input_size=(3, 384, 384), first_conv='stem.conv'), + nf_regnet_b5=_dcfg( + url='', pool_size=(12, 12), input_size=(3, 384, 384), test_input_size=(3, 456, 456), first_conv='stem.conv'), + + nf_resnet26=_dcfg(url='', first_conv='stem.conv'), + nf_resnet50=_dcfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/nf_resnet50_ra2-9f236009.pth', + pool_size=(8, 8), input_size=(3, 256, 256), test_input_size=(3, 288, 288), crop_pct=0.94, first_conv='stem.conv'), + nf_resnet101=_dcfg(url='', first_conv='stem.conv'), + + nf_seresnet26=_dcfg(url='', first_conv='stem.conv'), + nf_seresnet50=_dcfg(url='', first_conv='stem.conv'), + nf_seresnet101=_dcfg(url='', first_conv='stem.conv'), + + nf_ecaresnet26=_dcfg(url='', first_conv='stem.conv'), + nf_ecaresnet50=_dcfg(url='', first_conv='stem.conv'), + nf_ecaresnet101=_dcfg(url='', first_conv='stem.conv'), +) + + +@dataclass +class NfCfg: + depths: Tuple[int, int, int, int] + channels: Tuple[int, int, int, int] + alpha: float = 0.2 + stem_type: str = '3x3' + stem_chs: Optional[int] = None + group_size: Optional[int] = None + attn_layer: Optional[str] = None + attn_kwargs: dict = None + attn_gain: float = 2.0 # NF correction gain to apply if attn layer is used + width_factor: float = 1.0 + bottle_ratio: float = 0.5 + num_features: int = 0 # num out_channels for final conv, no final_conv if 0 + ch_div: int = 8 # round channels % 8 == 0 to keep tensor-core use optimal + reg: bool = False # enables EfficientNet-like options used in RegNet variants, expand from in_chs, se in middle + extra_conv: bool = False # extra 3x3 bottleneck convolution for NFNet models + gamma_in_act: bool = False + same_padding: bool = False + std_conv_eps: float = 1e-5 + skipinit: bool = False # disabled by default, non-trivial performance impact + zero_init_fc: bool = False + act_layer: str = 'silu' + + +def _nfres_cfg( + depths, channels=(256, 512, 1024, 2048), group_size=None, act_layer='relu', attn_layer=None, attn_kwargs=None): + attn_kwargs = attn_kwargs or {} + cfg = NfCfg( + depths=depths, channels=channels, stem_type='7x7_pool', stem_chs=64, bottle_ratio=0.25, + group_size=group_size, act_layer=act_layer, attn_layer=attn_layer, attn_kwargs=attn_kwargs) + return cfg + + +def _nfreg_cfg(depths, channels=(48, 104, 208, 440)): + num_features = 1280 * channels[-1] // 440 + attn_kwargs = dict(rd_ratio=0.5) + cfg = NfCfg( + depths=depths, channels=channels, stem_type='3x3', group_size=8, width_factor=0.75, bottle_ratio=2.25, + num_features=num_features, reg=True, attn_layer='se', attn_kwargs=attn_kwargs) + return cfg + + +def _nfnet_cfg( + depths, channels=(256, 512, 1536, 1536), group_size=128, bottle_ratio=0.5, feat_mult=2., + act_layer='gelu', attn_layer='se', attn_kwargs=None): + num_features = int(channels[-1] * feat_mult) + attn_kwargs = attn_kwargs if attn_kwargs is not None else dict(rd_ratio=0.5) + cfg = NfCfg( + depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=group_size, + bottle_ratio=bottle_ratio, extra_conv=True, num_features=num_features, act_layer=act_layer, + attn_layer=attn_layer, attn_kwargs=attn_kwargs) + return cfg + + +def _dm_nfnet_cfg(depths, channels=(256, 512, 1536, 1536), act_layer='gelu', skipinit=True): + cfg = NfCfg( + depths=depths, channels=channels, stem_type='deep_quad', stem_chs=128, group_size=128, + bottle_ratio=0.5, extra_conv=True, gamma_in_act=True, same_padding=True, skipinit=skipinit, + num_features=int(channels[-1] * 2.0), act_layer=act_layer, attn_layer='se', attn_kwargs=dict(rd_ratio=0.5)) + return cfg + + +model_cfgs = dict( + # NFNet-F models w/ GELU compatible with DeepMind weights + dm_nfnet_f0=_dm_nfnet_cfg(depths=(1, 2, 6, 3)), + dm_nfnet_f1=_dm_nfnet_cfg(depths=(2, 4, 12, 6)), + dm_nfnet_f2=_dm_nfnet_cfg(depths=(3, 6, 18, 9)), + dm_nfnet_f3=_dm_nfnet_cfg(depths=(4, 8, 24, 12)), + dm_nfnet_f4=_dm_nfnet_cfg(depths=(5, 10, 30, 15)), + dm_nfnet_f5=_dm_nfnet_cfg(depths=(6, 12, 36, 18)), + dm_nfnet_f6=_dm_nfnet_cfg(depths=(7, 14, 42, 21)), + + # NFNet-F models w/ GELU + nfnet_f0=_nfnet_cfg(depths=(1, 2, 6, 3)), + nfnet_f1=_nfnet_cfg(depths=(2, 4, 12, 6)), + nfnet_f2=_nfnet_cfg(depths=(3, 6, 18, 9)), + nfnet_f3=_nfnet_cfg(depths=(4, 8, 24, 12)), + nfnet_f4=_nfnet_cfg(depths=(5, 10, 30, 15)), + nfnet_f5=_nfnet_cfg(depths=(6, 12, 36, 18)), + nfnet_f6=_nfnet_cfg(depths=(7, 14, 42, 21)), + nfnet_f7=_nfnet_cfg(depths=(8, 16, 48, 24)), + + # Experimental 'light' versions of NFNet-F that are little leaner + nfnet_l0=_nfnet_cfg( + depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25, + attn_kwargs=dict(rd_ratio=0.25, rd_divisor=8), act_layer='silu'), + eca_nfnet_l0=_nfnet_cfg( + depths=(1, 2, 6, 3), feat_mult=1.5, group_size=64, bottle_ratio=0.25, + attn_layer='eca', attn_kwargs=dict(), act_layer='silu'), + eca_nfnet_l1=_nfnet_cfg( + depths=(2, 4, 12, 6), feat_mult=2, group_size=64, bottle_ratio=0.25, + attn_layer='eca', attn_kwargs=dict(), act_layer='silu'), + eca_nfnet_l2=_nfnet_cfg( + depths=(3, 6, 18, 9), feat_mult=2, group_size=64, bottle_ratio=0.25, + attn_layer='eca', attn_kwargs=dict(), act_layer='silu'), + eca_nfnet_l3=_nfnet_cfg( + depths=(4, 8, 24, 12), feat_mult=2, group_size=64, bottle_ratio=0.25, + attn_layer='eca', attn_kwargs=dict(), act_layer='silu'), + + # EffNet influenced RegNet defs. + # NOTE: These aren't quite the official ver, ch_div=1 must be set for exact ch counts. I round to ch_div=8. + nf_regnet_b0=_nfreg_cfg(depths=(1, 3, 6, 6)), + nf_regnet_b1=_nfreg_cfg(depths=(2, 4, 7, 7)), + nf_regnet_b2=_nfreg_cfg(depths=(2, 4, 8, 8), channels=(56, 112, 232, 488)), + nf_regnet_b3=_nfreg_cfg(depths=(2, 5, 9, 9), channels=(56, 128, 248, 528)), + nf_regnet_b4=_nfreg_cfg(depths=(2, 6, 11, 11), channels=(64, 144, 288, 616)), + nf_regnet_b5=_nfreg_cfg(depths=(3, 7, 14, 14), channels=(80, 168, 336, 704)), + # FIXME add B6-B8 + + # ResNet (preact, D style deep stem/avg down) defs + nf_resnet26=_nfres_cfg(depths=(2, 2, 2, 2)), + nf_resnet50=_nfres_cfg(depths=(3, 4, 6, 3)), + nf_resnet101=_nfres_cfg(depths=(3, 4, 23, 3)), + + nf_seresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)), + nf_seresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)), + nf_seresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='se', attn_kwargs=dict(rd_ratio=1/16)), + + nf_ecaresnet26=_nfres_cfg(depths=(2, 2, 2, 2), attn_layer='eca', attn_kwargs=dict()), + nf_ecaresnet50=_nfres_cfg(depths=(3, 4, 6, 3), attn_layer='eca', attn_kwargs=dict()), + nf_ecaresnet101=_nfres_cfg(depths=(3, 4, 23, 3), attn_layer='eca', attn_kwargs=dict()), + +) + + +class GammaAct(nn.Module): + def __init__(self, act_type='relu', gamma: float = 1.0, inplace=False): + super().__init__() + self.act_fn = get_act_fn(act_type) + self.gamma = gamma + self.inplace = inplace + + def forward(self, x): + return self.act_fn(x, inplace=self.inplace).mul_(self.gamma) + + +def act_with_gamma(act_type, gamma: float = 1.): + def _create(inplace=False): + return GammaAct(act_type, gamma=gamma, inplace=inplace) + return _create + + +class DownsampleAvg(nn.Module): + def __init__( + self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None, conv_layer=ScaledStdConv2d): + """ AvgPool Downsampling as in 'D' ResNet variants. Support for dilation.""" + super(DownsampleAvg, self).__init__() + avg_stride = stride if dilation == 1 else 1 + if stride > 1 or dilation > 1: + avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d + self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False) + else: + self.pool = nn.Identity() + self.conv = conv_layer(in_chs, out_chs, 1, stride=1) + + def forward(self, x): + return self.conv(self.pool(x)) + + +@register_notrace_module # reason: mul_ causes FX to drop a relevant node. https://github.com/pytorch/pytorch/issues/68301 +class NormFreeBlock(nn.Module): + """Normalization-Free pre-activation block. + """ + + def __init__( + self, in_chs, out_chs=None, stride=1, dilation=1, first_dilation=None, + alpha=1.0, beta=1.0, bottle_ratio=0.25, group_size=None, ch_div=1, reg=True, extra_conv=False, + skipinit=False, attn_layer=None, attn_gain=2.0, act_layer=None, conv_layer=None, drop_path_rate=0.): + super().__init__() + first_dilation = first_dilation or dilation + out_chs = out_chs or in_chs + # RegNet variants scale bottleneck from in_chs, otherwise scale from out_chs like ResNet + mid_chs = make_divisible(in_chs * bottle_ratio if reg else out_chs * bottle_ratio, ch_div) + groups = 1 if not group_size else mid_chs // group_size + if group_size and group_size % ch_div == 0: + mid_chs = group_size * groups # correct mid_chs if group_size divisible by ch_div, otherwise error + self.alpha = alpha + self.beta = beta + self.attn_gain = attn_gain + + if in_chs != out_chs or stride != 1 or dilation != first_dilation: + self.downsample = DownsampleAvg( + in_chs, out_chs, stride=stride, dilation=dilation, first_dilation=first_dilation, conv_layer=conv_layer) + else: + self.downsample = None + + self.act1 = act_layer() + self.conv1 = conv_layer(in_chs, mid_chs, 1) + self.act2 = act_layer(inplace=True) + self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups) + if extra_conv: + self.act2b = act_layer(inplace=True) + self.conv2b = conv_layer(mid_chs, mid_chs, 3, stride=1, dilation=dilation, groups=groups) + else: + self.act2b = None + self.conv2b = None + if reg and attn_layer is not None: + self.attn = attn_layer(mid_chs) # RegNet blocks apply attn btw conv2 & 3 + else: + self.attn = None + self.act3 = act_layer() + self.conv3 = conv_layer(mid_chs, out_chs, 1, gain_init=1. if skipinit else 0.) + if not reg and attn_layer is not None: + self.attn_last = attn_layer(out_chs) # ResNet blocks apply attn after conv3 + else: + self.attn_last = None + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.skipinit_gain = nn.Parameter(torch.tensor(0.)) if skipinit else None + + def forward(self, x): + out = self.act1(x) * self.beta + + # shortcut branch + shortcut = x + if self.downsample is not None: + shortcut = self.downsample(out) + + # residual branch + out = self.conv1(out) + out = self.conv2(self.act2(out)) + if self.conv2b is not None: + out = self.conv2b(self.act2b(out)) + if self.attn is not None: + out = self.attn_gain * self.attn(out) + out = self.conv3(self.act3(out)) + if self.attn_last is not None: + out = self.attn_gain * self.attn_last(out) + out = self.drop_path(out) + + if self.skipinit_gain is not None: + out.mul_(self.skipinit_gain) # this slows things down more than expected, TBD + out = out * self.alpha + shortcut + return out + + +def create_stem(in_chs, out_chs, stem_type='', conv_layer=None, act_layer=None, preact_feature=True): + stem_stride = 2 + stem_feature = dict(num_chs=out_chs, reduction=2, module='stem.conv') + stem = OrderedDict() + assert stem_type in ('', 'deep', 'deep_tiered', 'deep_quad', '3x3', '7x7', 'deep_pool', '3x3_pool', '7x7_pool') + if 'deep' in stem_type: + if 'quad' in stem_type: + # 4 deep conv stack as in NFNet-F models + assert not 'pool' in stem_type + stem_chs = (out_chs // 8, out_chs // 4, out_chs // 2, out_chs) + strides = (2, 1, 1, 2) + stem_stride = 4 + stem_feature = dict(num_chs=out_chs // 2, reduction=2, module='stem.conv3') + else: + if 'tiered' in stem_type: + stem_chs = (3 * out_chs // 8, out_chs // 2, out_chs) # 'T' resnets in resnet.py + else: + stem_chs = (out_chs // 2, out_chs // 2, out_chs) # 'D' ResNets + strides = (2, 1, 1) + stem_feature = dict(num_chs=out_chs // 2, reduction=2, module='stem.conv2') + last_idx = len(stem_chs) - 1 + for i, (c, s) in enumerate(zip(stem_chs, strides)): + stem[f'conv{i + 1}'] = conv_layer(in_chs, c, kernel_size=3, stride=s) + if i != last_idx: + stem[f'act{i + 2}'] = act_layer(inplace=True) + in_chs = c + elif '3x3' in stem_type: + # 3x3 stem conv as in RegNet + stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=3, stride=2) + else: + # 7x7 stem conv as in ResNet + stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=7, stride=2) + + if 'pool' in stem_type: + stem['pool'] = nn.MaxPool2d(3, stride=2, padding=1) + stem_stride = 4 + + return nn.Sequential(stem), stem_stride, stem_feature + + +# from https://github.com/deepmind/deepmind-research/tree/master/nfnets +_nonlin_gamma = dict( + identity=1.0, + celu=1.270926833152771, + elu=1.2716004848480225, + gelu=1.7015043497085571, + leaky_relu=1.70590341091156, + log_sigmoid=1.9193484783172607, + log_softmax=1.0002083778381348, + relu=1.7139588594436646, + relu6=1.7131484746932983, + selu=1.0008515119552612, + sigmoid=4.803835391998291, + silu=1.7881293296813965, + softsign=2.338853120803833, + softplus=1.9203323125839233, + tanh=1.5939117670059204, +) + + +class NormFreeNet(nn.Module): + """ Normalization-Free Network + + As described in : + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + and + `High-Performance Large-Scale Image Recognition Without Normalization` - https://arxiv.org/abs/2102.06171 + + This model aims to cover both the NFRegNet-Bx models as detailed in the paper's code snippets and + the (preact) ResNet models described earlier in the paper. + + There are a few differences: + * channels are rounded to be divisible by 8 by default (keep tensor core kernels happy), + this changes channel dim and param counts slightly from the paper models + * activation correcting gamma constants are moved into the ScaledStdConv as it has less performance + impact in PyTorch when done with the weight scaling there. This likely wasn't a concern in the JAX impl. + * a config option `gamma_in_act` can be enabled to not apply gamma in StdConv as described above, but + apply it in each activation. This is slightly slower, numerically different, but matches official impl. + * skipinit is disabled by default, it seems to have a rather drastic impact on GPU memory use and throughput + for what it is/does. Approx 8-10% throughput loss. + """ + def __init__( + self, cfg: NfCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32, + drop_rate=0., drop_path_rate=0. + ): + super().__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + self.grad_checkpointing = False + + assert cfg.act_layer in _nonlin_gamma, f"Please add non-linearity constants for activation ({cfg.act_layer})." + conv_layer = ScaledStdConv2dSame if cfg.same_padding else ScaledStdConv2d + if cfg.gamma_in_act: + act_layer = act_with_gamma(cfg.act_layer, gamma=_nonlin_gamma[cfg.act_layer]) + conv_layer = partial(conv_layer, eps=cfg.std_conv_eps) + else: + act_layer = get_act_layer(cfg.act_layer) + conv_layer = partial(conv_layer, gamma=_nonlin_gamma[cfg.act_layer], eps=cfg.std_conv_eps) + attn_layer = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None + + stem_chs = make_divisible((cfg.stem_chs or cfg.channels[0]) * cfg.width_factor, cfg.ch_div) + self.stem, stem_stride, stem_feat = create_stem( + in_chans, stem_chs, cfg.stem_type, conv_layer=conv_layer, act_layer=act_layer) + + self.feature_info = [stem_feat] + drop_path_rates = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(cfg.depths)).split(cfg.depths)] + prev_chs = stem_chs + net_stride = stem_stride + dilation = 1 + expected_var = 1.0 + stages = [] + for stage_idx, stage_depth in enumerate(cfg.depths): + stride = 1 if stage_idx == 0 and stem_stride > 2 else 2 + if net_stride >= output_stride and stride > 1: + dilation *= stride + stride = 1 + net_stride *= stride + first_dilation = 1 if dilation in (1, 2) else 2 + + blocks = [] + for block_idx in range(cfg.depths[stage_idx]): + first_block = block_idx == 0 and stage_idx == 0 + out_chs = make_divisible(cfg.channels[stage_idx] * cfg.width_factor, cfg.ch_div) + blocks += [NormFreeBlock( + in_chs=prev_chs, out_chs=out_chs, + alpha=cfg.alpha, + beta=1. / expected_var ** 0.5, + stride=stride if block_idx == 0 else 1, + dilation=dilation, + first_dilation=first_dilation, + group_size=cfg.group_size, + bottle_ratio=1. if cfg.reg and first_block else cfg.bottle_ratio, + ch_div=cfg.ch_div, + reg=cfg.reg, + extra_conv=cfg.extra_conv, + skipinit=cfg.skipinit, + attn_layer=attn_layer, + attn_gain=cfg.attn_gain, + act_layer=act_layer, + conv_layer=conv_layer, + drop_path_rate=drop_path_rates[stage_idx][block_idx], + )] + if block_idx == 0: + expected_var = 1. # expected var is reset after first block of each stage + expected_var += cfg.alpha ** 2 # Even if reset occurs, increment expected variance + first_dilation = dilation + prev_chs = out_chs + self.feature_info += [dict(num_chs=prev_chs, reduction=net_stride, module=f'stages.{stage_idx}')] + stages += [nn.Sequential(*blocks)] + self.stages = nn.Sequential(*stages) + + if cfg.num_features: + # The paper NFRegNet models have an EfficientNet-like final head convolution. + self.num_features = make_divisible(cfg.width_factor * cfg.num_features, cfg.ch_div) + self.final_conv = conv_layer(prev_chs, self.num_features, 1) + self.feature_info[-1] = dict(num_chs=self.num_features, reduction=net_stride, module=f'final_conv') + else: + self.num_features = prev_chs + self.final_conv = nn.Identity() + self.final_act = act_layer(inplace=cfg.num_features > 0) + + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + for n, m in self.named_modules(): + if 'fc' in n and isinstance(m, nn.Linear): + if cfg.zero_init_fc: + nn.init.zeros_(m.weight) + else: + nn.init.normal_(m.weight, 0., .01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='linear') + if m.bias is not None: + nn.init.zeros_(m.bias) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', + blocks=[ + (r'^stages\.(\d+)' if coarse else r'^stages\.(\d+)\.(\d+)', None), + (r'^final_conv', (99999,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.stages, x) + else: + x = self.stages(x) + x = self.final_conv(x) + x = self.final_act(x) + return x + + def forward_head(self, x): + return self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_normfreenet(variant, pretrained=False, **kwargs): + model_cfg = model_cfgs[variant] + feature_cfg = dict(flatten_sequential=True) + return build_model_with_cfg( + NormFreeNet, variant, pretrained, + model_cfg=model_cfg, + feature_cfg=feature_cfg, + **kwargs) + + +@register_model +def dm_nfnet_f0(pretrained=False, **kwargs): + """ NFNet-F0 (DeepMind weight compatible) + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('dm_nfnet_f0', pretrained=pretrained, **kwargs) + + +@register_model +def dm_nfnet_f1(pretrained=False, **kwargs): + """ NFNet-F1 (DeepMind weight compatible) + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('dm_nfnet_f1', pretrained=pretrained, **kwargs) + + +@register_model +def dm_nfnet_f2(pretrained=False, **kwargs): + """ NFNet-F2 (DeepMind weight compatible) + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('dm_nfnet_f2', pretrained=pretrained, **kwargs) + + +@register_model +def dm_nfnet_f3(pretrained=False, **kwargs): + """ NFNet-F3 (DeepMind weight compatible) + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('dm_nfnet_f3', pretrained=pretrained, **kwargs) + + +@register_model +def dm_nfnet_f4(pretrained=False, **kwargs): + """ NFNet-F4 (DeepMind weight compatible) + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('dm_nfnet_f4', pretrained=pretrained, **kwargs) + + +@register_model +def dm_nfnet_f5(pretrained=False, **kwargs): + """ NFNet-F5 (DeepMind weight compatible) + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('dm_nfnet_f5', pretrained=pretrained, **kwargs) + + +@register_model +def dm_nfnet_f6(pretrained=False, **kwargs): + """ NFNet-F6 (DeepMind weight compatible) + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('dm_nfnet_f6', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f0(pretrained=False, **kwargs): + """ NFNet-F0 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f0', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f1(pretrained=False, **kwargs): + """ NFNet-F1 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f1', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f2(pretrained=False, **kwargs): + """ NFNet-F2 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f2', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f3(pretrained=False, **kwargs): + """ NFNet-F3 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f3', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f4(pretrained=False, **kwargs): + """ NFNet-F4 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f4', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f5(pretrained=False, **kwargs): + """ NFNet-F5 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f5', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f6(pretrained=False, **kwargs): + """ NFNet-F6 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f6', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_f7(pretrained=False, **kwargs): + """ NFNet-F7 + `High-Performance Large-Scale Image Recognition Without Normalization` + - https://arxiv.org/abs/2102.06171 + """ + return _create_normfreenet('nfnet_f7', pretrained=pretrained, **kwargs) + + +@register_model +def nfnet_l0(pretrained=False, **kwargs): + """ NFNet-L0b w/ SiLU + My experimental 'light' model w/ F0 repeats, 1.5x final_conv mult, 64 group_size, .25 bottleneck & SE ratio + """ + return _create_normfreenet('nfnet_l0', pretrained=pretrained, **kwargs) + + +@register_model +def eca_nfnet_l0(pretrained=False, **kwargs): + """ ECA-NFNet-L0 w/ SiLU + My experimental 'light' model w/ F0 repeats, 1.5x final_conv mult, 64 group_size, .25 bottleneck & ECA attn + """ + return _create_normfreenet('eca_nfnet_l0', pretrained=pretrained, **kwargs) + + +@register_model +def eca_nfnet_l1(pretrained=False, **kwargs): + """ ECA-NFNet-L1 w/ SiLU + My experimental 'light' model w/ F1 repeats, 2.0x final_conv mult, 64 group_size, .25 bottleneck & ECA attn + """ + return _create_normfreenet('eca_nfnet_l1', pretrained=pretrained, **kwargs) + + +@register_model +def eca_nfnet_l2(pretrained=False, **kwargs): + """ ECA-NFNet-L2 w/ SiLU + My experimental 'light' model w/ F2 repeats, 2.0x final_conv mult, 64 group_size, .25 bottleneck & ECA attn + """ + return _create_normfreenet('eca_nfnet_l2', pretrained=pretrained, **kwargs) + + +@register_model +def eca_nfnet_l3(pretrained=False, **kwargs): + """ ECA-NFNet-L3 w/ SiLU + My experimental 'light' model w/ F3 repeats, 2.0x final_conv mult, 64 group_size, .25 bottleneck & ECA attn + """ + return _create_normfreenet('eca_nfnet_l3', pretrained=pretrained, **kwargs) + + +@register_model +def nf_regnet_b0(pretrained=False, **kwargs): + """ Normalization-Free RegNet-B0 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_regnet_b0', pretrained=pretrained, **kwargs) + + +@register_model +def nf_regnet_b1(pretrained=False, **kwargs): + """ Normalization-Free RegNet-B1 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_regnet_b1', pretrained=pretrained, **kwargs) + + +@register_model +def nf_regnet_b2(pretrained=False, **kwargs): + """ Normalization-Free RegNet-B2 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_regnet_b2', pretrained=pretrained, **kwargs) + + +@register_model +def nf_regnet_b3(pretrained=False, **kwargs): + """ Normalization-Free RegNet-B3 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_regnet_b3', pretrained=pretrained, **kwargs) + + +@register_model +def nf_regnet_b4(pretrained=False, **kwargs): + """ Normalization-Free RegNet-B4 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_regnet_b4', pretrained=pretrained, **kwargs) + + +@register_model +def nf_regnet_b5(pretrained=False, **kwargs): + """ Normalization-Free RegNet-B5 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_regnet_b5', pretrained=pretrained, **kwargs) + + +@register_model +def nf_resnet26(pretrained=False, **kwargs): + """ Normalization-Free ResNet-26 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_resnet26', pretrained=pretrained, **kwargs) + + +@register_model +def nf_resnet50(pretrained=False, **kwargs): + """ Normalization-Free ResNet-50 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_resnet50', pretrained=pretrained, **kwargs) + + +@register_model +def nf_resnet101(pretrained=False, **kwargs): + """ Normalization-Free ResNet-101 + `Characterizing signal propagation to close the performance gap in unnormalized ResNets` + - https://arxiv.org/abs/2101.08692 + """ + return _create_normfreenet('nf_resnet101', pretrained=pretrained, **kwargs) + + +@register_model +def nf_seresnet26(pretrained=False, **kwargs): + """ Normalization-Free SE-ResNet26 + """ + return _create_normfreenet('nf_seresnet26', pretrained=pretrained, **kwargs) + + +@register_model +def nf_seresnet50(pretrained=False, **kwargs): + """ Normalization-Free SE-ResNet50 + """ + return _create_normfreenet('nf_seresnet50', pretrained=pretrained, **kwargs) + + +@register_model +def nf_seresnet101(pretrained=False, **kwargs): + """ Normalization-Free SE-ResNet101 + """ + return _create_normfreenet('nf_seresnet101', pretrained=pretrained, **kwargs) + + +@register_model +def nf_ecaresnet26(pretrained=False, **kwargs): + """ Normalization-Free ECA-ResNet26 + """ + return _create_normfreenet('nf_ecaresnet26', pretrained=pretrained, **kwargs) + + +@register_model +def nf_ecaresnet50(pretrained=False, **kwargs): + """ Normalization-Free ECA-ResNet50 + """ + return _create_normfreenet('nf_ecaresnet50', pretrained=pretrained, **kwargs) + + +@register_model +def nf_ecaresnet101(pretrained=False, **kwargs): + """ Normalization-Free ECA-ResNet101 + """ + return _create_normfreenet('nf_ecaresnet101', pretrained=pretrained, **kwargs) diff --git a/custom_timm/models/pit.py b/custom_timm/models/pit.py new file mode 100644 index 0000000000000000000000000000000000000000..3dd79c0638fccbe52b91eab348f5abf61bdac67e --- /dev/null +++ b/custom_timm/models/pit.py @@ -0,0 +1,404 @@ +""" Pooling-based Vision Transformer (PiT) in PyTorch + +A PyTorch implement of Pooling-based Vision Transformers as described in +'Rethinking Spatial Dimensions of Vision Transformers' - https://arxiv.org/abs/2103.16302 + +This code was adapted from the original version at https://github.com/naver-ai/pit, original copyright below. + +Modifications for timm by / Copyright 2020 Ross Wightman +""" +# PiT +# Copyright 2021-present NAVER Corp. +# Apache License v2.0 + +import math +import re +from copy import deepcopy +from functools import partial +from typing import Tuple + +import torch +from torch import nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import trunc_normal_, to_2tuple +from .registry import register_model +from .vision_transformer import Block + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.conv', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # deit models (FB weights) + 'pit_ti_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_ti_730.pth'), + 'pit_xs_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_xs_781.pth'), + 'pit_s_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_s_809.pth'), + 'pit_b_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_b_820.pth'), + 'pit_ti_distilled_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_ti_distill_746.pth', + classifier=('head', 'head_dist')), + 'pit_xs_distilled_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_xs_distill_791.pth', + classifier=('head', 'head_dist')), + 'pit_s_distilled_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_s_distill_819.pth', + classifier=('head', 'head_dist')), + 'pit_b_distilled_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-pit-weights/pit_b_distill_840.pth', + classifier=('head', 'head_dist')), +} + + +class SequentialTuple(nn.Sequential): + """ This module exists to work around torchscript typing issues list -> list""" + def __init__(self, *args): + super(SequentialTuple, self).__init__(*args) + + def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + for module in self: + x = module(x) + return x + + +class Transformer(nn.Module): + def __init__( + self, base_dim, depth, heads, mlp_ratio, pool=None, drop_rate=.0, attn_drop_rate=.0, drop_path_prob=None): + super(Transformer, self).__init__() + self.layers = nn.ModuleList([]) + embed_dim = base_dim * heads + + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, + num_heads=heads, + mlp_ratio=mlp_ratio, + qkv_bias=True, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=drop_path_prob[i], + norm_layer=partial(nn.LayerNorm, eps=1e-6) + ) + for i in range(depth)]) + + self.pool = pool + + def forward(self, x: Tuple[torch.Tensor, torch.Tensor]) -> Tuple[torch.Tensor, torch.Tensor]: + x, cls_tokens = x + B, C, H, W = x.shape + token_length = cls_tokens.shape[1] + + x = x.flatten(2).transpose(1, 2) + x = torch.cat((cls_tokens, x), dim=1) + + x = self.blocks(x) + + cls_tokens = x[:, :token_length] + x = x[:, token_length:] + x = x.transpose(1, 2).reshape(B, C, H, W) + + if self.pool is not None: + x, cls_tokens = self.pool(x, cls_tokens) + return x, cls_tokens + + +class ConvHeadPooling(nn.Module): + def __init__(self, in_feature, out_feature, stride, padding_mode='zeros'): + super(ConvHeadPooling, self).__init__() + + self.conv = nn.Conv2d( + in_feature, out_feature, kernel_size=stride + 1, padding=stride // 2, stride=stride, + padding_mode=padding_mode, groups=in_feature) + self.fc = nn.Linear(in_feature, out_feature) + + def forward(self, x, cls_token) -> Tuple[torch.Tensor, torch.Tensor]: + x = self.conv(x) + cls_token = self.fc(cls_token) + return x, cls_token + + +class ConvEmbedding(nn.Module): + def __init__(self, in_channels, out_channels, patch_size, stride, padding): + super(ConvEmbedding, self).__init__() + self.conv = nn.Conv2d( + in_channels, out_channels, kernel_size=patch_size, stride=stride, padding=padding, bias=True) + + def forward(self, x): + x = self.conv(x) + return x + + +class PoolingVisionTransformer(nn.Module): + """ Pooling-based Vision Transformer + + A PyTorch implement of 'Rethinking Spatial Dimensions of Vision Transformers' + - https://arxiv.org/abs/2103.16302 + """ + def __init__( + self, img_size, patch_size, stride, base_dims, depth, heads, + mlp_ratio, num_classes=1000, in_chans=3, global_pool='token', + distilled=False, attn_drop_rate=.0, drop_rate=.0, drop_path_rate=.0): + super(PoolingVisionTransformer, self).__init__() + assert global_pool in ('token',) + + padding = 0 + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + height = math.floor((img_size[0] + 2 * padding - patch_size[0]) / stride + 1) + width = math.floor((img_size[1] + 2 * padding - patch_size[1]) / stride + 1) + + self.base_dims = base_dims + self.heads = heads + self.num_classes = num_classes + self.global_pool = global_pool + self.num_tokens = 2 if distilled else 1 + + self.patch_size = patch_size + self.pos_embed = nn.Parameter(torch.randn(1, base_dims[0] * heads[0], height, width)) + self.patch_embed = ConvEmbedding(in_chans, base_dims[0] * heads[0], patch_size, stride, padding) + + self.cls_token = nn.Parameter(torch.randn(1, self.num_tokens, base_dims[0] * heads[0])) + self.pos_drop = nn.Dropout(p=drop_rate) + + transformers = [] + # stochastic depth decay rule + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depth)).split(depth)] + for stage in range(len(depth)): + pool = None + if stage < len(heads) - 1: + pool = ConvHeadPooling( + base_dims[stage] * heads[stage], base_dims[stage + 1] * heads[stage + 1], stride=2) + transformers += [Transformer( + base_dims[stage], depth[stage], heads[stage], mlp_ratio, pool=pool, + drop_rate=drop_rate, attn_drop_rate=attn_drop_rate, drop_path_prob=dpr[stage]) + ] + self.transformers = SequentialTuple(*transformers) + self.norm = nn.LayerNorm(base_dims[-1] * heads[-1], eps=1e-6) + self.num_features = self.embed_dim = base_dims[-1] * heads[-1] + + # Classifier head + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + self.head_dist = None + if distilled: + self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity() + self.distilled_training = False # must set this True to train w/ distillation token + + trunc_normal_(self.pos_embed, std=.02) + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @torch.jit.ignore + def set_distilled_training(self, enable=True): + self.distilled_training = enable + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + def get_classifier(self): + if self.head_dist is not None: + return self.head, self.head_dist + else: + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + if self.head_dist is not None: + self.head_dist = nn.Linear(self.embed_dim, self.num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + x = self.pos_drop(x + self.pos_embed) + cls_tokens = self.cls_token.expand(x.shape[0], -1, -1) + x, cls_tokens = self.transformers((x, cls_tokens)) + cls_tokens = self.norm(cls_tokens) + return cls_tokens + + def forward_head(self, x, pre_logits: bool = False) -> torch.Tensor: + if self.head_dist is not None: + assert self.global_pool == 'token' + x, x_dist = x[:, 0], x[:, 1] + if not pre_logits: + x = self.head(x) + x_dist = self.head_dist(x_dist) + if self.distilled_training and self.training and not torch.jit.is_scripting(): + # only return separate classification predictions when training in distilled mode + return x, x_dist + else: + # during standard train / finetune, inference average the classifier predictions + return (x + x_dist) / 2 + else: + if self.global_pool == 'token': + x = x[:, 0] + if not pre_logits: + x = self.head(x) + return x + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def checkpoint_filter_fn(state_dict, model): + """ preprocess checkpoints """ + out_dict = {} + p_blocks = re.compile(r'pools\.(\d)\.') + for k, v in state_dict.items(): + # FIXME need to update resize for PiT impl + # if k == 'pos_embed' and v.shape != model.pos_embed.shape: + # # To resize pos embedding when using model at different size from pretrained weights + # v = resize_pos_embed(v, model.pos_embed) + k = p_blocks.sub(lambda exp: f'transformers.{int(exp.group(1))}.pool.', k) + out_dict[k] = v + return out_dict + + +def _create_pit(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model = build_model_with_cfg( + PoolingVisionTransformer, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def pit_b_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=14, + stride=7, + base_dims=[64, 64, 64], + depth=[3, 6, 4], + heads=[4, 8, 16], + mlp_ratio=4, + **kwargs + ) + return _create_pit('pit_b_224', pretrained, **model_kwargs) + + +@register_model +def pit_s_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=16, + stride=8, + base_dims=[48, 48, 48], + depth=[2, 6, 4], + heads=[3, 6, 12], + mlp_ratio=4, + **kwargs + ) + return _create_pit('pit_s_224', pretrained, **model_kwargs) + + +@register_model +def pit_xs_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=16, + stride=8, + base_dims=[48, 48, 48], + depth=[2, 6, 4], + heads=[2, 4, 8], + mlp_ratio=4, + **kwargs + ) + return _create_pit('pit_xs_224', pretrained, **model_kwargs) + + +@register_model +def pit_ti_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=16, + stride=8, + base_dims=[32, 32, 32], + depth=[2, 6, 4], + heads=[2, 4, 8], + mlp_ratio=4, + **kwargs + ) + return _create_pit('pit_ti_224', pretrained, **model_kwargs) + + +@register_model +def pit_b_distilled_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=14, + stride=7, + base_dims=[64, 64, 64], + depth=[3, 6, 4], + heads=[4, 8, 16], + mlp_ratio=4, + distilled=True, + **kwargs + ) + return _create_pit('pit_b_distilled_224', pretrained, **model_kwargs) + + +@register_model +def pit_s_distilled_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=16, + stride=8, + base_dims=[48, 48, 48], + depth=[2, 6, 4], + heads=[3, 6, 12], + mlp_ratio=4, + distilled=True, + **kwargs + ) + return _create_pit('pit_s_distilled_224', pretrained, **model_kwargs) + + +@register_model +def pit_xs_distilled_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=16, + stride=8, + base_dims=[48, 48, 48], + depth=[2, 6, 4], + heads=[2, 4, 8], + mlp_ratio=4, + distilled=True, + **kwargs + ) + return _create_pit('pit_xs_distilled_224', pretrained, **model_kwargs) + + +@register_model +def pit_ti_distilled_224(pretrained, **kwargs): + model_kwargs = dict( + patch_size=16, + stride=8, + base_dims=[32, 32, 32], + depth=[2, 6, 4], + heads=[2, 4, 8], + mlp_ratio=4, + distilled=True, + **kwargs + ) + return _create_pit('pit_ti_distilled_224', pretrained, **model_kwargs) \ No newline at end of file diff --git a/custom_timm/models/pnasnet.py b/custom_timm/models/pnasnet.py new file mode 100644 index 0000000000000000000000000000000000000000..81067845befcfaf5436d112af73359ae4128c2d5 --- /dev/null +++ b/custom_timm/models/pnasnet.py @@ -0,0 +1,361 @@ +""" + pnasnet5large implementation grabbed from Cadene's pretrained models + Additional credit to https://github.com/creafz + + https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/pnasnet.py + +""" +from collections import OrderedDict +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .helpers import build_model_with_cfg +from .layers import ConvNormAct, create_conv2d, create_pool2d, create_classifier +from .registry import register_model + +__all__ = ['PNASNet5Large'] + +default_cfgs = { + 'pnasnet5large': { + 'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/pnasnet5large-bf079911.pth', + 'input_size': (3, 331, 331), + 'pool_size': (11, 11), + 'crop_pct': 0.911, + 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), + 'std': (0.5, 0.5, 0.5), + 'num_classes': 1000, + 'first_conv': 'conv_0.conv', + 'classifier': 'last_linear', + 'label_offset': 1, # 1001 classes in pretrained weights + }, +} + + +class SeparableConv2d(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride, padding=''): + super(SeparableConv2d, self).__init__() + self.depthwise_conv2d = create_conv2d( + in_channels, in_channels, kernel_size=kernel_size, + stride=stride, padding=padding, groups=in_channels) + self.pointwise_conv2d = create_conv2d( + in_channels, out_channels, kernel_size=1, padding=padding) + + def forward(self, x): + x = self.depthwise_conv2d(x) + x = self.pointwise_conv2d(x) + return x + + +class BranchSeparables(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, stem_cell=False, padding=''): + super(BranchSeparables, self).__init__() + middle_channels = out_channels if stem_cell else in_channels + self.act_1 = nn.ReLU() + self.separable_1 = SeparableConv2d( + in_channels, middle_channels, kernel_size, stride=stride, padding=padding) + self.bn_sep_1 = nn.BatchNorm2d(middle_channels, eps=0.001) + self.act_2 = nn.ReLU() + self.separable_2 = SeparableConv2d( + middle_channels, out_channels, kernel_size, stride=1, padding=padding) + self.bn_sep_2 = nn.BatchNorm2d(out_channels, eps=0.001) + + def forward(self, x): + x = self.act_1(x) + x = self.separable_1(x) + x = self.bn_sep_1(x) + x = self.act_2(x) + x = self.separable_2(x) + x = self.bn_sep_2(x) + return x + + +class ActConvBn(nn.Module): + + def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=''): + super(ActConvBn, self).__init__() + self.act = nn.ReLU() + self.conv = create_conv2d( + in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding) + self.bn = nn.BatchNorm2d(out_channels, eps=0.001) + + def forward(self, x): + x = self.act(x) + x = self.conv(x) + x = self.bn(x) + return x + + +class FactorizedReduction(nn.Module): + + def __init__(self, in_channels, out_channels, padding=''): + super(FactorizedReduction, self).__init__() + self.act = nn.ReLU() + self.path_1 = nn.Sequential(OrderedDict([ + ('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)), + ('conv', create_conv2d(in_channels, out_channels // 2, kernel_size=1, padding=padding)), + ])) + self.path_2 = nn.Sequential(OrderedDict([ + ('pad', nn.ZeroPad2d((-1, 1, -1, 1))), # shift + ('avgpool', nn.AvgPool2d(1, stride=2, count_include_pad=False)), + ('conv', create_conv2d(in_channels, out_channels // 2, kernel_size=1, padding=padding)), + ])) + self.final_path_bn = nn.BatchNorm2d(out_channels, eps=0.001) + + def forward(self, x): + x = self.act(x) + x_path1 = self.path_1(x) + x_path2 = self.path_2(x) + out = self.final_path_bn(torch.cat([x_path1, x_path2], 1)) + return out + + +class CellBase(nn.Module): + + def cell_forward(self, x_left, x_right): + x_comb_iter_0_left = self.comb_iter_0_left(x_left) + x_comb_iter_0_right = self.comb_iter_0_right(x_left) + x_comb_iter_0 = x_comb_iter_0_left + x_comb_iter_0_right + + x_comb_iter_1_left = self.comb_iter_1_left(x_right) + x_comb_iter_1_right = self.comb_iter_1_right(x_right) + x_comb_iter_1 = x_comb_iter_1_left + x_comb_iter_1_right + + x_comb_iter_2_left = self.comb_iter_2_left(x_right) + x_comb_iter_2_right = self.comb_iter_2_right(x_right) + x_comb_iter_2 = x_comb_iter_2_left + x_comb_iter_2_right + + x_comb_iter_3_left = self.comb_iter_3_left(x_comb_iter_2) + x_comb_iter_3_right = self.comb_iter_3_right(x_right) + x_comb_iter_3 = x_comb_iter_3_left + x_comb_iter_3_right + + x_comb_iter_4_left = self.comb_iter_4_left(x_left) + if self.comb_iter_4_right is not None: + x_comb_iter_4_right = self.comb_iter_4_right(x_right) + else: + x_comb_iter_4_right = x_right + x_comb_iter_4 = x_comb_iter_4_left + x_comb_iter_4_right + + x_out = torch.cat([x_comb_iter_0, x_comb_iter_1, x_comb_iter_2, x_comb_iter_3, x_comb_iter_4], 1) + return x_out + + +class CellStem0(CellBase): + + def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type=''): + super(CellStem0, self).__init__() + self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, kernel_size=1, padding=pad_type) + + self.comb_iter_0_left = BranchSeparables( + in_chs_left, out_chs_left, kernel_size=5, stride=2, stem_cell=True, padding=pad_type) + self.comb_iter_0_right = nn.Sequential(OrderedDict([ + ('max_pool', create_pool2d('max', 3, stride=2, padding=pad_type)), + ('conv', create_conv2d(in_chs_left, out_chs_left, kernel_size=1, padding=pad_type)), + ('bn', nn.BatchNorm2d(out_chs_left, eps=0.001)), + ])) + + self.comb_iter_1_left = BranchSeparables( + out_chs_right, out_chs_right, kernel_size=7, stride=2, padding=pad_type) + self.comb_iter_1_right = create_pool2d('max', 3, stride=2, padding=pad_type) + + self.comb_iter_2_left = BranchSeparables( + out_chs_right, out_chs_right, kernel_size=5, stride=2, padding=pad_type) + self.comb_iter_2_right = BranchSeparables( + out_chs_right, out_chs_right, kernel_size=3, stride=2, padding=pad_type) + + self.comb_iter_3_left = BranchSeparables( + out_chs_right, out_chs_right, kernel_size=3, padding=pad_type) + self.comb_iter_3_right = create_pool2d('max', 3, stride=2, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables( + in_chs_right, out_chs_right, kernel_size=3, stride=2, stem_cell=True, padding=pad_type) + self.comb_iter_4_right = ActConvBn( + out_chs_right, out_chs_right, kernel_size=1, stride=2, padding=pad_type) + + def forward(self, x_left): + x_right = self.conv_1x1(x_left) + x_out = self.cell_forward(x_left, x_right) + return x_out + + +class Cell(CellBase): + + def __init__(self, in_chs_left, out_chs_left, in_chs_right, out_chs_right, pad_type='', + is_reduction=False, match_prev_layer_dims=False): + super(Cell, self).__init__() + + # If `is_reduction` is set to `True` stride 2 is used for + # convolution and pooling layers to reduce the spatial size of + # the output of a cell approximately by a factor of 2. + stride = 2 if is_reduction else 1 + + # If `match_prev_layer_dimensions` is set to `True` + # `FactorizedReduction` is used to reduce the spatial size + # of the left input of a cell approximately by a factor of 2. + self.match_prev_layer_dimensions = match_prev_layer_dims + if match_prev_layer_dims: + self.conv_prev_1x1 = FactorizedReduction(in_chs_left, out_chs_left, padding=pad_type) + else: + self.conv_prev_1x1 = ActConvBn(in_chs_left, out_chs_left, kernel_size=1, padding=pad_type) + self.conv_1x1 = ActConvBn(in_chs_right, out_chs_right, kernel_size=1, padding=pad_type) + + self.comb_iter_0_left = BranchSeparables( + out_chs_left, out_chs_left, kernel_size=5, stride=stride, padding=pad_type) + self.comb_iter_0_right = create_pool2d('max', 3, stride=stride, padding=pad_type) + + self.comb_iter_1_left = BranchSeparables( + out_chs_right, out_chs_right, kernel_size=7, stride=stride, padding=pad_type) + self.comb_iter_1_right = create_pool2d('max', 3, stride=stride, padding=pad_type) + + self.comb_iter_2_left = BranchSeparables( + out_chs_right, out_chs_right, kernel_size=5, stride=stride, padding=pad_type) + self.comb_iter_2_right = BranchSeparables( + out_chs_right, out_chs_right, kernel_size=3, stride=stride, padding=pad_type) + + self.comb_iter_3_left = BranchSeparables(out_chs_right, out_chs_right, kernel_size=3) + self.comb_iter_3_right = create_pool2d('max', 3, stride=stride, padding=pad_type) + + self.comb_iter_4_left = BranchSeparables( + out_chs_left, out_chs_left, kernel_size=3, stride=stride, padding=pad_type) + if is_reduction: + self.comb_iter_4_right = ActConvBn( + out_chs_right, out_chs_right, kernel_size=1, stride=stride, padding=pad_type) + else: + self.comb_iter_4_right = None + + def forward(self, x_left, x_right): + x_left = self.conv_prev_1x1(x_left) + x_right = self.conv_1x1(x_right) + x_out = self.cell_forward(x_left, x_right) + return x_out + + +class PNASNet5Large(nn.Module): + def __init__(self, num_classes=1000, in_chans=3, output_stride=32, drop_rate=0., global_pool='avg', pad_type=''): + super(PNASNet5Large, self).__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + self.num_features = 4320 + assert output_stride == 32 + + self.conv_0 = ConvNormAct( + in_chans, 96, kernel_size=3, stride=2, padding=0, + norm_layer=partial(nn.BatchNorm2d, eps=0.001, momentum=0.1), apply_act=False) + + self.cell_stem_0 = CellStem0( + in_chs_left=96, out_chs_left=54, in_chs_right=96, out_chs_right=54, pad_type=pad_type) + + self.cell_stem_1 = Cell( + in_chs_left=96, out_chs_left=108, in_chs_right=270, out_chs_right=108, pad_type=pad_type, + match_prev_layer_dims=True, is_reduction=True) + self.cell_0 = Cell( + in_chs_left=270, out_chs_left=216, in_chs_right=540, out_chs_right=216, pad_type=pad_type, + match_prev_layer_dims=True) + self.cell_1 = Cell( + in_chs_left=540, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type) + self.cell_2 = Cell( + in_chs_left=1080, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type) + self.cell_3 = Cell( + in_chs_left=1080, out_chs_left=216, in_chs_right=1080, out_chs_right=216, pad_type=pad_type) + + self.cell_4 = Cell( + in_chs_left=1080, out_chs_left=432, in_chs_right=1080, out_chs_right=432, pad_type=pad_type, + is_reduction=True) + self.cell_5 = Cell( + in_chs_left=1080, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type, + match_prev_layer_dims=True) + self.cell_6 = Cell( + in_chs_left=2160, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type) + self.cell_7 = Cell( + in_chs_left=2160, out_chs_left=432, in_chs_right=2160, out_chs_right=432, pad_type=pad_type) + + self.cell_8 = Cell( + in_chs_left=2160, out_chs_left=864, in_chs_right=2160, out_chs_right=864, pad_type=pad_type, + is_reduction=True) + self.cell_9 = Cell( + in_chs_left=2160, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type, + match_prev_layer_dims=True) + self.cell_10 = Cell( + in_chs_left=4320, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type) + self.cell_11 = Cell( + in_chs_left=4320, out_chs_left=864, in_chs_right=4320, out_chs_right=864, pad_type=pad_type) + self.act = nn.ReLU() + self.feature_info = [ + dict(num_chs=96, reduction=2, module='conv_0'), + dict(num_chs=270, reduction=4, module='cell_stem_1.conv_1x1.act'), + dict(num_chs=1080, reduction=8, module='cell_4.conv_1x1.act'), + dict(num_chs=2160, reduction=16, module='cell_8.conv_1x1.act'), + dict(num_chs=4320, reduction=32, module='act'), + ] + + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict(stem=r'^conv_0|cell_stem_[01]', blocks=r'^cell_(\d+)') + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.last_linear + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x_conv_0 = self.conv_0(x) + x_stem_0 = self.cell_stem_0(x_conv_0) + x_stem_1 = self.cell_stem_1(x_conv_0, x_stem_0) + x_cell_0 = self.cell_0(x_stem_0, x_stem_1) + x_cell_1 = self.cell_1(x_stem_1, x_cell_0) + x_cell_2 = self.cell_2(x_cell_0, x_cell_1) + x_cell_3 = self.cell_3(x_cell_1, x_cell_2) + x_cell_4 = self.cell_4(x_cell_2, x_cell_3) + x_cell_5 = self.cell_5(x_cell_3, x_cell_4) + x_cell_6 = self.cell_6(x_cell_4, x_cell_5) + x_cell_7 = self.cell_7(x_cell_5, x_cell_6) + x_cell_8 = self.cell_8(x_cell_6, x_cell_7) + x_cell_9 = self.cell_9(x_cell_7, x_cell_8) + x_cell_10 = self.cell_10(x_cell_8, x_cell_9) + x_cell_11 = self.cell_11(x_cell_9, x_cell_10) + x = self.act(x_cell_11) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0: + x = F.dropout(x, self.drop_rate, training=self.training) + return x if pre_logits else self.last_linear(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_pnasnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + PNASNet5Large, variant, pretrained, + feature_cfg=dict(feature_cls='hook', no_rewrite=True), # not possible to re-write this model + **kwargs) + + +@register_model +def pnasnet5large(pretrained=False, **kwargs): + r"""PNASNet-5 model architecture from the + `"Progressive Neural Architecture Search" + `_ paper. + """ + model_kwargs = dict(pad_type='same', **kwargs) + return _create_pnasnet('pnasnet5large', pretrained, **model_kwargs) diff --git a/custom_timm/models/poolformer.py b/custom_timm/models/poolformer.py new file mode 100644 index 0000000000000000000000000000000000000000..ee7167af586b63ae7ee03c8bb609061cf9244c08 --- /dev/null +++ b/custom_timm/models/poolformer.py @@ -0,0 +1,313 @@ +""" PoolFormer implementation + +Paper: `PoolFormer: MetaFormer is Actually What You Need for Vision` - https://arxiv.org/abs/2111.11418 + +Code adapted from official impl at https://github.com/sail-sg/poolformer, original copyright in comment below + +Modifications and additions for timm by / Copyright 2022, Ross Wightman +""" +# Copyright 2021 Garena Online Private Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import copy +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import DropPath, trunc_normal_, to_2tuple, ConvMlp, GroupNorm1 +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .95, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = dict( + poolformer_s12=_cfg( + url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s12.pth.tar', + crop_pct=0.9), + poolformer_s24=_cfg( + url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s24.pth.tar', + crop_pct=0.9), + poolformer_s36=_cfg( + url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_s36.pth.tar', + crop_pct=0.9), + poolformer_m36=_cfg( + url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m36.pth.tar', + crop_pct=0.95), + poolformer_m48=_cfg( + url='https://github.com/sail-sg/poolformer/releases/download/v1.0/poolformer_m48.pth.tar', + crop_pct=0.95), +) + + +class PatchEmbed(nn.Module): + """ Patch Embedding that is implemented by a layer of conv. + Input: tensor in shape [B, C, H, W] + Output: tensor in shape [B, C, H/stride, W/stride] + """ + + def __init__(self, in_chs=3, embed_dim=768, patch_size=16, stride=16, padding=0, norm_layer=None): + super().__init__() + patch_size = to_2tuple(patch_size) + stride = to_2tuple(stride) + padding = to_2tuple(padding) + self.proj = nn.Conv2d(in_chs, embed_dim, kernel_size=patch_size, stride=stride, padding=padding) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + x = self.proj(x) + x = self.norm(x) + return x + + +class Pooling(nn.Module): + def __init__(self, pool_size=3): + super().__init__() + self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False) + + def forward(self, x): + return self.pool(x) - x + + +class PoolFormerBlock(nn.Module): + """ + Args: + dim: embedding dim + pool_size: pooling size + mlp_ratio: mlp expansion ratio + act_layer: activation + norm_layer: normalization + drop: dropout rate + drop path: Stochastic Depth, refer to https://arxiv.org/abs/1603.09382 + use_layer_scale, --layer_scale_init_value: LayerScale, refer to https://arxiv.org/abs/2103.17239 + """ + + def __init__( + self, dim, pool_size=3, mlp_ratio=4., + act_layer=nn.GELU, norm_layer=GroupNorm1, + drop=0., drop_path=0., layer_scale_init_value=1e-5): + + super().__init__() + + self.norm1 = norm_layer(dim) + self.token_mixer = Pooling(pool_size=pool_size) + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = ConvMlp(dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + if layer_scale_init_value: + self.layer_scale_1 = nn.Parameter(layer_scale_init_value * torch.ones(dim)) + self.layer_scale_2 = nn.Parameter(layer_scale_init_value * torch.ones(dim)) + else: + self.layer_scale_1 = None + self.layer_scale_2 = None + + def forward(self, x): + if self.layer_scale_1 is not None: + x = x + self.drop_path1(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * self.token_mixer(self.norm1(x))) + x = x + self.drop_path2(self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(self.norm2(x))) + else: + x = x + self.drop_path1(self.token_mixer(self.norm1(x))) + x = x + self.drop_path2(self.mlp(self.norm2(x))) + return x + + +def basic_blocks( + dim, index, layers, + pool_size=3, mlp_ratio=4., + act_layer=nn.GELU, norm_layer=GroupNorm1, + drop_rate=.0, drop_path_rate=0., + layer_scale_init_value=1e-5, +): + """ generate PoolFormer blocks for a stage """ + blocks = [] + for block_idx in range(layers[index]): + block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1) + blocks.append(PoolFormerBlock( + dim, pool_size=pool_size, mlp_ratio=mlp_ratio, + act_layer=act_layer, norm_layer=norm_layer, + drop=drop_rate, drop_path=block_dpr, + layer_scale_init_value=layer_scale_init_value, + )) + blocks = nn.Sequential(*blocks) + return blocks + + +class PoolFormer(nn.Module): + """ PoolFormer + """ + + def __init__( + self, + layers, + embed_dims=(64, 128, 320, 512), + mlp_ratios=(4, 4, 4, 4), + downsamples=(True, True, True, True), + pool_size=3, + in_chans=3, + num_classes=1000, + global_pool='avg', + norm_layer=GroupNorm1, + act_layer=nn.GELU, + in_patch_size=7, + in_stride=4, + in_pad=2, + down_patch_size=3, + down_stride=2, + down_pad=1, + drop_rate=0., drop_path_rate=0., + layer_scale_init_value=1e-5, + **kwargs): + + super().__init__() + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = embed_dims[-1] + self.grad_checkpointing = False + + self.patch_embed = PatchEmbed( + patch_size=in_patch_size, stride=in_stride, padding=in_pad, + in_chs=in_chans, embed_dim=embed_dims[0]) + + # set the main block in network + network = [] + for i in range(len(layers)): + network.append(basic_blocks( + embed_dims[i], i, layers, + pool_size=pool_size, mlp_ratio=mlp_ratios[i], + act_layer=act_layer, norm_layer=norm_layer, + drop_rate=drop_rate, drop_path_rate=drop_path_rate, + layer_scale_init_value=layer_scale_init_value) + ) + if i < len(layers) - 1 and (downsamples[i] or embed_dims[i] != embed_dims[i + 1]): + # downsampling between stages + network.append(PatchEmbed( + in_chs=embed_dims[i], embed_dim=embed_dims[i + 1], + patch_size=down_patch_size, stride=down_stride, padding=down_pad) + ) + + self.network = nn.Sequential(*network) + self.norm = norm_layer(self.num_features) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + + # init for classification + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^patch_embed', # stem and embed + blocks=[ + (r'^network\.(\d+).*\.proj', (99999,)), + (r'^network\.(\d+)', None) if coarse else (r'^network\.(\d+)\.(\d+)', None), + (r'^norm', (99999,)) + ], + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + x = self.network(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean([-2, -1]) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_poolformer(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + model = build_model_with_cfg(PoolFormer, variant, pretrained, **kwargs) + return model + + +@register_model +def poolformer_s12(pretrained=False, **kwargs): + """ PoolFormer-S12 model, Params: 12M """ + model = _create_poolformer('poolformer_s12', pretrained=pretrained, layers=(2, 2, 6, 2), **kwargs) + return model + + +@register_model +def poolformer_s24(pretrained=False, **kwargs): + """ PoolFormer-S24 model, Params: 21M """ + model = _create_poolformer('poolformer_s24', pretrained=pretrained, layers=(4, 4, 12, 4), **kwargs) + return model + + +@register_model +def poolformer_s36(pretrained=False, **kwargs): + """ PoolFormer-S36 model, Params: 31M """ + model = _create_poolformer( + 'poolformer_s36', pretrained=pretrained, layers=(6, 6, 18, 6), layer_scale_init_value=1e-6, **kwargs) + return model + + +@register_model +def poolformer_m36(pretrained=False, **kwargs): + """ PoolFormer-M36 model, Params: 56M """ + layers = (6, 6, 18, 6) + embed_dims = (96, 192, 384, 768) + model = _create_poolformer( + 'poolformer_m36', pretrained=pretrained, layers=layers, embed_dims=embed_dims, + layer_scale_init_value=1e-6, **kwargs) + return model + + +@register_model +def poolformer_m48(pretrained=False, **kwargs): + """ PoolFormer-M48 model, Params: 73M """ + layers = (8, 8, 24, 8) + embed_dims = (96, 192, 384, 768) + model = _create_poolformer( + 'poolformer_m48', pretrained=pretrained, layers=layers, embed_dims=embed_dims, + layer_scale_init_value=1e-6, **kwargs) + return model diff --git a/custom_timm/models/pruned/ecaresnet101d_pruned.txt b/custom_timm/models/pruned/ecaresnet101d_pruned.txt new file mode 100644 index 0000000000000000000000000000000000000000..2589b2f9dd3f0d1e02e1d5ddc1fbcd5c143e02c6 --- /dev/null +++ b/custom_timm/models/pruned/ecaresnet101d_pruned.txt @@ -0,0 +1 @@ +conv1.0.weight:[32, 3, 3, 3]***conv1.1.weight:[32]***conv1.3.weight:[32, 32, 3, 3]***conv1.4.weight:[32]***conv1.6.weight:[64, 32, 3, 3]***bn1.weight:[64]***layer1.0.conv1.weight:[45, 64, 1, 1]***layer1.0.bn1.weight:[45]***layer1.0.conv2.weight:[25, 45, 3, 3]***layer1.0.bn2.weight:[25]***layer1.0.conv3.weight:[26, 25, 1, 1]***layer1.0.bn3.weight:[26]***layer1.0.se.conv.weight:[1, 1, 5]***layer1.0.downsample.1.weight:[26, 64, 1, 1]***layer1.0.downsample.2.weight:[26]***layer1.1.conv1.weight:[53, 26, 1, 1]***layer1.1.bn1.weight:[53]***layer1.1.conv2.weight:[20, 53, 3, 3]***layer1.1.bn2.weight:[20]***layer1.1.conv3.weight:[26, 20, 1, 1]***layer1.1.bn3.weight:[26]***layer1.1.se.conv.weight:[1, 1, 5]***layer1.2.conv1.weight:[60, 26, 1, 1]***layer1.2.bn1.weight:[60]***layer1.2.conv2.weight:[27, 60, 3, 3]***layer1.2.bn2.weight:[27]***layer1.2.conv3.weight:[26, 27, 1, 1]***layer1.2.bn3.weight:[26]***layer1.2.se.conv.weight:[1, 1, 5]***layer2.0.conv1.weight:[81, 26, 1, 1]***layer2.0.bn1.weight:[81]***layer2.0.conv2.weight:[24, 81, 3, 3]***layer2.0.bn2.weight:[24]***layer2.0.conv3.weight:[142, 24, 1, 1]***layer2.0.bn3.weight:[142]***layer2.0.se.conv.weight:[1, 1, 5]***layer2.0.downsample.1.weight:[142, 26, 1, 1]***layer2.0.downsample.2.weight:[142]***layer2.1.conv1.weight:[93, 142, 1, 1]***layer2.1.bn1.weight:[93]***layer2.1.conv2.weight:[49, 93, 3, 3]***layer2.1.bn2.weight:[49]***layer2.1.conv3.weight:[142, 49, 1, 1]***layer2.1.bn3.weight:[142]***layer2.1.se.conv.weight:[1, 1, 5]***layer2.2.conv1.weight:[102, 142, 1, 1]***layer2.2.bn1.weight:[102]***layer2.2.conv2.weight:[54, 102, 3, 3]***layer2.2.bn2.weight:[54]***layer2.2.conv3.weight:[142, 54, 1, 1]***layer2.2.bn3.weight:[142]***layer2.2.se.conv.weight:[1, 1, 5]***layer2.3.conv1.weight:[122, 142, 1, 1]***layer2.3.bn1.weight:[122]***layer2.3.conv2.weight:[78, 122, 3, 3]***layer2.3.bn2.weight:[78]***layer2.3.conv3.weight:[142, 78, 1, 1]***layer2.3.bn3.weight:[142]***layer2.3.se.conv.weight:[1, 1, 5]***layer3.0.conv1.weight:[101, 142, 1, 1]***layer3.0.bn1.weight:[101]***layer3.0.conv2.weight:[25, 101, 3, 3]***layer3.0.bn2.weight:[25]***layer3.0.conv3.weight:[278, 25, 1, 1]***layer3.0.bn3.weight:[278]***layer3.0.se.conv.weight:[1, 1, 5]***layer3.0.downsample.1.weight:[278, 142, 1, 1]***layer3.0.downsample.2.weight:[278]***layer3.1.conv1.weight:[239, 278, 1, 1]***layer3.1.bn1.weight:[239]***layer3.1.conv2.weight:[160, 239, 3, 3]***layer3.1.bn2.weight:[160]***layer3.1.conv3.weight:[278, 160, 1, 1]***layer3.1.bn3.weight:[278]***layer3.1.se.conv.weight:[1, 1, 5]***layer3.2.conv1.weight:[234, 278, 1, 1]***layer3.2.bn1.weight:[234]***layer3.2.conv2.weight:[156, 234, 3, 3]***layer3.2.bn2.weight:[156]***layer3.2.conv3.weight:[278, 156, 1, 1]***layer3.2.bn3.weight:[278]***layer3.2.se.conv.weight:[1, 1, 5]***layer3.3.conv1.weight:[250, 278, 1, 1]***layer3.3.bn1.weight:[250]***layer3.3.conv2.weight:[176, 250, 3, 3]***layer3.3.bn2.weight:[176]***layer3.3.conv3.weight:[278, 176, 1, 1]***layer3.3.bn3.weight:[278]***layer3.3.se.conv.weight:[1, 1, 5]***layer3.4.conv1.weight:[253, 278, 1, 1]***layer3.4.bn1.weight:[253]***layer3.4.conv2.weight:[191, 253, 3, 3]***layer3.4.bn2.weight:[191]***layer3.4.conv3.weight:[278, 191, 1, 1]***layer3.4.bn3.weight:[278]***layer3.4.se.conv.weight:[1, 1, 5]***layer3.5.conv1.weight:[251, 278, 1, 1]***layer3.5.bn1.weight:[251]***layer3.5.conv2.weight:[175, 251, 3, 3]***layer3.5.bn2.weight:[175]***layer3.5.conv3.weight:[278, 175, 1, 1]***layer3.5.bn3.weight:[278]***layer3.5.se.conv.weight:[1, 1, 5]***layer3.6.conv1.weight:[230, 278, 1, 1]***layer3.6.bn1.weight:[230]***layer3.6.conv2.weight:[128, 230, 3, 3]***layer3.6.bn2.weight:[128]***layer3.6.conv3.weight:[278, 128, 1, 1]***layer3.6.bn3.weight:[278]***layer3.6.se.conv.weight:[1, 1, 5]***layer3.7.conv1.weight:[244, 278, 1, 1]***layer3.7.bn1.weight:[244]***layer3.7.conv2.weight:[154, 244, 3, 3]***layer3.7.bn2.weight:[154]***layer3.7.conv3.weight:[278, 154, 1, 1]***layer3.7.bn3.weight:[278]***layer3.7.se.conv.weight:[1, 1, 5]***layer3.8.conv1.weight:[244, 278, 1, 1]***layer3.8.bn1.weight:[244]***layer3.8.conv2.weight:[159, 244, 3, 3]***layer3.8.bn2.weight:[159]***layer3.8.conv3.weight:[278, 159, 1, 1]***layer3.8.bn3.weight:[278]***layer3.8.se.conv.weight:[1, 1, 5]***layer3.9.conv1.weight:[238, 278, 1, 1]***layer3.9.bn1.weight:[238]***layer3.9.conv2.weight:[97, 238, 3, 3]***layer3.9.bn2.weight:[97]***layer3.9.conv3.weight:[278, 97, 1, 1]***layer3.9.bn3.weight:[278]***layer3.9.se.conv.weight:[1, 1, 5]***layer3.10.conv1.weight:[244, 278, 1, 1]***layer3.10.bn1.weight:[244]***layer3.10.conv2.weight:[149, 244, 3, 3]***layer3.10.bn2.weight:[149]***layer3.10.conv3.weight:[278, 149, 1, 1]***layer3.10.bn3.weight:[278]***layer3.10.se.conv.weight:[1, 1, 5]***layer3.11.conv1.weight:[253, 278, 1, 1]***layer3.11.bn1.weight:[253]***layer3.11.conv2.weight:[181, 253, 3, 3]***layer3.11.bn2.weight:[181]***layer3.11.conv3.weight:[278, 181, 1, 1]***layer3.11.bn3.weight:[278]***layer3.11.se.conv.weight:[1, 1, 5]***layer3.12.conv1.weight:[245, 278, 1, 1]***layer3.12.bn1.weight:[245]***layer3.12.conv2.weight:[119, 245, 3, 3]***layer3.12.bn2.weight:[119]***layer3.12.conv3.weight:[278, 119, 1, 1]***layer3.12.bn3.weight:[278]***layer3.12.se.conv.weight:[1, 1, 5]***layer3.13.conv1.weight:[255, 278, 1, 1]***layer3.13.bn1.weight:[255]***layer3.13.conv2.weight:[216, 255, 3, 3]***layer3.13.bn2.weight:[216]***layer3.13.conv3.weight:[278, 216, 1, 1]***layer3.13.bn3.weight:[278]***layer3.13.se.conv.weight:[1, 1, 5]***layer3.14.conv1.weight:[256, 278, 1, 1]***layer3.14.bn1.weight:[256]***layer3.14.conv2.weight:[201, 256, 3, 3]***layer3.14.bn2.weight:[201]***layer3.14.conv3.weight:[278, 201, 1, 1]***layer3.14.bn3.weight:[278]***layer3.14.se.conv.weight:[1, 1, 5]***layer3.15.conv1.weight:[253, 278, 1, 1]***layer3.15.bn1.weight:[253]***layer3.15.conv2.weight:[149, 253, 3, 3]***layer3.15.bn2.weight:[149]***layer3.15.conv3.weight:[278, 149, 1, 1]***layer3.15.bn3.weight:[278]***layer3.15.se.conv.weight:[1, 1, 5]***layer3.16.conv1.weight:[254, 278, 1, 1]***layer3.16.bn1.weight:[254]***layer3.16.conv2.weight:[141, 254, 3, 3]***layer3.16.bn2.weight:[141]***layer3.16.conv3.weight:[278, 141, 1, 1]***layer3.16.bn3.weight:[278]***layer3.16.se.conv.weight:[1, 1, 5]***layer3.17.conv1.weight:[256, 278, 1, 1]***layer3.17.bn1.weight:[256]***layer3.17.conv2.weight:[190, 256, 3, 3]***layer3.17.bn2.weight:[190]***layer3.17.conv3.weight:[278, 190, 1, 1]***layer3.17.bn3.weight:[278]***layer3.17.se.conv.weight:[1, 1, 5]***layer3.18.conv1.weight:[256, 278, 1, 1]***layer3.18.bn1.weight:[256]***layer3.18.conv2.weight:[217, 256, 3, 3]***layer3.18.bn2.weight:[217]***layer3.18.conv3.weight:[278, 217, 1, 1]***layer3.18.bn3.weight:[278]***layer3.18.se.conv.weight:[1, 1, 5]***layer3.19.conv1.weight:[255, 278, 1, 1]***layer3.19.bn1.weight:[255]***layer3.19.conv2.weight:[156, 255, 3, 3]***layer3.19.bn2.weight:[156]***layer3.19.conv3.weight:[278, 156, 1, 1]***layer3.19.bn3.weight:[278]***layer3.19.se.conv.weight:[1, 1, 5]***layer3.20.conv1.weight:[256, 278, 1, 1]***layer3.20.bn1.weight:[256]***layer3.20.conv2.weight:[155, 256, 3, 3]***layer3.20.bn2.weight:[155]***layer3.20.conv3.weight:[278, 155, 1, 1]***layer3.20.bn3.weight:[278]***layer3.20.se.conv.weight:[1, 1, 5]***layer3.21.conv1.weight:[256, 278, 1, 1]***layer3.21.bn1.weight:[256]***layer3.21.conv2.weight:[232, 256, 3, 3]***layer3.21.bn2.weight:[232]***layer3.21.conv3.weight:[278, 232, 1, 1]***layer3.21.bn3.weight:[278]***layer3.21.se.conv.weight:[1, 1, 5]***layer3.22.conv1.weight:[256, 278, 1, 1]***layer3.22.bn1.weight:[256]***layer3.22.conv2.weight:[214, 256, 3, 3]***layer3.22.bn2.weight:[214]***layer3.22.conv3.weight:[278, 214, 1, 1]***layer3.22.bn3.weight:[278]***layer3.22.se.conv.weight:[1, 1, 5]***layer4.0.conv1.weight:[499, 278, 1, 1]***layer4.0.bn1.weight:[499]***layer4.0.conv2.weight:[289, 499, 3, 3]***layer4.0.bn2.weight:[289]***layer4.0.conv3.weight:[2042, 289, 1, 1]***layer4.0.bn3.weight:[2042]***layer4.0.se.conv.weight:[1, 1, 7]***layer4.0.downsample.1.weight:[2042, 278, 1, 1]***layer4.0.downsample.2.weight:[2042]***layer4.1.conv1.weight:[512, 2042, 1, 1]***layer4.1.bn1.weight:[512]***layer4.1.conv2.weight:[512, 512, 3, 3]***layer4.1.bn2.weight:[512]***layer4.1.conv3.weight:[2042, 512, 1, 1]***layer4.1.bn3.weight:[2042]***layer4.1.se.conv.weight:[1, 1, 7]***layer4.2.conv1.weight:[512, 2042, 1, 1]***layer4.2.bn1.weight:[512]***layer4.2.conv2.weight:[502, 512, 3, 3]***layer4.2.bn2.weight:[502]***layer4.2.conv3.weight:[2042, 502, 1, 1]***layer4.2.bn3.weight:[2042]***layer4.2.se.conv.weight:[1, 1, 7]***fc.weight:[1000, 2042]***layer1_2_conv3_M.weight:[256, 26]***layer2_3_conv3_M.weight:[512, 142]***layer3_22_conv3_M.weight:[1024, 278]***layer4_2_conv3_M.weight:[2048, 2042] \ No newline at end of file diff --git a/custom_timm/models/pruned/ecaresnet50d_pruned.txt b/custom_timm/models/pruned/ecaresnet50d_pruned.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a8b2bf50e0631dce74d66a1a98e26cae10572a7 --- /dev/null +++ b/custom_timm/models/pruned/ecaresnet50d_pruned.txt @@ -0,0 +1 @@ +conv1.0.weight:[32, 3, 3, 3]***conv1.1.weight:[32]***conv1.3.weight:[32, 32, 3, 3]***conv1.4.weight:[32]***conv1.6.weight:[64, 32, 3, 3]***bn1.weight:[64]***layer1.0.conv1.weight:[47, 64, 1, 1]***layer1.0.bn1.weight:[47]***layer1.0.conv2.weight:[18, 47, 3, 3]***layer1.0.bn2.weight:[18]***layer1.0.conv3.weight:[19, 18, 1, 1]***layer1.0.bn3.weight:[19]***layer1.0.se.conv.weight:[1, 1, 5]***layer1.0.downsample.1.weight:[19, 64, 1, 1]***layer1.0.downsample.2.weight:[19]***layer1.1.conv1.weight:[52, 19, 1, 1]***layer1.1.bn1.weight:[52]***layer1.1.conv2.weight:[22, 52, 3, 3]***layer1.1.bn2.weight:[22]***layer1.1.conv3.weight:[19, 22, 1, 1]***layer1.1.bn3.weight:[19]***layer1.1.se.conv.weight:[1, 1, 5]***layer1.2.conv1.weight:[64, 19, 1, 1]***layer1.2.bn1.weight:[64]***layer1.2.conv2.weight:[35, 64, 3, 3]***layer1.2.bn2.weight:[35]***layer1.2.conv3.weight:[19, 35, 1, 1]***layer1.2.bn3.weight:[19]***layer1.2.se.conv.weight:[1, 1, 5]***layer2.0.conv1.weight:[85, 19, 1, 1]***layer2.0.bn1.weight:[85]***layer2.0.conv2.weight:[37, 85, 3, 3]***layer2.0.bn2.weight:[37]***layer2.0.conv3.weight:[171, 37, 1, 1]***layer2.0.bn3.weight:[171]***layer2.0.se.conv.weight:[1, 1, 5]***layer2.0.downsample.1.weight:[171, 19, 1, 1]***layer2.0.downsample.2.weight:[171]***layer2.1.conv1.weight:[107, 171, 1, 1]***layer2.1.bn1.weight:[107]***layer2.1.conv2.weight:[80, 107, 3, 3]***layer2.1.bn2.weight:[80]***layer2.1.conv3.weight:[171, 80, 1, 1]***layer2.1.bn3.weight:[171]***layer2.1.se.conv.weight:[1, 1, 5]***layer2.2.conv1.weight:[120, 171, 1, 1]***layer2.2.bn1.weight:[120]***layer2.2.conv2.weight:[85, 120, 3, 3]***layer2.2.bn2.weight:[85]***layer2.2.conv3.weight:[171, 85, 1, 1]***layer2.2.bn3.weight:[171]***layer2.2.se.conv.weight:[1, 1, 5]***layer2.3.conv1.weight:[125, 171, 1, 1]***layer2.3.bn1.weight:[125]***layer2.3.conv2.weight:[87, 125, 3, 3]***layer2.3.bn2.weight:[87]***layer2.3.conv3.weight:[171, 87, 1, 1]***layer2.3.bn3.weight:[171]***layer2.3.se.conv.weight:[1, 1, 5]***layer3.0.conv1.weight:[198, 171, 1, 1]***layer3.0.bn1.weight:[198]***layer3.0.conv2.weight:[126, 198, 3, 3]***layer3.0.bn2.weight:[126]***layer3.0.conv3.weight:[818, 126, 1, 1]***layer3.0.bn3.weight:[818]***layer3.0.se.conv.weight:[1, 1, 5]***layer3.0.downsample.1.weight:[818, 171, 1, 1]***layer3.0.downsample.2.weight:[818]***layer3.1.conv1.weight:[255, 818, 1, 1]***layer3.1.bn1.weight:[255]***layer3.1.conv2.weight:[232, 255, 3, 3]***layer3.1.bn2.weight:[232]***layer3.1.conv3.weight:[818, 232, 1, 1]***layer3.1.bn3.weight:[818]***layer3.1.se.conv.weight:[1, 1, 5]***layer3.2.conv1.weight:[256, 818, 1, 1]***layer3.2.bn1.weight:[256]***layer3.2.conv2.weight:[233, 256, 3, 3]***layer3.2.bn2.weight:[233]***layer3.2.conv3.weight:[818, 233, 1, 1]***layer3.2.bn3.weight:[818]***layer3.2.se.conv.weight:[1, 1, 5]***layer3.3.conv1.weight:[253, 818, 1, 1]***layer3.3.bn1.weight:[253]***layer3.3.conv2.weight:[235, 253, 3, 3]***layer3.3.bn2.weight:[235]***layer3.3.conv3.weight:[818, 235, 1, 1]***layer3.3.bn3.weight:[818]***layer3.3.se.conv.weight:[1, 1, 5]***layer3.4.conv1.weight:[256, 818, 1, 1]***layer3.4.bn1.weight:[256]***layer3.4.conv2.weight:[225, 256, 3, 3]***layer3.4.bn2.weight:[225]***layer3.4.conv3.weight:[818, 225, 1, 1]***layer3.4.bn3.weight:[818]***layer3.4.se.conv.weight:[1, 1, 5]***layer3.5.conv1.weight:[256, 818, 1, 1]***layer3.5.bn1.weight:[256]***layer3.5.conv2.weight:[239, 256, 3, 3]***layer3.5.bn2.weight:[239]***layer3.5.conv3.weight:[818, 239, 1, 1]***layer3.5.bn3.weight:[818]***layer3.5.se.conv.weight:[1, 1, 5]***layer4.0.conv1.weight:[492, 818, 1, 1]***layer4.0.bn1.weight:[492]***layer4.0.conv2.weight:[237, 492, 3, 3]***layer4.0.bn2.weight:[237]***layer4.0.conv3.weight:[2022, 237, 1, 1]***layer4.0.bn3.weight:[2022]***layer4.0.se.conv.weight:[1, 1, 7]***layer4.0.downsample.1.weight:[2022, 818, 1, 1]***layer4.0.downsample.2.weight:[2022]***layer4.1.conv1.weight:[512, 2022, 1, 1]***layer4.1.bn1.weight:[512]***layer4.1.conv2.weight:[500, 512, 3, 3]***layer4.1.bn2.weight:[500]***layer4.1.conv3.weight:[2022, 500, 1, 1]***layer4.1.bn3.weight:[2022]***layer4.1.se.conv.weight:[1, 1, 7]***layer4.2.conv1.weight:[512, 2022, 1, 1]***layer4.2.bn1.weight:[512]***layer4.2.conv2.weight:[490, 512, 3, 3]***layer4.2.bn2.weight:[490]***layer4.2.conv3.weight:[2022, 490, 1, 1]***layer4.2.bn3.weight:[2022]***layer4.2.se.conv.weight:[1, 1, 7]***fc.weight:[1000, 2022]***layer1_2_conv3_M.weight:[256, 19]***layer2_3_conv3_M.weight:[512, 171]***layer3_5_conv3_M.weight:[1024, 818]***layer4_2_conv3_M.weight:[2048, 2022] \ No newline at end of file diff --git a/custom_timm/models/pruned/efficientnet_b1_pruned.txt b/custom_timm/models/pruned/efficientnet_b1_pruned.txt new file mode 100644 index 0000000000000000000000000000000000000000..0972b527612b283fd242cc5eaeb6e767ea106c66 --- /dev/null +++ b/custom_timm/models/pruned/efficientnet_b1_pruned.txt @@ -0,0 +1 @@ +conv_stem.weight:[32, 3, 3, 3]***bn1.weight:[32]***bn1.bias:[32]***bn1.running_mean:[32]***bn1.running_var:[32]***bn1.num_batches_tracked:[]***blocks.0.0.conv_dw.weight:[32, 1, 3, 3]***blocks.0.0.bn1.weight:[32]***blocks.0.0.bn1.bias:[32]***blocks.0.0.bn1.running_mean:[32]***blocks.0.0.bn1.running_var:[32]***blocks.0.0.bn1.num_batches_tracked:[]***blocks.0.0.se.conv_reduce.weight:[8, 32, 1, 1]***blocks.0.0.se.conv_reduce.bias:[8]***blocks.0.0.se.conv_expand.weight:[32, 8, 1, 1]***blocks.0.0.se.conv_expand.bias:[32]***blocks.0.0.conv_pw.weight:[16, 32, 1, 1]***blocks.0.0.bn2.weight:[16]***blocks.0.0.bn2.bias:[16]***blocks.0.0.bn2.running_mean:[16]***blocks.0.0.bn2.running_var:[16]***blocks.0.0.bn2.num_batches_tracked:[]***blocks.0.1.conv_dw.weight:[16, 1, 3, 3]***blocks.0.1.bn1.weight:[16]***blocks.0.1.bn1.bias:[16]***blocks.0.1.bn1.running_mean:[16]***blocks.0.1.bn1.running_var:[16]***blocks.0.1.bn1.num_batches_tracked:[]***blocks.0.1.se.conv_reduce.weight:[4, 16, 1, 1]***blocks.0.1.se.conv_reduce.bias:[4]***blocks.0.1.se.conv_expand.weight:[16, 4, 1, 1]***blocks.0.1.se.conv_expand.bias:[16]***blocks.0.1.conv_pw.weight:[16, 16, 1, 1]***blocks.0.1.bn2.weight:[16]***blocks.0.1.bn2.bias:[16]***blocks.0.1.bn2.running_mean:[16]***blocks.0.1.bn2.running_var:[16]***blocks.0.1.bn2.num_batches_tracked:[]***blocks.1.0.conv_pw.weight:[48, 16, 1, 1]***blocks.1.0.bn1.weight:[48]***blocks.1.0.bn1.bias:[48]***blocks.1.0.bn1.running_mean:[48]***blocks.1.0.bn1.running_var:[48]***blocks.1.0.bn1.num_batches_tracked:[]***blocks.1.0.conv_dw.weight:[48, 1, 3, 3]***blocks.1.0.bn2.weight:[48]***blocks.1.0.bn2.bias:[48]***blocks.1.0.bn2.running_mean:[48]***blocks.1.0.bn2.running_var:[48]***blocks.1.0.bn2.num_batches_tracked:[]***blocks.1.0.se.conv_reduce.weight:[4, 48, 1, 1]***blocks.1.0.se.conv_reduce.bias:[4]***blocks.1.0.se.conv_expand.weight:[48, 4, 1, 1]***blocks.1.0.se.conv_expand.bias:[48]***blocks.1.0.conv_pwl.weight:[12, 48, 1, 1]***blocks.1.0.bn3.weight:[12]***blocks.1.0.bn3.bias:[12]***blocks.1.0.bn3.running_mean:[12]***blocks.1.0.bn3.running_var:[12]***blocks.1.0.bn3.num_batches_tracked:[]***blocks.1.1.conv_pw.weight:[62, 12, 1, 1]***blocks.1.1.bn1.weight:[62]***blocks.1.1.bn1.bias:[62]***blocks.1.1.bn1.running_mean:[62]***blocks.1.1.bn1.running_var:[62]***blocks.1.1.bn1.num_batches_tracked:[]***blocks.1.1.conv_dw.weight:[62, 1, 3, 3]***blocks.1.1.bn2.weight:[62]***blocks.1.1.bn2.bias:[62]***blocks.1.1.bn2.running_mean:[62]***blocks.1.1.bn2.running_var:[62]***blocks.1.1.bn2.num_batches_tracked:[]***blocks.1.1.se.conv_reduce.weight:[6, 62, 1, 1]***blocks.1.1.se.conv_reduce.bias:[6]***blocks.1.1.se.conv_expand.weight:[62, 6, 1, 1]***blocks.1.1.se.conv_expand.bias:[62]***blocks.1.1.conv_pwl.weight:[12, 62, 1, 1]***blocks.1.1.bn3.weight:[12]***blocks.1.1.bn3.bias:[12]***blocks.1.1.bn3.running_mean:[12]***blocks.1.1.bn3.running_var:[12]***blocks.1.1.bn3.num_batches_tracked:[]***blocks.1.2.conv_pw.weight:[48, 12, 1, 1]***blocks.1.2.bn1.weight:[48]***blocks.1.2.bn1.bias:[48]***blocks.1.2.bn1.running_mean:[48]***blocks.1.2.bn1.running_var:[48]***blocks.1.2.bn1.num_batches_tracked:[]***blocks.1.2.conv_dw.weight:[48, 1, 3, 3]***blocks.1.2.bn2.weight:[48]***blocks.1.2.bn2.bias:[48]***blocks.1.2.bn2.running_mean:[48]***blocks.1.2.bn2.running_var:[48]***blocks.1.2.bn2.num_batches_tracked:[]***blocks.1.2.se.conv_reduce.weight:[6, 48, 1, 1]***blocks.1.2.se.conv_reduce.bias:[6]***blocks.1.2.se.conv_expand.weight:[48, 6, 1, 1]***blocks.1.2.se.conv_expand.bias:[48]***blocks.1.2.conv_pwl.weight:[12, 48, 1, 1]***blocks.1.2.bn3.weight:[12]***blocks.1.2.bn3.bias:[12]***blocks.1.2.bn3.running_mean:[12]***blocks.1.2.bn3.running_var:[12]***blocks.1.2.bn3.num_batches_tracked:[]***blocks.2.0.conv_pw.weight:[70, 12, 1, 1]***blocks.2.0.bn1.weight:[70]***blocks.2.0.bn1.bias:[70]***blocks.2.0.bn1.running_mean:[70]***blocks.2.0.bn1.running_var:[70]***blocks.2.0.bn1.num_batches_tracked:[]***blocks.2.0.conv_dw.weight:[70, 1, 5, 5]***blocks.2.0.bn2.weight:[70]***blocks.2.0.bn2.bias:[70]***blocks.2.0.bn2.running_mean:[70]***blocks.2.0.bn2.running_var:[70]***blocks.2.0.bn2.num_batches_tracked:[]***blocks.2.0.se.conv_reduce.weight:[6, 70, 1, 1]***blocks.2.0.se.conv_reduce.bias:[6]***blocks.2.0.se.conv_expand.weight:[70, 6, 1, 1]***blocks.2.0.se.conv_expand.bias:[70]***blocks.2.0.conv_pwl.weight:[35, 70, 1, 1]***blocks.2.0.bn3.weight:[35]***blocks.2.0.bn3.bias:[35]***blocks.2.0.bn3.running_mean:[35]***blocks.2.0.bn3.running_var:[35]***blocks.2.0.bn3.num_batches_tracked:[]***blocks.2.1.conv_pw.weight:[61, 35, 1, 1]***blocks.2.1.bn1.weight:[61]***blocks.2.1.bn1.bias:[61]***blocks.2.1.bn1.running_mean:[61]***blocks.2.1.bn1.running_var:[61]***blocks.2.1.bn1.num_batches_tracked:[]***blocks.2.1.conv_dw.weight:[61, 1, 5, 5]***blocks.2.1.bn2.weight:[61]***blocks.2.1.bn2.bias:[61]***blocks.2.1.bn2.running_mean:[61]***blocks.2.1.bn2.running_var:[61]***blocks.2.1.bn2.num_batches_tracked:[]***blocks.2.1.se.conv_reduce.weight:[10, 61, 1, 1]***blocks.2.1.se.conv_reduce.bias:[10]***blocks.2.1.se.conv_expand.weight:[61, 10, 1, 1]***blocks.2.1.se.conv_expand.bias:[61]***blocks.2.1.conv_pwl.weight:[35, 61, 1, 1]***blocks.2.1.bn3.weight:[35]***blocks.2.1.bn3.bias:[35]***blocks.2.1.bn3.running_mean:[35]***blocks.2.1.bn3.running_var:[35]***blocks.2.1.bn3.num_batches_tracked:[]***blocks.2.2.conv_pw.weight:[51, 35, 1, 1]***blocks.2.2.bn1.weight:[51]***blocks.2.2.bn1.bias:[51]***blocks.2.2.bn1.running_mean:[51]***blocks.2.2.bn1.running_var:[51]***blocks.2.2.bn1.num_batches_tracked:[]***blocks.2.2.conv_dw.weight:[51, 1, 5, 5]***blocks.2.2.bn2.weight:[51]***blocks.2.2.bn2.bias:[51]***blocks.2.2.bn2.running_mean:[51]***blocks.2.2.bn2.running_var:[51]***blocks.2.2.bn2.num_batches_tracked:[]***blocks.2.2.se.conv_reduce.weight:[10, 51, 1, 1]***blocks.2.2.se.conv_reduce.bias:[10]***blocks.2.2.se.conv_expand.weight:[51, 10, 1, 1]***blocks.2.2.se.conv_expand.bias:[51]***blocks.2.2.conv_pwl.weight:[35, 51, 1, 1]***blocks.2.2.bn3.weight:[35]***blocks.2.2.bn3.bias:[35]***blocks.2.2.bn3.running_mean:[35]***blocks.2.2.bn3.running_var:[35]***blocks.2.2.bn3.num_batches_tracked:[]***blocks.3.0.conv_pw.weight:[175, 35, 1, 1]***blocks.3.0.bn1.weight:[175]***blocks.3.0.bn1.bias:[175]***blocks.3.0.bn1.running_mean:[175]***blocks.3.0.bn1.running_var:[175]***blocks.3.0.bn1.num_batches_tracked:[]***blocks.3.0.conv_dw.weight:[175, 1, 3, 3]***blocks.3.0.bn2.weight:[175]***blocks.3.0.bn2.bias:[175]***blocks.3.0.bn2.running_mean:[175]***blocks.3.0.bn2.running_var:[175]***blocks.3.0.bn2.num_batches_tracked:[]***blocks.3.0.se.conv_reduce.weight:[10, 175, 1, 1]***blocks.3.0.se.conv_reduce.bias:[10]***blocks.3.0.se.conv_expand.weight:[175, 10, 1, 1]***blocks.3.0.se.conv_expand.bias:[175]***blocks.3.0.conv_pwl.weight:[74, 175, 1, 1]***blocks.3.0.bn3.weight:[74]***blocks.3.0.bn3.bias:[74]***blocks.3.0.bn3.running_mean:[74]***blocks.3.0.bn3.running_var:[74]***blocks.3.0.bn3.num_batches_tracked:[]***blocks.3.1.conv_pw.weight:[188, 74, 1, 1]***blocks.3.1.bn1.weight:[188]***blocks.3.1.bn1.bias:[188]***blocks.3.1.bn1.running_mean:[188]***blocks.3.1.bn1.running_var:[188]***blocks.3.1.bn1.num_batches_tracked:[]***blocks.3.1.conv_dw.weight:[188, 1, 3, 3]***blocks.3.1.bn2.weight:[188]***blocks.3.1.bn2.bias:[188]***blocks.3.1.bn2.running_mean:[188]***blocks.3.1.bn2.running_var:[188]***blocks.3.1.bn2.num_batches_tracked:[]***blocks.3.1.se.conv_reduce.weight:[20, 188, 1, 1]***blocks.3.1.se.conv_reduce.bias:[20]***blocks.3.1.se.conv_expand.weight:[188, 20, 1, 1]***blocks.3.1.se.conv_expand.bias:[188]***blocks.3.1.conv_pwl.weight:[74, 188, 1, 1]***blocks.3.1.bn3.weight:[74]***blocks.3.1.bn3.bias:[74]***blocks.3.1.bn3.running_mean:[74]***blocks.3.1.bn3.running_var:[74]***blocks.3.1.bn3.num_batches_tracked:[]***blocks.3.2.conv_pw.weight:[137, 74, 1, 1]***blocks.3.2.bn1.weight:[137]***blocks.3.2.bn1.bias:[137]***blocks.3.2.bn1.running_mean:[137]***blocks.3.2.bn1.running_var:[137]***blocks.3.2.bn1.num_batches_tracked:[]***blocks.3.2.conv_dw.weight:[137, 1, 3, 3]***blocks.3.2.bn2.weight:[137]***blocks.3.2.bn2.bias:[137]***blocks.3.2.bn2.running_mean:[137]***blocks.3.2.bn2.running_var:[137]***blocks.3.2.bn2.num_batches_tracked:[]***blocks.3.2.se.conv_reduce.weight:[20, 137, 1, 1]***blocks.3.2.se.conv_reduce.bias:[20]***blocks.3.2.se.conv_expand.weight:[137, 20, 1, 1]***blocks.3.2.se.conv_expand.bias:[137]***blocks.3.2.conv_pwl.weight:[74, 137, 1, 1]***blocks.3.2.bn3.weight:[74]***blocks.3.2.bn3.bias:[74]***blocks.3.2.bn3.running_mean:[74]***blocks.3.2.bn3.running_var:[74]***blocks.3.2.bn3.num_batches_tracked:[]***blocks.3.3.conv_pw.weight:[164, 74, 1, 1]***blocks.3.3.bn1.weight:[164]***blocks.3.3.bn1.bias:[164]***blocks.3.3.bn1.running_mean:[164]***blocks.3.3.bn1.running_var:[164]***blocks.3.3.bn1.num_batches_tracked:[]***blocks.3.3.conv_dw.weight:[164, 1, 3, 3]***blocks.3.3.bn2.weight:[164]***blocks.3.3.bn2.bias:[164]***blocks.3.3.bn2.running_mean:[164]***blocks.3.3.bn2.running_var:[164]***blocks.3.3.bn2.num_batches_tracked:[]***blocks.3.3.se.conv_reduce.weight:[20, 164, 1, 1]***blocks.3.3.se.conv_reduce.bias:[20]***blocks.3.3.se.conv_expand.weight:[164, 20, 1, 1]***blocks.3.3.se.conv_expand.bias:[164]***blocks.3.3.conv_pwl.weight:[74, 164, 1, 1]***blocks.3.3.bn3.weight:[74]***blocks.3.3.bn3.bias:[74]***blocks.3.3.bn3.running_mean:[74]***blocks.3.3.bn3.running_var:[74]***blocks.3.3.bn3.num_batches_tracked:[]***blocks.4.0.conv_pw.weight:[399, 74, 1, 1]***blocks.4.0.bn1.weight:[399]***blocks.4.0.bn1.bias:[399]***blocks.4.0.bn1.running_mean:[399]***blocks.4.0.bn1.running_var:[399]***blocks.4.0.bn1.num_batches_tracked:[]***blocks.4.0.conv_dw.weight:[399, 1, 5, 5]***blocks.4.0.bn2.weight:[399]***blocks.4.0.bn2.bias:[399]***blocks.4.0.bn2.running_mean:[399]***blocks.4.0.bn2.running_var:[399]***blocks.4.0.bn2.num_batches_tracked:[]***blocks.4.0.se.conv_reduce.weight:[20, 399, 1, 1]***blocks.4.0.se.conv_reduce.bias:[20]***blocks.4.0.se.conv_expand.weight:[399, 20, 1, 1]***blocks.4.0.se.conv_expand.bias:[399]***blocks.4.0.conv_pwl.weight:[67, 399, 1, 1]***blocks.4.0.bn3.weight:[67]***blocks.4.0.bn3.bias:[67]***blocks.4.0.bn3.running_mean:[67]***blocks.4.0.bn3.running_var:[67]***blocks.4.0.bn3.num_batches_tracked:[]***blocks.4.1.conv_pw.weight:[201, 67, 1, 1]***blocks.4.1.bn1.weight:[201]***blocks.4.1.bn1.bias:[201]***blocks.4.1.bn1.running_mean:[201]***blocks.4.1.bn1.running_var:[201]***blocks.4.1.bn1.num_batches_tracked:[]***blocks.4.1.conv_dw.weight:[201, 1, 5, 5]***blocks.4.1.bn2.weight:[201]***blocks.4.1.bn2.bias:[201]***blocks.4.1.bn2.running_mean:[201]***blocks.4.1.bn2.running_var:[201]***blocks.4.1.bn2.num_batches_tracked:[]***blocks.4.1.se.conv_reduce.weight:[28, 201, 1, 1]***blocks.4.1.se.conv_reduce.bias:[28]***blocks.4.1.se.conv_expand.weight:[201, 28, 1, 1]***blocks.4.1.se.conv_expand.bias:[201]***blocks.4.1.conv_pwl.weight:[67, 201, 1, 1]***blocks.4.1.bn3.weight:[67]***blocks.4.1.bn3.bias:[67]***blocks.4.1.bn3.running_mean:[67]***blocks.4.1.bn3.running_var:[67]***blocks.4.1.bn3.num_batches_tracked:[]***blocks.4.2.conv_pw.weight:[160, 67, 1, 1]***blocks.4.2.bn1.weight:[160]***blocks.4.2.bn1.bias:[160]***blocks.4.2.bn1.running_mean:[160]***blocks.4.2.bn1.running_var:[160]***blocks.4.2.bn1.num_batches_tracked:[]***blocks.4.2.conv_dw.weight:[160, 1, 5, 5]***blocks.4.2.bn2.weight:[160]***blocks.4.2.bn2.bias:[160]***blocks.4.2.bn2.running_mean:[160]***blocks.4.2.bn2.running_var:[160]***blocks.4.2.bn2.num_batches_tracked:[]***blocks.4.2.se.conv_reduce.weight:[28, 160, 1, 1]***blocks.4.2.se.conv_reduce.bias:[28]***blocks.4.2.se.conv_expand.weight:[160, 28, 1, 1]***blocks.4.2.se.conv_expand.bias:[160]***blocks.4.2.conv_pwl.weight:[67, 160, 1, 1]***blocks.4.2.bn3.weight:[67]***blocks.4.2.bn3.bias:[67]***blocks.4.2.bn3.running_mean:[67]***blocks.4.2.bn3.running_var:[67]***blocks.4.2.bn3.num_batches_tracked:[]***blocks.4.3.conv_pw.weight:[213, 67, 1, 1]***blocks.4.3.bn1.weight:[213]***blocks.4.3.bn1.bias:[213]***blocks.4.3.bn1.running_mean:[213]***blocks.4.3.bn1.running_var:[213]***blocks.4.3.bn1.num_batches_tracked:[]***blocks.4.3.conv_dw.weight:[213, 1, 5, 5]***blocks.4.3.bn2.weight:[213]***blocks.4.3.bn2.bias:[213]***blocks.4.3.bn2.running_mean:[213]***blocks.4.3.bn2.running_var:[213]***blocks.4.3.bn2.num_batches_tracked:[]***blocks.4.3.se.conv_reduce.weight:[28, 213, 1, 1]***blocks.4.3.se.conv_reduce.bias:[28]***blocks.4.3.se.conv_expand.weight:[213, 28, 1, 1]***blocks.4.3.se.conv_expand.bias:[213]***blocks.4.3.conv_pwl.weight:[67, 213, 1, 1]***blocks.4.3.bn3.weight:[67]***blocks.4.3.bn3.bias:[67]***blocks.4.3.bn3.running_mean:[67]***blocks.4.3.bn3.running_var:[67]***blocks.4.3.bn3.num_batches_tracked:[]***blocks.5.0.conv_pw.weight:[637, 67, 1, 1]***blocks.5.0.bn1.weight:[637]***blocks.5.0.bn1.bias:[637]***blocks.5.0.bn1.running_mean:[637]***blocks.5.0.bn1.running_var:[637]***blocks.5.0.bn1.num_batches_tracked:[]***blocks.5.0.conv_dw.weight:[637, 1, 5, 5]***blocks.5.0.bn2.weight:[637]***blocks.5.0.bn2.bias:[637]***blocks.5.0.bn2.running_mean:[637]***blocks.5.0.bn2.running_var:[637]***blocks.5.0.bn2.num_batches_tracked:[]***blocks.5.0.se.conv_reduce.weight:[27, 637, 1, 1]***blocks.5.0.se.conv_reduce.bias:[27]***blocks.5.0.se.conv_expand.weight:[637, 27, 1, 1]***blocks.5.0.se.conv_expand.bias:[637]***blocks.5.0.conv_pwl.weight:[192, 637, 1, 1]***blocks.5.0.bn3.weight:[192]***blocks.5.0.bn3.bias:[192]***blocks.5.0.bn3.running_mean:[192]***blocks.5.0.bn3.running_var:[192]***blocks.5.0.bn3.num_batches_tracked:[]***blocks.5.1.conv_pw.weight:[806, 192, 1, 1]***blocks.5.1.bn1.weight:[806]***blocks.5.1.bn1.bias:[806]***blocks.5.1.bn1.running_mean:[806]***blocks.5.1.bn1.running_var:[806]***blocks.5.1.bn1.num_batches_tracked:[]***blocks.5.1.conv_dw.weight:[806, 1, 5, 5]***blocks.5.1.bn2.weight:[806]***blocks.5.1.bn2.bias:[806]***blocks.5.1.bn2.running_mean:[806]***blocks.5.1.bn2.running_var:[806]***blocks.5.1.bn2.num_batches_tracked:[]***blocks.5.1.se.conv_reduce.weight:[48, 806, 1, 1]***blocks.5.1.se.conv_reduce.bias:[48]***blocks.5.1.se.conv_expand.weight:[806, 48, 1, 1]***blocks.5.1.se.conv_expand.bias:[806]***blocks.5.1.conv_pwl.weight:[192, 806, 1, 1]***blocks.5.1.bn3.weight:[192]***blocks.5.1.bn3.bias:[192]***blocks.5.1.bn3.running_mean:[192]***blocks.5.1.bn3.running_var:[192]***blocks.5.1.bn3.num_batches_tracked:[]***blocks.5.2.conv_pw.weight:[798, 192, 1, 1]***blocks.5.2.bn1.weight:[798]***blocks.5.2.bn1.bias:[798]***blocks.5.2.bn1.running_mean:[798]***blocks.5.2.bn1.running_var:[798]***blocks.5.2.bn1.num_batches_tracked:[]***blocks.5.2.conv_dw.weight:[798, 1, 5, 5]***blocks.5.2.bn2.weight:[798]***blocks.5.2.bn2.bias:[798]***blocks.5.2.bn2.running_mean:[798]***blocks.5.2.bn2.running_var:[798]***blocks.5.2.bn2.num_batches_tracked:[]***blocks.5.2.se.conv_reduce.weight:[48, 798, 1, 1]***blocks.5.2.se.conv_reduce.bias:[48]***blocks.5.2.se.conv_expand.weight:[798, 48, 1, 1]***blocks.5.2.se.conv_expand.bias:[798]***blocks.5.2.conv_pwl.weight:[192, 798, 1, 1]***blocks.5.2.bn3.weight:[192]***blocks.5.2.bn3.bias:[192]***blocks.5.2.bn3.running_mean:[192]***blocks.5.2.bn3.running_var:[192]***blocks.5.2.bn3.num_batches_tracked:[]***blocks.5.3.conv_pw.weight:[891, 192, 1, 1]***blocks.5.3.bn1.weight:[891]***blocks.5.3.bn1.bias:[891]***blocks.5.3.bn1.running_mean:[891]***blocks.5.3.bn1.running_var:[891]***blocks.5.3.bn1.num_batches_tracked:[]***blocks.5.3.conv_dw.weight:[891, 1, 5, 5]***blocks.5.3.bn2.weight:[891]***blocks.5.3.bn2.bias:[891]***blocks.5.3.bn2.running_mean:[891]***blocks.5.3.bn2.running_var:[891]***blocks.5.3.bn2.num_batches_tracked:[]***blocks.5.3.se.conv_reduce.weight:[48, 891, 1, 1]***blocks.5.3.se.conv_reduce.bias:[48]***blocks.5.3.se.conv_expand.weight:[891, 48, 1, 1]***blocks.5.3.se.conv_expand.bias:[891]***blocks.5.3.conv_pwl.weight:[192, 891, 1, 1]***blocks.5.3.bn3.weight:[192]***blocks.5.3.bn3.bias:[192]***blocks.5.3.bn3.running_mean:[192]***blocks.5.3.bn3.running_var:[192]***blocks.5.3.bn3.num_batches_tracked:[]***blocks.5.4.conv_pw.weight:[990, 192, 1, 1]***blocks.5.4.bn1.weight:[990]***blocks.5.4.bn1.bias:[990]***blocks.5.4.bn1.running_mean:[990]***blocks.5.4.bn1.running_var:[990]***blocks.5.4.bn1.num_batches_tracked:[]***blocks.5.4.conv_dw.weight:[990, 1, 5, 5]***blocks.5.4.bn2.weight:[990]***blocks.5.4.bn2.bias:[990]***blocks.5.4.bn2.running_mean:[990]***blocks.5.4.bn2.running_var:[990]***blocks.5.4.bn2.num_batches_tracked:[]***blocks.5.4.se.conv_reduce.weight:[48, 990, 1, 1]***blocks.5.4.se.conv_reduce.bias:[48]***blocks.5.4.se.conv_expand.weight:[990, 48, 1, 1]***blocks.5.4.se.conv_expand.bias:[990]***blocks.5.4.conv_pwl.weight:[192, 990, 1, 1]***blocks.5.4.bn3.weight:[192]***blocks.5.4.bn3.bias:[192]***blocks.5.4.bn3.running_mean:[192]***blocks.5.4.bn3.running_var:[192]***blocks.5.4.bn3.num_batches_tracked:[]***blocks.6.0.conv_pw.weight:[1152, 192, 1, 1]***blocks.6.0.bn1.weight:[1152]***blocks.6.0.bn1.bias:[1152]***blocks.6.0.bn1.running_mean:[1152]***blocks.6.0.bn1.running_var:[1152]***blocks.6.0.bn1.num_batches_tracked:[]***blocks.6.0.conv_dw.weight:[1152, 1, 3, 3]***blocks.6.0.bn2.weight:[1152]***blocks.6.0.bn2.bias:[1152]***blocks.6.0.bn2.running_mean:[1152]***blocks.6.0.bn2.running_var:[1152]***blocks.6.0.bn2.num_batches_tracked:[]***blocks.6.0.se.conv_reduce.weight:[48, 1152, 1, 1]***blocks.6.0.se.conv_reduce.bias:[48]***blocks.6.0.se.conv_expand.weight:[1152, 48, 1, 1]***blocks.6.0.se.conv_expand.bias:[1152]***blocks.6.0.conv_pwl.weight:[320, 1152, 1, 1]***blocks.6.0.bn3.weight:[320]***blocks.6.0.bn3.bias:[320]***blocks.6.0.bn3.running_mean:[320]***blocks.6.0.bn3.running_var:[320]***blocks.6.0.bn3.num_batches_tracked:[]***blocks.6.1.conv_pw.weight:[1912, 320, 1, 1]***blocks.6.1.bn1.weight:[1912]***blocks.6.1.bn1.bias:[1912]***blocks.6.1.bn1.running_mean:[1912]***blocks.6.1.bn1.running_var:[1912]***blocks.6.1.bn1.num_batches_tracked:[]***blocks.6.1.conv_dw.weight:[1912, 1, 3, 3]***blocks.6.1.bn2.weight:[1912]***blocks.6.1.bn2.bias:[1912]***blocks.6.1.bn2.running_mean:[1912]***blocks.6.1.bn2.running_var:[1912]***blocks.6.1.bn2.num_batches_tracked:[]***blocks.6.1.se.conv_reduce.weight:[80, 1912, 1, 1]***blocks.6.1.se.conv_reduce.bias:[80]***blocks.6.1.se.conv_expand.weight:[1912, 80, 1, 1]***blocks.6.1.se.conv_expand.bias:[1912]***blocks.6.1.conv_pwl.weight:[320, 1912, 1, 1]***blocks.6.1.bn3.weight:[320]***blocks.6.1.bn3.bias:[320]***blocks.6.1.bn3.running_mean:[320]***blocks.6.1.bn3.running_var:[320]***blocks.6.1.bn3.num_batches_tracked:[]***conv_head.weight:[1280, 320, 1, 1]***bn2.weight:[1280]***bn2.bias:[1280]***bn2.running_mean:[1280]***bn2.running_var:[1280]***bn2.num_batches_tracked:[]***classifier.weight:[1000, 1280]***classifier.bias:[1000] \ No newline at end of file diff --git a/custom_timm/models/pruned/efficientnet_b2_pruned.txt b/custom_timm/models/pruned/efficientnet_b2_pruned.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e3fadee3e9f92eaade96afd8691a5e4437551ee --- /dev/null +++ b/custom_timm/models/pruned/efficientnet_b2_pruned.txt @@ -0,0 +1 @@ +conv_stem.weight:[32, 3, 3, 3]***bn1.weight:[32]***bn1.bias:[32]***bn1.running_mean:[32]***bn1.running_var:[32]***bn1.num_batches_tracked:[]***blocks.0.0.conv_dw.weight:[32, 1, 3, 3]***blocks.0.0.bn1.weight:[32]***blocks.0.0.bn1.bias:[32]***blocks.0.0.bn1.running_mean:[32]***blocks.0.0.bn1.running_var:[32]***blocks.0.0.bn1.num_batches_tracked:[]***blocks.0.0.se.conv_reduce.weight:[8, 32, 1, 1]***blocks.0.0.se.conv_reduce.bias:[8]***blocks.0.0.se.conv_expand.weight:[32, 8, 1, 1]***blocks.0.0.se.conv_expand.bias:[32]***blocks.0.0.conv_pw.weight:[16, 32, 1, 1]***blocks.0.0.bn2.weight:[16]***blocks.0.0.bn2.bias:[16]***blocks.0.0.bn2.running_mean:[16]***blocks.0.0.bn2.running_var:[16]***blocks.0.0.bn2.num_batches_tracked:[]***blocks.0.1.conv_dw.weight:[16, 1, 3, 3]***blocks.0.1.bn1.weight:[16]***blocks.0.1.bn1.bias:[16]***blocks.0.1.bn1.running_mean:[16]***blocks.0.1.bn1.running_var:[16]***blocks.0.1.bn1.num_batches_tracked:[]***blocks.0.1.se.conv_reduce.weight:[4, 16, 1, 1]***blocks.0.1.se.conv_reduce.bias:[4]***blocks.0.1.se.conv_expand.weight:[16, 4, 1, 1]***blocks.0.1.se.conv_expand.bias:[16]***blocks.0.1.conv_pw.weight:[16, 16, 1, 1]***blocks.0.1.bn2.weight:[16]***blocks.0.1.bn2.bias:[16]***blocks.0.1.bn2.running_mean:[16]***blocks.0.1.bn2.running_var:[16]***blocks.0.1.bn2.num_batches_tracked:[]***blocks.1.0.conv_pw.weight:[54, 16, 1, 1]***blocks.1.0.bn1.weight:[54]***blocks.1.0.bn1.bias:[54]***blocks.1.0.bn1.running_mean:[54]***blocks.1.0.bn1.running_var:[54]***blocks.1.0.bn1.num_batches_tracked:[]***blocks.1.0.conv_dw.weight:[54, 1, 3, 3]***blocks.1.0.bn2.weight:[54]***blocks.1.0.bn2.bias:[54]***blocks.1.0.bn2.running_mean:[54]***blocks.1.0.bn2.running_var:[54]***blocks.1.0.bn2.num_batches_tracked:[]***blocks.1.0.se.conv_reduce.weight:[4, 54, 1, 1]***blocks.1.0.se.conv_reduce.bias:[4]***blocks.1.0.se.conv_expand.weight:[54, 4, 1, 1]***blocks.1.0.se.conv_expand.bias:[54]***blocks.1.0.conv_pwl.weight:[17, 54, 1, 1]***blocks.1.0.bn3.weight:[17]***blocks.1.0.bn3.bias:[17]***blocks.1.0.bn3.running_mean:[17]***blocks.1.0.bn3.running_var:[17]***blocks.1.0.bn3.num_batches_tracked:[]***blocks.1.1.conv_pw.weight:[69, 17, 1, 1]***blocks.1.1.bn1.weight:[69]***blocks.1.1.bn1.bias:[69]***blocks.1.1.bn1.running_mean:[69]***blocks.1.1.bn1.running_var:[69]***blocks.1.1.bn1.num_batches_tracked:[]***blocks.1.1.conv_dw.weight:[69, 1, 3, 3]***blocks.1.1.bn2.weight:[69]***blocks.1.1.bn2.bias:[69]***blocks.1.1.bn2.running_mean:[69]***blocks.1.1.bn2.running_var:[69]***blocks.1.1.bn2.num_batches_tracked:[]***blocks.1.1.se.conv_reduce.weight:[6, 69, 1, 1]***blocks.1.1.se.conv_reduce.bias:[6]***blocks.1.1.se.conv_expand.weight:[69, 6, 1, 1]***blocks.1.1.se.conv_expand.bias:[69]***blocks.1.1.conv_pwl.weight:[17, 69, 1, 1]***blocks.1.1.bn3.weight:[17]***blocks.1.1.bn3.bias:[17]***blocks.1.1.bn3.running_mean:[17]***blocks.1.1.bn3.running_var:[17]***blocks.1.1.bn3.num_batches_tracked:[]***blocks.1.2.conv_pw.weight:[61, 17, 1, 1]***blocks.1.2.bn1.weight:[61]***blocks.1.2.bn1.bias:[61]***blocks.1.2.bn1.running_mean:[61]***blocks.1.2.bn1.running_var:[61]***blocks.1.2.bn1.num_batches_tracked:[]***blocks.1.2.conv_dw.weight:[61, 1, 3, 3]***blocks.1.2.bn2.weight:[61]***blocks.1.2.bn2.bias:[61]***blocks.1.2.bn2.running_mean:[61]***blocks.1.2.bn2.running_var:[61]***blocks.1.2.bn2.num_batches_tracked:[]***blocks.1.2.se.conv_reduce.weight:[6, 61, 1, 1]***blocks.1.2.se.conv_reduce.bias:[6]***blocks.1.2.se.conv_expand.weight:[61, 6, 1, 1]***blocks.1.2.se.conv_expand.bias:[61]***blocks.1.2.conv_pwl.weight:[17, 61, 1, 1]***blocks.1.2.bn3.weight:[17]***blocks.1.2.bn3.bias:[17]***blocks.1.2.bn3.running_mean:[17]***blocks.1.2.bn3.running_var:[17]***blocks.1.2.bn3.num_batches_tracked:[]***blocks.2.0.conv_pw.weight:[86, 17, 1, 1]***blocks.2.0.bn1.weight:[86]***blocks.2.0.bn1.bias:[86]***blocks.2.0.bn1.running_mean:[86]***blocks.2.0.bn1.running_var:[86]***blocks.2.0.bn1.num_batches_tracked:[]***blocks.2.0.conv_dw.weight:[86, 1, 5, 5]***blocks.2.0.bn2.weight:[86]***blocks.2.0.bn2.bias:[86]***blocks.2.0.bn2.running_mean:[86]***blocks.2.0.bn2.running_var:[86]***blocks.2.0.bn2.num_batches_tracked:[]***blocks.2.0.se.conv_reduce.weight:[6, 86, 1, 1]***blocks.2.0.se.conv_reduce.bias:[6]***blocks.2.0.se.conv_expand.weight:[86, 6, 1, 1]***blocks.2.0.se.conv_expand.bias:[86]***blocks.2.0.conv_pwl.weight:[42, 86, 1, 1]***blocks.2.0.bn3.weight:[42]***blocks.2.0.bn3.bias:[42]***blocks.2.0.bn3.running_mean:[42]***blocks.2.0.bn3.running_var:[42]***blocks.2.0.bn3.num_batches_tracked:[]***blocks.2.1.conv_pw.weight:[72, 42, 1, 1]***blocks.2.1.bn1.weight:[72]***blocks.2.1.bn1.bias:[72]***blocks.2.1.bn1.running_mean:[72]***blocks.2.1.bn1.running_var:[72]***blocks.2.1.bn1.num_batches_tracked:[]***blocks.2.1.conv_dw.weight:[72, 1, 5, 5]***blocks.2.1.bn2.weight:[72]***blocks.2.1.bn2.bias:[72]***blocks.2.1.bn2.running_mean:[72]***blocks.2.1.bn2.running_var:[72]***blocks.2.1.bn2.num_batches_tracked:[]***blocks.2.1.se.conv_reduce.weight:[12, 72, 1, 1]***blocks.2.1.se.conv_reduce.bias:[12]***blocks.2.1.se.conv_expand.weight:[72, 12, 1, 1]***blocks.2.1.se.conv_expand.bias:[72]***blocks.2.1.conv_pwl.weight:[42, 72, 1, 1]***blocks.2.1.bn3.weight:[42]***blocks.2.1.bn3.bias:[42]***blocks.2.1.bn3.running_mean:[42]***blocks.2.1.bn3.running_var:[42]***blocks.2.1.bn3.num_batches_tracked:[]***blocks.2.2.conv_pw.weight:[98, 42, 1, 1]***blocks.2.2.bn1.weight:[98]***blocks.2.2.bn1.bias:[98]***blocks.2.2.bn1.running_mean:[98]***blocks.2.2.bn1.running_var:[98]***blocks.2.2.bn1.num_batches_tracked:[]***blocks.2.2.conv_dw.weight:[98, 1, 5, 5]***blocks.2.2.bn2.weight:[98]***blocks.2.2.bn2.bias:[98]***blocks.2.2.bn2.running_mean:[98]***blocks.2.2.bn2.running_var:[98]***blocks.2.2.bn2.num_batches_tracked:[]***blocks.2.2.se.conv_reduce.weight:[12, 98, 1, 1]***blocks.2.2.se.conv_reduce.bias:[12]***blocks.2.2.se.conv_expand.weight:[98, 12, 1, 1]***blocks.2.2.se.conv_expand.bias:[98]***blocks.2.2.conv_pwl.weight:[42, 98, 1, 1]***blocks.2.2.bn3.weight:[42]***blocks.2.2.bn3.bias:[42]***blocks.2.2.bn3.running_mean:[42]***blocks.2.2.bn3.running_var:[42]***blocks.2.2.bn3.num_batches_tracked:[]***blocks.3.0.conv_pw.weight:[245, 42, 1, 1]***blocks.3.0.bn1.weight:[245]***blocks.3.0.bn1.bias:[245]***blocks.3.0.bn1.running_mean:[245]***blocks.3.0.bn1.running_var:[245]***blocks.3.0.bn1.num_batches_tracked:[]***blocks.3.0.conv_dw.weight:[245, 1, 3, 3]***blocks.3.0.bn2.weight:[245]***blocks.3.0.bn2.bias:[245]***blocks.3.0.bn2.running_mean:[245]***blocks.3.0.bn2.running_var:[245]***blocks.3.0.bn2.num_batches_tracked:[]***blocks.3.0.se.conv_reduce.weight:[12, 245, 1, 1]***blocks.3.0.se.conv_reduce.bias:[12]***blocks.3.0.se.conv_expand.weight:[245, 12, 1, 1]***blocks.3.0.se.conv_expand.bias:[245]***blocks.3.0.conv_pwl.weight:[85, 245, 1, 1]***blocks.3.0.bn3.weight:[85]***blocks.3.0.bn3.bias:[85]***blocks.3.0.bn3.running_mean:[85]***blocks.3.0.bn3.running_var:[85]***blocks.3.0.bn3.num_batches_tracked:[]***blocks.3.1.conv_pw.weight:[274, 85, 1, 1]***blocks.3.1.bn1.weight:[274]***blocks.3.1.bn1.bias:[274]***blocks.3.1.bn1.running_mean:[274]***blocks.3.1.bn1.running_var:[274]***blocks.3.1.bn1.num_batches_tracked:[]***blocks.3.1.conv_dw.weight:[274, 1, 3, 3]***blocks.3.1.bn2.weight:[274]***blocks.3.1.bn2.bias:[274]***blocks.3.1.bn2.running_mean:[274]***blocks.3.1.bn2.running_var:[274]***blocks.3.1.bn2.num_batches_tracked:[]***blocks.3.1.se.conv_reduce.weight:[22, 274, 1, 1]***blocks.3.1.se.conv_reduce.bias:[22]***blocks.3.1.se.conv_expand.weight:[274, 22, 1, 1]***blocks.3.1.se.conv_expand.bias:[274]***blocks.3.1.conv_pwl.weight:[85, 274, 1, 1]***blocks.3.1.bn3.weight:[85]***blocks.3.1.bn3.bias:[85]***blocks.3.1.bn3.running_mean:[85]***blocks.3.1.bn3.running_var:[85]***blocks.3.1.bn3.num_batches_tracked:[]***blocks.3.2.conv_pw.weight:[254, 85, 1, 1]***blocks.3.2.bn1.weight:[254]***blocks.3.2.bn1.bias:[254]***blocks.3.2.bn1.running_mean:[254]***blocks.3.2.bn1.running_var:[254]***blocks.3.2.bn1.num_batches_tracked:[]***blocks.3.2.conv_dw.weight:[254, 1, 3, 3]***blocks.3.2.bn2.weight:[254]***blocks.3.2.bn2.bias:[254]***blocks.3.2.bn2.running_mean:[254]***blocks.3.2.bn2.running_var:[254]***blocks.3.2.bn2.num_batches_tracked:[]***blocks.3.2.se.conv_reduce.weight:[22, 254, 1, 1]***blocks.3.2.se.conv_reduce.bias:[22]***blocks.3.2.se.conv_expand.weight:[254, 22, 1, 1]***blocks.3.2.se.conv_expand.bias:[254]***blocks.3.2.conv_pwl.weight:[85, 254, 1, 1]***blocks.3.2.bn3.weight:[85]***blocks.3.2.bn3.bias:[85]***blocks.3.2.bn3.running_mean:[85]***blocks.3.2.bn3.running_var:[85]***blocks.3.2.bn3.num_batches_tracked:[]***blocks.3.3.conv_pw.weight:[292, 85, 1, 1]***blocks.3.3.bn1.weight:[292]***blocks.3.3.bn1.bias:[292]***blocks.3.3.bn1.running_mean:[292]***blocks.3.3.bn1.running_var:[292]***blocks.3.3.bn1.num_batches_tracked:[]***blocks.3.3.conv_dw.weight:[292, 1, 3, 3]***blocks.3.3.bn2.weight:[292]***blocks.3.3.bn2.bias:[292]***blocks.3.3.bn2.running_mean:[292]***blocks.3.3.bn2.running_var:[292]***blocks.3.3.bn2.num_batches_tracked:[]***blocks.3.3.se.conv_reduce.weight:[22, 292, 1, 1]***blocks.3.3.se.conv_reduce.bias:[22]***blocks.3.3.se.conv_expand.weight:[292, 22, 1, 1]***blocks.3.3.se.conv_expand.bias:[292]***blocks.3.3.conv_pwl.weight:[85, 292, 1, 1]***blocks.3.3.bn3.weight:[85]***blocks.3.3.bn3.bias:[85]***blocks.3.3.bn3.running_mean:[85]***blocks.3.3.bn3.running_var:[85]***blocks.3.3.bn3.num_batches_tracked:[]***blocks.4.0.conv_pw.weight:[502, 85, 1, 1]***blocks.4.0.bn1.weight:[502]***blocks.4.0.bn1.bias:[502]***blocks.4.0.bn1.running_mean:[502]***blocks.4.0.bn1.running_var:[502]***blocks.4.0.bn1.num_batches_tracked:[]***blocks.4.0.conv_dw.weight:[502, 1, 5, 5]***blocks.4.0.bn2.weight:[502]***blocks.4.0.bn2.bias:[502]***blocks.4.0.bn2.running_mean:[502]***blocks.4.0.bn2.running_var:[502]***blocks.4.0.bn2.num_batches_tracked:[]***blocks.4.0.se.conv_reduce.weight:[22, 502, 1, 1]***blocks.4.0.se.conv_reduce.bias:[22]***blocks.4.0.se.conv_expand.weight:[502, 22, 1, 1]***blocks.4.0.se.conv_expand.bias:[502]***blocks.4.0.conv_pwl.weight:[116, 502, 1, 1]***blocks.4.0.bn3.weight:[116]***blocks.4.0.bn3.bias:[116]***blocks.4.0.bn3.running_mean:[116]***blocks.4.0.bn3.running_var:[116]***blocks.4.0.bn3.num_batches_tracked:[]***blocks.4.1.conv_pw.weight:[315, 116, 1, 1]***blocks.4.1.bn1.weight:[315]***blocks.4.1.bn1.bias:[315]***blocks.4.1.bn1.running_mean:[315]***blocks.4.1.bn1.running_var:[315]***blocks.4.1.bn1.num_batches_tracked:[]***blocks.4.1.conv_dw.weight:[315, 1, 5, 5]***blocks.4.1.bn2.weight:[315]***blocks.4.1.bn2.bias:[315]***blocks.4.1.bn2.running_mean:[315]***blocks.4.1.bn2.running_var:[315]***blocks.4.1.bn2.num_batches_tracked:[]***blocks.4.1.se.conv_reduce.weight:[30, 315, 1, 1]***blocks.4.1.se.conv_reduce.bias:[30]***blocks.4.1.se.conv_expand.weight:[315, 30, 1, 1]***blocks.4.1.se.conv_expand.bias:[315]***blocks.4.1.conv_pwl.weight:[116, 315, 1, 1]***blocks.4.1.bn3.weight:[116]***blocks.4.1.bn3.bias:[116]***blocks.4.1.bn3.running_mean:[116]***blocks.4.1.bn3.running_var:[116]***blocks.4.1.bn3.num_batches_tracked:[]***blocks.4.2.conv_pw.weight:[354, 116, 1, 1]***blocks.4.2.bn1.weight:[354]***blocks.4.2.bn1.bias:[354]***blocks.4.2.bn1.running_mean:[354]***blocks.4.2.bn1.running_var:[354]***blocks.4.2.bn1.num_batches_tracked:[]***blocks.4.2.conv_dw.weight:[354, 1, 5, 5]***blocks.4.2.bn2.weight:[354]***blocks.4.2.bn2.bias:[354]***blocks.4.2.bn2.running_mean:[354]***blocks.4.2.bn2.running_var:[354]***blocks.4.2.bn2.num_batches_tracked:[]***blocks.4.2.se.conv_reduce.weight:[30, 354, 1, 1]***blocks.4.2.se.conv_reduce.bias:[30]***blocks.4.2.se.conv_expand.weight:[354, 30, 1, 1]***blocks.4.2.se.conv_expand.bias:[354]***blocks.4.2.conv_pwl.weight:[116, 354, 1, 1]***blocks.4.2.bn3.weight:[116]***blocks.4.2.bn3.bias:[116]***blocks.4.2.bn3.running_mean:[116]***blocks.4.2.bn3.running_var:[116]***blocks.4.2.bn3.num_batches_tracked:[]***blocks.4.3.conv_pw.weight:[443, 116, 1, 1]***blocks.4.3.bn1.weight:[443]***blocks.4.3.bn1.bias:[443]***blocks.4.3.bn1.running_mean:[443]***blocks.4.3.bn1.running_var:[443]***blocks.4.3.bn1.num_batches_tracked:[]***blocks.4.3.conv_dw.weight:[443, 1, 5, 5]***blocks.4.3.bn2.weight:[443]***blocks.4.3.bn2.bias:[443]***blocks.4.3.bn2.running_mean:[443]***blocks.4.3.bn2.running_var:[443]***blocks.4.3.bn2.num_batches_tracked:[]***blocks.4.3.se.conv_reduce.weight:[30, 443, 1, 1]***blocks.4.3.se.conv_reduce.bias:[30]***blocks.4.3.se.conv_expand.weight:[443, 30, 1, 1]***blocks.4.3.se.conv_expand.bias:[443]***blocks.4.3.conv_pwl.weight:[116, 443, 1, 1]***blocks.4.3.bn3.weight:[116]***blocks.4.3.bn3.bias:[116]***blocks.4.3.bn3.running_mean:[116]***blocks.4.3.bn3.running_var:[116]***blocks.4.3.bn3.num_batches_tracked:[]***blocks.5.0.conv_pw.weight:[719, 116, 1, 1]***blocks.5.0.bn1.weight:[719]***blocks.5.0.bn1.bias:[719]***blocks.5.0.bn1.running_mean:[719]***blocks.5.0.bn1.running_var:[719]***blocks.5.0.bn1.num_batches_tracked:[]***blocks.5.0.conv_dw.weight:[719, 1, 5, 5]***blocks.5.0.bn2.weight:[719]***blocks.5.0.bn2.bias:[719]***blocks.5.0.bn2.running_mean:[719]***blocks.5.0.bn2.running_var:[719]***blocks.5.0.bn2.num_batches_tracked:[]***blocks.5.0.se.conv_reduce.weight:[30, 719, 1, 1]***blocks.5.0.se.conv_reduce.bias:[30]***blocks.5.0.se.conv_expand.weight:[719, 30, 1, 1]***blocks.5.0.se.conv_expand.bias:[719]***blocks.5.0.conv_pwl.weight:[208, 719, 1, 1]***blocks.5.0.bn3.weight:[208]***blocks.5.0.bn3.bias:[208]***blocks.5.0.bn3.running_mean:[208]***blocks.5.0.bn3.running_var:[208]***blocks.5.0.bn3.num_batches_tracked:[]***blocks.5.1.conv_pw.weight:[1148, 208, 1, 1]***blocks.5.1.bn1.weight:[1148]***blocks.5.1.bn1.bias:[1148]***blocks.5.1.bn1.running_mean:[1148]***blocks.5.1.bn1.running_var:[1148]***blocks.5.1.bn1.num_batches_tracked:[]***blocks.5.1.conv_dw.weight:[1148, 1, 5, 5]***blocks.5.1.bn2.weight:[1148]***blocks.5.1.bn2.bias:[1148]***blocks.5.1.bn2.running_mean:[1148]***blocks.5.1.bn2.running_var:[1148]***blocks.5.1.bn2.num_batches_tracked:[]***blocks.5.1.se.conv_reduce.weight:[52, 1148, 1, 1]***blocks.5.1.se.conv_reduce.bias:[52]***blocks.5.1.se.conv_expand.weight:[1148, 52, 1, 1]***blocks.5.1.se.conv_expand.bias:[1148]***blocks.5.1.conv_pwl.weight:[208, 1148, 1, 1]***blocks.5.1.bn3.weight:[208]***blocks.5.1.bn3.bias:[208]***blocks.5.1.bn3.running_mean:[208]***blocks.5.1.bn3.running_var:[208]***blocks.5.1.bn3.num_batches_tracked:[]***blocks.5.2.conv_pw.weight:[1160, 208, 1, 1]***blocks.5.2.bn1.weight:[1160]***blocks.5.2.bn1.bias:[1160]***blocks.5.2.bn1.running_mean:[1160]***blocks.5.2.bn1.running_var:[1160]***blocks.5.2.bn1.num_batches_tracked:[]***blocks.5.2.conv_dw.weight:[1160, 1, 5, 5]***blocks.5.2.bn2.weight:[1160]***blocks.5.2.bn2.bias:[1160]***blocks.5.2.bn2.running_mean:[1160]***blocks.5.2.bn2.running_var:[1160]***blocks.5.2.bn2.num_batches_tracked:[]***blocks.5.2.se.conv_reduce.weight:[52, 1160, 1, 1]***blocks.5.2.se.conv_reduce.bias:[52]***blocks.5.2.se.conv_expand.weight:[1160, 52, 1, 1]***blocks.5.2.se.conv_expand.bias:[1160]***blocks.5.2.conv_pwl.weight:[208, 1160, 1, 1]***blocks.5.2.bn3.weight:[208]***blocks.5.2.bn3.bias:[208]***blocks.5.2.bn3.running_mean:[208]***blocks.5.2.bn3.running_var:[208]***blocks.5.2.bn3.num_batches_tracked:[]***blocks.5.3.conv_pw.weight:[1182, 208, 1, 1]***blocks.5.3.bn1.weight:[1182]***blocks.5.3.bn1.bias:[1182]***blocks.5.3.bn1.running_mean:[1182]***blocks.5.3.bn1.running_var:[1182]***blocks.5.3.bn1.num_batches_tracked:[]***blocks.5.3.conv_dw.weight:[1182, 1, 5, 5]***blocks.5.3.bn2.weight:[1182]***blocks.5.3.bn2.bias:[1182]***blocks.5.3.bn2.running_mean:[1182]***blocks.5.3.bn2.running_var:[1182]***blocks.5.3.bn2.num_batches_tracked:[]***blocks.5.3.se.conv_reduce.weight:[52, 1182, 1, 1]***blocks.5.3.se.conv_reduce.bias:[52]***blocks.5.3.se.conv_expand.weight:[1182, 52, 1, 1]***blocks.5.3.se.conv_expand.bias:[1182]***blocks.5.3.conv_pwl.weight:[208, 1182, 1, 1]***blocks.5.3.bn3.weight:[208]***blocks.5.3.bn3.bias:[208]***blocks.5.3.bn3.running_mean:[208]***blocks.5.3.bn3.running_var:[208]***blocks.5.3.bn3.num_batches_tracked:[]***blocks.5.4.conv_pw.weight:[1228, 208, 1, 1]***blocks.5.4.bn1.weight:[1228]***blocks.5.4.bn1.bias:[1228]***blocks.5.4.bn1.running_mean:[1228]***blocks.5.4.bn1.running_var:[1228]***blocks.5.4.bn1.num_batches_tracked:[]***blocks.5.4.conv_dw.weight:[1228, 1, 5, 5]***blocks.5.4.bn2.weight:[1228]***blocks.5.4.bn2.bias:[1228]***blocks.5.4.bn2.running_mean:[1228]***blocks.5.4.bn2.running_var:[1228]***blocks.5.4.bn2.num_batches_tracked:[]***blocks.5.4.se.conv_reduce.weight:[52, 1228, 1, 1]***blocks.5.4.se.conv_reduce.bias:[52]***blocks.5.4.se.conv_expand.weight:[1228, 52, 1, 1]***blocks.5.4.se.conv_expand.bias:[1228]***blocks.5.4.conv_pwl.weight:[208, 1228, 1, 1]***blocks.5.4.bn3.weight:[208]***blocks.5.4.bn3.bias:[208]***blocks.5.4.bn3.running_mean:[208]***blocks.5.4.bn3.running_var:[208]***blocks.5.4.bn3.num_batches_tracked:[]***blocks.6.0.conv_pw.weight:[1248, 208, 1, 1]***blocks.6.0.bn1.weight:[1248]***blocks.6.0.bn1.bias:[1248]***blocks.6.0.bn1.running_mean:[1248]***blocks.6.0.bn1.running_var:[1248]***blocks.6.0.bn1.num_batches_tracked:[]***blocks.6.0.conv_dw.weight:[1248, 1, 3, 3]***blocks.6.0.bn2.weight:[1248]***blocks.6.0.bn2.bias:[1248]***blocks.6.0.bn2.running_mean:[1248]***blocks.6.0.bn2.running_var:[1248]***blocks.6.0.bn2.num_batches_tracked:[]***blocks.6.0.se.conv_reduce.weight:[52, 1248, 1, 1]***blocks.6.0.se.conv_reduce.bias:[52]***blocks.6.0.se.conv_expand.weight:[1248, 52, 1, 1]***blocks.6.0.se.conv_expand.bias:[1248]***blocks.6.0.conv_pwl.weight:[352, 1248, 1, 1]***blocks.6.0.bn3.weight:[352]***blocks.6.0.bn3.bias:[352]***blocks.6.0.bn3.running_mean:[352]***blocks.6.0.bn3.running_var:[352]***blocks.6.0.bn3.num_batches_tracked:[]***blocks.6.1.conv_pw.weight:[2112, 352, 1, 1]***blocks.6.1.bn1.weight:[2112]***blocks.6.1.bn1.bias:[2112]***blocks.6.1.bn1.running_mean:[2112]***blocks.6.1.bn1.running_var:[2112]***blocks.6.1.bn1.num_batches_tracked:[]***blocks.6.1.conv_dw.weight:[2112, 1, 3, 3]***blocks.6.1.bn2.weight:[2112]***blocks.6.1.bn2.bias:[2112]***blocks.6.1.bn2.running_mean:[2112]***blocks.6.1.bn2.running_var:[2112]***blocks.6.1.bn2.num_batches_tracked:[]***blocks.6.1.se.conv_reduce.weight:[88, 2112, 1, 1]***blocks.6.1.se.conv_reduce.bias:[88]***blocks.6.1.se.conv_expand.weight:[2112, 88, 1, 1]***blocks.6.1.se.conv_expand.bias:[2112]***blocks.6.1.conv_pwl.weight:[352, 2112, 1, 1]***blocks.6.1.bn3.weight:[352]***blocks.6.1.bn3.bias:[352]***blocks.6.1.bn3.running_mean:[352]***blocks.6.1.bn3.running_var:[352]***blocks.6.1.bn3.num_batches_tracked:[]***conv_head.weight:[1408, 352, 1, 1]***bn2.weight:[1408]***bn2.bias:[1408]***bn2.running_mean:[1408]***bn2.running_var:[1408]***bn2.num_batches_tracked:[]***classifier.weight:[1000, 1408]***classifier.bias:[1000] \ No newline at end of file diff --git a/custom_timm/models/pruned/efficientnet_b3_pruned.txt b/custom_timm/models/pruned/efficientnet_b3_pruned.txt new file mode 100644 index 0000000000000000000000000000000000000000..489781736de08e5cf40bf76528a735fff4a3f61c --- /dev/null +++ b/custom_timm/models/pruned/efficientnet_b3_pruned.txt @@ -0,0 +1 @@ +conv_stem.weight:[40, 3, 3, 3]***bn1.weight:[40]***bn1.bias:[40]***bn1.running_mean:[40]***bn1.running_var:[40]***bn1.num_batches_tracked:[]***blocks.0.0.conv_dw.weight:[40, 1, 3, 3]***blocks.0.0.bn1.weight:[40]***blocks.0.0.bn1.bias:[40]***blocks.0.0.bn1.running_mean:[40]***blocks.0.0.bn1.running_var:[40]***blocks.0.0.bn1.num_batches_tracked:[]***blocks.0.0.se.conv_reduce.weight:[10, 40, 1, 1]***blocks.0.0.se.conv_reduce.bias:[10]***blocks.0.0.se.conv_expand.weight:[40, 10, 1, 1]***blocks.0.0.se.conv_expand.bias:[40]***blocks.0.0.conv_pw.weight:[24, 40, 1, 1]***blocks.0.0.bn2.weight:[24]***blocks.0.0.bn2.bias:[24]***blocks.0.0.bn2.running_mean:[24]***blocks.0.0.bn2.running_var:[24]***blocks.0.0.bn2.num_batches_tracked:[]***blocks.0.1.conv_dw.weight:[24, 1, 3, 3]***blocks.0.1.bn1.weight:[24]***blocks.0.1.bn1.bias:[24]***blocks.0.1.bn1.running_mean:[24]***blocks.0.1.bn1.running_var:[24]***blocks.0.1.bn1.num_batches_tracked:[]***blocks.0.1.se.conv_reduce.weight:[6, 24, 1, 1]***blocks.0.1.se.conv_reduce.bias:[6]***blocks.0.1.se.conv_expand.weight:[24, 6, 1, 1]***blocks.0.1.se.conv_expand.bias:[24]***blocks.0.1.conv_pw.weight:[24, 24, 1, 1]***blocks.0.1.bn2.weight:[24]***blocks.0.1.bn2.bias:[24]***blocks.0.1.bn2.running_mean:[24]***blocks.0.1.bn2.running_var:[24]***blocks.0.1.bn2.num_batches_tracked:[]***blocks.1.0.conv_pw.weight:[27, 24, 1, 1]***blocks.1.0.bn1.weight:[27]***blocks.1.0.bn1.bias:[27]***blocks.1.0.bn1.running_mean:[27]***blocks.1.0.bn1.running_var:[27]***blocks.1.0.bn1.num_batches_tracked:[]***blocks.1.0.conv_dw.weight:[27, 1, 3, 3]***blocks.1.0.bn2.weight:[27]***blocks.1.0.bn2.bias:[27]***blocks.1.0.bn2.running_mean:[27]***blocks.1.0.bn2.running_var:[27]***blocks.1.0.bn2.num_batches_tracked:[]***blocks.1.0.se.conv_reduce.weight:[6, 27, 1, 1]***blocks.1.0.se.conv_reduce.bias:[6]***blocks.1.0.se.conv_expand.weight:[27, 6, 1, 1]***blocks.1.0.se.conv_expand.bias:[27]***blocks.1.0.conv_pwl.weight:[12, 27, 1, 1]***blocks.1.0.bn3.weight:[12]***blocks.1.0.bn3.bias:[12]***blocks.1.0.bn3.running_mean:[12]***blocks.1.0.bn3.running_var:[12]***blocks.1.0.bn3.num_batches_tracked:[]***blocks.1.1.conv_pw.weight:[49, 12, 1, 1]***blocks.1.1.bn1.weight:[49]***blocks.1.1.bn1.bias:[49]***blocks.1.1.bn1.running_mean:[49]***blocks.1.1.bn1.running_var:[49]***blocks.1.1.bn1.num_batches_tracked:[]***blocks.1.1.conv_dw.weight:[49, 1, 3, 3]***blocks.1.1.bn2.weight:[49]***blocks.1.1.bn2.bias:[49]***blocks.1.1.bn2.running_mean:[49]***blocks.1.1.bn2.running_var:[49]***blocks.1.1.bn2.num_batches_tracked:[]***blocks.1.1.se.conv_reduce.weight:[8, 49, 1, 1]***blocks.1.1.se.conv_reduce.bias:[8]***blocks.1.1.se.conv_expand.weight:[49, 8, 1, 1]***blocks.1.1.se.conv_expand.bias:[49]***blocks.1.1.conv_pwl.weight:[12, 49, 1, 1]***blocks.1.1.bn3.weight:[12]***blocks.1.1.bn3.bias:[12]***blocks.1.1.bn3.running_mean:[12]***blocks.1.1.bn3.running_var:[12]***blocks.1.1.bn3.num_batches_tracked:[]***blocks.1.2.conv_pw.weight:[48, 12, 1, 1]***blocks.1.2.bn1.weight:[48]***blocks.1.2.bn1.bias:[48]***blocks.1.2.bn1.running_mean:[48]***blocks.1.2.bn1.running_var:[48]***blocks.1.2.bn1.num_batches_tracked:[]***blocks.1.2.conv_dw.weight:[48, 1, 3, 3]***blocks.1.2.bn2.weight:[48]***blocks.1.2.bn2.bias:[48]***blocks.1.2.bn2.running_mean:[48]***blocks.1.2.bn2.running_var:[48]***blocks.1.2.bn2.num_batches_tracked:[]***blocks.1.2.se.conv_reduce.weight:[8, 48, 1, 1]***blocks.1.2.se.conv_reduce.bias:[8]***blocks.1.2.se.conv_expand.weight:[48, 8, 1, 1]***blocks.1.2.se.conv_expand.bias:[48]***blocks.1.2.conv_pwl.weight:[12, 48, 1, 1]***blocks.1.2.bn3.weight:[12]***blocks.1.2.bn3.bias:[12]***blocks.1.2.bn3.running_mean:[12]***blocks.1.2.bn3.running_var:[12]***blocks.1.2.bn3.num_batches_tracked:[]***blocks.2.0.conv_pw.weight:[83, 12, 1, 1]***blocks.2.0.bn1.weight:[83]***blocks.2.0.bn1.bias:[83]***blocks.2.0.bn1.running_mean:[83]***blocks.2.0.bn1.running_var:[83]***blocks.2.0.bn1.num_batches_tracked:[]***blocks.2.0.conv_dw.weight:[83, 1, 5, 5]***blocks.2.0.bn2.weight:[83]***blocks.2.0.bn2.bias:[83]***blocks.2.0.bn2.running_mean:[83]***blocks.2.0.bn2.running_var:[83]***blocks.2.0.bn2.num_batches_tracked:[]***blocks.2.0.se.conv_reduce.weight:[8, 83, 1, 1]***blocks.2.0.se.conv_reduce.bias:[8]***blocks.2.0.se.conv_expand.weight:[83, 8, 1, 1]***blocks.2.0.se.conv_expand.bias:[83]***blocks.2.0.conv_pwl.weight:[40, 83, 1, 1]***blocks.2.0.bn3.weight:[40]***blocks.2.0.bn3.bias:[40]***blocks.2.0.bn3.running_mean:[40]***blocks.2.0.bn3.running_var:[40]***blocks.2.0.bn3.num_batches_tracked:[]***blocks.2.1.conv_pw.weight:[90, 40, 1, 1]***blocks.2.1.bn1.weight:[90]***blocks.2.1.bn1.bias:[90]***blocks.2.1.bn1.running_mean:[90]***blocks.2.1.bn1.running_var:[90]***blocks.2.1.bn1.num_batches_tracked:[]***blocks.2.1.conv_dw.weight:[90, 1, 5, 5]***blocks.2.1.bn2.weight:[90]***blocks.2.1.bn2.bias:[90]***blocks.2.1.bn2.running_mean:[90]***blocks.2.1.bn2.running_var:[90]***blocks.2.1.bn2.num_batches_tracked:[]***blocks.2.1.se.conv_reduce.weight:[12, 90, 1, 1]***blocks.2.1.se.conv_reduce.bias:[12]***blocks.2.1.se.conv_expand.weight:[90, 12, 1, 1]***blocks.2.1.se.conv_expand.bias:[90]***blocks.2.1.conv_pwl.weight:[40, 90, 1, 1]***blocks.2.1.bn3.weight:[40]***blocks.2.1.bn3.bias:[40]***blocks.2.1.bn3.running_mean:[40]***blocks.2.1.bn3.running_var:[40]***blocks.2.1.bn3.num_batches_tracked:[]***blocks.2.2.conv_pw.weight:[85, 40, 1, 1]***blocks.2.2.bn1.weight:[85]***blocks.2.2.bn1.bias:[85]***blocks.2.2.bn1.running_mean:[85]***blocks.2.2.bn1.running_var:[85]***blocks.2.2.bn1.num_batches_tracked:[]***blocks.2.2.conv_dw.weight:[85, 1, 5, 5]***blocks.2.2.bn2.weight:[85]***blocks.2.2.bn2.bias:[85]***blocks.2.2.bn2.running_mean:[85]***blocks.2.2.bn2.running_var:[85]***blocks.2.2.bn2.num_batches_tracked:[]***blocks.2.2.se.conv_reduce.weight:[12, 85, 1, 1]***blocks.2.2.se.conv_reduce.bias:[12]***blocks.2.2.se.conv_expand.weight:[85, 12, 1, 1]***blocks.2.2.se.conv_expand.bias:[85]***blocks.2.2.conv_pwl.weight:[40, 85, 1, 1]***blocks.2.2.bn3.weight:[40]***blocks.2.2.bn3.bias:[40]***blocks.2.2.bn3.running_mean:[40]***blocks.2.2.bn3.running_var:[40]***blocks.2.2.bn3.num_batches_tracked:[]***blocks.3.0.conv_pw.weight:[215, 40, 1, 1]***blocks.3.0.bn1.weight:[215]***blocks.3.0.bn1.bias:[215]***blocks.3.0.bn1.running_mean:[215]***blocks.3.0.bn1.running_var:[215]***blocks.3.0.bn1.num_batches_tracked:[]***blocks.3.0.conv_dw.weight:[215, 1, 3, 3]***blocks.3.0.bn2.weight:[215]***blocks.3.0.bn2.bias:[215]***blocks.3.0.bn2.running_mean:[215]***blocks.3.0.bn2.running_var:[215]***blocks.3.0.bn2.num_batches_tracked:[]***blocks.3.0.se.conv_reduce.weight:[12, 215, 1, 1]***blocks.3.0.se.conv_reduce.bias:[12]***blocks.3.0.se.conv_expand.weight:[215, 12, 1, 1]***blocks.3.0.se.conv_expand.bias:[215]***blocks.3.0.conv_pwl.weight:[93, 215, 1, 1]***blocks.3.0.bn3.weight:[93]***blocks.3.0.bn3.bias:[93]***blocks.3.0.bn3.running_mean:[93]***blocks.3.0.bn3.running_var:[93]***blocks.3.0.bn3.num_batches_tracked:[]***blocks.3.1.conv_pw.weight:[261, 93, 1, 1]***blocks.3.1.bn1.weight:[261]***blocks.3.1.bn1.bias:[261]***blocks.3.1.bn1.running_mean:[261]***blocks.3.1.bn1.running_var:[261]***blocks.3.1.bn1.num_batches_tracked:[]***blocks.3.1.conv_dw.weight:[261, 1, 3, 3]***blocks.3.1.bn2.weight:[261]***blocks.3.1.bn2.bias:[261]***blocks.3.1.bn2.running_mean:[261]***blocks.3.1.bn2.running_var:[261]***blocks.3.1.bn2.num_batches_tracked:[]***blocks.3.1.se.conv_reduce.weight:[24, 261, 1, 1]***blocks.3.1.se.conv_reduce.bias:[24]***blocks.3.1.se.conv_expand.weight:[261, 24, 1, 1]***blocks.3.1.se.conv_expand.bias:[261]***blocks.3.1.conv_pwl.weight:[93, 261, 1, 1]***blocks.3.1.bn3.weight:[93]***blocks.3.1.bn3.bias:[93]***blocks.3.1.bn3.running_mean:[93]***blocks.3.1.bn3.running_var:[93]***blocks.3.1.bn3.num_batches_tracked:[]***blocks.3.2.conv_pw.weight:[219, 93, 1, 1]***blocks.3.2.bn1.weight:[219]***blocks.3.2.bn1.bias:[219]***blocks.3.2.bn1.running_mean:[219]***blocks.3.2.bn1.running_var:[219]***blocks.3.2.bn1.num_batches_tracked:[]***blocks.3.2.conv_dw.weight:[219, 1, 3, 3]***blocks.3.2.bn2.weight:[219]***blocks.3.2.bn2.bias:[219]***blocks.3.2.bn2.running_mean:[219]***blocks.3.2.bn2.running_var:[219]***blocks.3.2.bn2.num_batches_tracked:[]***blocks.3.2.se.conv_reduce.weight:[24, 219, 1, 1]***blocks.3.2.se.conv_reduce.bias:[24]***blocks.3.2.se.conv_expand.weight:[219, 24, 1, 1]***blocks.3.2.se.conv_expand.bias:[219]***blocks.3.2.conv_pwl.weight:[93, 219, 1, 1]***blocks.3.2.bn3.weight:[93]***blocks.3.2.bn3.bias:[93]***blocks.3.2.bn3.running_mean:[93]***blocks.3.2.bn3.running_var:[93]***blocks.3.2.bn3.num_batches_tracked:[]***blocks.3.3.conv_pw.weight:[254, 93, 1, 1]***blocks.3.3.bn1.weight:[254]***blocks.3.3.bn1.bias:[254]***blocks.3.3.bn1.running_mean:[254]***blocks.3.3.bn1.running_var:[254]***blocks.3.3.bn1.num_batches_tracked:[]***blocks.3.3.conv_dw.weight:[254, 1, 3, 3]***blocks.3.3.bn2.weight:[254]***blocks.3.3.bn2.bias:[254]***blocks.3.3.bn2.running_mean:[254]***blocks.3.3.bn2.running_var:[254]***blocks.3.3.bn2.num_batches_tracked:[]***blocks.3.3.se.conv_reduce.weight:[24, 254, 1, 1]***blocks.3.3.se.conv_reduce.bias:[24]***blocks.3.3.se.conv_expand.weight:[254, 24, 1, 1]***blocks.3.3.se.conv_expand.bias:[254]***blocks.3.3.conv_pwl.weight:[93, 254, 1, 1]***blocks.3.3.bn3.weight:[93]***blocks.3.3.bn3.bias:[93]***blocks.3.3.bn3.running_mean:[93]***blocks.3.3.bn3.running_var:[93]***blocks.3.3.bn3.num_batches_tracked:[]***blocks.3.4.conv_pw.weight:[236, 93, 1, 1]***blocks.3.4.bn1.weight:[236]***blocks.3.4.bn1.bias:[236]***blocks.3.4.bn1.running_mean:[236]***blocks.3.4.bn1.running_var:[236]***blocks.3.4.bn1.num_batches_tracked:[]***blocks.3.4.conv_dw.weight:[236, 1, 3, 3]***blocks.3.4.bn2.weight:[236]***blocks.3.4.bn2.bias:[236]***blocks.3.4.bn2.running_mean:[236]***blocks.3.4.bn2.running_var:[236]***blocks.3.4.bn2.num_batches_tracked:[]***blocks.3.4.se.conv_reduce.weight:[24, 236, 1, 1]***blocks.3.4.se.conv_reduce.bias:[24]***blocks.3.4.se.conv_expand.weight:[236, 24, 1, 1]***blocks.3.4.se.conv_expand.bias:[236]***blocks.3.4.conv_pwl.weight:[93, 236, 1, 1]***blocks.3.4.bn3.weight:[93]***blocks.3.4.bn3.bias:[93]***blocks.3.4.bn3.running_mean:[93]***blocks.3.4.bn3.running_var:[93]***blocks.3.4.bn3.num_batches_tracked:[]***blocks.4.0.conv_pw.weight:[480, 93, 1, 1]***blocks.4.0.bn1.weight:[480]***blocks.4.0.bn1.bias:[480]***blocks.4.0.bn1.running_mean:[480]***blocks.4.0.bn1.running_var:[480]***blocks.4.0.bn1.num_batches_tracked:[]***blocks.4.0.conv_dw.weight:[480, 1, 5, 5]***blocks.4.0.bn2.weight:[480]***blocks.4.0.bn2.bias:[480]***blocks.4.0.bn2.running_mean:[480]***blocks.4.0.bn2.running_var:[480]***blocks.4.0.bn2.num_batches_tracked:[]***blocks.4.0.se.conv_reduce.weight:[24, 480, 1, 1]***blocks.4.0.se.conv_reduce.bias:[24]***blocks.4.0.se.conv_expand.weight:[480, 24, 1, 1]***blocks.4.0.se.conv_expand.bias:[480]***blocks.4.0.conv_pwl.weight:[120, 480, 1, 1]***blocks.4.0.bn3.weight:[120]***blocks.4.0.bn3.bias:[120]***blocks.4.0.bn3.running_mean:[120]***blocks.4.0.bn3.running_var:[120]***blocks.4.0.bn3.num_batches_tracked:[]***blocks.4.1.conv_pw.weight:[235, 120, 1, 1]***blocks.4.1.bn1.weight:[235]***blocks.4.1.bn1.bias:[235]***blocks.4.1.bn1.running_mean:[235]***blocks.4.1.bn1.running_var:[235]***blocks.4.1.bn1.num_batches_tracked:[]***blocks.4.1.conv_dw.weight:[235, 1, 5, 5]***blocks.4.1.bn2.weight:[235]***blocks.4.1.bn2.bias:[235]***blocks.4.1.bn2.running_mean:[235]***blocks.4.1.bn2.running_var:[235]***blocks.4.1.bn2.num_batches_tracked:[]***blocks.4.1.se.conv_reduce.weight:[34, 235, 1, 1]***blocks.4.1.se.conv_reduce.bias:[34]***blocks.4.1.se.conv_expand.weight:[235, 34, 1, 1]***blocks.4.1.se.conv_expand.bias:[235]***blocks.4.1.conv_pwl.weight:[120, 235, 1, 1]***blocks.4.1.bn3.weight:[120]***blocks.4.1.bn3.bias:[120]***blocks.4.1.bn3.running_mean:[120]***blocks.4.1.bn3.running_var:[120]***blocks.4.1.bn3.num_batches_tracked:[]***blocks.4.2.conv_pw.weight:[217, 120, 1, 1]***blocks.4.2.bn1.weight:[217]***blocks.4.2.bn1.bias:[217]***blocks.4.2.bn1.running_mean:[217]***blocks.4.2.bn1.running_var:[217]***blocks.4.2.bn1.num_batches_tracked:[]***blocks.4.2.conv_dw.weight:[217, 1, 5, 5]***blocks.4.2.bn2.weight:[217]***blocks.4.2.bn2.bias:[217]***blocks.4.2.bn2.running_mean:[217]***blocks.4.2.bn2.running_var:[217]***blocks.4.2.bn2.num_batches_tracked:[]***blocks.4.2.se.conv_reduce.weight:[34, 217, 1, 1]***blocks.4.2.se.conv_reduce.bias:[34]***blocks.4.2.se.conv_expand.weight:[217, 34, 1, 1]***blocks.4.2.se.conv_expand.bias:[217]***blocks.4.2.conv_pwl.weight:[120, 217, 1, 1]***blocks.4.2.bn3.weight:[120]***blocks.4.2.bn3.bias:[120]***blocks.4.2.bn3.running_mean:[120]***blocks.4.2.bn3.running_var:[120]***blocks.4.2.bn3.num_batches_tracked:[]***blocks.4.3.conv_pw.weight:[226, 120, 1, 1]***blocks.4.3.bn1.weight:[226]***blocks.4.3.bn1.bias:[226]***blocks.4.3.bn1.running_mean:[226]***blocks.4.3.bn1.running_var:[226]***blocks.4.3.bn1.num_batches_tracked:[]***blocks.4.3.conv_dw.weight:[226, 1, 5, 5]***blocks.4.3.bn2.weight:[226]***blocks.4.3.bn2.bias:[226]***blocks.4.3.bn2.running_mean:[226]***blocks.4.3.bn2.running_var:[226]***blocks.4.3.bn2.num_batches_tracked:[]***blocks.4.3.se.conv_reduce.weight:[33, 226, 1, 1]***blocks.4.3.se.conv_reduce.bias:[33]***blocks.4.3.se.conv_expand.weight:[226, 33, 1, 1]***blocks.4.3.se.conv_expand.bias:[226]***blocks.4.3.conv_pwl.weight:[120, 226, 1, 1]***blocks.4.3.bn3.weight:[120]***blocks.4.3.bn3.bias:[120]***blocks.4.3.bn3.running_mean:[120]***blocks.4.3.bn3.running_var:[120]***blocks.4.3.bn3.num_batches_tracked:[]***blocks.4.4.conv_pw.weight:[340, 120, 1, 1]***blocks.4.4.bn1.weight:[340]***blocks.4.4.bn1.bias:[340]***blocks.4.4.bn1.running_mean:[340]***blocks.4.4.bn1.running_var:[340]***blocks.4.4.bn1.num_batches_tracked:[]***blocks.4.4.conv_dw.weight:[340, 1, 5, 5]***blocks.4.4.bn2.weight:[340]***blocks.4.4.bn2.bias:[340]***blocks.4.4.bn2.running_mean:[340]***blocks.4.4.bn2.running_var:[340]***blocks.4.4.bn2.num_batches_tracked:[]***blocks.4.4.se.conv_reduce.weight:[34, 340, 1, 1]***blocks.4.4.se.conv_reduce.bias:[34]***blocks.4.4.se.conv_expand.weight:[340, 34, 1, 1]***blocks.4.4.se.conv_expand.bias:[340]***blocks.4.4.conv_pwl.weight:[120, 340, 1, 1]***blocks.4.4.bn3.weight:[120]***blocks.4.4.bn3.bias:[120]***blocks.4.4.bn3.running_mean:[120]***blocks.4.4.bn3.running_var:[120]***blocks.4.4.bn3.num_batches_tracked:[]***blocks.5.0.conv_pw.weight:[802, 120, 1, 1]***blocks.5.0.bn1.weight:[802]***blocks.5.0.bn1.bias:[802]***blocks.5.0.bn1.running_mean:[802]***blocks.5.0.bn1.running_var:[802]***blocks.5.0.bn1.num_batches_tracked:[]***blocks.5.0.conv_dw.weight:[802, 1, 5, 5]***blocks.5.0.bn2.weight:[802]***blocks.5.0.bn2.bias:[802]***blocks.5.0.bn2.running_mean:[802]***blocks.5.0.bn2.running_var:[802]***blocks.5.0.bn2.num_batches_tracked:[]***blocks.5.0.se.conv_reduce.weight:[34, 802, 1, 1]***blocks.5.0.se.conv_reduce.bias:[34]***blocks.5.0.se.conv_expand.weight:[802, 34, 1, 1]***blocks.5.0.se.conv_expand.bias:[802]***blocks.5.0.conv_pwl.weight:[232, 802, 1, 1]***blocks.5.0.bn3.weight:[232]***blocks.5.0.bn3.bias:[232]***blocks.5.0.bn3.running_mean:[232]***blocks.5.0.bn3.running_var:[232]***blocks.5.0.bn3.num_batches_tracked:[]***blocks.5.1.conv_pw.weight:[1030, 232, 1, 1]***blocks.5.1.bn1.weight:[1030]***blocks.5.1.bn1.bias:[1030]***blocks.5.1.bn1.running_mean:[1030]***blocks.5.1.bn1.running_var:[1030]***blocks.5.1.bn1.num_batches_tracked:[]***blocks.5.1.conv_dw.weight:[1030, 1, 5, 5]***blocks.5.1.bn2.weight:[1030]***blocks.5.1.bn2.bias:[1030]***blocks.5.1.bn2.running_mean:[1030]***blocks.5.1.bn2.running_var:[1030]***blocks.5.1.bn2.num_batches_tracked:[]***blocks.5.1.se.conv_reduce.weight:[58, 1030, 1, 1]***blocks.5.1.se.conv_reduce.bias:[58]***blocks.5.1.se.conv_expand.weight:[1030, 58, 1, 1]***blocks.5.1.se.conv_expand.bias:[1030]***blocks.5.1.conv_pwl.weight:[232, 1030, 1, 1]***blocks.5.1.bn3.weight:[232]***blocks.5.1.bn3.bias:[232]***blocks.5.1.bn3.running_mean:[232]***blocks.5.1.bn3.running_var:[232]***blocks.5.1.bn3.num_batches_tracked:[]***blocks.5.2.conv_pw.weight:[924, 232, 1, 1]***blocks.5.2.bn1.weight:[924]***blocks.5.2.bn1.bias:[924]***blocks.5.2.bn1.running_mean:[924]***blocks.5.2.bn1.running_var:[924]***blocks.5.2.bn1.num_batches_tracked:[]***blocks.5.2.conv_dw.weight:[924, 1, 5, 5]***blocks.5.2.bn2.weight:[924]***blocks.5.2.bn2.bias:[924]***blocks.5.2.bn2.running_mean:[924]***blocks.5.2.bn2.running_var:[924]***blocks.5.2.bn2.num_batches_tracked:[]***blocks.5.2.se.conv_reduce.weight:[58, 924, 1, 1]***blocks.5.2.se.conv_reduce.bias:[58]***blocks.5.2.se.conv_expand.weight:[924, 58, 1, 1]***blocks.5.2.se.conv_expand.bias:[924]***blocks.5.2.conv_pwl.weight:[232, 924, 1, 1]***blocks.5.2.bn3.weight:[232]***blocks.5.2.bn3.bias:[232]***blocks.5.2.bn3.running_mean:[232]***blocks.5.2.bn3.running_var:[232]***blocks.5.2.bn3.num_batches_tracked:[]***blocks.5.3.conv_pw.weight:[1016, 232, 1, 1]***blocks.5.3.bn1.weight:[1016]***blocks.5.3.bn1.bias:[1016]***blocks.5.3.bn1.running_mean:[1016]***blocks.5.3.bn1.running_var:[1016]***blocks.5.3.bn1.num_batches_tracked:[]***blocks.5.3.conv_dw.weight:[1016, 1, 5, 5]***blocks.5.3.bn2.weight:[1016]***blocks.5.3.bn2.bias:[1016]***blocks.5.3.bn2.running_mean:[1016]***blocks.5.3.bn2.running_var:[1016]***blocks.5.3.bn2.num_batches_tracked:[]***blocks.5.3.se.conv_reduce.weight:[58, 1016, 1, 1]***blocks.5.3.se.conv_reduce.bias:[58]***blocks.5.3.se.conv_expand.weight:[1016, 58, 1, 1]***blocks.5.3.se.conv_expand.bias:[1016]***blocks.5.3.conv_pwl.weight:[232, 1016, 1, 1]***blocks.5.3.bn3.weight:[232]***blocks.5.3.bn3.bias:[232]***blocks.5.3.bn3.running_mean:[232]***blocks.5.3.bn3.running_var:[232]***blocks.5.3.bn3.num_batches_tracked:[]***blocks.5.4.conv_pw.weight:[1130, 232, 1, 1]***blocks.5.4.bn1.weight:[1130]***blocks.5.4.bn1.bias:[1130]***blocks.5.4.bn1.running_mean:[1130]***blocks.5.4.bn1.running_var:[1130]***blocks.5.4.bn1.num_batches_tracked:[]***blocks.5.4.conv_dw.weight:[1130, 1, 5, 5]***blocks.5.4.bn2.weight:[1130]***blocks.5.4.bn2.bias:[1130]***blocks.5.4.bn2.running_mean:[1130]***blocks.5.4.bn2.running_var:[1130]***blocks.5.4.bn2.num_batches_tracked:[]***blocks.5.4.se.conv_reduce.weight:[58, 1130, 1, 1]***blocks.5.4.se.conv_reduce.bias:[58]***blocks.5.4.se.conv_expand.weight:[1130, 58, 1, 1]***blocks.5.4.se.conv_expand.bias:[1130]***blocks.5.4.conv_pwl.weight:[232, 1130, 1, 1]***blocks.5.4.bn3.weight:[232]***blocks.5.4.bn3.bias:[232]***blocks.5.4.bn3.running_mean:[232]***blocks.5.4.bn3.running_var:[232]***blocks.5.4.bn3.num_batches_tracked:[]***blocks.5.5.conv_pw.weight:[1266, 232, 1, 1]***blocks.5.5.bn1.weight:[1266]***blocks.5.5.bn1.bias:[1266]***blocks.5.5.bn1.running_mean:[1266]***blocks.5.5.bn1.running_var:[1266]***blocks.5.5.bn1.num_batches_tracked:[]***blocks.5.5.conv_dw.weight:[1266, 1, 5, 5]***blocks.5.5.bn2.weight:[1266]***blocks.5.5.bn2.bias:[1266]***blocks.5.5.bn2.running_mean:[1266]***blocks.5.5.bn2.running_var:[1266]***blocks.5.5.bn2.num_batches_tracked:[]***blocks.5.5.se.conv_reduce.weight:[58, 1266, 1, 1]***blocks.5.5.se.conv_reduce.bias:[58]***blocks.5.5.se.conv_expand.weight:[1266, 58, 1, 1]***blocks.5.5.se.conv_expand.bias:[1266]***blocks.5.5.conv_pwl.weight:[232, 1266, 1, 1]***blocks.5.5.bn3.weight:[232]***blocks.5.5.bn3.bias:[232]***blocks.5.5.bn3.running_mean:[232]***blocks.5.5.bn3.running_var:[232]***blocks.5.5.bn3.num_batches_tracked:[]***blocks.6.0.conv_pw.weight:[1392, 232, 1, 1]***blocks.6.0.bn1.weight:[1392]***blocks.6.0.bn1.bias:[1392]***blocks.6.0.bn1.running_mean:[1392]***blocks.6.0.bn1.running_var:[1392]***blocks.6.0.bn1.num_batches_tracked:[]***blocks.6.0.conv_dw.weight:[1392, 1, 3, 3]***blocks.6.0.bn2.weight:[1392]***blocks.6.0.bn2.bias:[1392]***blocks.6.0.bn2.running_mean:[1392]***blocks.6.0.bn2.running_var:[1392]***blocks.6.0.bn2.num_batches_tracked:[]***blocks.6.0.se.conv_reduce.weight:[58, 1392, 1, 1]***blocks.6.0.se.conv_reduce.bias:[58]***blocks.6.0.se.conv_expand.weight:[1392, 58, 1, 1]***blocks.6.0.se.conv_expand.bias:[1392]***blocks.6.0.conv_pwl.weight:[384, 1392, 1, 1]***blocks.6.0.bn3.weight:[384]***blocks.6.0.bn3.bias:[384]***blocks.6.0.bn3.running_mean:[384]***blocks.6.0.bn3.running_var:[384]***blocks.6.0.bn3.num_batches_tracked:[]***blocks.6.1.conv_pw.weight:[2301, 384, 1, 1]***blocks.6.1.bn1.weight:[2301]***blocks.6.1.bn1.bias:[2301]***blocks.6.1.bn1.running_mean:[2301]***blocks.6.1.bn1.running_var:[2301]***blocks.6.1.bn1.num_batches_tracked:[]***blocks.6.1.conv_dw.weight:[2301, 1, 3, 3]***blocks.6.1.bn2.weight:[2301]***blocks.6.1.bn2.bias:[2301]***blocks.6.1.bn2.running_mean:[2301]***blocks.6.1.bn2.running_var:[2301]***blocks.6.1.bn2.num_batches_tracked:[]***blocks.6.1.se.conv_reduce.weight:[96, 2301, 1, 1]***blocks.6.1.se.conv_reduce.bias:[96]***blocks.6.1.se.conv_expand.weight:[2301, 96, 1, 1]***blocks.6.1.se.conv_expand.bias:[2301]***blocks.6.1.conv_pwl.weight:[384, 2301, 1, 1]***blocks.6.1.bn3.weight:[384]***blocks.6.1.bn3.bias:[384]***blocks.6.1.bn3.running_mean:[384]***blocks.6.1.bn3.running_var:[384]***blocks.6.1.bn3.num_batches_tracked:[]***conv_head.weight:[1536, 384, 1, 1]***bn2.weight:[1536]***bn2.bias:[1536]***bn2.running_mean:[1536]***bn2.running_var:[1536]***bn2.num_batches_tracked:[]***classifier.weight:[1000, 1536]***classifier.bias:[1000] \ No newline at end of file diff --git a/custom_timm/models/pvt_v2.py b/custom_timm/models/pvt_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..6e024f43c05c624fada3b682b7efedbf41e51008 --- /dev/null +++ b/custom_timm/models/pvt_v2.py @@ -0,0 +1,476 @@ +""" Pyramid Vision Transformer v2 + +@misc{wang2021pvtv2, + title={PVTv2: Improved Baselines with Pyramid Vision Transformer}, + author={Wenhai Wang and Enze Xie and Xiang Li and Deng-Ping Fan and Kaitao Song and Ding Liang and + Tong Lu and Ping Luo and Ling Shao}, + year={2021}, + eprint={2106.13797}, + archivePrefix={arXiv}, + primaryClass={cs.CV} +} + +Based on Apache 2.0 licensed code at https://github.com/whai362/PVT + +Modifications and timm support by / Copyright 2022, Ross Wightman +""" + +import math +from functools import partial +from typing import Tuple, List, Callable, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint as checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import DropPath, to_2tuple, to_ntuple, trunc_normal_ +from .registry import register_model + +__all__ = ['PyramidVisionTransformerV2'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.9, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', 'fixed_input_size': False, + **kwargs + } + + +default_cfgs = { + 'pvt_v2_b0': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b0.pth'), + 'pvt_v2_b1': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b1.pth'), + 'pvt_v2_b2': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2.pth'), + 'pvt_v2_b3': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b3.pth'), + 'pvt_v2_b4': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b4.pth'), + 'pvt_v2_b5': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b5.pth'), + 'pvt_v2_b2_li': _cfg(url='https://github.com/whai362/PVT/releases/download/v2/pvt_v2_b2_li.pth') +} + + +class MlpWithDepthwiseConv(nn.Module): + def __init__( + self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, + drop=0., extra_relu=False): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.relu = nn.ReLU() if extra_relu else nn.Identity() + self.dwconv = nn.Conv2d(hidden_features, hidden_features, 3, 1, 1, bias=True, groups=hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x, feat_size: List[int]): + x = self.fc1(x) + B, N, C = x.shape + x = x.transpose(1, 2).view(B, C, feat_size[0], feat_size[1]) + x = self.relu(x) + x = self.dwconv(x) + x = x.flatten(2).transpose(1, 2) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(nn.Module): + def __init__( + self, + dim, + num_heads=8, + sr_ratio=1, + linear_attn=False, + qkv_bias=True, + attn_drop=0., + proj_drop=0. + ): + super().__init__() + assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." + + self.dim = dim + self.num_heads = num_heads + self.head_dim = dim // num_heads + self.scale = self.head_dim ** -0.5 + + self.q = nn.Linear(dim, dim, bias=qkv_bias) + self.kv = nn.Linear(dim, dim * 2, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + if not linear_attn: + self.pool = None + if sr_ratio > 1: + self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) + self.norm = nn.LayerNorm(dim) + else: + self.sr = None + self.norm = None + self.act = None + else: + self.pool = nn.AdaptiveAvgPool2d(7) + self.sr = nn.Conv2d(dim, dim, kernel_size=1, stride=1) + self.norm = nn.LayerNorm(dim) + self.act = nn.GELU() + + def forward(self, x, feat_size: List[int]): + B, N, C = x.shape + H, W = feat_size + q = self.q(x).reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + + if self.pool is not None: + x_ = x.permute(0, 2, 1).reshape(B, C, H, W) + x_ = self.sr(self.pool(x_)).reshape(B, C, -1).permute(0, 2, 1) + x_ = self.norm(x_) + x_ = self.act(x_) + kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + else: + if self.sr is not None: + x_ = x.permute(0, 2, 1).reshape(B, C, H, W) + x_ = self.sr(x_).reshape(B, C, -1).permute(0, 2, 1) + x_ = self.norm(x_) + kv = self.kv(x_).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + else: + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + k, v = kv.unbind(0) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., sr_ratio=1, linear_attn=False, qkv_bias=False, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + sr_ratio=sr_ratio, + linear_attn=linear_attn, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = MlpWithDepthwiseConv( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=drop, + extra_relu=linear_attn + ) + + def forward(self, x, feat_size: List[int]): + x = x + self.drop_path(self.attn(self.norm1(x), feat_size)) + x = x + self.drop_path(self.mlp(self.norm2(x), feat_size)) + + return x + + +class OverlapPatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + def __init__(self, patch_size=7, stride=4, in_chans=3, embed_dim=768): + super().__init__() + patch_size = to_2tuple(patch_size) + assert max(patch_size) > stride, "Set larger patch_size than stride" + self.patch_size = patch_size + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=patch_size, stride=stride, + padding=(patch_size[0] // 2, patch_size[1] // 2)) + self.norm = nn.LayerNorm(embed_dim) + + def forward(self, x): + x = self.proj(x) + feat_size = x.shape[-2:] + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + return x, feat_size + + +class PyramidVisionTransformerStage(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + depth: int, + downsample: bool = True, + num_heads: int = 8, + sr_ratio: int = 1, + linear_attn: bool = False, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + drop: float = 0., + attn_drop: float = 0., + drop_path: Union[List[float], float] = 0.0, + norm_layer: Callable = nn.LayerNorm, + ): + super().__init__() + self.grad_checkpointing = False + + if downsample: + self.downsample = OverlapPatchEmbed( + patch_size=3, + stride=2, + in_chans=dim, + embed_dim=dim_out) + else: + assert dim == dim_out + self.downsample = None + + self.blocks = nn.ModuleList([Block( + dim=dim_out, + num_heads=num_heads, + sr_ratio=sr_ratio, + linear_attn=linear_attn, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + ) for i in range(depth)]) + + self.norm = norm_layer(dim_out) + + def forward(self, x, feat_size: List[int]) -> Tuple[torch.Tensor, List[int]]: + if self.downsample is not None: + x, feat_size = self.downsample(x) + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint.checkpoint(blk, x, feat_size) + else: + x = blk(x, feat_size) + x = self.norm(x) + x = x.reshape(x.shape[0], feat_size[0], feat_size[1], -1).permute(0, 3, 1, 2).contiguous() + return x, feat_size + + +class PyramidVisionTransformerV2(nn.Module): + def __init__( + self, + img_size=None, + in_chans=3, + num_classes=1000, + global_pool='avg', + depths=(3, 4, 6, 3), + embed_dims=(64, 128, 256, 512), + num_heads=(1, 2, 4, 8), + sr_ratios=(8, 4, 2, 1), + mlp_ratios=(8., 8., 4., 4.), + qkv_bias=True, + linear=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.num_classes = num_classes + assert global_pool in ('avg', '') + self.global_pool = global_pool + self.depths = depths + num_stages = len(depths) + mlp_ratios = to_ntuple(num_stages)(mlp_ratios) + num_heads = to_ntuple(num_stages)(num_heads) + sr_ratios = to_ntuple(num_stages)(sr_ratios) + assert(len(embed_dims)) == num_stages + + self.patch_embed = OverlapPatchEmbed( + patch_size=7, + stride=4, + in_chans=in_chans, + embed_dim=embed_dims[0]) + + dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)] + cur = 0 + prev_dim = embed_dims[0] + self.stages = nn.ModuleList() + for i in range(num_stages): + self.stages.append(PyramidVisionTransformerStage( + dim=prev_dim, + dim_out=embed_dims[i], + depth=depths[i], + downsample=i > 0, + num_heads=num_heads[i], + sr_ratio=sr_ratios[i], + mlp_ratio=mlp_ratios[i], + linear_attn=linear, + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer + )) + prev_dim = embed_dims[i] + cur += depths[i] + + # classification head + self.num_features = embed_dims[-1] + self.head = nn.Linear(embed_dims[-1], num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def freeze_patch_emb(self): + self.patch_embed.requires_grad = False + + @torch.jit.ignore + def no_weight_decay(self): + return {} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^patch_embed', # stem and embed + blocks=r'^stages\.(\d+)' + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('avg', '') + self.global_pool = global_pool + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x, feat_size = self.patch_embed(x) + for stage in self.stages: + x, feat_size = stage(x, feat_size=feat_size) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x.mean(dim=(-1, -2)) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _checkpoint_filter_fn(state_dict, model): + """ Remap original checkpoints -> timm """ + if 'patch_embed.proj.weight' in state_dict: + return state_dict # non-original checkpoint, no remapping needed + + out_dict = {} + import re + for k, v in state_dict.items(): + if k.startswith('patch_embed'): + k = k.replace('patch_embed1', 'patch_embed') + k = k.replace('patch_embed2', 'stages.1.downsample') + k = k.replace('patch_embed3', 'stages.2.downsample') + k = k.replace('patch_embed4', 'stages.3.downsample') + k = k.replace('dwconv.dwconv', 'dwconv') + k = re.sub(r'block(\d+).(\d+)', lambda x: f'stages.{int(x.group(1)) - 1}.blocks.{x.group(2)}', k) + k = re.sub(r'^norm(\d+)', lambda x: f'stages.{int(x.group(1)) - 1}.norm', k) + out_dict[k] = v + return out_dict + + +def _create_pvt2(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + model = build_model_with_cfg( + PyramidVisionTransformerV2, variant, pretrained, + pretrained_filter_fn=_checkpoint_filter_fn, + **kwargs + ) + return model + + +@register_model +def pvt_v2_b0(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(2, 2, 2, 2), embed_dims=(32, 64, 160, 256), num_heads=(1, 2, 5, 8), + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return _create_pvt2('pvt_v2_b0', pretrained=pretrained, **model_kwargs) + + +@register_model +def pvt_v2_b1(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(2, 2, 2, 2), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8), + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return _create_pvt2('pvt_v2_b1', pretrained=pretrained, **model_kwargs) + + +@register_model +def pvt_v2_b2(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 4, 6, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8), + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return _create_pvt2('pvt_v2_b2', pretrained=pretrained, **model_kwargs) + + +@register_model +def pvt_v2_b3(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 4, 18, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8), + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return _create_pvt2('pvt_v2_b3', pretrained=pretrained, **model_kwargs) + + +@register_model +def pvt_v2_b4(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 8, 27, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8), + norm_layer=partial(nn.LayerNorm, eps=1e-6), **kwargs) + return _create_pvt2('pvt_v2_b4', pretrained=pretrained, **model_kwargs) + + +@register_model +def pvt_v2_b5(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 6, 40, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8), + mlp_ratios=(4, 4, 4, 4), norm_layer=partial(nn.LayerNorm, eps=1e-6), + **kwargs) + return _create_pvt2('pvt_v2_b5', pretrained=pretrained, **model_kwargs) + + +@register_model +def pvt_v2_b2_li(pretrained=False, **kwargs): + model_kwargs = dict( + depths=(3, 4, 6, 3), embed_dims=(64, 128, 320, 512), num_heads=(1, 2, 5, 8), + norm_layer=partial(nn.LayerNorm, eps=1e-6), linear=True, **kwargs) + return _create_pvt2('pvt_v2_b2_li', pretrained=pretrained, **model_kwargs) + diff --git a/custom_timm/models/registry.py b/custom_timm/models/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..9f58060fd0fdf1a2b3256327d479efd0bba77fc0 --- /dev/null +++ b/custom_timm/models/registry.py @@ -0,0 +1,159 @@ +""" Model Registry +Hacked together by / Copyright 2020 Ross Wightman +""" + +import sys +import re +import fnmatch +from collections import defaultdict +from copy import deepcopy + +__all__ = ['list_models', 'is_model', 'model_entrypoint', 'list_modules', 'is_model_in_modules', + 'is_pretrained_cfg_key', 'has_pretrained_cfg_key', 'get_pretrained_cfg_value', 'is_model_pretrained'] + +_module_to_models = defaultdict(set) # dict of sets to check membership of model in module +_model_to_module = {} # mapping of model names to module names +_model_entrypoints = {} # mapping of model names to entrypoint fns +_model_has_pretrained = set() # set of model names that have pretrained weight url present +_model_pretrained_cfgs = dict() # central repo for model default_cfgs + + +def register_model(fn): + # lookup containing module + mod = sys.modules[fn.__module__] + module_name_split = fn.__module__.split('.') + module_name = module_name_split[-1] if len(module_name_split) else '' + + # add model to __all__ in module + model_name = fn.__name__ + if hasattr(mod, '__all__'): + mod.__all__.append(model_name) + else: + mod.__all__ = [model_name] + + # add entries to registry dict/sets + _model_entrypoints[model_name] = fn + _model_to_module[model_name] = module_name + _module_to_models[module_name].add(model_name) + has_valid_pretrained = False # check if model has a pretrained url to allow filtering on this + if hasattr(mod, 'default_cfgs') and model_name in mod.default_cfgs: + # this will catch all models that have entrypoint matching cfg key, but miss any aliasing + # entrypoints or non-matching combos + cfg = mod.default_cfgs[model_name] + has_valid_pretrained = ( + ('url' in cfg and 'http' in cfg['url']) or + ('file' in cfg and cfg['file']) or + ('hf_hub_id' in cfg and cfg['hf_hub_id']) + ) + _model_pretrained_cfgs[model_name] = mod.default_cfgs[model_name] + if has_valid_pretrained: + _model_has_pretrained.add(model_name) + return fn + + +def _natural_key(string_): + return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] + + +def list_models(filter='', module='', pretrained=False, exclude_filters='', name_matches_cfg=False): + """ Return list of available model names, sorted alphabetically + + Args: + filter (str) - Wildcard filter string that works with fnmatch + module (str) - Limit model selection to a specific sub-module (ie 'gen_efficientnet') + pretrained (bool) - Include only models with pretrained weights if True + exclude_filters (str or list[str]) - Wildcard filters to exclude models after including them with filter + name_matches_cfg (bool) - Include only models w/ model_name matching default_cfg name (excludes some aliases) + + Example: + model_list('gluon_resnet*') -- returns all models starting with 'gluon_resnet' + model_list('*resnext*, 'resnet') -- returns all models with 'resnext' in 'resnet' module + """ + if module: + all_models = list(_module_to_models[module]) + else: + all_models = _model_entrypoints.keys() + if filter: + models = [] + include_filters = filter if isinstance(filter, (tuple, list)) else [filter] + for f in include_filters: + include_models = fnmatch.filter(all_models, f) # include these models + if len(include_models): + models = set(models).union(include_models) + else: + models = all_models + if exclude_filters: + if not isinstance(exclude_filters, (tuple, list)): + exclude_filters = [exclude_filters] + for xf in exclude_filters: + exclude_models = fnmatch.filter(models, xf) # exclude these models + if len(exclude_models): + models = set(models).difference(exclude_models) + if pretrained: + models = _model_has_pretrained.intersection(models) + if name_matches_cfg: + models = set(_model_pretrained_cfgs).intersection(models) + return list(sorted(models, key=_natural_key)) + + +def is_model(model_name): + """ Check if a model name exists + """ + return model_name in _model_entrypoints + + +def model_entrypoint(model_name): + """Fetch a model entrypoint for specified model name + """ + return _model_entrypoints[model_name] + + +def list_modules(): + """ Return list of module names that contain models / model entrypoints + """ + modules = _module_to_models.keys() + return list(sorted(modules)) + + +def is_model_in_modules(model_name, module_names): + """Check if a model exists within a subset of modules + Args: + model_name (str) - name of model to check + module_names (tuple, list, set) - names of modules to search in + """ + assert isinstance(module_names, (tuple, list, set)) + return any(model_name in _module_to_models[n] for n in module_names) + + +def is_model_pretrained(model_name): + return model_name in _model_has_pretrained + + +def get_pretrained_cfg(model_name): + if model_name in _model_pretrained_cfgs: + return deepcopy(_model_pretrained_cfgs[model_name]) + return {} + + +def has_pretrained_cfg_key(model_name, cfg_key): + """ Query model default_cfgs for existence of a specific key. + """ + if model_name in _model_pretrained_cfgs and cfg_key in _model_pretrained_cfgs[model_name]: + return True + return False + + +def is_pretrained_cfg_key(model_name, cfg_key): + """ Return truthy value for specified model default_cfg key, False if does not exist. + """ + if model_name in _model_pretrained_cfgs and _model_pretrained_cfgs[model_name].get(cfg_key, False): + return True + return False + + +def get_pretrained_cfg_value(model_name, cfg_key): + """ Get a specific model default_cfg value by key. None if it doesn't exist. + """ + if model_name in _model_pretrained_cfgs: + return _model_pretrained_cfgs[model_name].get(cfg_key, None) + return None \ No newline at end of file diff --git a/custom_timm/models/regnet.py b/custom_timm/models/regnet.py new file mode 100644 index 0000000000000000000000000000000000000000..3ead5d9e9fe6a060e1559c2affed4698e3a4b57f --- /dev/null +++ b/custom_timm/models/regnet.py @@ -0,0 +1,711 @@ +"""RegNet + +Paper: `Designing Network Design Spaces` - https://arxiv.org/abs/2003.13678 +Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py + +Based on original PyTorch impl linked above, but re-wrote to use my own blocks (adapted from ResNet here) +and cleaned up with more descriptive variable names. + +Weights from original impl have been modified +* first layer from BGR -> RGB as most PyTorch models are +* removed training specific dict entries from checkpoints and keep model state_dict only +* remap names to match the ones here + +Hacked together by / Copyright 2020 Ross Wightman +""" +import math +from dataclasses import dataclass +from functools import partial +from typing import Optional, Union, Callable + +import numpy as np +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, named_apply, checkpoint_seq +from .layers import ClassifierHead, AvgPool2dSame, ConvNormAct, SEModule, DropPath, GroupNormAct +from .layers import get_act_layer, get_norm_act_layer, create_conv2d +from .registry import register_model + + +@dataclass +class RegNetCfg: + depth: int = 21 + w0: int = 80 + wa: float = 42.63 + wm: float = 2.66 + group_size: int = 24 + bottle_ratio: float = 1. + se_ratio: float = 0. + stem_width: int = 32 + downsample: Optional[str] = 'conv1x1' + linear_out: bool = False + preact: bool = False + num_features: int = 0 + act_layer: Union[str, Callable] = 'relu' + norm_layer: Union[str, Callable] = 'batchnorm' + + +# Model FLOPS = three trailing digits * 10^8 +model_cfgs = dict( + # RegNet-X + regnetx_002=RegNetCfg(w0=24, wa=36.44, wm=2.49, group_size=8, depth=13), + regnetx_004=RegNetCfg(w0=24, wa=24.48, wm=2.54, group_size=16, depth=22), + regnetx_006=RegNetCfg(w0=48, wa=36.97, wm=2.24, group_size=24, depth=16), + regnetx_008=RegNetCfg(w0=56, wa=35.73, wm=2.28, group_size=16, depth=16), + regnetx_016=RegNetCfg(w0=80, wa=34.01, wm=2.25, group_size=24, depth=18), + regnetx_032=RegNetCfg(w0=88, wa=26.31, wm=2.25, group_size=48, depth=25), + regnetx_040=RegNetCfg(w0=96, wa=38.65, wm=2.43, group_size=40, depth=23), + regnetx_064=RegNetCfg(w0=184, wa=60.83, wm=2.07, group_size=56, depth=17), + regnetx_080=RegNetCfg(w0=80, wa=49.56, wm=2.88, group_size=120, depth=23), + regnetx_120=RegNetCfg(w0=168, wa=73.36, wm=2.37, group_size=112, depth=19), + regnetx_160=RegNetCfg(w0=216, wa=55.59, wm=2.1, group_size=128, depth=22), + regnetx_320=RegNetCfg(w0=320, wa=69.86, wm=2.0, group_size=168, depth=23), + + # RegNet-Y + regnety_002=RegNetCfg(w0=24, wa=36.44, wm=2.49, group_size=8, depth=13, se_ratio=0.25), + regnety_004=RegNetCfg(w0=48, wa=27.89, wm=2.09, group_size=8, depth=16, se_ratio=0.25), + regnety_006=RegNetCfg(w0=48, wa=32.54, wm=2.32, group_size=16, depth=15, se_ratio=0.25), + regnety_008=RegNetCfg(w0=56, wa=38.84, wm=2.4, group_size=16, depth=14, se_ratio=0.25), + regnety_016=RegNetCfg(w0=48, wa=20.71, wm=2.65, group_size=24, depth=27, se_ratio=0.25), + regnety_032=RegNetCfg(w0=80, wa=42.63, wm=2.66, group_size=24, depth=21, se_ratio=0.25), + regnety_040=RegNetCfg(w0=96, wa=31.41, wm=2.24, group_size=64, depth=22, se_ratio=0.25), + regnety_064=RegNetCfg(w0=112, wa=33.22, wm=2.27, group_size=72, depth=25, se_ratio=0.25), + regnety_080=RegNetCfg(w0=192, wa=76.82, wm=2.19, group_size=56, depth=17, se_ratio=0.25), + regnety_120=RegNetCfg(w0=168, wa=73.36, wm=2.37, group_size=112, depth=19, se_ratio=0.25), + regnety_160=RegNetCfg(w0=200, wa=106.23, wm=2.48, group_size=112, depth=18, se_ratio=0.25), + regnety_320=RegNetCfg(w0=232, wa=115.89, wm=2.53, group_size=232, depth=20, se_ratio=0.25), + + # Experimental + regnety_040s_gn=RegNetCfg( + w0=96, wa=31.41, wm=2.24, group_size=64, depth=22, se_ratio=0.25, + act_layer='silu', norm_layer=partial(GroupNormAct, group_size=16)), + + # regnetv = 'preact regnet y' + regnetv_040=RegNetCfg( + depth=22, w0=96, wa=31.41, wm=2.24, group_size=64, se_ratio=0.25, preact=True, act_layer='silu'), + regnetv_064=RegNetCfg( + depth=25, w0=112, wa=33.22, wm=2.27, group_size=72, se_ratio=0.25, preact=True, act_layer='silu', + downsample='avg'), + + # RegNet-Z (unverified) + regnetz_005=RegNetCfg( + depth=21, w0=16, wa=10.7, wm=2.51, group_size=4, bottle_ratio=4.0, se_ratio=0.25, + downsample=None, linear_out=True, num_features=1024, act_layer='silu', + ), + regnetz_040=RegNetCfg( + depth=28, w0=48, wa=14.5, wm=2.226, group_size=8, bottle_ratio=4.0, se_ratio=0.25, + downsample=None, linear_out=True, num_features=0, act_layer='silu', + ), + regnetz_040h=RegNetCfg( + depth=28, w0=48, wa=14.5, wm=2.226, group_size=8, bottle_ratio=4.0, se_ratio=0.25, + downsample=None, linear_out=True, num_features=1536, act_layer='silu', + ), +) + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = dict( + regnetx_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_002-e7e85e5c.pth'), + regnetx_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_004-7d0e9424.pth'), + regnetx_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_006-85ec1baa.pth'), + regnetx_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_008-d8b470eb.pth'), + regnetx_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_016-65ca972a.pth'), + regnetx_032=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_032-ed0c7f7e.pth'), + regnetx_040=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_040-73c2a654.pth'), + regnetx_064=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_064-29278baa.pth'), + regnetx_080=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_080-7c7fcab1.pth'), + regnetx_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_120-65d5521e.pth'), + regnetx_160=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_160-c98c4112.pth'), + regnetx_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnetx_320-8ea38b93.pth'), + + regnety_002=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_002-e68ca334.pth'), + regnety_004=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_004-0db870e6.pth'), + regnety_006=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_006-c67e57ec.pth'), + regnety_008=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_008-dc900dbe.pth'), + regnety_016=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_016-54367f74.pth'), + regnety_032=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/regnety_032_ra-7f2439f9.pth', + crop_pct=1.0, test_input_size=(3, 288, 288)), + regnety_040=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnety_040_ra3-670e1166.pth', + crop_pct=1.0, test_input_size=(3, 288, 288)), + regnety_064=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnety_064_ra3-aa26dc7d.pth', + crop_pct=1.0, test_input_size=(3, 288, 288)), + regnety_080=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnety_080_ra3-1fdc4344.pth', + crop_pct=1.0, test_input_size=(3, 288, 288)), + regnety_120=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_120-721ba79a.pth'), + regnety_160=_cfg( + url='https://dl.fbaipublicfiles.com/deit/regnety_160-a5fe301d.pth', # from Facebook DeiT GitHub repository + crop_pct=1.0, test_input_size=(3, 288, 288)), + regnety_320=_cfg(url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-regnet/regnety_320-ba464b29.pth'), + + regnety_040s_gn=_cfg(url=''), + regnetv_040=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetv_040_ra3-c248f51f.pth', + first_conv='stem', crop_pct=1.0, test_input_size=(3, 288, 288)), + regnetv_064=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetv_064_ra3-530616c2.pth', + first_conv='stem', crop_pct=1.0, test_input_size=(3, 288, 288)), + + regnetz_005=_cfg(url=''), + regnetz_040=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_040_ra3-9007edf5.pth', + input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320)), + regnetz_040h=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/regnetz_040h_ra3-f594343b.pth', + input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320)), +) + + +def quantize_float(f, q): + """Converts a float to closest non-zero int divisible by q.""" + return int(round(f / q) * q) + + +def adjust_widths_groups_comp(widths, bottle_ratios, groups): + """Adjusts the compatibility of widths and groups.""" + bottleneck_widths = [int(w * b) for w, b in zip(widths, bottle_ratios)] + groups = [min(g, w_bot) for g, w_bot in zip(groups, bottleneck_widths)] + bottleneck_widths = [quantize_float(w_bot, g) for w_bot, g in zip(bottleneck_widths, groups)] + widths = [int(w_bot / b) for w_bot, b in zip(bottleneck_widths, bottle_ratios)] + return widths, groups + + +def generate_regnet(width_slope, width_initial, width_mult, depth, group_size, q=8): + """Generates per block widths from RegNet parameters.""" + assert width_slope >= 0 and width_initial > 0 and width_mult > 1 and width_initial % q == 0 + # TODO dWr scaling? + # depth = int(depth * (scale ** 0.1)) + # width_scale = scale ** 0.4 # dWr scale, exp 0.8 / 2, applied to both group and layer widths + widths_cont = np.arange(depth) * width_slope + width_initial + width_exps = np.round(np.log(widths_cont / width_initial) / np.log(width_mult)) + widths = width_initial * np.power(width_mult, width_exps) + widths = np.round(np.divide(widths, q)) * q + num_stages, max_stage = len(np.unique(widths)), width_exps.max() + 1 + groups = np.array([group_size for _ in range(num_stages)]) + return widths.astype(int).tolist(), num_stages, groups.astype(int).tolist() + + +def downsample_conv(in_chs, out_chs, kernel_size=1, stride=1, dilation=1, norm_layer=None, preact=False): + norm_layer = norm_layer or nn.BatchNorm2d + kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size + dilation = dilation if kernel_size > 1 else 1 + if preact: + return create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation) + else: + return ConvNormAct( + in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, norm_layer=norm_layer, apply_act=False) + + +def downsample_avg(in_chs, out_chs, kernel_size=1, stride=1, dilation=1, norm_layer=None, preact=False): + """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment.""" + norm_layer = norm_layer or nn.BatchNorm2d + avg_stride = stride if dilation == 1 else 1 + pool = nn.Identity() + if stride > 1 or dilation > 1: + avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d + pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False) + if preact: + conv = create_conv2d(in_chs, out_chs, 1, stride=1) + else: + conv = ConvNormAct(in_chs, out_chs, 1, stride=1, norm_layer=norm_layer, apply_act=False) + return nn.Sequential(*[pool, conv]) + + +def create_shortcut( + downsample_type, in_chs, out_chs, kernel_size, stride, dilation=(1, 1), norm_layer=None, preact=False): + assert downsample_type in ('avg', 'conv1x1', '', None) + if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]: + dargs = dict(stride=stride, dilation=dilation[0], norm_layer=norm_layer, preact=preact) + if not downsample_type: + return None # no shortcut, no downsample + elif downsample_type == 'avg': + return downsample_avg(in_chs, out_chs, **dargs) + else: + return downsample_conv(in_chs, out_chs, kernel_size=kernel_size, **dargs) + else: + return nn.Identity() # identity shortcut (no downsample) + + +class Bottleneck(nn.Module): + """ RegNet Bottleneck + + This is almost exactly the same as a ResNet Bottlneck. The main difference is the SE block is moved from + after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels. + """ + + def __init__( + self, in_chs, out_chs, stride=1, dilation=(1, 1), bottle_ratio=1, group_size=1, se_ratio=0.25, + downsample='conv1x1', linear_out=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + drop_block=None, drop_path_rate=0.): + super(Bottleneck, self).__init__() + act_layer = get_act_layer(act_layer) + bottleneck_chs = int(round(out_chs * bottle_ratio)) + groups = bottleneck_chs // group_size + + cargs = dict(act_layer=act_layer, norm_layer=norm_layer) + self.conv1 = ConvNormAct(in_chs, bottleneck_chs, kernel_size=1, **cargs) + self.conv2 = ConvNormAct( + bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, dilation=dilation[0], + groups=groups, drop_layer=drop_block, **cargs) + if se_ratio: + se_channels = int(round(in_chs * se_ratio)) + self.se = SEModule(bottleneck_chs, rd_channels=se_channels, act_layer=act_layer) + else: + self.se = nn.Identity() + self.conv3 = ConvNormAct(bottleneck_chs, out_chs, kernel_size=1, apply_act=False, **cargs) + self.act3 = nn.Identity() if linear_out else act_layer() + self.downsample = create_shortcut(downsample, in_chs, out_chs, 1, stride, dilation, norm_layer=norm_layer) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + + def zero_init_last(self): + nn.init.zeros_(self.conv3.bn.weight) + + def forward(self, x): + shortcut = x + x = self.conv1(x) + x = self.conv2(x) + x = self.se(x) + x = self.conv3(x) + if self.downsample is not None: + # NOTE stuck with downsample as the attr name due to weight compatibility + # now represents the shortcut, no shortcut if None, and non-downsample shortcut == nn.Identity() + x = self.drop_path(x) + self.downsample(shortcut) + x = self.act3(x) + return x + + +class PreBottleneck(nn.Module): + """ RegNet Bottleneck + + This is almost exactly the same as a ResNet Bottlneck. The main difference is the SE block is moved from + after conv3 to after conv2. Otherwise, it's just redefining the arguments for groups/bottleneck channels. + """ + + def __init__( + self, in_chs, out_chs, stride=1, dilation=(1, 1), bottle_ratio=1, group_size=1, se_ratio=0.25, + downsample='conv1x1', linear_out=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + drop_block=None, drop_path_rate=0.): + super(PreBottleneck, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer, act_layer) + bottleneck_chs = int(round(out_chs * bottle_ratio)) + groups = bottleneck_chs // group_size + + self.norm1 = norm_act_layer(in_chs) + self.conv1 = create_conv2d(in_chs, bottleneck_chs, kernel_size=1) + self.norm2 = norm_act_layer(bottleneck_chs) + self.conv2 = create_conv2d( + bottleneck_chs, bottleneck_chs, kernel_size=3, stride=stride, dilation=dilation[0], groups=groups) + if se_ratio: + se_channels = int(round(in_chs * se_ratio)) + self.se = SEModule(bottleneck_chs, rd_channels=se_channels, act_layer=act_layer) + else: + self.se = nn.Identity() + self.norm3 = norm_act_layer(bottleneck_chs) + self.conv3 = create_conv2d(bottleneck_chs, out_chs, kernel_size=1) + self.downsample = create_shortcut(downsample, in_chs, out_chs, 1, stride, dilation, preact=True) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + + def zero_init_last(self): + pass + + def forward(self, x): + x = self.norm1(x) + shortcut = x + x = self.conv1(x) + x = self.norm2(x) + x = self.conv2(x) + x = self.se(x) + x = self.norm3(x) + x = self.conv3(x) + if self.downsample is not None: + # NOTE stuck with downsample as the attr name due to weight compatibility + # now represents the shortcut, no shortcut if None, and non-downsample shortcut == nn.Identity() + x = self.drop_path(x) + self.downsample(shortcut) + return x + + +class RegStage(nn.Module): + """Stage (sequence of blocks w/ the same output shape).""" + + def __init__( + self, depth, in_chs, out_chs, stride, dilation, + drop_path_rates=None, block_fn=Bottleneck, **block_kwargs): + super(RegStage, self).__init__() + self.grad_checkpointing = False + + first_dilation = 1 if dilation in (1, 2) else 2 + for i in range(depth): + block_stride = stride if i == 0 else 1 + block_in_chs = in_chs if i == 0 else out_chs + block_dilation = (first_dilation, dilation) + dpr = drop_path_rates[i] if drop_path_rates is not None else 0. + name = "b{}".format(i + 1) + self.add_module( + name, block_fn( + block_in_chs, out_chs, stride=block_stride, dilation=block_dilation, + drop_path_rate=dpr, **block_kwargs) + ) + first_dilation = dilation + + def forward(self, x): + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.children(), x) + else: + for block in self.children(): + x = block(x) + return x + + +class RegNet(nn.Module): + """RegNet-X, Y, and Z Models + + Paper: https://arxiv.org/abs/2003.13678 + Original Impl: https://github.com/facebookresearch/pycls/blob/master/pycls/models/regnet.py + """ + + def __init__( + self, cfg: RegNetCfg, in_chans=3, num_classes=1000, output_stride=32, global_pool='avg', + drop_rate=0., drop_path_rate=0., zero_init_last=True): + super().__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + assert output_stride in (8, 16, 32) + + # Construct the stem + stem_width = cfg.stem_width + na_args = dict(act_layer=cfg.act_layer, norm_layer=cfg.norm_layer) + if cfg.preact: + self.stem = create_conv2d(in_chans, stem_width, 3, stride=2) + else: + self.stem = ConvNormAct(in_chans, stem_width, 3, stride=2, **na_args) + self.feature_info = [dict(num_chs=stem_width, reduction=2, module='stem')] + + # Construct the stages + prev_width = stem_width + curr_stride = 2 + per_stage_args, common_args = self._get_stage_args( + cfg, output_stride=output_stride, drop_path_rate=drop_path_rate) + assert len(per_stage_args) == 4 + block_fn = PreBottleneck if cfg.preact else Bottleneck + for i, stage_args in enumerate(per_stage_args): + stage_name = "s{}".format(i + 1) + self.add_module(stage_name, RegStage(in_chs=prev_width, block_fn=block_fn, **stage_args, **common_args)) + prev_width = stage_args['out_chs'] + curr_stride *= stage_args['stride'] + self.feature_info += [dict(num_chs=prev_width, reduction=curr_stride, module=stage_name)] + + # Construct the head + if cfg.num_features: + self.final_conv = ConvNormAct(prev_width, cfg.num_features, kernel_size=1, **na_args) + self.num_features = cfg.num_features + else: + final_act = cfg.linear_out or cfg.preact + self.final_conv = get_act_layer(cfg.act_layer)() if final_act else nn.Identity() + self.num_features = prev_width + self.head = ClassifierHead( + in_chs=self.num_features, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate) + + named_apply(partial(_init_weights, zero_init_last=zero_init_last), self) + + def _get_stage_args(self, cfg: RegNetCfg, default_stride=2, output_stride=32, drop_path_rate=0.): + # Generate RegNet ws per block + widths, num_stages, stage_gs = generate_regnet(cfg.wa, cfg.w0, cfg.wm, cfg.depth, cfg.group_size) + + # Convert to per stage format + stage_widths, stage_depths = np.unique(widths, return_counts=True) + stage_br = [cfg.bottle_ratio for _ in range(num_stages)] + stage_strides = [] + stage_dilations = [] + net_stride = 2 + dilation = 1 + for _ in range(num_stages): + if net_stride >= output_stride: + dilation *= default_stride + stride = 1 + else: + stride = default_stride + net_stride *= stride + stage_strides.append(stride) + stage_dilations.append(dilation) + stage_dpr = np.split(np.linspace(0, drop_path_rate, sum(stage_depths)), np.cumsum(stage_depths[:-1])) + + # Adjust the compatibility of ws and gws + stage_widths, stage_gs = adjust_widths_groups_comp(stage_widths, stage_br, stage_gs) + arg_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_size', 'drop_path_rates'] + per_stage_args = [ + dict(zip(arg_names, params)) for params in + zip(stage_widths, stage_strides, stage_dilations, stage_depths, stage_br, stage_gs, stage_dpr)] + common_args = dict( + downsample=cfg.downsample, se_ratio=cfg.se_ratio, linear_out=cfg.linear_out, + act_layer=cfg.act_layer, norm_layer=cfg.norm_layer) + return per_stage_args, common_args + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', + blocks=r'^s(\d+)' if coarse else r'^s(\d+)\.b(\d+)', + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in list(self.children())[1:-1]: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + x = self.s1(x) + x = self.s2(x) + x = self.s3(x) + x = self.s4(x) + x = self.final_conv(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _init_weights(module, name='', zero_init_last=False): + if isinstance(module, nn.Conv2d): + fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels + fan_out //= module.groups + module.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Linear): + nn.init.normal_(module.weight, mean=0.0, std=0.01) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif zero_init_last and hasattr(module, 'zero_init_last'): + module.zero_init_last() + + +def _filter_fn(state_dict): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + if 'model' in state_dict: + # For DeiT trained regnety_160 pretraiend model + state_dict = state_dict['model'] + return state_dict + + +def _create_regnet(variant, pretrained, **kwargs): + return build_model_with_cfg( + RegNet, variant, pretrained, + model_cfg=model_cfgs[variant], + pretrained_filter_fn=_filter_fn, + **kwargs) + + +@register_model +def regnetx_002(pretrained=False, **kwargs): + """RegNetX-200MF""" + return _create_regnet('regnetx_002', pretrained, **kwargs) + + +@register_model +def regnetx_004(pretrained=False, **kwargs): + """RegNetX-400MF""" + return _create_regnet('regnetx_004', pretrained, **kwargs) + + +@register_model +def regnetx_006(pretrained=False, **kwargs): + """RegNetX-600MF""" + return _create_regnet('regnetx_006', pretrained, **kwargs) + + +@register_model +def regnetx_008(pretrained=False, **kwargs): + """RegNetX-800MF""" + return _create_regnet('regnetx_008', pretrained, **kwargs) + + +@register_model +def regnetx_016(pretrained=False, **kwargs): + """RegNetX-1.6GF""" + return _create_regnet('regnetx_016', pretrained, **kwargs) + + +@register_model +def regnetx_032(pretrained=False, **kwargs): + """RegNetX-3.2GF""" + return _create_regnet('regnetx_032', pretrained, **kwargs) + + +@register_model +def regnetx_040(pretrained=False, **kwargs): + """RegNetX-4.0GF""" + return _create_regnet('regnetx_040', pretrained, **kwargs) + + +@register_model +def regnetx_064(pretrained=False, **kwargs): + """RegNetX-6.4GF""" + return _create_regnet('regnetx_064', pretrained, **kwargs) + + +@register_model +def regnetx_080(pretrained=False, **kwargs): + """RegNetX-8.0GF""" + return _create_regnet('regnetx_080', pretrained, **kwargs) + + +@register_model +def regnetx_120(pretrained=False, **kwargs): + """RegNetX-12GF""" + return _create_regnet('regnetx_120', pretrained, **kwargs) + + +@register_model +def regnetx_160(pretrained=False, **kwargs): + """RegNetX-16GF""" + return _create_regnet('regnetx_160', pretrained, **kwargs) + + +@register_model +def regnetx_320(pretrained=False, **kwargs): + """RegNetX-32GF""" + return _create_regnet('regnetx_320', pretrained, **kwargs) + + +@register_model +def regnety_002(pretrained=False, **kwargs): + """RegNetY-200MF""" + return _create_regnet('regnety_002', pretrained, **kwargs) + + +@register_model +def regnety_004(pretrained=False, **kwargs): + """RegNetY-400MF""" + return _create_regnet('regnety_004', pretrained, **kwargs) + + +@register_model +def regnety_006(pretrained=False, **kwargs): + """RegNetY-600MF""" + return _create_regnet('regnety_006', pretrained, **kwargs) + + +@register_model +def regnety_008(pretrained=False, **kwargs): + """RegNetY-800MF""" + return _create_regnet('regnety_008', pretrained, **kwargs) + + +@register_model +def regnety_016(pretrained=False, **kwargs): + """RegNetY-1.6GF""" + return _create_regnet('regnety_016', pretrained, **kwargs) + + +@register_model +def regnety_032(pretrained=False, **kwargs): + """RegNetY-3.2GF""" + return _create_regnet('regnety_032', pretrained, **kwargs) + + +@register_model +def regnety_040(pretrained=False, **kwargs): + """RegNetY-4.0GF""" + return _create_regnet('regnety_040', pretrained, **kwargs) + + +@register_model +def regnety_064(pretrained=False, **kwargs): + """RegNetY-6.4GF""" + return _create_regnet('regnety_064', pretrained, **kwargs) + + +@register_model +def regnety_080(pretrained=False, **kwargs): + """RegNetY-8.0GF""" + return _create_regnet('regnety_080', pretrained, **kwargs) + + +@register_model +def regnety_120(pretrained=False, **kwargs): + """RegNetY-12GF""" + return _create_regnet('regnety_120', pretrained, **kwargs) + + +@register_model +def regnety_160(pretrained=False, **kwargs): + """RegNetY-16GF""" + return _create_regnet('regnety_160', pretrained, **kwargs) + + +@register_model +def regnety_320(pretrained=False, **kwargs): + """RegNetY-32GF""" + return _create_regnet('regnety_320', pretrained, **kwargs) + + +@register_model +def regnety_040s_gn(pretrained=False, **kwargs): + """RegNetY-4.0GF w/ GroupNorm """ + return _create_regnet('regnety_040s_gn', pretrained, **kwargs) + + +@register_model +def regnetv_040(pretrained=False, **kwargs): + """""" + return _create_regnet('regnetv_040', pretrained, **kwargs) + + +@register_model +def regnetv_064(pretrained=False, **kwargs): + """""" + return _create_regnet('regnetv_064', pretrained, **kwargs) + + +@register_model +def regnetz_005(pretrained=False, **kwargs): + """RegNetZ-500MF + NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py + but it's not clear it is equivalent to paper model as not detailed in the paper. + """ + return _create_regnet('regnetz_005', pretrained, zero_init_last=False, **kwargs) + + +@register_model +def regnetz_040(pretrained=False, **kwargs): + """RegNetZ-4.0GF + NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py + but it's not clear it is equivalent to paper model as not detailed in the paper. + """ + return _create_regnet('regnetz_040', pretrained, zero_init_last=False, **kwargs) + + +@register_model +def regnetz_040h(pretrained=False, **kwargs): + """RegNetZ-4.0GF + NOTE: config found in https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py + but it's not clear it is equivalent to paper model as not detailed in the paper. + """ + return _create_regnet('regnetz_040h', pretrained, zero_init_last=False, **kwargs) diff --git a/custom_timm/models/res2net.py b/custom_timm/models/res2net.py new file mode 100644 index 0000000000000000000000000000000000000000..01899c6438bb88e907fb879abf27895b7d9ca970 --- /dev/null +++ b/custom_timm/models/res2net.py @@ -0,0 +1,213 @@ +""" Res2Net and Res2NeXt +Adapted from Official Pytorch impl at: https://github.com/gasvn/Res2Net/ +Paper: `Res2Net: A New Multi-scale Backbone Architecture` - https://arxiv.org/abs/1904.01169 +""" +import math + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .registry import register_model +from .resnet import ResNet + +__all__ = [] + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv1', 'classifier': 'fc', + **kwargs + } + + +default_cfgs = { + 'res2net50_26w_4s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_4s-06e79181.pth'), + 'res2net50_48w_2s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_48w_2s-afed724a.pth'), + 'res2net50_14w_8s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_14w_8s-6527dddc.pth'), + 'res2net50_26w_6s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_6s-19041792.pth'), + 'res2net50_26w_8s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net50_26w_8s-2c7c9f12.pth'), + 'res2net101_26w_4s': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2net101_26w_4s-02a759a1.pth'), + 'res2next50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-res2net/res2next50_4s-6ef7e7bf.pth'), +} + + +class Bottle2neck(nn.Module): + """ Res2Net/Res2NeXT Bottleneck + Adapted from https://github.com/gasvn/Res2Net/blob/master/res2net.py + """ + expansion = 4 + + def __init__( + self, inplanes, planes, stride=1, downsample=None, + cardinality=1, base_width=26, scale=4, dilation=1, first_dilation=None, + act_layer=nn.ReLU, norm_layer=None, attn_layer=None, **_): + super(Bottle2neck, self).__init__() + self.scale = scale + self.is_first = stride > 1 or downsample is not None + self.num_scales = max(1, scale - 1) + width = int(math.floor(planes * (base_width / 64.0))) * cardinality + self.width = width + outplanes = planes * self.expansion + first_dilation = first_dilation or dilation + + self.conv1 = nn.Conv2d(inplanes, width * scale, kernel_size=1, bias=False) + self.bn1 = norm_layer(width * scale) + + convs = [] + bns = [] + for i in range(self.num_scales): + convs.append(nn.Conv2d( + width, width, kernel_size=3, stride=stride, padding=first_dilation, + dilation=first_dilation, groups=cardinality, bias=False)) + bns.append(norm_layer(width)) + self.convs = nn.ModuleList(convs) + self.bns = nn.ModuleList(bns) + if self.is_first: + # FIXME this should probably have count_include_pad=False, but hurts original weights + self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1) + else: + self.pool = None + + self.conv3 = nn.Conv2d(width * scale, outplanes, kernel_size=1, bias=False) + self.bn3 = norm_layer(outplanes) + self.se = attn_layer(outplanes) if attn_layer is not None else None + + self.relu = act_layer(inplace=True) + self.downsample = downsample + + def zero_init_last(self): + nn.init.zeros_(self.bn3.weight) + + def forward(self, x): + shortcut = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + spx = torch.split(out, self.width, 1) + spo = [] + sp = spx[0] # redundant, for torchscript + for i, (conv, bn) in enumerate(zip(self.convs, self.bns)): + if i == 0 or self.is_first: + sp = spx[i] + else: + sp = sp + spx[i] + sp = conv(sp) + sp = bn(sp) + sp = self.relu(sp) + spo.append(sp) + if self.scale > 1: + if self.pool is not None: # self.is_first == True, None check for torchscript + spo.append(self.pool(spx[-1])) + else: + spo.append(spx[-1]) + out = torch.cat(spo, 1) + + out = self.conv3(out) + out = self.bn3(out) + + if self.se is not None: + out = self.se(out) + + if self.downsample is not None: + shortcut = self.downsample(x) + + out += shortcut + out = self.relu(out) + + return out + + +def _create_res2net(variant, pretrained=False, **kwargs): + return build_model_with_cfg(ResNet, variant, pretrained, **kwargs) + + +@register_model +def res2net50_26w_4s(pretrained=False, **kwargs): + """Constructs a Res2Net-50 26w4s model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model_args = dict( + block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=4), **kwargs) + return _create_res2net('res2net50_26w_4s', pretrained, **model_args) + + +@register_model +def res2net101_26w_4s(pretrained=False, **kwargs): + """Constructs a Res2Net-101 26w4s model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model_args = dict( + block=Bottle2neck, layers=[3, 4, 23, 3], base_width=26, block_args=dict(scale=4), **kwargs) + return _create_res2net('res2net101_26w_4s', pretrained, **model_args) + + +@register_model +def res2net50_26w_6s(pretrained=False, **kwargs): + """Constructs a Res2Net-50 26w6s model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model_args = dict( + block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=6), **kwargs) + return _create_res2net('res2net50_26w_6s', pretrained, **model_args) + + +@register_model +def res2net50_26w_8s(pretrained=False, **kwargs): + """Constructs a Res2Net-50 26w8s model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model_args = dict( + block=Bottle2neck, layers=[3, 4, 6, 3], base_width=26, block_args=dict(scale=8), **kwargs) + return _create_res2net('res2net50_26w_8s', pretrained, **model_args) + + +@register_model +def res2net50_48w_2s(pretrained=False, **kwargs): + """Constructs a Res2Net-50 48w2s model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model_args = dict( + block=Bottle2neck, layers=[3, 4, 6, 3], base_width=48, block_args=dict(scale=2), **kwargs) + return _create_res2net('res2net50_48w_2s', pretrained, **model_args) + + +@register_model +def res2net50_14w_8s(pretrained=False, **kwargs): + """Constructs a Res2Net-50 14w8s model. + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model_args = dict( + block=Bottle2neck, layers=[3, 4, 6, 3], base_width=14, block_args=dict(scale=8), **kwargs) + return _create_res2net('res2net50_14w_8s', pretrained, **model_args) + + +@register_model +def res2next50(pretrained=False, **kwargs): + """Construct Res2NeXt-50 4s + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + """ + model_args = dict( + block=Bottle2neck, layers=[3, 4, 6, 3], base_width=4, cardinality=8, block_args=dict(scale=4), **kwargs) + return _create_res2net('res2next50', pretrained, **model_args) diff --git a/custom_timm/models/resnest.py b/custom_timm/models/resnest.py new file mode 100644 index 0000000000000000000000000000000000000000..84f329d9551c600c321fea4e3858520466f334df --- /dev/null +++ b/custom_timm/models/resnest.py @@ -0,0 +1,231 @@ +""" ResNeSt Models + +Paper: `ResNeSt: Split-Attention Networks` - https://arxiv.org/abs/2004.08955 + +Adapted from original PyTorch impl w/ weights at https://github.com/zhanghang1989/ResNeSt by Hang Zhang + +Modified for torchscript compat, and consistency with timm by Ross Wightman +""" +import torch +from torch import nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import SplitAttn +from .registry import register_model +from .resnet import ResNet + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv1.0', 'classifier': 'fc', + **kwargs + } + +default_cfgs = { + 'resnest14d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_resnest14-9c8fe254.pth'), + 'resnest26d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/gluon_resnest26-50eb607c.pth'), + 'resnest50d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50-528c19ca.pth'), + 'resnest101e': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest101-22405ba7.pth', + input_size=(3, 256, 256), pool_size=(8, 8)), + 'resnest200e': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest200-75117900.pth', + input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=0.909, interpolation='bicubic'), + 'resnest269e': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest269-0cc87c48.pth', + input_size=(3, 416, 416), pool_size=(13, 13), crop_pct=0.928, interpolation='bicubic'), + 'resnest50d_4s2x40d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50_fast_4s2x40d-41d14ed0.pth', + interpolation='bicubic'), + 'resnest50d_1s4x24d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-resnest/resnest50_fast_1s4x24d-d4a4f76f.pth', + interpolation='bicubic') +} + + +class ResNestBottleneck(nn.Module): + """ResNet Bottleneck + """ + # pylint: disable=unused-argument + expansion = 4 + + def __init__( + self, inplanes, planes, stride=1, downsample=None, + radix=1, cardinality=1, base_width=64, avd=False, avd_first=False, is_first=False, + reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): + super(ResNestBottleneck, self).__init__() + assert reduce_first == 1 # not supported + assert attn_layer is None # not supported + assert aa_layer is None # TODO not yet supported + assert drop_path is None # TODO not yet supported + + group_width = int(planes * (base_width / 64.)) * cardinality + first_dilation = first_dilation or dilation + if avd and (stride > 1 or is_first): + avd_stride = stride + stride = 1 + else: + avd_stride = 0 + self.radix = radix + + self.conv1 = nn.Conv2d(inplanes, group_width, kernel_size=1, bias=False) + self.bn1 = norm_layer(group_width) + self.act1 = act_layer(inplace=True) + self.avd_first = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and avd_first else None + + if self.radix >= 1: + self.conv2 = SplitAttn( + group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation, + dilation=first_dilation, groups=cardinality, radix=radix, norm_layer=norm_layer, drop_layer=drop_block) + self.bn2 = nn.Identity() + self.drop_block = nn.Identity() + self.act2 = nn.Identity() + else: + self.conv2 = nn.Conv2d( + group_width, group_width, kernel_size=3, stride=stride, padding=first_dilation, + dilation=first_dilation, groups=cardinality, bias=False) + self.bn2 = norm_layer(group_width) + self.drop_block = drop_block() if drop_block is not None else nn.Identity() + self.act2 = act_layer(inplace=True) + self.avd_last = nn.AvgPool2d(3, avd_stride, padding=1) if avd_stride > 0 and not avd_first else None + + self.conv3 = nn.Conv2d(group_width, planes * 4, kernel_size=1, bias=False) + self.bn3 = norm_layer(planes*4) + self.act3 = act_layer(inplace=True) + self.downsample = downsample + + def zero_init_last(self): + nn.init.zeros_(self.bn3.weight) + + def forward(self, x): + shortcut = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.act1(out) + + if self.avd_first is not None: + out = self.avd_first(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.drop_block(out) + out = self.act2(out) + + if self.avd_last is not None: + out = self.avd_last(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + shortcut = self.downsample(x) + + out += shortcut + out = self.act3(out) + return out + + +def _create_resnest(variant, pretrained=False, **kwargs): + return build_model_with_cfg(ResNet, variant, pretrained, **kwargs) + + +@register_model +def resnest14d(pretrained=False, **kwargs): + """ ResNeSt-14d model. Weights ported from GluonCV. + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[1, 1, 1, 1], + stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1, + block_args=dict(radix=2, avd=True, avd_first=False), **kwargs) + return _create_resnest('resnest14d', pretrained=pretrained, **model_kwargs) + + +@register_model +def resnest26d(pretrained=False, **kwargs): + """ ResNeSt-26d model. Weights ported from GluonCV. + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[2, 2, 2, 2], + stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1, + block_args=dict(radix=2, avd=True, avd_first=False), **kwargs) + return _create_resnest('resnest26d', pretrained=pretrained, **model_kwargs) + + +@register_model +def resnest50d(pretrained=False, **kwargs): + """ ResNeSt-50d model. Matches paper ResNeSt-50 model, https://arxiv.org/abs/2004.08955 + Since this codebase supports all possible variations, 'd' for deep stem, stem_width 32, avg in downsample. + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[3, 4, 6, 3], + stem_type='deep', stem_width=32, avg_down=True, base_width=64, cardinality=1, + block_args=dict(radix=2, avd=True, avd_first=False), **kwargs) + return _create_resnest('resnest50d', pretrained=pretrained, **model_kwargs) + + +@register_model +def resnest101e(pretrained=False, **kwargs): + """ ResNeSt-101e model. Matches paper ResNeSt-101 model, https://arxiv.org/abs/2004.08955 + Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample. + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[3, 4, 23, 3], + stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1, + block_args=dict(radix=2, avd=True, avd_first=False), **kwargs) + return _create_resnest('resnest101e', pretrained=pretrained, **model_kwargs) + + +@register_model +def resnest200e(pretrained=False, **kwargs): + """ ResNeSt-200e model. Matches paper ResNeSt-200 model, https://arxiv.org/abs/2004.08955 + Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample. + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[3, 24, 36, 3], + stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1, + block_args=dict(radix=2, avd=True, avd_first=False), **kwargs) + return _create_resnest('resnest200e', pretrained=pretrained, **model_kwargs) + + +@register_model +def resnest269e(pretrained=False, **kwargs): + """ ResNeSt-269e model. Matches paper ResNeSt-269 model, https://arxiv.org/abs/2004.08955 + Since this codebase supports all possible variations, 'e' for deep stem, stem_width 64, avg in downsample. + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[3, 30, 48, 8], + stem_type='deep', stem_width=64, avg_down=True, base_width=64, cardinality=1, + block_args=dict(radix=2, avd=True, avd_first=False), **kwargs) + return _create_resnest('resnest269e', pretrained=pretrained, **model_kwargs) + + +@register_model +def resnest50d_4s2x40d(pretrained=False, **kwargs): + """ResNeSt-50 4s2x40d from https://github.com/zhanghang1989/ResNeSt/blob/master/ablation.md + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[3, 4, 6, 3], + stem_type='deep', stem_width=32, avg_down=True, base_width=40, cardinality=2, + block_args=dict(radix=4, avd=True, avd_first=True), **kwargs) + return _create_resnest('resnest50d_4s2x40d', pretrained=pretrained, **model_kwargs) + + +@register_model +def resnest50d_1s4x24d(pretrained=False, **kwargs): + """ResNeSt-50 1s4x24d from https://github.com/zhanghang1989/ResNeSt/blob/master/ablation.md + """ + model_kwargs = dict( + block=ResNestBottleneck, layers=[3, 4, 6, 3], + stem_type='deep', stem_width=32, avg_down=True, base_width=24, cardinality=4, + block_args=dict(radix=1, avd=True, avd_first=True), **kwargs) + return _create_resnest('resnest50d_1s4x24d', pretrained=pretrained, **model_kwargs) diff --git a/custom_timm/models/resnet.py b/custom_timm/models/resnet.py new file mode 100644 index 0000000000000000000000000000000000000000..1c3b2a9ce02bd223a17be44765bc094390b32811 --- /dev/null +++ b/custom_timm/models/resnet.py @@ -0,0 +1,1608 @@ +"""PyTorch ResNet + +This started as a copy of https://github.com/pytorch/vision 'resnet.py' (BSD-3-Clause) with +additional dropout and dynamic global avg/max pool. + +ResNeXt, SE-ResNeXt, SENet, and MXNet Gluon stem/downsample variants, tiered stems added by Ross Wightman + +Copyright 2019, Ross Wightman +""" +import math +from functools import partial + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import DropBlock2d, DropPath, AvgPool2dSame, BlurPool2d, GroupNorm, create_attn, get_attn, create_classifier +from .registry import register_model + +__all__ = ['ResNet', 'BasicBlock', 'Bottleneck'] # model_registry will add each entrypoint fn to this + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv1', 'classifier': 'fc', + **kwargs + } + + +default_cfgs = { + # ResNet and Wide ResNet + 'resnet10t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet10t_176_c3-f3215ab1.pth', + input_size=(3, 176, 176), pool_size=(6, 6), + test_crop_pct=0.95, test_input_size=(3, 224, 224), + first_conv='conv1.0'), + 'resnet14t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet14t_176_c3-c4ed2c37.pth', + input_size=(3, 176, 176), pool_size=(6, 6), + test_crop_pct=0.95, test_input_size=(3, 224, 224), + first_conv='conv1.0'), + 'resnet18': _cfg(url='https://download.pytorch.org/models/resnet18-5c106cde.pth'), + 'resnet18d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet18d_ra2-48a79e06.pth', + interpolation='bicubic', first_conv='conv1.0'), + 'resnet34': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34-43635321.pth'), + 'resnet34d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet34d_ra2-f8dcfcaf.pth', + interpolation='bicubic', first_conv='conv1.0'), + 'resnet26': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet26-9aa10e23.pth', + interpolation='bicubic'), + 'resnet26d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet26d-69e92c46.pth', + interpolation='bicubic', first_conv='conv1.0'), + 'resnet26t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-attn-weights/resnet26t_256_ra2-6f6fa748.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=0.94), + 'resnet50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_a1_0-14fe96d1.pth', + interpolation='bicubic', crop_pct=0.95), + 'resnet50d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet50d_ra2-464e36ba.pth', + interpolation='bicubic', first_conv='conv1.0'), + 'resnet50t': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + 'resnet101': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet101_a1h-36d3f2aa.pth', + interpolation='bicubic', crop_pct=0.95), + 'resnet101d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet101d_ra2-2803ffab.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), + crop_pct=1.0, test_input_size=(3, 320, 320)), + 'resnet152': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet152_a1h-dc400468.pth', + interpolation='bicubic', crop_pct=0.95), + 'resnet152d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet152d_ra2-5cac0439.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), + crop_pct=1.0, test_input_size=(3, 320, 320)), + 'resnet200': _cfg(url='', interpolation='bicubic'), + 'resnet200d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnet200d_ra2-bdba9bf9.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), + crop_pct=1.0, test_input_size=(3, 320, 320)), + 'tv_resnet34': _cfg(url='https://download.pytorch.org/models/resnet34-333f7ec4.pth'), + 'tv_resnet50': _cfg(url='https://download.pytorch.org/models/resnet50-19c8e357.pth'), + 'tv_resnet101': _cfg(url='https://download.pytorch.org/models/resnet101-5d3b4d8f.pth'), + 'tv_resnet152': _cfg(url='https://download.pytorch.org/models/resnet152-b121ed2d.pth'), + 'wide_resnet50_2': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/wide_resnet50_racm-8234f177.pth', + interpolation='bicubic'), + 'wide_resnet101_2': _cfg(url='https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth'), + + # ResNets w/ alternative norm layers + 'resnet50_gn': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnet50_gn_a1h2-8fe6c4d0.pth', + crop_pct=0.94, interpolation='bicubic'), + + # ResNeXt + 'resnext50_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnext50_32x4d_a1h-0146ab0a.pth', + interpolation='bicubic', crop_pct=0.95), + 'resnext50d_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnext50d_32x4d-103e99f8.pth', + interpolation='bicubic', + first_conv='conv1.0'), + 'resnext101_32x4d': _cfg(url=''), + 'resnext101_32x8d': _cfg(url='https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth'), + 'resnext101_64x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnext101_64x4d_c-0d0e0cc0.pth', + interpolation='bicubic', crop_pct=1.0, test_input_size=(3, 288, 288)), + 'tv_resnext50_32x4d': _cfg(url='https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth'), + + # ResNeXt models - Weakly Supervised Pretraining on Instagram Hashtags + # from https://github.com/facebookresearch/WSL-Images + # Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only. + 'ig_resnext101_32x8d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x8-c38310e5.pth'), + 'ig_resnext101_32x16d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x16-c6f796b0.pth'), + 'ig_resnext101_32x32d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x32-e4b90b00.pth'), + 'ig_resnext101_32x48d': _cfg(url='https://download.pytorch.org/models/ig_resnext101_32x48-3e41cc8a.pth'), + + # Semi-Supervised ResNe*t models from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models + # Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only. + 'ssl_resnet18': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet18-d92f0530.pth'), + 'ssl_resnet50': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnet50-08389792.pth'), + 'ssl_resnext50_32x4d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext50_32x4-ddb3e555.pth'), + 'ssl_resnext101_32x4d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x4-dc43570a.pth'), + 'ssl_resnext101_32x8d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x8-2cfe2f8b.pth'), + 'ssl_resnext101_32x16d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_supervised_resnext101_32x16-15fffa57.pth'), + + # Semi-Weakly Supervised ResNe*t models from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models + # Please note the CC-BY-NC 4.0 license on theses weights, non-commercial use only. + 'swsl_resnet18': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet18-118f1556.pth'), + 'swsl_resnet50': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnet50-16a12f1b.pth'), + 'swsl_resnext50_32x4d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext50_32x4-72679e44.pth'), + 'swsl_resnext101_32x4d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x4-3f87e46b.pth'), + 'swsl_resnext101_32x8d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x8-b4712904.pth'), + 'swsl_resnext101_32x16d': _cfg( + url='https://dl.fbaipublicfiles.com/semiweaksupervision/model_files/semi_weakly_supervised_resnext101_32x16-f3559a9c.pth'), + + # Efficient Channel Attention ResNets + 'ecaresnet26t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet26t_ra2-46609757.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), + crop_pct=0.95, test_input_size=(3, 320, 320)), + 'ecaresnetlight': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnetlight-75a9c627.pth', + interpolation='bicubic'), + 'ecaresnet50d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet50d-93c81e3b.pth', + interpolation='bicubic', + first_conv='conv1.0'), + 'ecaresnet50d_pruned': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet50d_p-e4fa23c2.pth', + interpolation='bicubic', + first_conv='conv1.0'), + 'ecaresnet50t': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet50t_ra2-f7ac63c4.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), + crop_pct=0.95, test_input_size=(3, 320, 320)), + 'ecaresnet101d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet101d-153dad65.pth', + interpolation='bicubic', first_conv='conv1.0'), + 'ecaresnet101d_pruned': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/ecaresnet101d_p-9e74cb91.pth', + interpolation='bicubic', + first_conv='conv1.0'), + 'ecaresnet200d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)), + 'ecaresnet269d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ecaresnet269d_320_ra2-7baa55cb.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 320, 320), pool_size=(10, 10), + crop_pct=1.0, test_input_size=(3, 352, 352)), + + # Efficient Channel Attention ResNeXts + 'ecaresnext26t_32x4d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + 'ecaresnext50t_32x4d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + + # Squeeze-Excitation ResNets, to eventually replace the models in senet.py + 'seresnet18': _cfg( + url='', + interpolation='bicubic'), + 'seresnet34': _cfg( + url='', + interpolation='bicubic'), + 'seresnet50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet50_ra_224-8efdb4bb.pth', + interpolation='bicubic'), + 'seresnet50t': _cfg( + url='', + interpolation='bicubic', + first_conv='conv1.0'), + 'seresnet101': _cfg( + url='', + interpolation='bicubic'), + 'seresnet152': _cfg( + url='', + interpolation='bicubic'), + 'seresnet152d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet152d_ra2-04464dd2.pth', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), pool_size=(8, 8), + crop_pct=1.0, test_input_size=(3, 320, 320) + ), + 'seresnet200d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)), + 'seresnet269d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0', input_size=(3, 256, 256), crop_pct=0.94, pool_size=(8, 8)), + + # Squeeze-Excitation ResNeXts, to eventually replace the models in senet.py + 'seresnext26d_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26d_32x4d-80fa48a3.pth', + interpolation='bicubic', + first_conv='conv1.0'), + 'seresnext26t_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26tn_32x4d-569cb627.pth', + interpolation='bicubic', + first_conv='conv1.0'), + 'seresnext50_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext50_32x4d_racm-a304a460.pth', + interpolation='bicubic'), + 'seresnext101_32x4d': _cfg( + url='', + interpolation='bicubic'), + 'seresnext101_32x8d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/seresnext101_32x8d_ah-e6bc4c0a.pth', + interpolation='bicubic', test_input_size=(3, 288, 288), crop_pct=1.0), + 'seresnext101d_32x8d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/seresnext101d_32x8d_ah-191d7b94.pth', + interpolation='bicubic', first_conv='conv1.0', test_input_size=(3, 288, 288), crop_pct=1.0), + + 'senet154': _cfg( + url='', + interpolation='bicubic', + first_conv='conv1.0'), + + # ResNets with anti-aliasing / blur pool + 'resnetblur18': _cfg( + interpolation='bicubic'), + 'resnetblur50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/resnetblur50-84f4748f.pth', + interpolation='bicubic'), + 'resnetblur50d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + 'resnetblur101d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + 'resnetaa50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnetaa50_a1h-4cf422b3.pth', + test_input_size=(3, 288, 288), test_crop_pct=1.0, interpolation='bicubic'), + 'resnetaa50d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + 'resnetaa101d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + 'seresnetaa50d': _cfg( + url='', + interpolation='bicubic', first_conv='conv1.0'), + 'seresnextaa101d_32x8d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/seresnextaa101d_32x8d_ah-83c8ae12.pth', + interpolation='bicubic', first_conv='conv1.0', test_input_size=(3, 288, 288), crop_pct=1.0), + + # ResNet-RS models + 'resnetrs50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs50_ema-6b53758b.pth', + input_size=(3, 160, 160), pool_size=(5, 5), crop_pct=0.91, test_input_size=(3, 224, 224), + interpolation='bicubic', first_conv='conv1.0'), + 'resnetrs101': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs101_i192_ema-1509bbf6.pth', + input_size=(3, 192, 192), pool_size=(6, 6), crop_pct=0.94, test_input_size=(3, 288, 288), + interpolation='bicubic', first_conv='conv1.0'), + 'resnetrs152': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs152_i256_ema-a9aff7f9.pth', + input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320), + interpolation='bicubic', first_conv='conv1.0'), + 'resnetrs200': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnetrs200_c-6b698b88.pth', + input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 320, 320), + interpolation='bicubic', first_conv='conv1.0'), + 'resnetrs270': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs270_ema-b40e674c.pth', + input_size=(3, 256, 256), pool_size=(8, 8), crop_pct=1.0, test_input_size=(3, 352, 352), + interpolation='bicubic', first_conv='conv1.0'), + 'resnetrs350': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs350_i256_ema-5a1aa8f1.pth', + input_size=(3, 288, 288), pool_size=(9, 9), crop_pct=1.0, test_input_size=(3, 384, 384), + interpolation='bicubic', first_conv='conv1.0'), + 'resnetrs420': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rs-weights/resnetrs420_ema-972dee69.pth', + input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0, test_input_size=(3, 416, 416), + interpolation='bicubic', first_conv='conv1.0'), +} + + +def get_padding(kernel_size, stride, dilation=1): + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding + + +def create_aa(aa_layer, channels, stride=2, enable=True): + if not aa_layer or not enable: + return nn.Identity() + return aa_layer(stride) if issubclass(aa_layer, nn.AvgPool2d) else aa_layer(channels=channels, stride=stride) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__( + self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, + reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): + super(BasicBlock, self).__init__() + + assert cardinality == 1, 'BasicBlock only supports cardinality of 1' + assert base_width == 64, 'BasicBlock does not support changing base width' + first_planes = planes // reduce_first + outplanes = planes * self.expansion + first_dilation = first_dilation or dilation + use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation) + + self.conv1 = nn.Conv2d( + inplanes, first_planes, kernel_size=3, stride=1 if use_aa else stride, padding=first_dilation, + dilation=first_dilation, bias=False) + self.bn1 = norm_layer(first_planes) + self.drop_block = drop_block() if drop_block is not None else nn.Identity() + self.act1 = act_layer(inplace=True) + self.aa = create_aa(aa_layer, channels=first_planes, stride=stride, enable=use_aa) + + self.conv2 = nn.Conv2d( + first_planes, outplanes, kernel_size=3, padding=dilation, dilation=dilation, bias=False) + self.bn2 = norm_layer(outplanes) + + self.se = create_attn(attn_layer, outplanes) + + self.act2 = act_layer(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.drop_path = drop_path + + def zero_init_last(self): + nn.init.zeros_(self.bn2.weight) + + def forward(self, x): + shortcut = x + + x = self.conv1(x) + x = self.bn1(x) + x = self.drop_block(x) + x = self.act1(x) + x = self.aa(x) + + x = self.conv2(x) + x = self.bn2(x) + + if self.se is not None: + x = self.se(x) + + if self.drop_path is not None: + x = self.drop_path(x) + + if self.downsample is not None: + shortcut = self.downsample(shortcut) + x += shortcut + x = self.act2(x) + + return x + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__( + self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, + reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): + super(Bottleneck, self).__init__() + + width = int(math.floor(planes * (base_width / 64)) * cardinality) + first_planes = width // reduce_first + outplanes = planes * self.expansion + first_dilation = first_dilation or dilation + use_aa = aa_layer is not None and (stride == 2 or first_dilation != dilation) + + self.conv1 = nn.Conv2d(inplanes, first_planes, kernel_size=1, bias=False) + self.bn1 = norm_layer(first_planes) + self.act1 = act_layer(inplace=True) + + self.conv2 = nn.Conv2d( + first_planes, width, kernel_size=3, stride=1 if use_aa else stride, + padding=first_dilation, dilation=first_dilation, groups=cardinality, bias=False) + self.bn2 = norm_layer(width) + self.drop_block = drop_block() if drop_block is not None else nn.Identity() + self.act2 = act_layer(inplace=True) + self.aa = create_aa(aa_layer, channels=width, stride=stride, enable=use_aa) + + self.conv3 = nn.Conv2d(width, outplanes, kernel_size=1, bias=False) + self.bn3 = norm_layer(outplanes) + + self.se = create_attn(attn_layer, outplanes) + + self.act3 = act_layer(inplace=True) + self.downsample = downsample + self.stride = stride + self.dilation = dilation + self.drop_path = drop_path + + def zero_init_last(self): + nn.init.zeros_(self.bn3.weight) + + def forward(self, x): + shortcut = x + + x = self.conv1(x) + x = self.bn1(x) + x = self.act1(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.drop_block(x) + x = self.act2(x) + x = self.aa(x) + + x = self.conv3(x) + x = self.bn3(x) + + if self.se is not None: + x = self.se(x) + + if self.drop_path is not None: + x = self.drop_path(x) + + if self.downsample is not None: + shortcut = self.downsample(shortcut) + x += shortcut + x = self.act3(x) + + return x + + +def downsample_conv( + in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None): + norm_layer = norm_layer or nn.BatchNorm2d + kernel_size = 1 if stride == 1 and dilation == 1 else kernel_size + first_dilation = (first_dilation or dilation) if kernel_size > 1 else 1 + p = get_padding(kernel_size, stride, first_dilation) + + return nn.Sequential(*[ + nn.Conv2d( + in_channels, out_channels, kernel_size, stride=stride, padding=p, dilation=first_dilation, bias=False), + norm_layer(out_channels) + ]) + + +def downsample_avg( + in_channels, out_channels, kernel_size, stride=1, dilation=1, first_dilation=None, norm_layer=None): + norm_layer = norm_layer or nn.BatchNorm2d + avg_stride = stride if dilation == 1 else 1 + if stride == 1 and dilation == 1: + pool = nn.Identity() + else: + avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d + pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False) + + return nn.Sequential(*[ + pool, + nn.Conv2d(in_channels, out_channels, 1, stride=1, padding=0, bias=False), + norm_layer(out_channels) + ]) + + +def drop_blocks(drop_prob=0.): + return [ + None, None, + partial(DropBlock2d, drop_prob=drop_prob, block_size=5, gamma_scale=0.25) if drop_prob else None, + partial(DropBlock2d, drop_prob=drop_prob, block_size=3, gamma_scale=1.00) if drop_prob else None] + + +def make_blocks( + block_fn, channels, block_repeats, inplanes, reduce_first=1, output_stride=32, + down_kernel_size=1, avg_down=False, drop_block_rate=0., drop_path_rate=0., **kwargs): + stages = [] + feature_info = [] + net_num_blocks = sum(block_repeats) + net_block_idx = 0 + net_stride = 4 + dilation = prev_dilation = 1 + for stage_idx, (planes, num_blocks, db) in enumerate(zip(channels, block_repeats, drop_blocks(drop_block_rate))): + stage_name = f'layer{stage_idx + 1}' # never liked this name, but weight compat requires it + stride = 1 if stage_idx == 0 else 2 + if net_stride >= output_stride: + dilation *= stride + stride = 1 + else: + net_stride *= stride + + downsample = None + if stride != 1 or inplanes != planes * block_fn.expansion: + down_kwargs = dict( + in_channels=inplanes, out_channels=planes * block_fn.expansion, kernel_size=down_kernel_size, + stride=stride, dilation=dilation, first_dilation=prev_dilation, norm_layer=kwargs.get('norm_layer')) + downsample = downsample_avg(**down_kwargs) if avg_down else downsample_conv(**down_kwargs) + + block_kwargs = dict(reduce_first=reduce_first, dilation=dilation, drop_block=db, **kwargs) + blocks = [] + for block_idx in range(num_blocks): + downsample = downsample if block_idx == 0 else None + stride = stride if block_idx == 0 else 1 + block_dpr = drop_path_rate * net_block_idx / (net_num_blocks - 1) # stochastic depth linear decay rule + blocks.append(block_fn( + inplanes, planes, stride, downsample, first_dilation=prev_dilation, + drop_path=DropPath(block_dpr) if block_dpr > 0. else None, **block_kwargs)) + prev_dilation = dilation + inplanes = planes * block_fn.expansion + net_block_idx += 1 + + stages.append((stage_name, nn.Sequential(*blocks))) + feature_info.append(dict(num_chs=inplanes, reduction=net_stride, module=stage_name)) + + return stages, feature_info + + +class ResNet(nn.Module): + """ResNet / ResNeXt / SE-ResNeXt / SE-Net + + This class implements all variants of ResNet, ResNeXt, SE-ResNeXt, and SENet that + * have > 1 stride in the 3x3 conv layer of bottleneck + * have conv-bn-act ordering + + This ResNet impl supports a number of stem and downsample options based on the v1c, v1d, v1e, and v1s + variants included in the MXNet Gluon ResNetV1b model. The C and D variants are also discussed in the + 'Bag of Tricks' paper: https://arxiv.org/pdf/1812.01187. The B variant is equivalent to torchvision default. + + ResNet variants (the same modifications can be used in SE/ResNeXt models as well): + * normal, b - 7x7 stem, stem_width = 64, same as torchvision ResNet, NVIDIA ResNet 'v1.5', Gluon v1b + * c - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64) + * d - 3 layer deep 3x3 stem, stem_width = 32 (32, 32, 64), average pool in downsample + * e - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128), average pool in downsample + * s - 3 layer deep 3x3 stem, stem_width = 64 (64, 64, 128) + * t - 3 layer deep 3x3 stem, stem width = 32 (24, 48, 64), average pool in downsample + * tn - 3 layer deep 3x3 stem, stem width = 32 (24, 32, 64), average pool in downsample + + ResNeXt + * normal - 7x7 stem, stem_width = 64, standard cardinality and base widths + * same c,d, e, s variants as ResNet can be enabled + + SE-ResNeXt + * normal - 7x7 stem, stem_width = 64 + * same c, d, e, s variants as ResNet can be enabled + + SENet-154 - 3 layer deep 3x3 stem (same as v1c-v1s), stem_width = 64, cardinality=64, + reduction by 2 on width of first bottleneck convolution, 3x3 downsample convs after first block + + Parameters + ---------- + block : Block, class for the residual block. Options are BasicBlockGl, BottleneckGl. + layers : list of int, number of layers in each block + num_classes : int, default 1000, number of classification classes. + in_chans : int, default 3, number of input (color) channels. + output_stride : int, default 32, output stride of the network, 32, 16, or 8. + global_pool : str, Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax' + cardinality : int, default 1, number of convolution groups for 3x3 conv in Bottleneck. + base_width : int, default 64, factor determining bottleneck channels. `planes * base_width / 64 * cardinality` + stem_width : int, default 64, number of channels in stem convolutions + stem_type : str, default '' + The type of stem: + * '', default - a single 7x7 conv with a width of stem_width + * 'deep' - three 3x3 convolution layers of widths stem_width, stem_width, stem_width * 2 + * 'deep_tiered' - three 3x3 conv layers of widths stem_width//4 * 3, stem_width, stem_width * 2 + block_reduce_first : int, default 1 + Reduction factor for first convolution output width of residual blocks, 1 for all archs except senets, where 2 + down_kernel_size : int, default 1, kernel size of residual block downsample path, 1x1 for most, 3x3 for senets + avg_down : bool, default False, use average pooling for projection skip connection between stages/downsample. + act_layer : nn.Module, activation layer + norm_layer : nn.Module, normalization layer + aa_layer : nn.Module, anti-aliasing layer + drop_rate : float, default 0. Dropout probability before classifier, for training + """ + + def __init__( + self, block, layers, num_classes=1000, in_chans=3, output_stride=32, global_pool='avg', + cardinality=1, base_width=64, stem_width=64, stem_type='', replace_stem_pool=False, block_reduce_first=1, + down_kernel_size=1, avg_down=False, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None, + drop_rate=0.0, drop_path_rate=0., drop_block_rate=0., zero_init_last=True, block_args=None): + super(ResNet, self).__init__() + block_args = block_args or dict() + assert output_stride in (8, 16, 32) + self.num_classes = num_classes + self.drop_rate = drop_rate + self.grad_checkpointing = False + + # Stem + deep_stem = 'deep' in stem_type + inplanes = stem_width * 2 if deep_stem else 64 + if deep_stem: + stem_chs = (stem_width, stem_width) + if 'tiered' in stem_type: + stem_chs = (3 * (stem_width // 4), stem_width) + self.conv1 = nn.Sequential(*[ + nn.Conv2d(in_chans, stem_chs[0], 3, stride=2, padding=1, bias=False), + norm_layer(stem_chs[0]), + act_layer(inplace=True), + nn.Conv2d(stem_chs[0], stem_chs[1], 3, stride=1, padding=1, bias=False), + norm_layer(stem_chs[1]), + act_layer(inplace=True), + nn.Conv2d(stem_chs[1], inplanes, 3, stride=1, padding=1, bias=False)]) + else: + self.conv1 = nn.Conv2d(in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = norm_layer(inplanes) + self.act1 = act_layer(inplace=True) + self.feature_info = [dict(num_chs=inplanes, reduction=2, module='act1')] + + # Stem pooling. The name 'maxpool' remains for weight compatibility. + if replace_stem_pool: + self.maxpool = nn.Sequential(*filter(None, [ + nn.Conv2d(inplanes, inplanes, 3, stride=1 if aa_layer else 2, padding=1, bias=False), + create_aa(aa_layer, channels=inplanes, stride=2) if aa_layer is not None else None, + norm_layer(inplanes), + act_layer(inplace=True) + ])) + else: + if aa_layer is not None: + if issubclass(aa_layer, nn.AvgPool2d): + self.maxpool = aa_layer(2) + else: + self.maxpool = nn.Sequential(*[ + nn.MaxPool2d(kernel_size=3, stride=1, padding=1), + aa_layer(channels=inplanes, stride=2)]) + else: + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + # Feature Blocks + channels = [64, 128, 256, 512] + stage_modules, stage_feature_info = make_blocks( + block, channels, layers, inplanes, cardinality=cardinality, base_width=base_width, + output_stride=output_stride, reduce_first=block_reduce_first, avg_down=avg_down, + down_kernel_size=down_kernel_size, act_layer=act_layer, norm_layer=norm_layer, aa_layer=aa_layer, + drop_block_rate=drop_block_rate, drop_path_rate=drop_path_rate, **block_args) + for stage in stage_modules: + self.add_module(*stage) # layer1, layer2, etc + self.feature_info.extend(stage_feature_info) + + # Head (Pooling and Classifier) + self.num_features = 512 * block.expansion + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + self.init_weights(zero_init_last=zero_init_last) + + @torch.jit.ignore + def init_weights(self, zero_init_last=True): + for n, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + if zero_init_last: + for m in self.modules(): + if hasattr(m, 'zero_init_last'): + m.zero_init_last() + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict(stem=r'^conv1|bn1|maxpool', blocks=r'^layer(\d+)' if coarse else r'^layer(\d+)\.(\d+)') + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self, name_only=False): + return 'fc' if name_only else self.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.act1(x) + x = self.maxpool(x) + + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq([self.layer1, self.layer2, self.layer3, self.layer4], x, flatten=True) + else: + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate: + x = F.dropout(x, p=float(self.drop_rate), training=self.training) + return x if pre_logits else self.fc(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_resnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg(ResNet, variant, pretrained, **kwargs) + + +@register_model +def resnet10t(pretrained=False, **kwargs): + """Constructs a ResNet-10-T model. + """ + model_args = dict( + block=BasicBlock, layers=[1, 1, 1, 1], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs) + return _create_resnet('resnet10t', pretrained, **model_args) + + +@register_model +def resnet14t(pretrained=False, **kwargs): + """Constructs a ResNet-14-T model. + """ + model_args = dict( + block=Bottleneck, layers=[1, 1, 1, 1], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs) + return _create_resnet('resnet14t', pretrained, **model_args) + + +@register_model +def resnet18(pretrained=False, **kwargs): + """Constructs a ResNet-18 model. + """ + model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs) + return _create_resnet('resnet18', pretrained, **model_args) + + +@register_model +def resnet18d(pretrained=False, **kwargs): + """Constructs a ResNet-18-D model. + """ + model_args = dict( + block=BasicBlock, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnet18d', pretrained, **model_args) + + +@register_model +def resnet34(pretrained=False, **kwargs): + """Constructs a ResNet-34 model. + """ + model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('resnet34', pretrained, **model_args) + + +@register_model +def resnet34d(pretrained=False, **kwargs): + """Constructs a ResNet-34-D model. + """ + model_args = dict( + block=BasicBlock, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnet34d', pretrained, **model_args) + + +@register_model +def resnet26(pretrained=False, **kwargs): + """Constructs a ResNet-26 model. + """ + model_args = dict(block=Bottleneck, layers=[2, 2, 2, 2], **kwargs) + return _create_resnet('resnet26', pretrained, **model_args) + + +@register_model +def resnet26t(pretrained=False, **kwargs): + """Constructs a ResNet-26-T model. + """ + model_args = dict( + block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs) + return _create_resnet('resnet26t', pretrained, **model_args) + + +@register_model +def resnet26d(pretrained=False, **kwargs): + """Constructs a ResNet-26-D model. + """ + model_args = dict(block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnet26d', pretrained, **model_args) + + +@register_model +def resnet50(pretrained=False, **kwargs): + """Constructs a ResNet-50 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('resnet50', pretrained, **model_args) + + +@register_model +def resnet50d(pretrained=False, **kwargs): + """Constructs a ResNet-50-D model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnet50d', pretrained, **model_args) + + +@register_model +def resnet50t(pretrained=False, **kwargs): + """Constructs a ResNet-50-T model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep_tiered', avg_down=True, **kwargs) + return _create_resnet('resnet50t', pretrained, **model_args) + + +@register_model +def resnet101(pretrained=False, **kwargs): + """Constructs a ResNet-101 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs) + return _create_resnet('resnet101', pretrained, **model_args) + + +@register_model +def resnet101d(pretrained=False, **kwargs): + """Constructs a ResNet-101-D model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnet101d', pretrained, **model_args) + + +@register_model +def resnet152(pretrained=False, **kwargs): + """Constructs a ResNet-152 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs) + return _create_resnet('resnet152', pretrained, **model_args) + + +@register_model +def resnet152d(pretrained=False, **kwargs): + """Constructs a ResNet-152-D model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnet152d', pretrained, **model_args) + + +@register_model +def resnet200(pretrained=False, **kwargs): + """Constructs a ResNet-200 model. + """ + model_args = dict(block=Bottleneck, layers=[3, 24, 36, 3], **kwargs) + return _create_resnet('resnet200', pretrained, **model_args) + + +@register_model +def resnet200d(pretrained=False, **kwargs): + """Constructs a ResNet-200-D model. + """ + model_args = dict( + block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnet200d', pretrained, **model_args) + + +@register_model +def tv_resnet34(pretrained=False, **kwargs): + """Constructs a ResNet-34 model with original Torchvision weights. + """ + model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('tv_resnet34', pretrained, **model_args) + + +@register_model +def tv_resnet50(pretrained=False, **kwargs): + """Constructs a ResNet-50 model with original Torchvision weights. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('tv_resnet50', pretrained, **model_args) + + +@register_model +def tv_resnet101(pretrained=False, **kwargs): + """Constructs a ResNet-101 model w/ Torchvision pretrained weights. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], **kwargs) + return _create_resnet('tv_resnet101', pretrained, **model_args) + + +@register_model +def tv_resnet152(pretrained=False, **kwargs): + """Constructs a ResNet-152 model w/ Torchvision pretrained weights. + """ + model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], **kwargs) + return _create_resnet('tv_resnet152', pretrained, **model_args) + + +@register_model +def wide_resnet50_2(pretrained=False, **kwargs): + """Constructs a Wide ResNet-50-2 model. + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048 + channels, and in Wide ResNet-50-2 has 2048-1024-2048. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], base_width=128, **kwargs) + return _create_resnet('wide_resnet50_2', pretrained, **model_args) + + +@register_model +def wide_resnet101_2(pretrained=False, **kwargs): + """Constructs a Wide ResNet-101-2 model. + The model is the same as ResNet except for the bottleneck number of channels + which is twice larger in every block. The number of channels in outer 1x1 + convolutions is the same. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], base_width=128, **kwargs) + return _create_resnet('wide_resnet101_2', pretrained, **model_args) + + +@register_model +def resnet50_gn(pretrained=False, **kwargs): + """Constructs a ResNet-50 model w/ GroupNorm + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('resnet50_gn', pretrained, norm_layer=GroupNorm, **model_args) + + +@register_model +def resnext50_32x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt50-32x4d model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('resnext50_32x4d', pretrained, **model_args) + + +@register_model +def resnext50d_32x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt50d-32x4d model. ResNext50 w/ deep stem & avg pool downsample + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, + stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnext50d_32x4d', pretrained, **model_args) + + +@register_model +def resnext101_32x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 32x4d model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('resnext101_32x4d', pretrained, **model_args) + + +@register_model +def resnext101_32x8d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 32x8d model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs) + return _create_resnet('resnext101_32x8d', pretrained, **model_args) + + +@register_model +def resnext101_64x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt101-64x4d model. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=64, base_width=4, **kwargs) + return _create_resnet('resnext101_64x4d', pretrained, **model_args) + + +@register_model +def tv_resnext50_32x4d(pretrained=False, **kwargs): + """Constructs a ResNeXt50-32x4d model with original Torchvision weights. + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('tv_resnext50_32x4d', pretrained, **model_args) + + +@register_model +def ig_resnext101_32x8d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 32x8 model pre-trained on weakly-supervised data + and finetuned on ImageNet from Figure 5 in + `"Exploring the Limits of Weakly Supervised Pretraining" `_ + Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs) + return _create_resnet('ig_resnext101_32x8d', pretrained, **model_args) + + +@register_model +def ig_resnext101_32x16d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 32x16 model pre-trained on weakly-supervised data + and finetuned on ImageNet from Figure 5 in + `"Exploring the Limits of Weakly Supervised Pretraining" `_ + Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs) + return _create_resnet('ig_resnext101_32x16d', pretrained, **model_args) + + +@register_model +def ig_resnext101_32x32d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 32x32 model pre-trained on weakly-supervised data + and finetuned on ImageNet from Figure 5 in + `"Exploring the Limits of Weakly Supervised Pretraining" `_ + Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=32, **kwargs) + return _create_resnet('ig_resnext101_32x32d', pretrained, **model_args) + + +@register_model +def ig_resnext101_32x48d(pretrained=False, **kwargs): + """Constructs a ResNeXt-101 32x48 model pre-trained on weakly-supervised data + and finetuned on ImageNet from Figure 5 in + `"Exploring the Limits of Weakly Supervised Pretraining" `_ + Weights from https://pytorch.org/hub/facebookresearch_WSL-Images_resnext/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=48, **kwargs) + return _create_resnet('ig_resnext101_32x48d', pretrained, **model_args) + + +@register_model +def ssl_resnet18(pretrained=False, **kwargs): + """Constructs a semi-supervised ResNet-18 model pre-trained on YFCC100M dataset and finetuned on ImageNet + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs) + return _create_resnet('ssl_resnet18', pretrained, **model_args) + + +@register_model +def ssl_resnet50(pretrained=False, **kwargs): + """Constructs a semi-supervised ResNet-50 model pre-trained on YFCC100M dataset and finetuned on ImageNet + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('ssl_resnet50', pretrained, **model_args) + + +@register_model +def ssl_resnext50_32x4d(pretrained=False, **kwargs): + """Constructs a semi-supervised ResNeXt-50 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('ssl_resnext50_32x4d', pretrained, **model_args) + + +@register_model +def ssl_resnext101_32x4d(pretrained=False, **kwargs): + """Constructs a semi-supervised ResNeXt-101 32x4 model pre-trained on YFCC100M dataset and finetuned on ImageNet + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('ssl_resnext101_32x4d', pretrained, **model_args) + + +@register_model +def ssl_resnext101_32x8d(pretrained=False, **kwargs): + """Constructs a semi-supervised ResNeXt-101 32x8 model pre-trained on YFCC100M dataset and finetuned on ImageNet + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs) + return _create_resnet('ssl_resnext101_32x8d', pretrained, **model_args) + + +@register_model +def ssl_resnext101_32x16d(pretrained=False, **kwargs): + """Constructs a semi-supervised ResNeXt-101 32x16 model pre-trained on YFCC100M dataset and finetuned on ImageNet + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs) + return _create_resnet('ssl_resnext101_32x16d', pretrained, **model_args) + + +@register_model +def swsl_resnet18(pretrained=False, **kwargs): + """Constructs a semi-weakly supervised Resnet-18 model pre-trained on 1B weakly supervised + image dataset and finetuned on ImageNet. + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], **kwargs) + return _create_resnet('swsl_resnet18', pretrained, **model_args) + + +@register_model +def swsl_resnet50(pretrained=False, **kwargs): + """Constructs a semi-weakly supervised ResNet-50 model pre-trained on 1B weakly supervised + image dataset and finetuned on ImageNet. + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], **kwargs) + return _create_resnet('swsl_resnet50', pretrained, **model_args) + + +@register_model +def swsl_resnext50_32x4d(pretrained=False, **kwargs): + """Constructs a semi-weakly supervised ResNeXt-50 32x4 model pre-trained on 1B weakly supervised + image dataset and finetuned on ImageNet. + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('swsl_resnext50_32x4d', pretrained, **model_args) + + +@register_model +def swsl_resnext101_32x4d(pretrained=False, **kwargs): + """Constructs a semi-weakly supervised ResNeXt-101 32x4 model pre-trained on 1B weakly supervised + image dataset and finetuned on ImageNet. + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, **kwargs) + return _create_resnet('swsl_resnext101_32x4d', pretrained, **model_args) + + +@register_model +def swsl_resnext101_32x8d(pretrained=False, **kwargs): + """Constructs a semi-weakly supervised ResNeXt-101 32x8 model pre-trained on 1B weakly supervised + image dataset and finetuned on ImageNet. + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, **kwargs) + return _create_resnet('swsl_resnext101_32x8d', pretrained, **model_args) + + +@register_model +def swsl_resnext101_32x16d(pretrained=False, **kwargs): + """Constructs a semi-weakly supervised ResNeXt-101 32x16 model pre-trained on 1B weakly supervised + image dataset and finetuned on ImageNet. + `"Billion-scale Semi-Supervised Learning for Image Classification" `_ + Weights from https://github.com/facebookresearch/semi-supervised-ImageNet1K-models/ + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=16, **kwargs) + return _create_resnet('swsl_resnext101_32x16d', pretrained, **model_args) + + +@register_model +def ecaresnet26t(pretrained=False, **kwargs): + """Constructs an ECA-ResNeXt-26-T model. + This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels + in the deep stem and ECA attn. + """ + model_args = dict( + block=Bottleneck, layers=[2, 2, 2, 2], stem_width=32, + stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet26t', pretrained, **model_args) + + +@register_model +def ecaresnet50d(pretrained=False, **kwargs): + """Constructs a ResNet-50-D model with eca. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet50d', pretrained, **model_args) + + +@register_model +def ecaresnet50d_pruned(pretrained=False, **kwargs): + """Constructs a ResNet-50-D model pruned with eca. + The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet50d_pruned', pretrained, pruned=True, **model_args) + + +@register_model +def ecaresnet50t(pretrained=False, **kwargs): + """Constructs an ECA-ResNet-50-T model. + Like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels in the deep stem and ECA attn. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, + stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet50t', pretrained, **model_args) + + +@register_model +def ecaresnetlight(pretrained=False, **kwargs): + """Constructs a ResNet-50-D light model with eca. + """ + model_args = dict( + block=Bottleneck, layers=[1, 1, 11, 3], stem_width=32, avg_down=True, + block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnetlight', pretrained, **model_args) + + +@register_model +def ecaresnet101d(pretrained=False, **kwargs): + """Constructs a ResNet-101-D model with eca. + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet101d', pretrained, **model_args) + + +@register_model +def ecaresnet101d_pruned(pretrained=False, **kwargs): + """Constructs a ResNet-101-D model pruned with eca. + The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet101d_pruned', pretrained, pruned=True, **model_args) + + +@register_model +def ecaresnet200d(pretrained=False, **kwargs): + """Constructs a ResNet-200-D model with ECA. + """ + model_args = dict( + block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet200d', pretrained, **model_args) + + +@register_model +def ecaresnet269d(pretrained=False, **kwargs): + """Constructs a ResNet-269-D model with ECA. + """ + model_args = dict( + block=Bottleneck, layers=[3, 30, 48, 8], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnet269d', pretrained, **model_args) + + +@register_model +def ecaresnext26t_32x4d(pretrained=False, **kwargs): + """Constructs an ECA-ResNeXt-26-T model. + This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels + in the deep stem. This model replaces SE module with the ECA module + """ + model_args = dict( + block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32, + stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnext26t_32x4d', pretrained, **model_args) + + +@register_model +def ecaresnext50t_32x4d(pretrained=False, **kwargs): + """Constructs an ECA-ResNeXt-50-T model. + This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels + in the deep stem. This model replaces SE module with the ECA module + """ + model_args = dict( + block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32, + stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='eca'), **kwargs) + return _create_resnet('ecaresnext50t_32x4d', pretrained, **model_args) + + +@register_model +def seresnet18(pretrained=False, **kwargs): + model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet18', pretrained, **model_args) + + +@register_model +def seresnet34(pretrained=False, **kwargs): + model_args = dict(block=BasicBlock, layers=[3, 4, 6, 3], block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet34', pretrained, **model_args) + + +@register_model +def seresnet50(pretrained=False, **kwargs): + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet50', pretrained, **model_args) + + +@register_model +def seresnet50t(pretrained=False, **kwargs): + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep_tiered', avg_down=True, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet50t', pretrained, **model_args) + + +@register_model +def seresnet101(pretrained=False, **kwargs): + model_args = dict(block=Bottleneck, layers=[3, 4, 23, 3], block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet101', pretrained, **model_args) + + +@register_model +def seresnet152(pretrained=False, **kwargs): + model_args = dict(block=Bottleneck, layers=[3, 8, 36, 3], block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet152', pretrained, **model_args) + + +@register_model +def seresnet152d(pretrained=False, **kwargs): + model_args = dict( + block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet152d', pretrained, **model_args) + + +@register_model +def seresnet200d(pretrained=False, **kwargs): + """Constructs a ResNet-200-D model with SE attn. + """ + model_args = dict( + block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet200d', pretrained, **model_args) + + +@register_model +def seresnet269d(pretrained=False, **kwargs): + """Constructs a ResNet-269-D model with SE attn. + """ + model_args = dict( + block=Bottleneck, layers=[3, 30, 48, 8], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnet269d', pretrained, **model_args) + + +@register_model +def seresnext26d_32x4d(pretrained=False, **kwargs): + """Constructs a SE-ResNeXt-26-D model.` + This is technically a 28 layer ResNet, using the 'D' modifier from Gluon / bag-of-tricks for + combination of deep stem and avg_pool in downsample. + """ + model_args = dict( + block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32, + stem_type='deep', avg_down=True, block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnext26d_32x4d', pretrained, **model_args) + + +@register_model +def seresnext26t_32x4d(pretrained=False, **kwargs): + """Constructs a SE-ResNet-26-T model. + This is technically a 28 layer ResNet, like a 'D' bag-of-tricks model but with tiered 24, 32, 64 channels + in the deep stem. + """ + model_args = dict( + block=Bottleneck, layers=[2, 2, 2, 2], cardinality=32, base_width=4, stem_width=32, + stem_type='deep_tiered', avg_down=True, block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnext26t_32x4d', pretrained, **model_args) + + +@register_model +def seresnext26tn_32x4d(pretrained=False, **kwargs): + """Constructs a SE-ResNeXt-26-T model. + NOTE I deprecated previous 't' model defs and replaced 't' with 'tn', this was the only tn model of note + so keeping this def for backwards compat with any uses out there. Old 't' model is lost. + """ + return seresnext26t_32x4d(pretrained=pretrained, **kwargs) + + +@register_model +def seresnext50_32x4d(pretrained=False, **kwargs): + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnext50_32x4d', pretrained, **model_args) + + +@register_model +def seresnext101_32x4d(pretrained=False, **kwargs): + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=4, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnext101_32x4d', pretrained, **model_args) + + +@register_model +def seresnext101_32x8d(pretrained=False, **kwargs): + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnext101_32x8d', pretrained, **model_args) + + +@register_model +def seresnext101d_32x8d(pretrained=False, **kwargs): + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, + stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnext101d_32x8d', pretrained, **model_args) + + +@register_model +def senet154(pretrained=False, **kwargs): + model_args = dict( + block=Bottleneck, layers=[3, 8, 36, 3], cardinality=64, base_width=4, stem_type='deep', + down_kernel_size=3, block_reduce_first=2, block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('senet154', pretrained, **model_args) + + +@register_model +def resnetblur18(pretrained=False, **kwargs): + """Constructs a ResNet-18 model with blur anti-aliasing + """ + model_args = dict(block=BasicBlock, layers=[2, 2, 2, 2], aa_layer=BlurPool2d, **kwargs) + return _create_resnet('resnetblur18', pretrained, **model_args) + + +@register_model +def resnetblur50(pretrained=False, **kwargs): + """Constructs a ResNet-50 model with blur anti-aliasing + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=BlurPool2d, **kwargs) + return _create_resnet('resnetblur50', pretrained, **model_args) + + +@register_model +def resnetblur50d(pretrained=False, **kwargs): + """Constructs a ResNet-50-D model with blur anti-aliasing + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=BlurPool2d, + stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnetblur50d', pretrained, **model_args) + + +@register_model +def resnetblur101d(pretrained=False, **kwargs): + """Constructs a ResNet-101-D model with blur anti-aliasing + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], aa_layer=BlurPool2d, + stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnetblur101d', pretrained, **model_args) + + +@register_model +def resnetaa50(pretrained=False, **kwargs): + """Constructs a ResNet-50 model with avgpool anti-aliasing + """ + model_args = dict(block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=nn.AvgPool2d, **kwargs) + return _create_resnet('resnetaa50', pretrained, **model_args) + + +@register_model +def resnetaa50d(pretrained=False, **kwargs): + """Constructs a ResNet-50-D model with avgpool anti-aliasing + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=nn.AvgPool2d, + stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnetaa50d', pretrained, **model_args) + + +@register_model +def resnetaa101d(pretrained=False, **kwargs): + """Constructs a ResNet-101-D model with avgpool anti-aliasing + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], aa_layer=nn.AvgPool2d, + stem_width=32, stem_type='deep', avg_down=True, **kwargs) + return _create_resnet('resnetaa101d', pretrained, **model_args) + + +@register_model +def seresnetaa50d(pretrained=False, **kwargs): + """Constructs a SE=ResNet-50-D model with avgpool anti-aliasing + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], aa_layer=nn.AvgPool2d, + stem_width=32, stem_type='deep', avg_down=True, block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnetaa50d', pretrained, **model_args) + + +@register_model +def seresnextaa101d_32x8d(pretrained=False, **kwargs): + """Constructs a SE=ResNeXt-101-D 32x8d model with avgpool anti-aliasing + """ + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], cardinality=32, base_width=8, + stem_width=32, stem_type='deep', avg_down=True, aa_layer=nn.AvgPool2d, + block_args=dict(attn_layer='se'), **kwargs) + return _create_resnet('seresnextaa101d_32x8d', pretrained, **model_args) + + +@register_model +def resnetrs50(pretrained=False, **kwargs): + """Constructs a ResNet-RS-50 model. + Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 + Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs + """ + attn_layer = partial(get_attn('se'), rd_ratio=0.25) + model_args = dict( + block=Bottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, + avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) + return _create_resnet('resnetrs50', pretrained, **model_args) + + +@register_model +def resnetrs101(pretrained=False, **kwargs): + """Constructs a ResNet-RS-101 model. + Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 + Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs + """ + attn_layer = partial(get_attn('se'), rd_ratio=0.25) + model_args = dict( + block=Bottleneck, layers=[3, 4, 23, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, + avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) + return _create_resnet('resnetrs101', pretrained, **model_args) + + +@register_model +def resnetrs152(pretrained=False, **kwargs): + """Constructs a ResNet-RS-152 model. + Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 + Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs + """ + attn_layer = partial(get_attn('se'), rd_ratio=0.25) + model_args = dict( + block=Bottleneck, layers=[3, 8, 36, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, + avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) + return _create_resnet('resnetrs152', pretrained, **model_args) + + +@register_model +def resnetrs200(pretrained=False, **kwargs): + """Constructs a ResNet-RS-200 model. + Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 + Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs + """ + attn_layer = partial(get_attn('se'), rd_ratio=0.25) + model_args = dict( + block=Bottleneck, layers=[3, 24, 36, 3], stem_width=32, stem_type='deep', replace_stem_pool=True, + avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) + return _create_resnet('resnetrs200', pretrained, **model_args) + + +@register_model +def resnetrs270(pretrained=False, **kwargs): + """Constructs a ResNet-RS-270 model. + Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 + Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs + """ + attn_layer = partial(get_attn('se'), rd_ratio=0.25) + model_args = dict( + block=Bottleneck, layers=[4, 29, 53, 4], stem_width=32, stem_type='deep', replace_stem_pool=True, + avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) + return _create_resnet('resnetrs270', pretrained, **model_args) + + + +@register_model +def resnetrs350(pretrained=False, **kwargs): + """Constructs a ResNet-RS-350 model. + Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 + Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs + """ + attn_layer = partial(get_attn('se'), rd_ratio=0.25) + model_args = dict( + block=Bottleneck, layers=[4, 36, 72, 4], stem_width=32, stem_type='deep', replace_stem_pool=True, + avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) + return _create_resnet('resnetrs350', pretrained, **model_args) + + +@register_model +def resnetrs420(pretrained=False, **kwargs): + """Constructs a ResNet-RS-420 model + Paper: Revisiting ResNets - https://arxiv.org/abs/2103.07579 + Pretrained weights from https://github.com/tensorflow/tpu/tree/bee9c4f6/models/official/resnet/resnet_rs + """ + attn_layer = partial(get_attn('se'), rd_ratio=0.25) + model_args = dict( + block=Bottleneck, layers=[4, 44, 87, 4], stem_width=32, stem_type='deep', replace_stem_pool=True, + avg_down=True, block_args=dict(attn_layer=attn_layer), **kwargs) + return _create_resnet('resnetrs420', pretrained, **model_args) diff --git a/custom_timm/models/resnetv2.py b/custom_timm/models/resnetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..d85677a479f75779da8edb2d112a29fd744b6e7b --- /dev/null +++ b/custom_timm/models/resnetv2.py @@ -0,0 +1,708 @@ +"""Pre-Activation ResNet v2 with GroupNorm and Weight Standardization. + +A PyTorch implementation of ResNetV2 adapted from the Google Big-Transfoer (BiT) source code +at https://github.com/google-research/big_transfer to match timm interfaces. The BiT weights have +been included here as pretrained models from their original .NPZ checkpoints. + +Additionally, supports non pre-activation bottleneck for use as a backbone for Vision Transfomers (ViT) and +extra padding support to allow porting of official Hybrid ResNet pretrained weights from +https://github.com/google-research/vision_transformer + +Thanks to the Google team for the above two repositories and associated papers: +* Big Transfer (BiT): General Visual Representation Learning - https://arxiv.org/abs/1912.11370 +* An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale - https://arxiv.org/abs/2010.11929 +* Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237 + +Original copyright of Google code below, modifications by Ross Wightman, Copyright 2020. +""" +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from collections import OrderedDict # pylint: disable=g-importing-member + +import torch +import torch.nn as nn +from functools import partial + +from custom_timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .helpers import build_model_with_cfg, named_apply, adapt_input_conv, checkpoint_seq +from .registry import register_model +from .layers import GroupNormAct, BatchNormAct2d, EvoNorm2dB0, EvoNorm2dS0, EvoNorm2dS1, FilterResponseNormTlu2d,\ + ClassifierHead, DropPath, AvgPool2dSame, create_pool2d, StdConv2d, create_conv2d + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'stem.conv', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = { + # pretrained on imagenet21k, finetuned on imagenet1k + 'resnetv2_50x1_bitm': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R50x1-ILSVRC2012.npz', + input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0), + 'resnetv2_50x3_bitm': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R50x3-ILSVRC2012.npz', + input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0), + 'resnetv2_101x1_bitm': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R101x1-ILSVRC2012.npz', + input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0), + 'resnetv2_101x3_bitm': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R101x3-ILSVRC2012.npz', + input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0), + 'resnetv2_152x2_bitm': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R152x2-ILSVRC2012.npz', + input_size=(3, 448, 448), pool_size=(14, 14), crop_pct=1.0), + 'resnetv2_152x4_bitm': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R152x4-ILSVRC2012.npz', + input_size=(3, 480, 480), pool_size=(15, 15), crop_pct=1.0), # only one at 480x480? + + # trained on imagenet-21k + 'resnetv2_50x1_bitm_in21k': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R50x1.npz', + num_classes=21843), + 'resnetv2_50x3_bitm_in21k': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R50x3.npz', + num_classes=21843), + 'resnetv2_101x1_bitm_in21k': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R101x1.npz', + num_classes=21843), + 'resnetv2_101x3_bitm_in21k': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R101x3.npz', + num_classes=21843), + 'resnetv2_152x2_bitm_in21k': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R152x2.npz', + num_classes=21843), + 'resnetv2_152x4_bitm_in21k': _cfg( + url='https://storage.googleapis.com/bit_models/BiT-M-R152x4.npz', + num_classes=21843), + + 'resnetv2_50x1_bit_distilled': _cfg( + url='https://storage.googleapis.com/bit_models/distill/R50x1_224.npz', + interpolation='bicubic'), + 'resnetv2_152x2_bit_teacher': _cfg( + url='https://storage.googleapis.com/bit_models/distill/R152x2_T_224.npz', + interpolation='bicubic'), + 'resnetv2_152x2_bit_teacher_384': _cfg( + url='https://storage.googleapis.com/bit_models/distill/R152x2_T_384.npz', + input_size=(3, 384, 384), pool_size=(12, 12), crop_pct=1.0, interpolation='bicubic'), + + 'resnetv2_50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnetv2_50_a1h-000cdf49.pth', + interpolation='bicubic', crop_pct=0.95), + 'resnetv2_50d': _cfg( + interpolation='bicubic', first_conv='stem.conv1'), + 'resnetv2_50t': _cfg( + interpolation='bicubic', first_conv='stem.conv1'), + 'resnetv2_101': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rsb-weights/resnetv2_101_a1h-5d01f016.pth', + interpolation='bicubic', crop_pct=0.95), + 'resnetv2_101d': _cfg( + interpolation='bicubic', first_conv='stem.conv1'), + 'resnetv2_152': _cfg( + interpolation='bicubic'), + 'resnetv2_152d': _cfg( + interpolation='bicubic', first_conv='stem.conv1'), + + 'resnetv2_50d_gn': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnetv2_50d_gn_ah-c415c11a.pth', + interpolation='bicubic', first_conv='stem.conv1', test_input_size=(3, 288, 288), crop_pct=0.95), + 'resnetv2_50d_evob': _cfg( + interpolation='bicubic', first_conv='stem.conv1'), + 'resnetv2_50d_evos': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/resnetv2_50d_evos_ah-7c4dd548.pth', + interpolation='bicubic', first_conv='stem.conv1', test_input_size=(3, 288, 288), crop_pct=0.95), + 'resnetv2_50d_frn': _cfg( + interpolation='bicubic', first_conv='stem.conv1'), +} + + +def make_div(v, divisor=8): + min_value = divisor + new_v = max(min_value, int(v + divisor / 2) // divisor * divisor) + if new_v < 0.9 * v: + new_v += divisor + return new_v + + +class PreActBottleneck(nn.Module): + """Pre-activation (v2) bottleneck block. + + Follows the implementation of "Identity Mappings in Deep Residual Networks": + https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua + + Except it puts the stride on 3x3 conv when available. + """ + + def __init__( + self, in_chs, out_chs=None, bottle_ratio=0.25, stride=1, dilation=1, first_dilation=None, groups=1, + act_layer=None, conv_layer=None, norm_layer=None, proj_layer=None, drop_path_rate=0.): + super().__init__() + first_dilation = first_dilation or dilation + conv_layer = conv_layer or StdConv2d + norm_layer = norm_layer or partial(GroupNormAct, num_groups=32) + out_chs = out_chs or in_chs + mid_chs = make_div(out_chs * bottle_ratio) + + if proj_layer is not None: + self.downsample = proj_layer( + in_chs, out_chs, stride=stride, dilation=dilation, first_dilation=first_dilation, preact=True, + conv_layer=conv_layer, norm_layer=norm_layer) + else: + self.downsample = None + + self.norm1 = norm_layer(in_chs) + self.conv1 = conv_layer(in_chs, mid_chs, 1) + self.norm2 = norm_layer(mid_chs) + self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups) + self.norm3 = norm_layer(mid_chs) + self.conv3 = conv_layer(mid_chs, out_chs, 1) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + + def zero_init_last(self): + nn.init.zeros_(self.conv3.weight) + + def forward(self, x): + x_preact = self.norm1(x) + + # shortcut branch + shortcut = x + if self.downsample is not None: + shortcut = self.downsample(x_preact) + + # residual branch + x = self.conv1(x_preact) + x = self.conv2(self.norm2(x)) + x = self.conv3(self.norm3(x)) + x = self.drop_path(x) + return x + shortcut + + +class Bottleneck(nn.Module): + """Non Pre-activation bottleneck block, equiv to V1.5/V1b Bottleneck. Used for ViT. + """ + def __init__( + self, in_chs, out_chs=None, bottle_ratio=0.25, stride=1, dilation=1, first_dilation=None, groups=1, + act_layer=None, conv_layer=None, norm_layer=None, proj_layer=None, drop_path_rate=0.): + super().__init__() + first_dilation = first_dilation or dilation + act_layer = act_layer or nn.ReLU + conv_layer = conv_layer or StdConv2d + norm_layer = norm_layer or partial(GroupNormAct, num_groups=32) + out_chs = out_chs or in_chs + mid_chs = make_div(out_chs * bottle_ratio) + + if proj_layer is not None: + self.downsample = proj_layer( + in_chs, out_chs, stride=stride, dilation=dilation, preact=False, + conv_layer=conv_layer, norm_layer=norm_layer) + else: + self.downsample = None + + self.conv1 = conv_layer(in_chs, mid_chs, 1) + self.norm1 = norm_layer(mid_chs) + self.conv2 = conv_layer(mid_chs, mid_chs, 3, stride=stride, dilation=first_dilation, groups=groups) + self.norm2 = norm_layer(mid_chs) + self.conv3 = conv_layer(mid_chs, out_chs, 1) + self.norm3 = norm_layer(out_chs, apply_act=False) + self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + self.act3 = act_layer(inplace=True) + + def zero_init_last(self): + nn.init.zeros_(self.norm3.weight) + + def forward(self, x): + # shortcut branch + shortcut = x + if self.downsample is not None: + shortcut = self.downsample(x) + + # residual + x = self.conv1(x) + x = self.norm1(x) + x = self.conv2(x) + x = self.norm2(x) + x = self.conv3(x) + x = self.norm3(x) + x = self.drop_path(x) + x = self.act3(x + shortcut) + return x + + +class DownsampleConv(nn.Module): + def __init__( + self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None, preact=True, + conv_layer=None, norm_layer=None): + super(DownsampleConv, self).__init__() + self.conv = conv_layer(in_chs, out_chs, 1, stride=stride) + self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False) + + def forward(self, x): + return self.norm(self.conv(x)) + + +class DownsampleAvg(nn.Module): + def __init__( + self, in_chs, out_chs, stride=1, dilation=1, first_dilation=None, + preact=True, conv_layer=None, norm_layer=None): + """ AvgPool Downsampling as in 'D' ResNet variants. This is not in RegNet space but I might experiment.""" + super(DownsampleAvg, self).__init__() + avg_stride = stride if dilation == 1 else 1 + if stride > 1 or dilation > 1: + avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d + self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False) + else: + self.pool = nn.Identity() + self.conv = conv_layer(in_chs, out_chs, 1, stride=1) + self.norm = nn.Identity() if preact else norm_layer(out_chs, apply_act=False) + + def forward(self, x): + return self.norm(self.conv(self.pool(x))) + + +class ResNetStage(nn.Module): + """ResNet Stage.""" + def __init__( + self, in_chs, out_chs, stride, dilation, depth, bottle_ratio=0.25, groups=1, + avg_down=False, block_dpr=None, block_fn=PreActBottleneck, + act_layer=None, conv_layer=None, norm_layer=None, **block_kwargs): + super(ResNetStage, self).__init__() + first_dilation = 1 if dilation in (1, 2) else 2 + layer_kwargs = dict(act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer) + proj_layer = DownsampleAvg if avg_down else DownsampleConv + prev_chs = in_chs + self.blocks = nn.Sequential() + for block_idx in range(depth): + drop_path_rate = block_dpr[block_idx] if block_dpr else 0. + stride = stride if block_idx == 0 else 1 + self.blocks.add_module(str(block_idx), block_fn( + prev_chs, out_chs, stride=stride, dilation=dilation, bottle_ratio=bottle_ratio, groups=groups, + first_dilation=first_dilation, proj_layer=proj_layer, drop_path_rate=drop_path_rate, + **layer_kwargs, **block_kwargs)) + prev_chs = out_chs + first_dilation = dilation + proj_layer = None + + def forward(self, x): + x = self.blocks(x) + return x + + +def is_stem_deep(stem_type): + return any([s in stem_type for s in ('deep', 'tiered')]) + + +def create_resnetv2_stem( + in_chs, out_chs=64, stem_type='', preact=True, + conv_layer=StdConv2d, norm_layer=partial(GroupNormAct, num_groups=32)): + stem = OrderedDict() + assert stem_type in ('', 'fixed', 'same', 'deep', 'deep_fixed', 'deep_same', 'tiered') + + # NOTE conv padding mode can be changed by overriding the conv_layer def + if is_stem_deep(stem_type): + # A 3 deep 3x3 conv stack as in ResNet V1D models + if 'tiered' in stem_type: + stem_chs = (3 * out_chs // 8, out_chs // 2) # 'T' resnets in resnet.py + else: + stem_chs = (out_chs // 2, out_chs // 2) # 'D' ResNets + stem['conv1'] = conv_layer(in_chs, stem_chs[0], kernel_size=3, stride=2) + stem['norm1'] = norm_layer(stem_chs[0]) + stem['conv2'] = conv_layer(stem_chs[0], stem_chs[1], kernel_size=3, stride=1) + stem['norm2'] = norm_layer(stem_chs[1]) + stem['conv3'] = conv_layer(stem_chs[1], out_chs, kernel_size=3, stride=1) + if not preact: + stem['norm3'] = norm_layer(out_chs) + else: + # The usual 7x7 stem conv + stem['conv'] = conv_layer(in_chs, out_chs, kernel_size=7, stride=2) + if not preact: + stem['norm'] = norm_layer(out_chs) + + if 'fixed' in stem_type: + # 'fixed' SAME padding approximation that is used in BiT models + stem['pad'] = nn.ConstantPad2d(1, 0.) + stem['pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=0) + elif 'same' in stem_type: + # full, input size based 'SAME' padding, used in ViT Hybrid model + stem['pool'] = create_pool2d('max', kernel_size=3, stride=2, padding='same') + else: + # the usual PyTorch symmetric padding + stem['pool'] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + + return nn.Sequential(stem) + + +class ResNetV2(nn.Module): + """Implementation of Pre-activation (v2) ResNet mode. + """ + + def __init__( + self, layers, channels=(256, 512, 1024, 2048), + num_classes=1000, in_chans=3, global_pool='avg', output_stride=32, + width_factor=1, stem_chs=64, stem_type='', avg_down=False, preact=True, + act_layer=nn.ReLU, conv_layer=StdConv2d, norm_layer=partial(GroupNormAct, num_groups=32), + drop_rate=0., drop_path_rate=0., zero_init_last=False): + super().__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + wf = width_factor + + self.feature_info = [] + stem_chs = make_div(stem_chs * wf) + self.stem = create_resnetv2_stem( + in_chans, stem_chs, stem_type, preact, conv_layer=conv_layer, norm_layer=norm_layer) + stem_feat = ('stem.conv3' if is_stem_deep(stem_type) else 'stem.conv') if preact else 'stem.norm' + self.feature_info.append(dict(num_chs=stem_chs, reduction=2, module=stem_feat)) + + prev_chs = stem_chs + curr_stride = 4 + dilation = 1 + block_dprs = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(layers)).split(layers)] + block_fn = PreActBottleneck if preact else Bottleneck + self.stages = nn.Sequential() + for stage_idx, (d, c, bdpr) in enumerate(zip(layers, channels, block_dprs)): + out_chs = make_div(c * wf) + stride = 1 if stage_idx == 0 else 2 + if curr_stride >= output_stride: + dilation *= stride + stride = 1 + stage = ResNetStage( + prev_chs, out_chs, stride=stride, dilation=dilation, depth=d, avg_down=avg_down, + act_layer=act_layer, conv_layer=conv_layer, norm_layer=norm_layer, block_dpr=bdpr, block_fn=block_fn) + prev_chs = out_chs + curr_stride *= stride + self.feature_info += [dict(num_chs=prev_chs, reduction=curr_stride, module=f'stages.{stage_idx}')] + self.stages.add_module(str(stage_idx), stage) + + self.num_features = prev_chs + self.norm = norm_layer(self.num_features) if preact else nn.Identity() + self.head = ClassifierHead( + self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate, use_conv=True) + + self.init_weights(zero_init_last=zero_init_last) + self.grad_checkpointing = False + + @torch.jit.ignore + def init_weights(self, zero_init_last=True): + named_apply(partial(_init_weights, zero_init_last=zero_init_last), self) + + @torch.jit.ignore() + def load_pretrained(self, checkpoint_path, prefix='resnet/'): + _load_weights(self, checkpoint_path, prefix) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', + blocks=r'^stages\.(\d+)' if coarse else [ + (r'^stages\.(\d+)\.blocks\.(\d+)', None), + (r'^norm', (99999,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.head = ClassifierHead( + self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate, use_conv=True) + + def forward_features(self, x): + x = self.stem(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.stages, x, flatten=True) + else: + x = self.stages(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _init_weights(module: nn.Module, name: str = '', zero_init_last=True): + if isinstance(module, nn.Linear) or ('head.fc' in name and isinstance(module, nn.Conv2d)): + nn.init.normal_(module.weight, mean=0.0, std=0.01) + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu') + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, (nn.BatchNorm2d, nn.LayerNorm, nn.GroupNorm)): + nn.init.ones_(module.weight) + nn.init.zeros_(module.bias) + elif zero_init_last and hasattr(module, 'zero_init_last'): + module.zero_init_last() + + +@torch.no_grad() +def _load_weights(model: nn.Module, checkpoint_path: str, prefix: str = 'resnet/'): + import numpy as np + + def t2p(conv_weights): + """Possibly convert HWIO to OIHW.""" + if conv_weights.ndim == 4: + conv_weights = conv_weights.transpose([3, 2, 0, 1]) + return torch.from_numpy(conv_weights) + + weights = np.load(checkpoint_path) + stem_conv_w = adapt_input_conv( + model.stem.conv.weight.shape[1], t2p(weights[f'{prefix}root_block/standardized_conv2d/kernel'])) + model.stem.conv.weight.copy_(stem_conv_w) + model.norm.weight.copy_(t2p(weights[f'{prefix}group_norm/gamma'])) + model.norm.bias.copy_(t2p(weights[f'{prefix}group_norm/beta'])) + if isinstance(getattr(model.head, 'fc', None), nn.Conv2d) and \ + model.head.fc.weight.shape[0] == weights[f'{prefix}head/conv2d/kernel'].shape[-1]: + model.head.fc.weight.copy_(t2p(weights[f'{prefix}head/conv2d/kernel'])) + model.head.fc.bias.copy_(t2p(weights[f'{prefix}head/conv2d/bias'])) + for i, (sname, stage) in enumerate(model.stages.named_children()): + for j, (bname, block) in enumerate(stage.blocks.named_children()): + cname = 'standardized_conv2d' + block_prefix = f'{prefix}block{i + 1}/unit{j + 1:02d}/' + block.conv1.weight.copy_(t2p(weights[f'{block_prefix}a/{cname}/kernel'])) + block.conv2.weight.copy_(t2p(weights[f'{block_prefix}b/{cname}/kernel'])) + block.conv3.weight.copy_(t2p(weights[f'{block_prefix}c/{cname}/kernel'])) + block.norm1.weight.copy_(t2p(weights[f'{block_prefix}a/group_norm/gamma'])) + block.norm2.weight.copy_(t2p(weights[f'{block_prefix}b/group_norm/gamma'])) + block.norm3.weight.copy_(t2p(weights[f'{block_prefix}c/group_norm/gamma'])) + block.norm1.bias.copy_(t2p(weights[f'{block_prefix}a/group_norm/beta'])) + block.norm2.bias.copy_(t2p(weights[f'{block_prefix}b/group_norm/beta'])) + block.norm3.bias.copy_(t2p(weights[f'{block_prefix}c/group_norm/beta'])) + if block.downsample is not None: + w = weights[f'{block_prefix}a/proj/{cname}/kernel'] + block.downsample.conv.weight.copy_(t2p(w)) + + +def _create_resnetv2(variant, pretrained=False, **kwargs): + feature_cfg = dict(flatten_sequential=True) + return build_model_with_cfg( + ResNetV2, variant, pretrained, + feature_cfg=feature_cfg, + pretrained_custom_load='_bit' in variant, + **kwargs) + + +def _create_resnetv2_bit(variant, pretrained=False, **kwargs): + return _create_resnetv2( + variant, pretrained=pretrained, stem_type='fixed', conv_layer=partial(StdConv2d, eps=1e-8), **kwargs) + + +@register_model +def resnetv2_50x1_bitm(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_50x1_bitm', pretrained=pretrained, layers=[3, 4, 6, 3], width_factor=1, **kwargs) + + +@register_model +def resnetv2_50x3_bitm(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_50x3_bitm', pretrained=pretrained, layers=[3, 4, 6, 3], width_factor=3, **kwargs) + + +@register_model +def resnetv2_101x1_bitm(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_101x1_bitm', pretrained=pretrained, layers=[3, 4, 23, 3], width_factor=1, **kwargs) + + +@register_model +def resnetv2_101x3_bitm(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_101x3_bitm', pretrained=pretrained, layers=[3, 4, 23, 3], width_factor=3, **kwargs) + + +@register_model +def resnetv2_152x2_bitm(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_152x2_bitm', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=2, **kwargs) + + +@register_model +def resnetv2_152x4_bitm(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_152x4_bitm', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=4, **kwargs) + + +@register_model +def resnetv2_50x1_bitm_in21k(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_50x1_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843), + layers=[3, 4, 6, 3], width_factor=1, **kwargs) + + +@register_model +def resnetv2_50x3_bitm_in21k(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_50x3_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843), + layers=[3, 4, 6, 3], width_factor=3, **kwargs) + + +@register_model +def resnetv2_101x1_bitm_in21k(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_101x1_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843), + layers=[3, 4, 23, 3], width_factor=1, **kwargs) + + +@register_model +def resnetv2_101x3_bitm_in21k(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_101x3_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843), + layers=[3, 4, 23, 3], width_factor=3, **kwargs) + + +@register_model +def resnetv2_152x2_bitm_in21k(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_152x2_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843), + layers=[3, 8, 36, 3], width_factor=2, **kwargs) + + +@register_model +def resnetv2_152x4_bitm_in21k(pretrained=False, **kwargs): + return _create_resnetv2_bit( + 'resnetv2_152x4_bitm_in21k', pretrained=pretrained, num_classes=kwargs.pop('num_classes', 21843), + layers=[3, 8, 36, 3], width_factor=4, **kwargs) + + +@register_model +def resnetv2_50x1_bit_distilled(pretrained=False, **kwargs): + """ ResNetV2-50x1-BiT Distilled + Paper: Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237 + """ + return _create_resnetv2_bit( + 'resnetv2_50x1_bit_distilled', pretrained=pretrained, layers=[3, 4, 6, 3], width_factor=1, **kwargs) + + +@register_model +def resnetv2_152x2_bit_teacher(pretrained=False, **kwargs): + """ ResNetV2-152x2-BiT Teacher + Paper: Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237 + """ + return _create_resnetv2_bit( + 'resnetv2_152x2_bit_teacher', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=2, **kwargs) + + +@register_model +def resnetv2_152x2_bit_teacher_384(pretrained=False, **kwargs): + """ ResNetV2-152xx-BiT Teacher @ 384x384 + Paper: Knowledge distillation: A good teacher is patient and consistent - https://arxiv.org/abs/2106.05237 + """ + return _create_resnetv2_bit( + 'resnetv2_152x2_bit_teacher_384', pretrained=pretrained, layers=[3, 8, 36, 3], width_factor=2, **kwargs) + + +@register_model +def resnetv2_50(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_50', pretrained=pretrained, + layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, **kwargs) + + +@register_model +def resnetv2_50d(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_50d', pretrained=pretrained, + layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, + stem_type='deep', avg_down=True, **kwargs) + + +@register_model +def resnetv2_50t(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_50t', pretrained=pretrained, + layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, + stem_type='tiered', avg_down=True, **kwargs) + + +@register_model +def resnetv2_101(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_101', pretrained=pretrained, + layers=[3, 4, 23, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, **kwargs) + + +@register_model +def resnetv2_101d(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_101d', pretrained=pretrained, + layers=[3, 4, 23, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, + stem_type='deep', avg_down=True, **kwargs) + + +@register_model +def resnetv2_152(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_152', pretrained=pretrained, + layers=[3, 8, 36, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, **kwargs) + + +@register_model +def resnetv2_152d(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_152d', pretrained=pretrained, + layers=[3, 8, 36, 3], conv_layer=create_conv2d, norm_layer=BatchNormAct2d, + stem_type='deep', avg_down=True, **kwargs) + + +# Experimental configs (may change / be removed) + +@register_model +def resnetv2_50d_gn(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_50d_gn', pretrained=pretrained, + layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=GroupNormAct, + stem_type='deep', avg_down=True, **kwargs) + + +@register_model +def resnetv2_50d_evob(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_50d_evob', pretrained=pretrained, + layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=EvoNorm2dB0, + stem_type='deep', avg_down=True, zero_init_last=True, **kwargs) + + +@register_model +def resnetv2_50d_evos(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_50d_evos', pretrained=pretrained, + layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=EvoNorm2dS0, + stem_type='deep', avg_down=True, **kwargs) + + +@register_model +def resnetv2_50d_frn(pretrained=False, **kwargs): + return _create_resnetv2( + 'resnetv2_50d_frn', pretrained=pretrained, + layers=[3, 4, 6, 3], conv_layer=create_conv2d, norm_layer=FilterResponseNormTlu2d, + stem_type='deep', avg_down=True, **kwargs) diff --git a/custom_timm/models/rexnet.py b/custom_timm/models/rexnet.py new file mode 100644 index 0000000000000000000000000000000000000000..c7077ea6e996c624ef85052b1a6114ea681142b9 --- /dev/null +++ b/custom_timm/models/rexnet.py @@ -0,0 +1,261 @@ +""" ReXNet + +A PyTorch impl of `ReXNet: Diminishing Representational Bottleneck on Convolutional Neural Network` - +https://arxiv.org/abs/2007.00992 + +Adapted from original impl at https://github.com/clovaai/rexnet +Copyright (c) 2020-present NAVER Corp. MIT license + +Changes for timm, feature extraction, and rounded channel variant hacked together by Ross Wightman +Copyright 2020 Ross Wightman +""" + +import torch +import torch.nn as nn +from functools import partial +from math import ceil + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import ClassifierHead, create_act_layer, ConvNormAct, DropPath, make_divisible, SEModule +from .registry import register_model +from .efficientnet_builder import efficientnet_init_weights + + +def _cfg(url=''): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.conv', 'classifier': 'head.fc', + } + + +default_cfgs = dict( + rexnet_100=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_100-1b4dddf4.pth'), + rexnet_130=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_130-590d768e.pth'), + rexnet_150=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_150-bd1a6aa8.pth'), + rexnet_200=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_200-8c0b7f2d.pth'), + rexnetr_100=_cfg( + url=''), + rexnetr_130=_cfg( + url=''), + rexnetr_150=_cfg( + url=''), + rexnetr_200=_cfg( + url=''), +) + +SEWithNorm = partial(SEModule, norm_layer=nn.BatchNorm2d) + + +class LinearBottleneck(nn.Module): + def __init__( + self, in_chs, out_chs, stride, exp_ratio=1.0, se_ratio=0., ch_div=1, + act_layer='swish', dw_act_layer='relu6', drop_path=None): + super(LinearBottleneck, self).__init__() + self.use_shortcut = stride == 1 and in_chs <= out_chs + self.in_channels = in_chs + self.out_channels = out_chs + + if exp_ratio != 1.: + dw_chs = make_divisible(round(in_chs * exp_ratio), divisor=ch_div) + self.conv_exp = ConvNormAct(in_chs, dw_chs, act_layer=act_layer) + else: + dw_chs = in_chs + self.conv_exp = None + + self.conv_dw = ConvNormAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False) + if se_ratio > 0: + self.se = SEWithNorm(dw_chs, rd_channels=make_divisible(int(dw_chs * se_ratio), ch_div)) + else: + self.se = None + self.act_dw = create_act_layer(dw_act_layer) + + self.conv_pwl = ConvNormAct(dw_chs, out_chs, 1, apply_act=False) + self.drop_path = drop_path + + def feat_channels(self, exp=False): + return self.conv_dw.out_channels if exp else self.out_channels + + def forward(self, x): + shortcut = x + if self.conv_exp is not None: + x = self.conv_exp(x) + x = self.conv_dw(x) + if self.se is not None: + x = self.se(x) + x = self.act_dw(x) + x = self.conv_pwl(x) + if self.use_shortcut: + if self.drop_path is not None: + x = self.drop_path(x) + x = torch.cat([x[:, 0:self.in_channels] + shortcut, x[:, self.in_channels:]], dim=1) + return x + + +def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, se_ratio=0., ch_div=1): + layers = [1, 2, 2, 3, 3, 5] + strides = [1, 2, 2, 2, 1, 2] + layers = [ceil(element * depth_mult) for element in layers] + strides = sum([[element] + [1] * (layers[idx] - 1) for idx, element in enumerate(strides)], []) + exp_ratios = [1] * layers[0] + [6] * sum(layers[1:]) + depth = sum(layers[:]) * 3 + base_chs = initial_chs / width_mult if width_mult < 1.0 else initial_chs + + # The following channel configuration is a simple instance to make each layer become an expand layer. + out_chs_list = [] + for i in range(depth // 3): + out_chs_list.append(make_divisible(round(base_chs * width_mult), divisor=ch_div)) + base_chs += final_chs / (depth // 3 * 1.0) + + se_ratios = [0.] * (layers[0] + layers[1]) + [se_ratio] * sum(layers[2:]) + + return list(zip(out_chs_list, exp_ratios, strides, se_ratios)) + + +def _build_blocks( + block_cfg, prev_chs, width_mult, ch_div=1, act_layer='swish', dw_act_layer='relu6', drop_path_rate=0.): + feat_chs = [prev_chs] + feature_info = [] + curr_stride = 2 + features = [] + num_blocks = len(block_cfg) + for block_idx, (chs, exp_ratio, stride, se_ratio) in enumerate(block_cfg): + if stride > 1: + fname = 'stem' if block_idx == 0 else f'features.{block_idx - 1}' + feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=fname)] + curr_stride *= stride + block_dpr = drop_path_rate * block_idx / (num_blocks - 1) # stochastic depth linear decay rule + drop_path = DropPath(block_dpr) if block_dpr > 0. else None + features.append(LinearBottleneck( + in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, se_ratio=se_ratio, + ch_div=ch_div, act_layer=act_layer, dw_act_layer=dw_act_layer, drop_path=drop_path)) + prev_chs = chs + feat_chs += [features[-1].feat_channels()] + pen_chs = make_divisible(1280 * width_mult, divisor=ch_div) + feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=f'features.{len(features) - 1}')] + features.append(ConvNormAct(prev_chs, pen_chs, act_layer=act_layer)) + return features, feature_info + + +class ReXNetV1(nn.Module): + def __init__( + self, in_chans=3, num_classes=1000, global_pool='avg', output_stride=32, + initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, se_ratio=1/12., + ch_div=1, act_layer='swish', dw_act_layer='relu6', drop_rate=0.2, drop_path_rate=0. + ): + super(ReXNetV1, self).__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + self.grad_checkpointing = False + + assert output_stride == 32 # FIXME support dilation + stem_base_chs = 32 / width_mult if width_mult < 1.0 else 32 + stem_chs = make_divisible(round(stem_base_chs * width_mult), divisor=ch_div) + self.stem = ConvNormAct(in_chans, stem_chs, 3, stride=2, act_layer=act_layer) + + block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, se_ratio, ch_div) + features, self.feature_info = _build_blocks( + block_cfg, stem_chs, width_mult, ch_div, act_layer, dw_act_layer, drop_path_rate) + self.num_features = features[-1].out_channels + self.features = nn.Sequential(*features) + + self.head = ClassifierHead(self.num_features, num_classes, global_pool, drop_rate) + + efficientnet_init_weights(self) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^stem', + blocks=r'^features\.(\d+)', + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.features, x, flatten=True) + else: + x = self.features(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_rexnet(variant, pretrained, **kwargs): + feature_cfg = dict(flatten_sequential=True) + return build_model_with_cfg( + ReXNetV1, variant, pretrained, + feature_cfg=feature_cfg, + **kwargs) + + +@register_model +def rexnet_100(pretrained=False, **kwargs): + """ReXNet V1 1.0x""" + return _create_rexnet('rexnet_100', pretrained, **kwargs) + + +@register_model +def rexnet_130(pretrained=False, **kwargs): + """ReXNet V1 1.3x""" + return _create_rexnet('rexnet_130', pretrained, width_mult=1.3, **kwargs) + + +@register_model +def rexnet_150(pretrained=False, **kwargs): + """ReXNet V1 1.5x""" + return _create_rexnet('rexnet_150', pretrained, width_mult=1.5, **kwargs) + + +@register_model +def rexnet_200(pretrained=False, **kwargs): + """ReXNet V1 2.0x""" + return _create_rexnet('rexnet_200', pretrained, width_mult=2.0, **kwargs) + + +@register_model +def rexnetr_100(pretrained=False, **kwargs): + """ReXNet V1 1.0x w/ rounded (mod 8) channels""" + return _create_rexnet('rexnetr_100', pretrained, ch_div=8, **kwargs) + + +@register_model +def rexnetr_130(pretrained=False, **kwargs): + """ReXNet V1 1.3x w/ rounded (mod 8) channels""" + return _create_rexnet('rexnetr_130', pretrained, width_mult=1.3, ch_div=8, **kwargs) + + +@register_model +def rexnetr_150(pretrained=False, **kwargs): + """ReXNet V1 1.5x w/ rounded (mod 8) channels""" + return _create_rexnet('rexnetr_150', pretrained, width_mult=1.5, ch_div=8, **kwargs) + + +@register_model +def rexnetr_200(pretrained=False, **kwargs): + """ReXNet V1 2.0x w/ rounded (mod 8) channels""" + return _create_rexnet('rexnetr_200', pretrained, width_mult=2.0, ch_div=8, **kwargs) diff --git a/custom_timm/models/selecsls.py b/custom_timm/models/selecsls.py new file mode 100644 index 0000000000000000000000000000000000000000..2eb9e1f6dc9647e1c5071300ff030f760fba3984 --- /dev/null +++ b/custom_timm/models/selecsls.py @@ -0,0 +1,377 @@ +"""PyTorch SelecSLS Net example for ImageNet Classification +License: CC BY 4.0 (https://creativecommons.org/licenses/by/4.0/legalcode) +Author: Dushyant Mehta (@mehtadushy) + +SelecSLS (core) Network Architecture as proposed in "XNect: Real-time Multi-person 3D +Human Pose Estimation with a Single RGB Camera, Mehta et al." +https://arxiv.org/abs/1907.00837 + +Based on ResNet implementation in https://github.com/rwightman/pytorch-image-models +and SelecSLS Net implementation in https://github.com/mehtadushy/SelecSLS-Pytorch +""" +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import create_classifier +from .registry import register_model + +__all__ = ['SelecSLS'] # model_registry will add each entrypoint fn to this + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (4, 4), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.0', 'classifier': 'fc', + **kwargs + } + + +default_cfgs = { + 'selecsls42': _cfg( + url='', + interpolation='bicubic'), + 'selecsls42b': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls42b-8af30141.pth', + interpolation='bicubic'), + 'selecsls60': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls60-bbf87526.pth', + interpolation='bicubic'), + 'selecsls60b': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-selecsls/selecsls60b-94e619b5.pth', + interpolation='bicubic'), + 'selecsls84': _cfg( + url='', + interpolation='bicubic'), +} + + +class SequentialList(nn.Sequential): + + def __init__(self, *args): + super(SequentialList, self).__init__(*args) + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (List[torch.Tensor]) -> (List[torch.Tensor]) + pass + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (torch.Tensor) -> (List[torch.Tensor]) + pass + + def forward(self, x) -> List[torch.Tensor]: + for module in self: + x = module(x) + return x + + +class SelectSeq(nn.Module): + def __init__(self, mode='index', index=0): + super(SelectSeq, self).__init__() + self.mode = mode + self.index = index + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (List[torch.Tensor]) -> (torch.Tensor) + pass + + @torch.jit._overload_method # noqa: F811 + def forward(self, x): + # type: (Tuple[torch.Tensor]) -> (torch.Tensor) + pass + + def forward(self, x) -> torch.Tensor: + if self.mode == 'index': + return x[self.index] + else: + return torch.cat(x, dim=1) + + +def conv_bn(in_chs, out_chs, k=3, stride=1, padding=None, dilation=1): + if padding is None: + padding = ((stride - 1) + dilation * (k - 1)) // 2 + return nn.Sequential( + nn.Conv2d(in_chs, out_chs, k, stride, padding=padding, dilation=dilation, bias=False), + nn.BatchNorm2d(out_chs), + nn.ReLU(inplace=True) + ) + + +class SelecSLSBlock(nn.Module): + def __init__(self, in_chs, skip_chs, mid_chs, out_chs, is_first, stride, dilation=1): + super(SelecSLSBlock, self).__init__() + self.stride = stride + self.is_first = is_first + assert stride in [1, 2] + + # Process input with 4 conv blocks with the same number of input and output channels + self.conv1 = conv_bn(in_chs, mid_chs, 3, stride, dilation=dilation) + self.conv2 = conv_bn(mid_chs, mid_chs, 1) + self.conv3 = conv_bn(mid_chs, mid_chs // 2, 3) + self.conv4 = conv_bn(mid_chs // 2, mid_chs, 1) + self.conv5 = conv_bn(mid_chs, mid_chs // 2, 3) + self.conv6 = conv_bn(2 * mid_chs + (0 if is_first else skip_chs), out_chs, 1) + + def forward(self, x: List[torch.Tensor]) -> List[torch.Tensor]: + if not isinstance(x, list): + x = [x] + assert len(x) in [1, 2] + + d1 = self.conv1(x[0]) + d2 = self.conv3(self.conv2(d1)) + d3 = self.conv5(self.conv4(d2)) + if self.is_first: + out = self.conv6(torch.cat([d1, d2, d3], 1)) + return [out, out] + else: + return [self.conv6(torch.cat([d1, d2, d3, x[1]], 1)), x[1]] + + +class SelecSLS(nn.Module): + """SelecSLS42 / SelecSLS60 / SelecSLS84 + + Parameters + ---------- + cfg : network config dictionary specifying block type, feature, and head args + num_classes : int, default 1000 + Number of classification classes. + in_chans : int, default 3 + Number of input (color) channels. + drop_rate : float, default 0. + Dropout probability before classifier, for training + global_pool : str, default 'avg' + Global pooling type. One of 'avg', 'max', 'avgmax', 'catavgmax' + """ + + def __init__(self, cfg, num_classes=1000, in_chans=3, drop_rate=0.0, global_pool='avg'): + self.num_classes = num_classes + self.drop_rate = drop_rate + super(SelecSLS, self).__init__() + + self.stem = conv_bn(in_chans, 32, stride=2) + self.features = SequentialList(*[cfg['block'](*block_args) for block_args in cfg['features']]) + self.from_seq = SelectSeq() # from List[tensor] -> Tensor in module compatible way + self.head = nn.Sequential(*[conv_bn(*conv_args) for conv_args in cfg['head']]) + self.num_features = cfg['num_features'] + self.feature_info = cfg['feature_info'] + + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + for n, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1.) + nn.init.constant_(m.bias, 0.) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', + blocks=r'^features\.(\d+)', + blocks_head=r'^head' + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x = self.stem(x) + x = self.features(x) + x = self.head(self.from_seq(x)) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return x if pre_logits else self.fc(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_selecsls(variant, pretrained, **kwargs): + cfg = {} + feature_info = [dict(num_chs=32, reduction=2, module='stem.2')] + if variant.startswith('selecsls42'): + cfg['block'] = SelecSLSBlock + # Define configuration of the network after the initial neck + cfg['features'] = [ + # in_chs, skip_chs, mid_chs, out_chs, is_first, stride + (32, 0, 64, 64, True, 2), + (64, 64, 64, 128, False, 1), + (128, 0, 144, 144, True, 2), + (144, 144, 144, 288, False, 1), + (288, 0, 304, 304, True, 2), + (304, 304, 304, 480, False, 1), + ] + feature_info.extend([ + dict(num_chs=128, reduction=4, module='features.1'), + dict(num_chs=288, reduction=8, module='features.3'), + dict(num_chs=480, reduction=16, module='features.5'), + ]) + # Head can be replaced with alternative configurations depending on the problem + feature_info.append(dict(num_chs=1024, reduction=32, module='head.1')) + if variant == 'selecsls42b': + cfg['head'] = [ + (480, 960, 3, 2), + (960, 1024, 3, 1), + (1024, 1280, 3, 2), + (1280, 1024, 1, 1), + ] + feature_info.append(dict(num_chs=1024, reduction=64, module='head.3')) + cfg['num_features'] = 1024 + else: + cfg['head'] = [ + (480, 960, 3, 2), + (960, 1024, 3, 1), + (1024, 1024, 3, 2), + (1024, 1280, 1, 1), + ] + feature_info.append(dict(num_chs=1280, reduction=64, module='head.3')) + cfg['num_features'] = 1280 + + elif variant.startswith('selecsls60'): + cfg['block'] = SelecSLSBlock + # Define configuration of the network after the initial neck + cfg['features'] = [ + # in_chs, skip_chs, mid_chs, out_chs, is_first, stride + (32, 0, 64, 64, True, 2), + (64, 64, 64, 128, False, 1), + (128, 0, 128, 128, True, 2), + (128, 128, 128, 128, False, 1), + (128, 128, 128, 288, False, 1), + (288, 0, 288, 288, True, 2), + (288, 288, 288, 288, False, 1), + (288, 288, 288, 288, False, 1), + (288, 288, 288, 416, False, 1), + ] + feature_info.extend([ + dict(num_chs=128, reduction=4, module='features.1'), + dict(num_chs=288, reduction=8, module='features.4'), + dict(num_chs=416, reduction=16, module='features.8'), + ]) + # Head can be replaced with alternative configurations depending on the problem + feature_info.append(dict(num_chs=1024, reduction=32, module='head.1')) + if variant == 'selecsls60b': + cfg['head'] = [ + (416, 756, 3, 2), + (756, 1024, 3, 1), + (1024, 1280, 3, 2), + (1280, 1024, 1, 1), + ] + feature_info.append(dict(num_chs=1024, reduction=64, module='head.3')) + cfg['num_features'] = 1024 + else: + cfg['head'] = [ + (416, 756, 3, 2), + (756, 1024, 3, 1), + (1024, 1024, 3, 2), + (1024, 1280, 1, 1), + ] + feature_info.append(dict(num_chs=1280, reduction=64, module='head.3')) + cfg['num_features'] = 1280 + + elif variant == 'selecsls84': + cfg['block'] = SelecSLSBlock + # Define configuration of the network after the initial neck + cfg['features'] = [ + # in_chs, skip_chs, mid_chs, out_chs, is_first, stride + (32, 0, 64, 64, True, 2), + (64, 64, 64, 144, False, 1), + (144, 0, 144, 144, True, 2), + (144, 144, 144, 144, False, 1), + (144, 144, 144, 144, False, 1), + (144, 144, 144, 144, False, 1), + (144, 144, 144, 304, False, 1), + (304, 0, 304, 304, True, 2), + (304, 304, 304, 304, False, 1), + (304, 304, 304, 304, False, 1), + (304, 304, 304, 304, False, 1), + (304, 304, 304, 304, False, 1), + (304, 304, 304, 512, False, 1), + ] + feature_info.extend([ + dict(num_chs=144, reduction=4, module='features.1'), + dict(num_chs=304, reduction=8, module='features.6'), + dict(num_chs=512, reduction=16, module='features.12'), + ]) + # Head can be replaced with alternative configurations depending on the problem + cfg['head'] = [ + (512, 960, 3, 2), + (960, 1024, 3, 1), + (1024, 1024, 3, 2), + (1024, 1280, 3, 1), + ] + cfg['num_features'] = 1280 + feature_info.extend([ + dict(num_chs=1024, reduction=32, module='head.1'), + dict(num_chs=1280, reduction=64, module='head.3') + ]) + else: + raise ValueError('Invalid net configuration ' + variant + ' !!!') + cfg['feature_info'] = feature_info + + # this model can do 6 feature levels by default, unlike most others, leave as 0-4 to avoid surprises? + return build_model_with_cfg( + SelecSLS, variant, pretrained, + model_cfg=cfg, + feature_cfg=dict(out_indices=(0, 1, 2, 3, 4), flatten_sequential=True), + **kwargs) + + +@register_model +def selecsls42(pretrained=False, **kwargs): + """Constructs a SelecSLS42 model. + """ + return _create_selecsls('selecsls42', pretrained, **kwargs) + + +@register_model +def selecsls42b(pretrained=False, **kwargs): + """Constructs a SelecSLS42_B model. + """ + return _create_selecsls('selecsls42b', pretrained, **kwargs) + + +@register_model +def selecsls60(pretrained=False, **kwargs): + """Constructs a SelecSLS60 model. + """ + return _create_selecsls('selecsls60', pretrained, **kwargs) + + +@register_model +def selecsls60b(pretrained=False, **kwargs): + """Constructs a SelecSLS60_B model. + """ + return _create_selecsls('selecsls60b', pretrained, **kwargs) + + +@register_model +def selecsls84(pretrained=False, **kwargs): + """Constructs a SelecSLS84 model. + """ + return _create_selecsls('selecsls84', pretrained, **kwargs) diff --git a/custom_timm/models/senet.py b/custom_timm/models/senet.py new file mode 100644 index 0000000000000000000000000000000000000000..5611479f82bef79df4913c6bf0e56b35e0630651 --- /dev/null +++ b/custom_timm/models/senet.py @@ -0,0 +1,465 @@ +""" +SEResNet implementation from Cadene's pretrained models +https://github.com/Cadene/pretrained-models.pytorch/blob/master/pretrainedmodels/models/senet.py +Additional credit to https://github.com/creafz + +Original model: https://github.com/hujie-frank/SENet + +ResNet code gently borrowed from +https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py + +FIXME I'm deprecating this model and moving them to ResNet as I don't want to maintain duplicate +support for extras like dilation, switchable BN/activations, feature extraction, etc that don't exist here. +""" +import math +from collections import OrderedDict + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import create_classifier +from .registry import register_model + +__all__ = ['SENet'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'layer0.conv1', 'classifier': 'last_linear', + **kwargs + } + + +default_cfgs = { + 'legacy_senet154': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/legacy_senet154-e9eb9fe6.pth'), + 'legacy_seresnet18': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet18-4bb0ce65.pth', + interpolation='bicubic'), + 'legacy_seresnet34': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnet34-a4004e63.pth'), + 'legacy_seresnet50': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet50-ce0d4300.pth'), + 'legacy_seresnet101': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet101-7e38fcc6.pth'), + 'legacy_seresnet152': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/se_resnet152-d17c99b7.pth'), + 'legacy_seresnext26_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/seresnext26_32x4d-65ebdb501.pth', + interpolation='bicubic'), + 'legacy_seresnext50_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/legacy_se_resnext50_32x4d-f3651bad.pth'), + 'legacy_seresnext101_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/legacy_se_resnext101_32x4d-37725eac.pth'), +} + + +def _weight_init(m): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1.) + nn.init.constant_(m.bias, 0.) + + +class SEModule(nn.Module): + + def __init__(self, channels, reduction): + super(SEModule, self).__init__() + self.fc1 = nn.Conv2d(channels, channels // reduction, kernel_size=1) + self.relu = nn.ReLU(inplace=True) + self.fc2 = nn.Conv2d(channels // reduction, channels, kernel_size=1) + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + module_input = x + x = x.mean((2, 3), keepdim=True) + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + x = self.sigmoid(x) + return module_input * x + + +class Bottleneck(nn.Module): + """ + Base class for bottlenecks that implements `forward()` method. + """ + + def forward(self, x): + shortcut = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + out = self.conv3(out) + out = self.bn3(out) + + if self.downsample is not None: + shortcut = self.downsample(x) + + out = self.se_module(out) + shortcut + out = self.relu(out) + + return out + + +class SEBottleneck(Bottleneck): + """ + Bottleneck for SENet154. + """ + expansion = 4 + + def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None): + super(SEBottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes * 2, kernel_size=1, bias=False) + self.bn1 = nn.BatchNorm2d(planes * 2) + self.conv2 = nn.Conv2d( + planes * 2, planes * 4, kernel_size=3, stride=stride, + padding=1, groups=groups, bias=False) + self.bn2 = nn.BatchNorm2d(planes * 4) + self.conv3 = nn.Conv2d(planes * 4, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.se_module = SEModule(planes * 4, reduction=reduction) + self.downsample = downsample + self.stride = stride + + +class SEResNetBottleneck(Bottleneck): + """ + ResNet bottleneck with a Squeeze-and-Excitation module. It follows Caffe + implementation and uses `stride=stride` in `conv1` and not in `conv2` + (the latter is used in the torchvision implementation of ResNet). + """ + expansion = 4 + + def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None): + super(SEResNetBottleneck, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False, stride=stride) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, groups=groups, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.se_module = SEModule(planes * 4, reduction=reduction) + self.downsample = downsample + self.stride = stride + + +class SEResNeXtBottleneck(Bottleneck): + """ + ResNeXt bottleneck type C with a Squeeze-and-Excitation module. + """ + expansion = 4 + + def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None, base_width=4): + super(SEResNeXtBottleneck, self).__init__() + width = math.floor(planes * (base_width / 64)) * groups + self.conv1 = nn.Conv2d(inplanes, width, kernel_size=1, bias=False, stride=1) + self.bn1 = nn.BatchNorm2d(width) + self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride, padding=1, groups=groups, bias=False) + self.bn2 = nn.BatchNorm2d(width) + self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False) + self.bn3 = nn.BatchNorm2d(planes * 4) + self.relu = nn.ReLU(inplace=True) + self.se_module = SEModule(planes * 4, reduction=reduction) + self.downsample = downsample + self.stride = stride + + +class SEResNetBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, groups, reduction, stride=1, downsample=None): + super(SEResNetBlock, self).__init__() + self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=3, padding=1, stride=stride, bias=False) + self.bn1 = nn.BatchNorm2d(planes) + self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1, groups=groups, bias=False) + self.bn2 = nn.BatchNorm2d(planes) + self.relu = nn.ReLU(inplace=True) + self.se_module = SEModule(planes, reduction=reduction) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + shortcut = x + + out = self.conv1(x) + out = self.bn1(out) + out = self.relu(out) + + out = self.conv2(out) + out = self.bn2(out) + out = self.relu(out) + + if self.downsample is not None: + shortcut = self.downsample(x) + + out = self.se_module(out) + shortcut + out = self.relu(out) + + return out + + +class SENet(nn.Module): + + def __init__( + self, block, layers, groups, reduction, drop_rate=0.2, + in_chans=3, inplanes=64, input_3x3=False, downsample_kernel_size=1, + downsample_padding=0, num_classes=1000, global_pool='avg'): + """ + Parameters + ---------- + block (nn.Module): Bottleneck class. + - For SENet154: SEBottleneck + - For SE-ResNet models: SEResNetBottleneck + - For SE-ResNeXt models: SEResNeXtBottleneck + layers (list of ints): Number of residual blocks for 4 layers of the + network (layer1...layer4). + groups (int): Number of groups for the 3x3 convolution in each + bottleneck block. + - For SENet154: 64 + - For SE-ResNet models: 1 + - For SE-ResNeXt models: 32 + reduction (int): Reduction ratio for Squeeze-and-Excitation modules. + - For all models: 16 + dropout_p (float or None): Drop probability for the Dropout layer. + If `None` the Dropout layer is not used. + - For SENet154: 0.2 + - For SE-ResNet models: None + - For SE-ResNeXt models: None + inplanes (int): Number of input channels for layer1. + - For SENet154: 128 + - For SE-ResNet models: 64 + - For SE-ResNeXt models: 64 + input_3x3 (bool): If `True`, use three 3x3 convolutions instead of + a single 7x7 convolution in layer0. + - For SENet154: True + - For SE-ResNet models: False + - For SE-ResNeXt models: False + downsample_kernel_size (int): Kernel size for downsampling convolutions + in layer2, layer3 and layer4. + - For SENet154: 3 + - For SE-ResNet models: 1 + - For SE-ResNeXt models: 1 + downsample_padding (int): Padding for downsampling convolutions in + layer2, layer3 and layer4. + - For SENet154: 1 + - For SE-ResNet models: 0 + - For SE-ResNeXt models: 0 + num_classes (int): Number of outputs in `last_linear` layer. + - For all models: 1000 + """ + super(SENet, self).__init__() + self.inplanes = inplanes + self.num_classes = num_classes + self.drop_rate = drop_rate + if input_3x3: + layer0_modules = [ + ('conv1', nn.Conv2d(in_chans, 64, 3, stride=2, padding=1, bias=False)), + ('bn1', nn.BatchNorm2d(64)), + ('relu1', nn.ReLU(inplace=True)), + ('conv2', nn.Conv2d(64, 64, 3, stride=1, padding=1, bias=False)), + ('bn2', nn.BatchNorm2d(64)), + ('relu2', nn.ReLU(inplace=True)), + ('conv3', nn.Conv2d(64, inplanes, 3, stride=1, padding=1, bias=False)), + ('bn3', nn.BatchNorm2d(inplanes)), + ('relu3', nn.ReLU(inplace=True)), + ] + else: + layer0_modules = [ + ('conv1', nn.Conv2d( + in_chans, inplanes, kernel_size=7, stride=2, padding=3, bias=False)), + ('bn1', nn.BatchNorm2d(inplanes)), + ('relu1', nn.ReLU(inplace=True)), + ] + self.layer0 = nn.Sequential(OrderedDict(layer0_modules)) + # To preserve compatibility with Caffe weights `ceil_mode=True` is used instead of `padding=1`. + self.pool0 = nn.MaxPool2d(3, stride=2, ceil_mode=True) + self.feature_info = [dict(num_chs=inplanes, reduction=2, module='layer0')] + self.layer1 = self._make_layer( + block, + planes=64, + blocks=layers[0], + groups=groups, + reduction=reduction, + downsample_kernel_size=1, + downsample_padding=0 + ) + self.feature_info += [dict(num_chs=64 * block.expansion, reduction=4, module='layer1')] + self.layer2 = self._make_layer( + block, + planes=128, + blocks=layers[1], + stride=2, + groups=groups, + reduction=reduction, + downsample_kernel_size=downsample_kernel_size, + downsample_padding=downsample_padding + ) + self.feature_info += [dict(num_chs=128 * block.expansion, reduction=8, module='layer2')] + self.layer3 = self._make_layer( + block, + planes=256, + blocks=layers[2], + stride=2, + groups=groups, + reduction=reduction, + downsample_kernel_size=downsample_kernel_size, + downsample_padding=downsample_padding + ) + self.feature_info += [dict(num_chs=256 * block.expansion, reduction=16, module='layer3')] + self.layer4 = self._make_layer( + block, + planes=512, + blocks=layers[3], + stride=2, + groups=groups, + reduction=reduction, + downsample_kernel_size=downsample_kernel_size, + downsample_padding=downsample_padding + ) + self.feature_info += [dict(num_chs=512 * block.expansion, reduction=32, module='layer4')] + self.num_features = 512 * block.expansion + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + for m in self.modules(): + _weight_init(m) + + def _make_layer(self, block, planes, blocks, groups, reduction, stride=1, + downsample_kernel_size=1, downsample_padding=0): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + downsample = nn.Sequential( + nn.Conv2d( + self.inplanes, planes * block.expansion, kernel_size=downsample_kernel_size, + stride=stride, padding=downsample_padding, bias=False), + nn.BatchNorm2d(planes * block.expansion), + ) + + layers = [block(self.inplanes, planes, groups, reduction, stride, downsample)] + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append(block(self.inplanes, planes, groups, reduction)) + + return nn.Sequential(*layers) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict(stem=r'^layer0', blocks=r'^layer(\d+)' if coarse else r'^layer(\d+)\.(\d+)') + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.last_linear + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.last_linear = create_classifier( + self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x = self.layer0(x) + x = self.pool0(x) + x = self.layer1(x) + x = self.layer2(x) + x = self.layer3(x) + x = self.layer4(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate > 0.: + x = F.dropout(x, p=self.drop_rate, training=self.training) + return x if pre_logits else self.last_linear(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_senet(variant, pretrained=False, **kwargs): + return build_model_with_cfg(SENet, variant, pretrained, **kwargs) + + +@register_model +def legacy_seresnet18(pretrained=False, **kwargs): + model_args = dict( + block=SEResNetBlock, layers=[2, 2, 2, 2], groups=1, reduction=16, **kwargs) + return _create_senet('legacy_seresnet18', pretrained, **model_args) + + +@register_model +def legacy_seresnet34(pretrained=False, **kwargs): + model_args = dict( + block=SEResNetBlock, layers=[3, 4, 6, 3], groups=1, reduction=16, **kwargs) + return _create_senet('legacy_seresnet34', pretrained, **model_args) + + +@register_model +def legacy_seresnet50(pretrained=False, **kwargs): + model_args = dict( + block=SEResNetBottleneck, layers=[3, 4, 6, 3], groups=1, reduction=16, **kwargs) + return _create_senet('legacy_seresnet50', pretrained, **model_args) + + +@register_model +def legacy_seresnet101(pretrained=False, **kwargs): + model_args = dict( + block=SEResNetBottleneck, layers=[3, 4, 23, 3], groups=1, reduction=16, **kwargs) + return _create_senet('legacy_seresnet101', pretrained, **model_args) + + +@register_model +def legacy_seresnet152(pretrained=False, **kwargs): + model_args = dict( + block=SEResNetBottleneck, layers=[3, 8, 36, 3], groups=1, reduction=16, **kwargs) + return _create_senet('legacy_seresnet152', pretrained, **model_args) + + +@register_model +def legacy_senet154(pretrained=False, **kwargs): + model_args = dict( + block=SEBottleneck, layers=[3, 8, 36, 3], groups=64, reduction=16, + downsample_kernel_size=3, downsample_padding=1, inplanes=128, input_3x3=True, **kwargs) + return _create_senet('legacy_senet154', pretrained, **model_args) + + +@register_model +def legacy_seresnext26_32x4d(pretrained=False, **kwargs): + model_args = dict( + block=SEResNeXtBottleneck, layers=[2, 2, 2, 2], groups=32, reduction=16, **kwargs) + return _create_senet('legacy_seresnext26_32x4d', pretrained, **model_args) + + +@register_model +def legacy_seresnext50_32x4d(pretrained=False, **kwargs): + model_args = dict( + block=SEResNeXtBottleneck, layers=[3, 4, 6, 3], groups=32, reduction=16, **kwargs) + return _create_senet('legacy_seresnext50_32x4d', pretrained, **model_args) + + +@register_model +def legacy_seresnext101_32x4d(pretrained=False, **kwargs): + model_args = dict( + block=SEResNeXtBottleneck, layers=[3, 4, 23, 3], groups=32, reduction=16, **kwargs) + return _create_senet('legacy_seresnext101_32x4d', pretrained, **model_args) diff --git a/custom_timm/models/sequencer.py b/custom_timm/models/sequencer.py new file mode 100644 index 0000000000000000000000000000000000000000..48240d1d8625f4c0cb3c497a5c49058d722c2549 --- /dev/null +++ b/custom_timm/models/sequencer.py @@ -0,0 +1,417 @@ +""" Sequencer + +Paper: `Sequencer: Deep LSTM for Image Classification` - https://arxiv.org/pdf/2205.01972.pdf + +""" +# Copyright (c) 2022. Yuki Tatsunami +# Licensed under the Apache License, Version 2.0 (the "License"); + + +import math +from functools import partial +from typing import Tuple + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, DEFAULT_CROP_PCT +from .helpers import build_model_with_cfg, named_apply +from .layers import lecun_normal_, DropPath, Mlp, PatchEmbed as TimmPatchEmbed +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': DEFAULT_CROP_PCT, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = dict( + sequencer2d_s=_cfg(url="https://github.com/okojoalg/sequencer/releases/download/weights/sequencer2d_s.pth"), + sequencer2d_m=_cfg(url="https://github.com/okojoalg/sequencer/releases/download/weights/sequencer2d_m.pth"), + sequencer2d_l=_cfg(url="https://github.com/okojoalg/sequencer/releases/download/weights/sequencer2d_l.pth"), +) + + +def _init_weights(module: nn.Module, name: str, head_bias: float = 0., flax=False): + if isinstance(module, nn.Linear): + if name.startswith('head'): + nn.init.zeros_(module.weight) + nn.init.constant_(module.bias, head_bias) + else: + if flax: + # Flax defaults + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + else: + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + if 'mlp' in name: + nn.init.normal_(module.bias, std=1e-6) + else: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.Conv2d): + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d, nn.GroupNorm)): + nn.init.ones_(module.weight) + nn.init.zeros_(module.bias) + elif isinstance(module, (nn.RNN, nn.GRU, nn.LSTM)): + stdv = 1.0 / math.sqrt(module.hidden_size) + for weight in module.parameters(): + nn.init.uniform_(weight, -stdv, stdv) + elif hasattr(module, 'init_weights'): + module.init_weights() + + +def get_stage( + index, layers, patch_sizes, embed_dims, hidden_sizes, mlp_ratios, block_layer, rnn_layer, mlp_layer, + norm_layer, act_layer, num_layers, bidirectional, union, + with_fc, drop=0., drop_path_rate=0., **kwargs): + assert len(layers) == len(patch_sizes) == len(embed_dims) == len(hidden_sizes) == len(mlp_ratios) + blocks = [] + for block_idx in range(layers[index]): + drop_path = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1) + blocks.append(block_layer( + embed_dims[index], hidden_sizes[index], mlp_ratio=mlp_ratios[index], + rnn_layer=rnn_layer, mlp_layer=mlp_layer, norm_layer=norm_layer, act_layer=act_layer, + num_layers=num_layers, bidirectional=bidirectional, union=union, with_fc=with_fc, + drop=drop, drop_path=drop_path)) + + if index < len(embed_dims) - 1: + blocks.append(Downsample2D(embed_dims[index], embed_dims[index + 1], patch_sizes[index + 1])) + + blocks = nn.Sequential(*blocks) + return blocks + + +class RNNIdentity(nn.Module): + def __init__(self, *args, **kwargs): + super(RNNIdentity, self).__init__() + + def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, None]: + return x, None + + +class RNN2DBase(nn.Module): + + def __init__( + self, input_size: int, hidden_size: int, + num_layers: int = 1, bias: bool = True, bidirectional: bool = True, + union="cat", with_fc=True): + super().__init__() + + self.input_size = input_size + self.hidden_size = hidden_size + self.output_size = 2 * hidden_size if bidirectional else hidden_size + self.union = union + + self.with_vertical = True + self.with_horizontal = True + self.with_fc = with_fc + + self.fc = None + if with_fc: + if union == "cat": + self.fc = nn.Linear(2 * self.output_size, input_size) + elif union == "add": + self.fc = nn.Linear(self.output_size, input_size) + elif union == "vertical": + self.fc = nn.Linear(self.output_size, input_size) + self.with_horizontal = False + elif union == "horizontal": + self.fc = nn.Linear(self.output_size, input_size) + self.with_vertical = False + else: + raise ValueError("Unrecognized union: " + union) + elif union == "cat": + pass + if 2 * self.output_size != input_size: + raise ValueError(f"The output channel {2 * self.output_size} is different from the input channel {input_size}.") + elif union == "add": + pass + if self.output_size != input_size: + raise ValueError(f"The output channel {self.output_size} is different from the input channel {input_size}.") + elif union == "vertical": + if self.output_size != input_size: + raise ValueError(f"The output channel {self.output_size} is different from the input channel {input_size}.") + self.with_horizontal = False + elif union == "horizontal": + if self.output_size != input_size: + raise ValueError(f"The output channel {self.output_size} is different from the input channel {input_size}.") + self.with_vertical = False + else: + raise ValueError("Unrecognized union: " + union) + + self.rnn_v = RNNIdentity() + self.rnn_h = RNNIdentity() + + def forward(self, x): + B, H, W, C = x.shape + + if self.with_vertical: + v = x.permute(0, 2, 1, 3) + v = v.reshape(-1, H, C) + v, _ = self.rnn_v(v) + v = v.reshape(B, W, H, -1) + v = v.permute(0, 2, 1, 3) + else: + v = None + + if self.with_horizontal: + h = x.reshape(-1, W, C) + h, _ = self.rnn_h(h) + h = h.reshape(B, H, W, -1) + else: + h = None + + if v is not None and h is not None: + if self.union == "cat": + x = torch.cat([v, h], dim=-1) + else: + x = v + h + elif v is not None: + x = v + elif h is not None: + x = h + + if self.fc is not None: + x = self.fc(x) + + return x + + +class LSTM2D(RNN2DBase): + + def __init__( + self, input_size: int, hidden_size: int, + num_layers: int = 1, bias: bool = True, bidirectional: bool = True, + union="cat", with_fc=True): + super().__init__(input_size, hidden_size, num_layers, bias, bidirectional, union, with_fc) + if self.with_vertical: + self.rnn_v = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bias=bias, bidirectional=bidirectional) + if self.with_horizontal: + self.rnn_h = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bias=bias, bidirectional=bidirectional) + + +class Sequencer2DBlock(nn.Module): + def __init__( + self, dim, hidden_size, mlp_ratio=3.0, rnn_layer=LSTM2D, mlp_layer=Mlp, + norm_layer=partial(nn.LayerNorm, eps=1e-6), act_layer=nn.GELU, + num_layers=1, bidirectional=True, union="cat", with_fc=True, drop=0., drop_path=0.): + super().__init__() + channels_dim = int(mlp_ratio * dim) + self.norm1 = norm_layer(dim) + self.rnn_tokens = rnn_layer(dim, hidden_size, num_layers=num_layers, bidirectional=bidirectional, + union=union, with_fc=with_fc) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp_channels = mlp_layer(dim, channels_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + x = x + self.drop_path(self.rnn_tokens(self.norm1(x))) + x = x + self.drop_path(self.mlp_channels(self.norm2(x))) + return x + + +class PatchEmbed(TimmPatchEmbed): + def forward(self, x): + x = self.proj(x) + if self.flatten: + x = x.flatten(2).transpose(1, 2) # BCHW -> BNC + else: + x = x.permute(0, 2, 3, 1) # BCHW -> BHWC + x = self.norm(x) + return x + + +class Shuffle(nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x): + if self.training: + B, H, W, C = x.shape + r = torch.randperm(H * W) + x = x.reshape(B, -1, C) + x = x[:, r, :].reshape(B, H, W, -1) + return x + + +class Downsample2D(nn.Module): + def __init__(self, input_dim, output_dim, patch_size): + super().__init__() + self.down = nn.Conv2d(input_dim, output_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + x = x.permute(0, 3, 1, 2) + x = self.down(x) + x = x.permute(0, 2, 3, 1) + return x + + +class Sequencer2D(nn.Module): + def __init__( + self, + num_classes=1000, + img_size=224, + in_chans=3, + global_pool='avg', + layers=[4, 3, 8, 3], + patch_sizes=[7, 2, 1, 1], + embed_dims=[192, 384, 384, 384], + hidden_sizes=[48, 96, 96, 96], + mlp_ratios=[3.0, 3.0, 3.0, 3.0], + block_layer=Sequencer2DBlock, + rnn_layer=LSTM2D, + mlp_layer=Mlp, + norm_layer=partial(nn.LayerNorm, eps=1e-6), + act_layer=nn.GELU, + num_rnn_layers=1, + bidirectional=True, + union="cat", + with_fc=True, + drop_rate=0., + drop_path_rate=0., + nlhb=False, + stem_norm=False, + ): + super().__init__() + assert global_pool in ('', 'avg') + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = embed_dims[-1] # num_features for consistency with other models + self.feature_dim = -1 # channel dim index for feature outputs (rank 4, NHWC) + self.embed_dims = embed_dims + self.stem = PatchEmbed( + img_size=img_size, patch_size=patch_sizes[0], in_chans=in_chans, + embed_dim=embed_dims[0], norm_layer=norm_layer if stem_norm else None, + flatten=False) + + self.blocks = nn.Sequential(*[ + get_stage( + i, layers, patch_sizes, embed_dims, hidden_sizes, mlp_ratios, block_layer=block_layer, + rnn_layer=rnn_layer, mlp_layer=mlp_layer, norm_layer=norm_layer, act_layer=act_layer, + num_layers=num_rnn_layers, bidirectional=bidirectional, + union=union, with_fc=with_fc, drop=drop_rate, drop_path_rate=drop_path_rate, + ) + for i, _ in enumerate(embed_dims)]) + + self.norm = norm_layer(embed_dims[-1]) + self.head = nn.Linear(embed_dims[-1], self.num_classes) if num_classes > 0 else nn.Identity() + + self.init_weights(nlhb=nlhb) + + def init_weights(self, nlhb=False): + head_bias = -math.log(self.num_classes) if nlhb else 0. + named_apply(partial(_init_weights, head_bias=head_bias), module=self) # depth-first + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', + blocks=[ + (r'^blocks\.(\d+)\..*\.down', (99999,)), + (r'^blocks\.(\d+)', None) if coarse else (r'^blocks\.(\d+)\.(\d+)', None), + (r'^norm', (99999,)) + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.stem(x) + x = self.blocks(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean(dim=(1, 2)) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_sequencer2d(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Sequencer2D models.') + + model = build_model_with_cfg(Sequencer2D, variant, pretrained, **kwargs) + return model + + +# main + +@register_model +def sequencer2d_s(pretrained=False, **kwargs): + model_args = dict( + layers=[4, 3, 8, 3], + patch_sizes=[7, 2, 1, 1], + embed_dims=[192, 384, 384, 384], + hidden_sizes=[48, 96, 96, 96], + mlp_ratios=[3.0, 3.0, 3.0, 3.0], + rnn_layer=LSTM2D, + bidirectional=True, + union="cat", + with_fc=True, + **kwargs) + model = _create_sequencer2d('sequencer2d_s', pretrained=pretrained, **model_args) + return model + + +@register_model +def sequencer2d_m(pretrained=False, **kwargs): + model_args = dict( + layers=[4, 3, 14, 3], + patch_sizes=[7, 2, 1, 1], + embed_dims=[192, 384, 384, 384], + hidden_sizes=[48, 96, 96, 96], + mlp_ratios=[3.0, 3.0, 3.0, 3.0], + rnn_layer=LSTM2D, + bidirectional=True, + union="cat", + with_fc=True, + **kwargs) + model = _create_sequencer2d('sequencer2d_m', pretrained=pretrained, **model_args) + return model + + +@register_model +def sequencer2d_l(pretrained=False, **kwargs): + model_args = dict( + layers=[8, 8, 16, 4], + patch_sizes=[7, 2, 1, 1], + embed_dims=[192, 384, 384, 384], + hidden_sizes=[48, 96, 96, 96], + mlp_ratios=[3.0, 3.0, 3.0, 3.0], + rnn_layer=LSTM2D, + bidirectional=True, + union="cat", + with_fc=True, + **kwargs) + model = _create_sequencer2d('sequencer2d_l', pretrained=pretrained, **model_args) + return model diff --git a/custom_timm/models/sknet.py b/custom_timm/models/sknet.py new file mode 100644 index 0000000000000000000000000000000000000000..342a7901325780809a3213d6188e87ea111a9a11 --- /dev/null +++ b/custom_timm/models/sknet.py @@ -0,0 +1,206 @@ +""" Selective Kernel Networks (ResNet base) + +Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586) + +This was inspired by reading 'Compounding the Performance Improvements...' (https://arxiv.org/abs/2001.06268) +and a streamlined impl at https://github.com/clovaai/assembled-cnn but I ended up building something closer +to the original paper with some modifications of my own to better balance param count vs accuracy. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import math + +from torch import nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .layers import SelectiveKernel, ConvNormAct, ConvNormActAa, create_attn +from .registry import register_model +from .resnet import ResNet + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'conv1', 'classifier': 'fc', + **kwargs + } + + +default_cfgs = { + 'skresnet18': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet18_ra-4eec2804.pth'), + 'skresnet34': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnet34_ra-bdc0ccde.pth'), + 'skresnet50': _cfg(), + 'skresnet50d': _cfg( + first_conv='conv1.0'), + 'skresnext50_32x4d': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/skresnext50_ra-f40e40bf.pth'), +} + + +class SelectiveKernelBasic(nn.Module): + expansion = 1 + + def __init__( + self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, + sk_kwargs=None, reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, + norm_layer=nn.BatchNorm2d, attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): + super(SelectiveKernelBasic, self).__init__() + + sk_kwargs = sk_kwargs or {} + conv_kwargs = dict(act_layer=act_layer, norm_layer=norm_layer) + assert cardinality == 1, 'BasicBlock only supports cardinality of 1' + assert base_width == 64, 'BasicBlock doest not support changing base width' + first_planes = planes // reduce_first + outplanes = planes * self.expansion + first_dilation = first_dilation or dilation + + self.conv1 = SelectiveKernel( + inplanes, first_planes, stride=stride, dilation=first_dilation, + aa_layer=aa_layer, drop_layer=drop_block, **conv_kwargs, **sk_kwargs) + self.conv2 = ConvNormAct( + first_planes, outplanes, kernel_size=3, dilation=dilation, apply_act=False, **conv_kwargs) + self.se = create_attn(attn_layer, outplanes) + self.act = act_layer(inplace=True) + self.downsample = downsample + self.drop_path = drop_path + + def zero_init_last(self): + nn.init.zeros_(self.conv2.bn.weight) + + def forward(self, x): + shortcut = x + x = self.conv1(x) + x = self.conv2(x) + if self.se is not None: + x = self.se(x) + if self.drop_path is not None: + x = self.drop_path(x) + if self.downsample is not None: + shortcut = self.downsample(shortcut) + x += shortcut + x = self.act(x) + return x + + +class SelectiveKernelBottleneck(nn.Module): + expansion = 4 + + def __init__( + self, inplanes, planes, stride=1, downsample=None, cardinality=1, base_width=64, sk_kwargs=None, + reduce_first=1, dilation=1, first_dilation=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, + attn_layer=None, aa_layer=None, drop_block=None, drop_path=None): + super(SelectiveKernelBottleneck, self).__init__() + + sk_kwargs = sk_kwargs or {} + conv_kwargs = dict(act_layer=act_layer, norm_layer=norm_layer) + width = int(math.floor(planes * (base_width / 64)) * cardinality) + first_planes = width // reduce_first + outplanes = planes * self.expansion + first_dilation = first_dilation or dilation + + self.conv1 = ConvNormAct(inplanes, first_planes, kernel_size=1, **conv_kwargs) + self.conv2 = SelectiveKernel( + first_planes, width, stride=stride, dilation=first_dilation, groups=cardinality, + aa_layer=aa_layer, drop_layer=drop_block, **conv_kwargs, **sk_kwargs) + self.conv3 = ConvNormAct(width, outplanes, kernel_size=1, apply_act=False, **conv_kwargs) + self.se = create_attn(attn_layer, outplanes) + self.act = act_layer(inplace=True) + self.downsample = downsample + self.drop_path = drop_path + + def zero_init_last(self): + nn.init.zeros_(self.conv3.bn.weight) + + def forward(self, x): + shortcut = x + x = self.conv1(x) + x = self.conv2(x) + x = self.conv3(x) + if self.se is not None: + x = self.se(x) + if self.drop_path is not None: + x = self.drop_path(x) + if self.downsample is not None: + shortcut = self.downsample(shortcut) + x += shortcut + x = self.act(x) + return x + + +def _create_skresnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg(ResNet, variant, pretrained, **kwargs) + + +@register_model +def skresnet18(pretrained=False, **kwargs): + """Constructs a Selective Kernel ResNet-18 model. + + Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this + variation splits the input channels to the selective convolutions to keep param count down. + """ + sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True) + model_args = dict( + block=SelectiveKernelBasic, layers=[2, 2, 2, 2], block_args=dict(sk_kwargs=sk_kwargs), + zero_init_last=False, **kwargs) + return _create_skresnet('skresnet18', pretrained, **model_args) + + +@register_model +def skresnet34(pretrained=False, **kwargs): + """Constructs a Selective Kernel ResNet-34 model. + + Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this + variation splits the input channels to the selective convolutions to keep param count down. + """ + sk_kwargs = dict(rd_ratio=1 / 8, rd_divisor=16, split_input=True) + model_args = dict( + block=SelectiveKernelBasic, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs), + zero_init_last=False, **kwargs) + return _create_skresnet('skresnet34', pretrained, **model_args) + + +@register_model +def skresnet50(pretrained=False, **kwargs): + """Constructs a Select Kernel ResNet-50 model. + + Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this + variation splits the input channels to the selective convolutions to keep param count down. + """ + sk_kwargs = dict(split_input=True) + model_args = dict( + block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], block_args=dict(sk_kwargs=sk_kwargs), + zero_init_last=False, **kwargs) + return _create_skresnet('skresnet50', pretrained, **model_args) + + +@register_model +def skresnet50d(pretrained=False, **kwargs): + """Constructs a Select Kernel ResNet-50-D model. + + Different from configs in Select Kernel paper or "Compounding the Performance Improvements..." this + variation splits the input channels to the selective convolutions to keep param count down. + """ + sk_kwargs = dict(split_input=True) + model_args = dict( + block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], stem_width=32, stem_type='deep', avg_down=True, + block_args=dict(sk_kwargs=sk_kwargs), zero_init_last=False, **kwargs) + return _create_skresnet('skresnet50d', pretrained, **model_args) + + +@register_model +def skresnext50_32x4d(pretrained=False, **kwargs): + """Constructs a Select Kernel ResNeXt50-32x4d model. This should be equivalent to + the SKNet-50 model in the Select Kernel Paper + """ + sk_kwargs = dict(rd_ratio=1/16, rd_divisor=32, split_input=False) + model_args = dict( + block=SelectiveKernelBottleneck, layers=[3, 4, 6, 3], cardinality=32, base_width=4, + block_args=dict(sk_kwargs=sk_kwargs), zero_init_last=False, **kwargs) + return _create_skresnet('skresnext50_32x4d', pretrained, **model_args) + diff --git a/custom_timm/models/swin_transformer.py b/custom_timm/models/swin_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..9b2e215dc8d98ba91ced0f381096c2be8c3f8163 --- /dev/null +++ b/custom_timm/models/swin_transformer.py @@ -0,0 +1,700 @@ +""" Swin Transformer +A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` + - https://arxiv.org/pdf/2103.14030 + +Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below + +S3 (AutoFormerV2, https://arxiv.org/abs/2111.14725) Swin weights from + - https://github.com/microsoft/Cream/tree/main/AutoFormerV2 + +Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman +""" +# -------------------------------------------------------- +# Swin Transformer +# Copyright (c) 2021 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# -------------------------------------------------------- +import logging +import math +from functools import partial +from typing import Optional + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_function +from .helpers import build_model_with_cfg, named_apply, checkpoint_seq +from .layers import PatchEmbed, Mlp, DropPath, to_2tuple, to_ntuple, trunc_normal_, _assert +from .registry import register_model +from .vision_transformer import checkpoint_filter_fn, get_init_weights_vit + + +_logger = logging.getLogger(__name__) + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'swin_base_patch4_window12_384': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22kto1k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + + 'swin_base_patch4_window7_224': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pth', + ), + + 'swin_large_patch4_window12_384': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22kto1k.pth', + input_size=(3, 384, 384), crop_pct=1.0), + + 'swin_large_patch4_window7_224': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22kto1k.pth', + ), + + 'swin_small_patch4_window7_224': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_small_patch4_window7_224.pth', + ), + + 'swin_tiny_patch4_window7_224': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_tiny_patch4_window7_224.pth', + ), + + 'swin_base_patch4_window12_384_in22k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth', + input_size=(3, 384, 384), crop_pct=1.0, num_classes=21841), + + 'swin_base_patch4_window7_224_in22k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22k.pth', + num_classes=21841), + + 'swin_large_patch4_window12_384_in22k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth', + input_size=(3, 384, 384), crop_pct=1.0, num_classes=21841), + + 'swin_large_patch4_window7_224_in22k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window7_224_22k.pth', + num_classes=21841), + + 'swin_s3_tiny_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_t-1d53f6a8.pth' + ), + 'swin_s3_small_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_s-3bb4c69d.pth' + ), + 'swin_s3_base_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/s3_b-a1e95db4.pth' + ) +} + + +def window_partition(x, window_size: int): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def window_reverse(windows, window_size: int, H: int, W: int): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + x = windows.view(B, H // window_size, W // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +def get_relative_position_index(win_h, win_w): + # get pair-wise relative position index for each token inside the window + coords = torch.stack(torch.meshgrid([torch.arange(win_h), torch.arange(win_w)])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += win_h - 1 # shift to start from 0 + relative_coords[:, :, 1] += win_w - 1 + relative_coords[:, :, 0] *= 2 * win_w - 1 + return relative_coords.sum(-1) # Wh*Ww, Wh*Ww + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + head_dim (int): Number of channels per head (dim // num_heads if not set) + window_size (tuple[int]): The height and width of the window. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__(self, dim, num_heads, head_dim=None, window_size=7, qkv_bias=True, attn_drop=0., proj_drop=0.): + + super().__init__() + self.dim = dim + self.window_size = to_2tuple(window_size) # Wh, Ww + win_h, win_w = self.window_size + self.window_area = win_h * win_w + self.num_heads = num_heads + head_dim = head_dim or dim // num_heads + attn_dim = head_dim * num_heads + self.scale = head_dim ** -0.5 + + # define a parameter table of relative position bias, shape: 2*Wh-1 * 2*Ww-1, nH + self.relative_position_bias_table = nn.Parameter(torch.zeros((2 * win_h - 1) * (2 * win_w - 1), num_heads)) + + # get pair-wise relative position index for each token inside the window + self.register_buffer("relative_position_index", get_relative_position_index(win_h, win_w)) + + self.qkv = nn.Linear(dim, attn_dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(attn_dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + trunc_normal_(self.relative_position_bias_table, std=.02) + self.softmax = nn.Softmax(dim=-1) + + def _get_rel_pos_bias(self) -> torch.Tensor: + relative_position_bias = self.relative_position_bias_table[ + self.relative_position_index.view(-1)].view(self.window_area, self.window_area, -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + return relative_position_bias.unsqueeze(0) + + def forward(self, x, mask: Optional[torch.Tensor] = None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + q = q * self.scale + attn = (q @ k.transpose(-2, -1)) + attn = attn + self._get_rel_pos_bias() + + if mask is not None: + num_win = mask.shape[0] + attn = attn.view(B_ // num_win, num_win, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + r""" Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + window_size (int): Window size. + num_heads (int): Number of attention heads. + head_dim (int): Enforce the number of channels per head + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, dim, input_resolution, num_heads=4, head_dim=None, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + self.norm1 = norm_layer(dim) + self.attn = WindowAttention( + dim, num_heads=num_heads, head_dim=head_dim, window_size=to_2tuple(self.window_size), + qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + + if self.shift_size > 0: + # calculate attention mask for SW-MSA + H, W = self.input_resolution + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + cnt = 0 + for h in ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)): + for w in ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None)): + img_mask[:, h, w, :] = cnt + cnt += 1 + mask_windows = window_partition(img_mask, self.window_size) # num_win, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + _assert(L == H * W, "input feature has wrong size") + + shortcut = x + x = self.norm1(x) + x = x.view(B, H, W, C) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll(x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2)) + else: + shifted_x = x + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # num_win*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size * self.window_size, C) # num_win*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=self.attn_mask) # num_win*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll(shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2)) + else: + x = shifted_x + x = x.view(B, H * W, C) + + # FFN + x = shortcut + self.drop_path(x) + x = x + self.drop_path(self.mlp(self.norm2(x))) + + return x + + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, out_dim=None, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.out_dim = out_dim or 2 * dim + self.norm = norm_layer(4 * dim) + self.reduction = nn.Linear(4 * dim, self.out_dim, bias=False) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + _assert(L == H * W, "input feature has wrong size") + _assert(H % 2 == 0 and W % 2 == 0, f"x size ({H}*{W}) are not even.") + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.norm(x) + x = self.reduction(x) + + return x + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + head_dim (int): Channels per head (dim // num_heads if not set) + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + """ + + def __init__( + self, dim, out_dim, input_resolution, depth, num_heads=4, head_dim=None, + window_size=7, mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., + drop_path=0., norm_layer=nn.LayerNorm, downsample=None): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.grad_checkpointing = False + + # build blocks + self.blocks = nn.Sequential(*[ + SwinTransformerBlock( + dim=dim, input_resolution=input_resolution, num_heads=num_heads, head_dim=head_dim, + window_size=window_size, shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, norm_layer=norm_layer) + for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, out_dim=out_dim, norm_layer=norm_layer) + else: + self.downsample = None + + def forward(self, x): + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + +class SwinTransformer(nn.Module): + r""" Swin Transformer + A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows` - + https://arxiv.org/pdf/2103.14030 + + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + head_dim (int, tuple(int)): + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, global_pool='avg', + embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), head_dim=None, + window_size=7, mlp_ratio=4., qkv_bias=True, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, ape=False, patch_norm=True, weight_init='', **kwargs): + super().__init__() + assert global_pool in ('', 'avg') + self.num_classes = num_classes + self.global_pool = global_pool + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if patch_norm else None) + num_patches = self.patch_embed.num_patches + self.patch_grid = self.patch_embed.grid_size + + # absolute position embedding + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) if ape else None + self.pos_drop = nn.Dropout(p=drop_rate) + + # build layers + if not isinstance(embed_dim, (tuple, list)): + embed_dim = [int(embed_dim * 2 ** i) for i in range(self.num_layers)] + embed_out_dim = embed_dim[1:] + [None] + head_dim = to_ntuple(self.num_layers)(head_dim) + window_size = to_ntuple(self.num_layers)(window_size) + mlp_ratio = to_ntuple(self.num_layers)(mlp_ratio) + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + layers = [] + for i in range(self.num_layers): + layers += [BasicLayer( + dim=embed_dim[i], + out_dim=embed_out_dim[i], + input_resolution=(self.patch_grid[0] // (2 ** i), self.patch_grid[1] // (2 ** i)), + depth=depths[i], + num_heads=num_heads[i], + head_dim=head_dim[i], + window_size=window_size[i], + mlp_ratio=mlp_ratio[i], + qkv_bias=qkv_bias, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i]):sum(depths[:i + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i < self.num_layers - 1) else None + )] + self.layers = nn.Sequential(*layers) + + self.norm = norm_layer(self.num_features) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + if weight_init != 'skip': + self.init_weights(weight_init) + + @torch.jit.ignore + def init_weights(self, mode=''): + assert mode in ('jax', 'jax_nlhb', 'moco', '') + if self.absolute_pos_embed is not None: + trunc_normal_(self.absolute_pos_embed, std=.02) + head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0. + named_apply(get_init_weights_vit(mode, head_bias=head_bias), self) + + @torch.jit.ignore + def no_weight_decay(self): + nwd = {'absolute_pos_embed'} + for n, _ in self.named_parameters(): + if 'relative_position_bias_table' in n: + nwd.add(n) + return nwd + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^absolute_pos_embed|patch_embed', # stem and embed + blocks=r'^layers\.(\d+)' if coarse else [ + (r'^layers\.(\d+).downsample', (0,)), + (r'^layers\.(\d+)\.\w+\.(\d+)', None), + (r'^norm', (99999,)), + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for l in self.layers: + l.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + if self.absolute_pos_embed is not None: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + x = self.layers(x) + x = self.norm(x) # B L C + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean(dim=1) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_swin_transformer(variant, pretrained=False, **kwargs): + model = build_model_with_cfg( + SwinTransformer, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + + return model + + +@register_model +def swin_base_patch4_window12_384(pretrained=False, **kwargs): + """ Swin-B @ 384x384, pretrained ImageNet-22k, fine tune 1k + """ + model_kwargs = dict( + patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs) + return _create_swin_transformer('swin_base_patch4_window12_384', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_base_patch4_window7_224(pretrained=False, **kwargs): + """ Swin-B @ 224x224, pretrained ImageNet-22k, fine tune 1k + """ + model_kwargs = dict( + patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs) + return _create_swin_transformer('swin_base_patch4_window7_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_large_patch4_window12_384(pretrained=False, **kwargs): + """ Swin-L @ 384x384, pretrained ImageNet-22k, fine tune 1k + """ + model_kwargs = dict( + patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs) + return _create_swin_transformer('swin_large_patch4_window12_384', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_large_patch4_window7_224(pretrained=False, **kwargs): + """ Swin-L @ 224x224, pretrained ImageNet-22k, fine tune 1k + """ + model_kwargs = dict( + patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs) + return _create_swin_transformer('swin_large_patch4_window7_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_small_patch4_window7_224(pretrained=False, **kwargs): + """ Swin-S @ 224x224, trained ImageNet-1k + """ + model_kwargs = dict( + patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer('swin_small_patch4_window7_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_tiny_patch4_window7_224(pretrained=False, **kwargs): + """ Swin-T @ 224x224, trained ImageNet-1k + """ + model_kwargs = dict( + patch_size=4, window_size=7, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer('swin_tiny_patch4_window7_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_base_patch4_window12_384_in22k(pretrained=False, **kwargs): + """ Swin-B @ 384x384, trained ImageNet-22k + """ + model_kwargs = dict( + patch_size=4, window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs) + return _create_swin_transformer('swin_base_patch4_window12_384_in22k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_base_patch4_window7_224_in22k(pretrained=False, **kwargs): + """ Swin-B @ 224x224, trained ImageNet-22k + """ + model_kwargs = dict( + patch_size=4, window_size=7, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs) + return _create_swin_transformer('swin_base_patch4_window7_224_in22k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_large_patch4_window12_384_in22k(pretrained=False, **kwargs): + """ Swin-L @ 384x384, trained ImageNet-22k + """ + model_kwargs = dict( + patch_size=4, window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs) + return _create_swin_transformer('swin_large_patch4_window12_384_in22k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_large_patch4_window7_224_in22k(pretrained=False, **kwargs): + """ Swin-L @ 224x224, trained ImageNet-22k + """ + model_kwargs = dict( + patch_size=4, window_size=7, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs) + return _create_swin_transformer('swin_large_patch4_window7_224_in22k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_s3_tiny_224(pretrained=False, **kwargs): + """ Swin-S3-T @ 224x224, ImageNet-1k. https://arxiv.org/abs/2111.14725 + """ + model_kwargs = dict( + patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer('swin_s3_tiny_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_s3_small_224(pretrained=False, **kwargs): + """ Swin-S3-S @ 224x224, trained ImageNet-1k. https://arxiv.org/abs/2111.14725 + """ + model_kwargs = dict( + patch_size=4, window_size=(14, 14, 14, 7), embed_dim=96, depths=(2, 2, 18, 2), + num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer('swin_s3_small_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swin_s3_base_224(pretrained=False, **kwargs): + """ Swin-S3-B @ 224x224, trained ImageNet-1k. https://arxiv.org/abs/2111.14725 + """ + model_kwargs = dict( + patch_size=4, window_size=(7, 7, 14, 7), embed_dim=96, depths=(2, 2, 30, 2), + num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer('swin_s3_base_224', pretrained=pretrained, **model_kwargs) + diff --git a/custom_timm/models/swin_transformer_v2.py b/custom_timm/models/swin_transformer_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..ade2b050a956fe6f30811736d196d3f33e4dcc7c --- /dev/null +++ b/custom_timm/models/swin_transformer_v2.py @@ -0,0 +1,753 @@ +""" Swin Transformer V2 +A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution` + - https://arxiv.org/abs/2111.09883 + +Code/weights from https://github.com/microsoft/Swin-Transformer, original copyright/license info below + +Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman +""" +# -------------------------------------------------------- +# Swin Transformer V2 +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Written by Ze Liu +# -------------------------------------------------------- +import math +from typing import Tuple, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_function +from .helpers import build_model_with_cfg, named_apply +from .layers import PatchEmbed, Mlp, DropPath, to_2tuple, to_ntuple, trunc_normal_, _assert +from .registry import register_model + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'swinv2_tiny_window8_256': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window8_256.pth', + input_size=(3, 256, 256) + ), + 'swinv2_tiny_window16_256': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_tiny_patch4_window16_256.pth', + input_size=(3, 256, 256) + ), + 'swinv2_small_window8_256': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window8_256.pth', + input_size=(3, 256, 256) + ), + 'swinv2_small_window16_256': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_small_patch4_window16_256.pth', + input_size=(3, 256, 256) + ), + 'swinv2_base_window8_256': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window8_256.pth', + input_size=(3, 256, 256) + ), + 'swinv2_base_window16_256': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window16_256.pth', + input_size=(3, 256, 256) + ), + + 'swinv2_base_window12_192_22k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12_192_22k.pth', + num_classes=21841, input_size=(3, 192, 192) + ), + 'swinv2_base_window12to16_192to256_22kft1k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to16_192to256_22kto1k_ft.pth', + input_size=(3, 256, 256) + ), + 'swinv2_base_window12to24_192to384_22kft1k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_base_patch4_window12to24_192to384_22kto1k_ft.pth', + input_size=(3, 384, 384), crop_pct=1.0, + ), + 'swinv2_large_window12_192_22k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12_192_22k.pth', + num_classes=21841, input_size=(3, 192, 192) + ), + 'swinv2_large_window12to16_192to256_22kft1k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to16_192to256_22kto1k_ft.pth', + input_size=(3, 256, 256) + ), + 'swinv2_large_window12to24_192to384_22kft1k': _cfg( + url='https://github.com/SwinTransformer/storage/releases/download/v2.0.0/swinv2_large_patch4_window12to24_192to384_22kto1k_ft.pth', + input_size=(3, 384, 384), crop_pct=1.0, + ), +} + + +def window_partition(x, window_size: Tuple[int, int]): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def window_reverse(windows, window_size: Tuple[int, int], img_size: Tuple[int, int]): + """ + Args: + windows: (num_windows * B, window_size[0], window_size[1], C) + window_size (Tuple[int, int]): Window size + img_size (Tuple[int, int]): Image size + + Returns: + x: (B, H, W, C) + """ + H, W = img_size + B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1])) + x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowAttention(nn.Module): + r""" Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + pretrained_window_size (tuple[int]): The height and width of the window in pre-training. + """ + + def __init__( + self, dim, window_size, num_heads, qkv_bias=True, attn_drop=0., proj_drop=0., + pretrained_window_size=[0, 0]): + + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.pretrained_window_size = pretrained_window_size + self.num_heads = num_heads + + self.logit_scale = nn.Parameter(torch.log(10 * torch.ones((num_heads, 1, 1)))) + + # mlp to generate continuous relative position bias + self.cpb_mlp = nn.Sequential( + nn.Linear(2, 512, bias=True), + nn.ReLU(inplace=True), + nn.Linear(512, num_heads, bias=False) + ) + + # get relative_coords_table + relative_coords_h = torch.arange(-(self.window_size[0] - 1), self.window_size[0], dtype=torch.float32) + relative_coords_w = torch.arange(-(self.window_size[1] - 1), self.window_size[1], dtype=torch.float32) + relative_coords_table = torch.stack(torch.meshgrid([ + relative_coords_h, + relative_coords_w])).permute(1, 2, 0).contiguous().unsqueeze(0) # 1, 2*Wh-1, 2*Ww-1, 2 + if pretrained_window_size[0] > 0: + relative_coords_table[:, :, :, 0] /= (pretrained_window_size[0] - 1) + relative_coords_table[:, :, :, 1] /= (pretrained_window_size[1] - 1) + else: + relative_coords_table[:, :, :, 0] /= (self.window_size[0] - 1) + relative_coords_table[:, :, :, 1] /= (self.window_size[1] - 1) + relative_coords_table *= 8 # normalize to -8, 8 + relative_coords_table = torch.sign(relative_coords_table) * torch.log2( + torch.abs(relative_coords_table) + 1.0) / math.log2(8) + + self.register_buffer("relative_coords_table", relative_coords_table, persistent=False) + + # get pair-wise relative position index for each token inside the window + coords_h = torch.arange(self.window_size[0]) + coords_w = torch.arange(self.window_size[1]) + coords = torch.stack(torch.meshgrid([coords_h, coords_w])) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += self.window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += self.window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 + relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + self.register_buffer("relative_position_index", relative_position_index, persistent=False) + + self.qkv = nn.Linear(dim, dim * 3, bias=False) + if qkv_bias: + self.q_bias = nn.Parameter(torch.zeros(dim)) + self.register_buffer('k_bias', torch.zeros(dim), persistent=False) + self.v_bias = nn.Parameter(torch.zeros(dim)) + else: + self.q_bias = None + self.k_bias = None + self.v_bias = None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask: Optional[torch.Tensor] = None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + qkv_bias = None + if self.q_bias is not None: + qkv_bias = torch.cat((self.q_bias, self.k_bias, self.v_bias)) + qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias) + qkv = qkv.reshape(B_, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + + # cosine attention + attn = (F.normalize(q, dim=-1) @ F.normalize(k, dim=-1).transpose(-2, -1)) + logit_scale = torch.clamp(self.logit_scale, max=math.log(1. / 0.01)).exp() + attn = attn * logit_scale + + relative_position_bias_table = self.cpb_mlp(self.relative_coords_table).view(-1, self.num_heads) + relative_position_bias = relative_position_bias_table[self.relative_position_index.view(-1)].view( + self.window_size[0] * self.window_size[1], self.window_size[0] * self.window_size[1], -1) # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() # nH, Wh*Ww, Wh*Ww + relative_position_bias = 16 * torch.sigmoid(relative_position_bias) + attn = attn + relative_position_bias.unsqueeze(0) + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + attn = self.softmax(attn) + else: + attn = self.softmax(attn) + + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class SwinTransformerBlock(nn.Module): + r""" Swin Transformer Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + pretrained_window_size (int): Window size in pretraining. + """ + + def __init__( + self, dim, input_resolution, num_heads, window_size=7, shift_size=0, + mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm, pretrained_window_size=0): + super().__init__() + self.dim = dim + self.input_resolution = to_2tuple(input_resolution) + self.num_heads = num_heads + ws, ss = self._calc_window_shift(window_size, shift_size) + self.window_size: Tuple[int, int] = ws + self.shift_size: Tuple[int, int] = ss + self.window_area = self.window_size[0] * self.window_size[1] + self.mlp_ratio = mlp_ratio + + self.attn = WindowAttention( + dim, window_size=to_2tuple(self.window_size), num_heads=num_heads, + qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop, + pretrained_window_size=to_2tuple(pretrained_window_size)) + self.norm1 = norm_layer(dim) + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.norm2 = norm_layer(dim) + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + if any(self.shift_size): + # calculate attention mask for SW-MSA + H, W = self.input_resolution + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + cnt = 0 + for h in ( + slice(0, -self.window_size[0]), + slice(-self.window_size[0], -self.shift_size[0]), + slice(-self.shift_size[0], None)): + for w in ( + slice(0, -self.window_size[1]), + slice(-self.window_size[1], -self.shift_size[1]), + slice(-self.shift_size[1], None)): + img_mask[:, h, w, :] = cnt + cnt += 1 + mask_windows = window_partition(img_mask, self.window_size) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_area) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def _calc_window_shift(self, target_window_size, target_shift_size) -> Tuple[Tuple[int, int], Tuple[int, int]]: + target_window_size = to_2tuple(target_window_size) + target_shift_size = to_2tuple(target_shift_size) + window_size = [r if r <= w else w for r, w in zip(self.input_resolution, target_window_size)] + shift_size = [0 if r <= w else s for r, w, s in zip(self.input_resolution, window_size, target_shift_size)] + return tuple(window_size), tuple(shift_size) + + def _attn(self, x): + H, W = self.input_resolution + B, L, C = x.shape + _assert(L == H * W, "input feature has wrong size") + x = x.view(B, H, W, C) + + # cyclic shift + has_shift = any(self.shift_size) + if has_shift: + shifted_x = torch.roll(x, shifts=(-self.shift_size[0], -self.shift_size[1]), dims=(1, 2)) + else: + shifted_x = x + + # partition windows + x_windows = window_partition(shifted_x, self.window_size) # nW*B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_area, C) # nW*B, window_size*window_size, C + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=self.attn_mask) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size[0], self.window_size[1], C) + shifted_x = window_reverse(attn_windows, self.window_size, self.input_resolution) # B H' W' C + + # reverse cyclic shift + if has_shift: + x = torch.roll(shifted_x, shifts=self.shift_size, dims=(1, 2)) + else: + x = shifted_x + x = x.view(B, H * W, C) + return x + + def forward(self, x): + x = x + self.drop_path1(self.norm1(self._attn(x))) + x = x + self.drop_path2(self.norm2(self.mlp(x))) + return x + + +class PatchMerging(nn.Module): + r""" Patch Merging Layer. + + Args: + input_resolution (tuple[int]): Resolution of input feature. + dim (int): Number of input channels. + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__(self, input_resolution, dim, norm_layer=nn.LayerNorm): + super().__init__() + self.input_resolution = input_resolution + self.dim = dim + self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False) + self.norm = norm_layer(2 * dim) + + def forward(self, x): + """ + x: B, H*W, C + """ + H, W = self.input_resolution + B, L, C = x.shape + _assert(L == H * W, "input feature has wrong size") + _assert(H % 2 == 0, f"x size ({H}*{W}) are not even.") + _assert(W % 2 == 0, f"x size ({H}*{W}) are not even.") + + x = x.view(B, H, W, C) + + x0 = x[:, 0::2, 0::2, :] # B H/2 W/2 C + x1 = x[:, 1::2, 0::2, :] # B H/2 W/2 C + x2 = x[:, 0::2, 1::2, :] # B H/2 W/2 C + x3 = x[:, 1::2, 1::2, :] # B H/2 W/2 C + x = torch.cat([x0, x1, x2, x3], -1) # B H/2 W/2 4*C + x = x.view(B, -1, 4 * C) # B H/2*W/2 4*C + + x = self.reduction(x) + x = self.norm(x) + + return x + + +class BasicLayer(nn.Module): + """ A basic Swin Transformer layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + pretrained_window_size (int): Local window size in pre-training. + """ + + def __init__( + self, dim, input_resolution, depth, num_heads, window_size, + mlp_ratio=4., qkv_bias=True, drop=0., attn_drop=0., drop_path=0., + norm_layer=nn.LayerNorm, downsample=None, pretrained_window_size=0): + + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.grad_checkpointing = False + + # build blocks + self.blocks = nn.ModuleList([ + SwinTransformerBlock( + dim=dim, input_resolution=input_resolution, + num_heads=num_heads, window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop, attn_drop=attn_drop, + drop_path=drop_path[i] if isinstance(drop_path, list) else drop_path, + norm_layer=norm_layer, + pretrained_window_size=pretrained_window_size) + for i in range(depth)]) + + # patch merging layer + if downsample is not None: + self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) + else: + self.downsample = nn.Identity() + + def forward(self, x): + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + x = self.downsample(x) + return x + + def _init_respostnorm(self): + for blk in self.blocks: + nn.init.constant_(blk.norm1.bias, 0) + nn.init.constant_(blk.norm1.weight, 0) + nn.init.constant_(blk.norm2.bias, 0) + nn.init.constant_(blk.norm2.weight, 0) + + +class SwinTransformerV2(nn.Module): + r""" Swin Transformer V2 + A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution` + - https://arxiv.org/abs/2111.09883 + Args: + img_size (int | tuple(int)): Input image size. Default 224 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input image channels. Default: 3 + num_classes (int): Number of classes for classification head. Default: 1000 + embed_dim (int): Patch embedding dimension. Default: 96 + depths (tuple(int)): Depth of each Swin Transformer layer. + num_heads (tuple(int)): Number of attention heads in different layers. + window_size (int): Window size. Default: 7 + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4 + qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True + drop_rate (float): Dropout rate. Default: 0 + attn_drop_rate (float): Attention dropout rate. Default: 0 + drop_path_rate (float): Stochastic depth rate. Default: 0.1 + norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm. + ape (bool): If True, add absolute position embedding to the patch embedding. Default: False + patch_norm (bool): If True, add normalization after patch embedding. Default: True + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False + pretrained_window_sizes (tuple(int)): Pretrained window sizes of each layer. + """ + + def __init__( + self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, global_pool='avg', + embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), + window_size=7, mlp_ratio=4., qkv_bias=True, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0.1, + norm_layer=nn.LayerNorm, ape=False, patch_norm=True, + pretrained_window_sizes=(0, 0, 0, 0), **kwargs): + super().__init__() + + self.num_classes = num_classes + assert global_pool in ('', 'avg') + self.global_pool = global_pool + self.num_layers = len(depths) + self.embed_dim = embed_dim + self.patch_norm = patch_norm + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) + + # split image into non-overlapping patches + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, + norm_layer=norm_layer if self.patch_norm else None) + num_patches = self.patch_embed.num_patches + + # absolute position embedding + if ape: + self.absolute_pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim)) + trunc_normal_(self.absolute_pos_embed, std=.02) + else: + self.absolute_pos_embed = None + + self.pos_drop = nn.Dropout(p=drop_rate) + + # stochastic depth + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + layer = BasicLayer( + dim=int(embed_dim * 2 ** i_layer), + input_resolution=( + self.patch_embed.grid_size[0] // (2 ** i_layer), + self.patch_embed.grid_size[1] // (2 ** i_layer)), + depth=depths[i_layer], + num_heads=num_heads[i_layer], + window_size=window_size, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])], + norm_layer=norm_layer, + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + pretrained_window_size=pretrained_window_sizes[i_layer] + ) + self.layers.append(layer) + + self.norm = norm_layer(self.num_features) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + self.apply(self._init_weights) + for bly in self.layers: + bly._init_respostnorm() + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def no_weight_decay(self): + nod = {'absolute_pos_embed'} + for n, m in self.named_modules(): + if any([kw in n for kw in ("cpb_mlp", "logit_scale", 'relative_position_bias_table')]): + nod.add(n) + return nod + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^absolute_pos_embed|patch_embed', # stem and embed + blocks=r'^layers\.(\d+)' if coarse else [ + (r'^layers\.(\d+).downsample', (0,)), + (r'^layers\.(\d+)\.\w+\.(\d+)', None), + (r'^norm', (99999,)), + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for l in self.layers: + l.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + if self.absolute_pos_embed is not None: + x = x + self.absolute_pos_embed + x = self.pos_drop(x) + + for layer in self.layers: + x = layer(x) + + x = self.norm(x) # B L C + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean(dim=1) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def checkpoint_filter_fn(state_dict, model): + out_dict = {} + if 'model' in state_dict: + # For deit models + state_dict = state_dict['model'] + for k, v in state_dict.items(): + if any([n in k for n in ('relative_position_index', 'relative_coords_table')]): + continue # skip buffers that should not be persistent + out_dict[k] = v + return out_dict + + +def _create_swin_transformer_v2(variant, pretrained=False, **kwargs): + model = build_model_with_cfg( + SwinTransformerV2, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def swinv2_tiny_window16_256(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=16, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer_v2('swinv2_tiny_window16_256', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_tiny_window8_256(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=8, embed_dim=96, depths=(2, 2, 6, 2), num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer_v2('swinv2_tiny_window8_256', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_small_window16_256(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=16, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer_v2('swinv2_small_window16_256', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_small_window8_256(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=8, embed_dim=96, depths=(2, 2, 18, 2), num_heads=(3, 6, 12, 24), **kwargs) + return _create_swin_transformer_v2('swinv2_small_window8_256', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_base_window16_256(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=16, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs) + return _create_swin_transformer_v2('swinv2_base_window16_256', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_base_window8_256(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=8, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs) + return _create_swin_transformer_v2('swinv2_base_window8_256', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_base_window12_192_22k(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=12, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), **kwargs) + return _create_swin_transformer_v2('swinv2_base_window12_192_22k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_base_window12to16_192to256_22kft1k(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=16, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), + pretrained_window_sizes=(12, 12, 12, 6), **kwargs) + return _create_swin_transformer_v2( + 'swinv2_base_window12to16_192to256_22kft1k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_base_window12to24_192to384_22kft1k(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=24, embed_dim=128, depths=(2, 2, 18, 2), num_heads=(4, 8, 16, 32), + pretrained_window_sizes=(12, 12, 12, 6), **kwargs) + return _create_swin_transformer_v2( + 'swinv2_base_window12to24_192to384_22kft1k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_large_window12_192_22k(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=12, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), **kwargs) + return _create_swin_transformer_v2('swinv2_large_window12_192_22k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_large_window12to16_192to256_22kft1k(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=16, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), + pretrained_window_sizes=(12, 12, 12, 6), **kwargs) + return _create_swin_transformer_v2( + 'swinv2_large_window12to16_192to256_22kft1k', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_large_window12to24_192to384_22kft1k(pretrained=False, **kwargs): + """ + """ + model_kwargs = dict( + window_size=24, embed_dim=192, depths=(2, 2, 18, 2), num_heads=(6, 12, 24, 48), + pretrained_window_sizes=(12, 12, 12, 6), **kwargs) + return _create_swin_transformer_v2( + 'swinv2_large_window12to24_192to384_22kft1k', pretrained=pretrained, **model_kwargs) diff --git a/custom_timm/models/swin_transformer_v2_cr.py b/custom_timm/models/swin_transformer_v2_cr.py new file mode 100644 index 0000000000000000000000000000000000000000..d3ac4ac572d0b55bc1abf278f34fa9e3bd7bcb7a --- /dev/null +++ b/custom_timm/models/swin_transformer_v2_cr.py @@ -0,0 +1,1029 @@ +""" Swin Transformer V2 + +A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution` + - https://arxiv.org/pdf/2111.09883 + +Code adapted from https://github.com/ChristophReich1996/Swin-Transformer-V2, original copyright/license info below + +This implementation is experimental and subject to change in manners that will break weight compat: +* Size of the pos embed MLP are not spelled out in paper in terms of dim, fixed for all models? vary with num_heads? + * currently dim is fixed, I feel it may make sense to scale with num_heads (dim per head) +* The specifics of the memory saving 'sequential attention' are not detailed, Christoph Reich has an impl at + GitHub link above. It needs further investigation as throughput vs mem tradeoff doesn't appear beneficial. +* num_heads per stage is not detailed for Huge and Giant model variants +* 'Giant' is 3B params in paper but ~2.6B here despite matching paper dim + block counts +* experiments are ongoing wrt to 'main branch' norm layer use and weight init scheme + +Noteworthy additions over official Swin v1: +* MLP relative position embedding is looking promising and adapts to different image/window sizes +* This impl has been designed to allow easy change of image size with matching window size changes +* Non-square image size and window size are supported + +Modifications and additions for timm hacked together by / Copyright 2022, Ross Wightman +""" +# -------------------------------------------------------- +# Swin Transformer V2 reimplementation +# Copyright (c) 2021 Christoph Reich +# Licensed under The MIT License [see LICENSE for details] +# Written by Christoph Reich +# -------------------------------------------------------- +import logging +import math +from copy import deepcopy +from typing import Tuple, Optional, List, Union, Any, Type + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .fx_features import register_notrace_function +from .helpers import build_model_with_cfg, named_apply +from .layers import DropPath, Mlp, to_2tuple, _assert +from .registry import register_model + + +_logger = logging.getLogger(__name__) + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, + 'input_size': (3, 224, 224), + 'pool_size': (7, 7), + 'crop_pct': 0.9, + 'interpolation': 'bicubic', + 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, + 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj', + 'classifier': 'head', + **kwargs, + } + + +default_cfgs = { + 'swinv2_cr_tiny_384': _cfg( + url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)), + 'swinv2_cr_tiny_224': _cfg( + url="", input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_tiny_ns_224': _cfg( + url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_tiny_ns_224-ba8166c6.pth", + input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_small_384': _cfg( + url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)), + 'swinv2_cr_small_224': _cfg( + url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_small_224-0813c165.pth", + input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_small_ns_224': _cfg( + url="https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights-swinv2/swin_v2_cr_small_ns_224_iv-2ce90f8e.pth", + input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_base_384': _cfg( + url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)), + 'swinv2_cr_base_224': _cfg( + url="", input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_base_ns_224': _cfg( + url="", input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_large_384': _cfg( + url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)), + 'swinv2_cr_large_224': _cfg( + url="", input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_huge_384': _cfg( + url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)), + 'swinv2_cr_huge_224': _cfg( + url="", input_size=(3, 224, 224), crop_pct=0.9), + 'swinv2_cr_giant_384': _cfg( + url="", input_size=(3, 384, 384), crop_pct=1.0, pool_size=(12, 12)), + 'swinv2_cr_giant_224': _cfg( + url="", input_size=(3, 224, 224), crop_pct=0.9), +} + + +def bchw_to_bhwc(x: torch.Tensor) -> torch.Tensor: + """Permutes a tensor from the shape (B, C, H, W) to (B, H, W, C). """ + return x.permute(0, 2, 3, 1) + + +def bhwc_to_bchw(x: torch.Tensor) -> torch.Tensor: + """Permutes a tensor from the shape (B, H, W, C) to (B, C, H, W). """ + return x.permute(0, 3, 1, 2) + + +def window_partition(x, window_size: Tuple[int, int]): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size[0], window_size[0], W // window_size[1], window_size[1], C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size[0], window_size[1], C) + return windows + + +@register_notrace_function # reason: int argument is a Proxy +def window_reverse(windows, window_size: Tuple[int, int], img_size: Tuple[int, int]): + """ + Args: + windows: (num_windows * B, window_size[0], window_size[1], C) + window_size (Tuple[int, int]): Window size + img_size (Tuple[int, int]): Image size + + Returns: + x: (B, H, W, C) + """ + H, W = img_size + B = int(windows.shape[0] / (H * W / window_size[0] / window_size[1])) + x = windows.view(B, H // window_size[0], W // window_size[1], window_size[0], window_size[1], -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class WindowMultiHeadAttention(nn.Module): + r"""This class implements window-based Multi-Head-Attention with log-spaced continuous position bias. + + Args: + dim (int): Number of input features + window_size (int): Window size + num_heads (int): Number of attention heads + drop_attn (float): Dropout rate of attention map + drop_proj (float): Dropout rate after projection + meta_hidden_dim (int): Number of hidden features in the two layer MLP meta network + sequential_attn (bool): If true sequential self-attention is performed + """ + + def __init__( + self, + dim: int, + num_heads: int, + window_size: Tuple[int, int], + drop_attn: float = 0.0, + drop_proj: float = 0.0, + meta_hidden_dim: int = 384, # FIXME what's the optimal value? + sequential_attn: bool = False, + ) -> None: + super(WindowMultiHeadAttention, self).__init__() + assert dim % num_heads == 0, \ + "The number of input features (in_features) are not divisible by the number of heads (num_heads)." + self.in_features: int = dim + self.window_size: Tuple[int, int] = window_size + self.num_heads: int = num_heads + self.sequential_attn: bool = sequential_attn + + self.qkv = nn.Linear(in_features=dim, out_features=dim * 3, bias=True) + self.attn_drop = nn.Dropout(drop_attn) + self.proj = nn.Linear(in_features=dim, out_features=dim, bias=True) + self.proj_drop = nn.Dropout(drop_proj) + # meta network for positional encodings + self.meta_mlp = Mlp( + 2, # x, y + hidden_features=meta_hidden_dim, + out_features=num_heads, + act_layer=nn.ReLU, + drop=(0.125, 0.) # FIXME should there be stochasticity, appears to 'overfit' without? + ) + # NOTE old checkpoints used inverse of logit_scale ('tau') following the paper, see conversion fn + self.logit_scale = nn.Parameter(torch.log(10 * torch.ones(num_heads))) + self._make_pair_wise_relative_positions() + + def _make_pair_wise_relative_positions(self) -> None: + """Method initializes the pair-wise relative positions to compute the positional biases.""" + device = self.logit_scale.device + coordinates = torch.stack(torch.meshgrid([ + torch.arange(self.window_size[0], device=device), + torch.arange(self.window_size[1], device=device)]), dim=0).flatten(1) + relative_coordinates = coordinates[:, :, None] - coordinates[:, None, :] + relative_coordinates = relative_coordinates.permute(1, 2, 0).reshape(-1, 2).float() + relative_coordinates_log = torch.sign(relative_coordinates) * torch.log( + 1.0 + relative_coordinates.abs()) + self.register_buffer("relative_coordinates_log", relative_coordinates_log, persistent=False) + + def update_input_size(self, new_window_size: int, **kwargs: Any) -> None: + """Method updates the window size and so the pair-wise relative positions + + Args: + new_window_size (int): New window size + kwargs (Any): Unused + """ + # Set new window size and new pair-wise relative positions + self.window_size: int = new_window_size + self._make_pair_wise_relative_positions() + + def _relative_positional_encodings(self) -> torch.Tensor: + """Method computes the relative positional encodings + + Returns: + relative_position_bias (torch.Tensor): Relative positional encodings + (1, number of heads, window size ** 2, window size ** 2) + """ + window_area = self.window_size[0] * self.window_size[1] + relative_position_bias = self.meta_mlp(self.relative_coordinates_log) + relative_position_bias = relative_position_bias.transpose(1, 0).reshape( + self.num_heads, window_area, window_area + ) + relative_position_bias = relative_position_bias.unsqueeze(0) + return relative_position_bias + + def _forward_sequential( + self, + x: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """ + """ + # FIXME TODO figure out 'sequential' attention mentioned in paper (should reduce GPU memory) + assert False, "not implemented" + + def _forward_batch( + self, + x: torch.Tensor, + mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + """This function performs standard (non-sequential) scaled cosine self-attention. + """ + Bw, L, C = x.shape + + qkv = self.qkv(x).view(Bw, L, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + query, key, value = qkv.unbind(0) + + # compute attention map with scaled cosine attention + attn = (F.normalize(query, dim=-1) @ F.normalize(key, dim=-1).transpose(-2, -1)) + logit_scale = torch.clamp(self.logit_scale.reshape(1, self.num_heads, 1, 1), max=math.log(1. / 0.01)).exp() + attn = attn * logit_scale + attn = attn + self._relative_positional_encodings() + + if mask is not None: + # Apply mask if utilized + num_win: int = mask.shape[0] + attn = attn.view(Bw // num_win, num_win, self.num_heads, L, L) + attn = attn + mask.unsqueeze(1).unsqueeze(0) + attn = attn.view(-1, self.num_heads, L, L) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ value).transpose(1, 2).reshape(Bw, L, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + def forward(self, x: torch.Tensor, mask: Optional[torch.Tensor] = None) -> torch.Tensor: + """ Forward pass. + Args: + x (torch.Tensor): Input tensor of the shape (B * windows, N, C) + mask (Optional[torch.Tensor]): Attention mask for the shift case + + Returns: + Output tensor of the shape [B * windows, N, C] + """ + if self.sequential_attn: + return self._forward_sequential(x, mask) + else: + return self._forward_batch(x, mask) + + +class SwinTransformerBlock(nn.Module): + r"""This class implements the Swin transformer block. + + Args: + dim (int): Number of input channels + num_heads (int): Number of attention heads to be utilized + feat_size (Tuple[int, int]): Input resolution + window_size (Tuple[int, int]): Window size to be utilized + shift_size (int): Shifting size to be used + mlp_ratio (int): Ratio of the hidden dimension in the FFN to the input channels + drop (float): Dropout in input mapping + drop_attn (float): Dropout rate of attention map + drop_path (float): Dropout in main path + extra_norm (bool): Insert extra norm on 'main' branch if True + sequential_attn (bool): If true sequential self-attention is performed + norm_layer (Type[nn.Module]): Type of normalization layer to be utilized + """ + + def __init__( + self, + dim: int, + num_heads: int, + feat_size: Tuple[int, int], + window_size: Tuple[int, int], + shift_size: Tuple[int, int] = (0, 0), + mlp_ratio: float = 4.0, + init_values: Optional[float] = 0, + drop: float = 0.0, + drop_attn: float = 0.0, + drop_path: float = 0.0, + extra_norm: bool = False, + sequential_attn: bool = False, + norm_layer: Type[nn.Module] = nn.LayerNorm, + ) -> None: + super(SwinTransformerBlock, self).__init__() + self.dim: int = dim + self.feat_size: Tuple[int, int] = feat_size + self.target_shift_size: Tuple[int, int] = to_2tuple(shift_size) + self.window_size, self.shift_size = self._calc_window_shift(to_2tuple(window_size)) + self.window_area = self.window_size[0] * self.window_size[1] + self.init_values: Optional[float] = init_values + + # attn branch + self.attn = WindowMultiHeadAttention( + dim=dim, + num_heads=num_heads, + window_size=self.window_size, + drop_attn=drop_attn, + drop_proj=drop, + sequential_attn=sequential_attn, + ) + self.norm1 = norm_layer(dim) + self.drop_path1 = DropPath(drop_prob=drop_path) if drop_path > 0.0 else nn.Identity() + + # mlp branch + self.mlp = Mlp( + in_features=dim, + hidden_features=int(dim * mlp_ratio), + drop=drop, + out_features=dim, + ) + self.norm2 = norm_layer(dim) + self.drop_path2 = DropPath(drop_prob=drop_path) if drop_path > 0.0 else nn.Identity() + + # Extra main branch norm layer mentioned for Huge/Giant models in V2 paper. + # Also being used as final network norm and optional stage ending norm while still in a C-last format. + self.norm3 = norm_layer(dim) if extra_norm else nn.Identity() + + self._make_attention_mask() + self.init_weights() + + def _calc_window_shift(self, target_window_size): + window_size = [f if f <= w else w for f, w in zip(self.feat_size, target_window_size)] + shift_size = [0 if f <= w else s for f, w, s in zip(self.feat_size, window_size, self.target_shift_size)] + return tuple(window_size), tuple(shift_size) + + def _make_attention_mask(self) -> None: + """Method generates the attention mask used in shift case.""" + # Make masks for shift case + if any(self.shift_size): + # calculate attention mask for SW-MSA + H, W = self.feat_size + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + cnt = 0 + for h in ( + slice(0, -self.window_size[0]), + slice(-self.window_size[0], -self.shift_size[0]), + slice(-self.shift_size[0], None)): + for w in ( + slice(0, -self.window_size[1]), + slice(-self.window_size[1], -self.shift_size[1]), + slice(-self.shift_size[1], None)): + img_mask[:, h, w, :] = cnt + cnt += 1 + mask_windows = window_partition(img_mask, self.window_size) # num_windows, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_area) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0)) + else: + attn_mask = None + self.register_buffer("attn_mask", attn_mask, persistent=False) + + def init_weights(self): + # extra, module specific weight init + if self.init_values is not None: + nn.init.constant_(self.norm1.weight, self.init_values) + nn.init.constant_(self.norm2.weight, self.init_values) + + def update_input_size(self, new_window_size: Tuple[int, int], new_feat_size: Tuple[int, int]) -> None: + """Method updates the image resolution to be processed and window size and so the pair-wise relative positions. + + Args: + new_window_size (int): New window size + new_feat_size (Tuple[int, int]): New input resolution + """ + # Update input resolution + self.feat_size: Tuple[int, int] = new_feat_size + self.window_size, self.shift_size = self._calc_window_shift(to_2tuple(new_window_size)) + self.window_area = self.window_size[0] * self.window_size[1] + self.attn.update_input_size(new_window_size=self.window_size) + self._make_attention_mask() + + def _shifted_window_attn(self, x): + H, W = self.feat_size + B, L, C = x.shape + x = x.view(B, H, W, C) + + # cyclic shift + sh, sw = self.shift_size + do_shift: bool = any(self.shift_size) + if do_shift: + # FIXME PyTorch XLA needs cat impl, roll not lowered + # x = torch.cat([x[:, sh:], x[:, :sh]], dim=1) + # x = torch.cat([x[:, :, sw:], x[:, :, :sw]], dim=2) + x = torch.roll(x, shifts=(-sh, -sw), dims=(1, 2)) + + # partition windows + x_windows = window_partition(x, self.window_size) # num_windows * B, window_size, window_size, C + x_windows = x_windows.view(-1, self.window_size[0] * self.window_size[1], C) + + # W-MSA/SW-MSA + attn_windows = self.attn(x_windows, mask=self.attn_mask) # num_windows * B, window_size * window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size[0], self.window_size[1], C) + x = window_reverse(attn_windows, self.window_size, self.feat_size) # B H' W' C + + # reverse cyclic shift + if do_shift: + # FIXME PyTorch XLA needs cat impl, roll not lowered + # x = torch.cat([x[:, -sh:], x[:, :-sh]], dim=1) + # x = torch.cat([x[:, :, -sw:], x[:, :, :-sw]], dim=2) + x = torch.roll(x, shifts=(sh, sw), dims=(1, 2)) + + x = x.view(B, L, C) + return x + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward pass. + + Args: + x (torch.Tensor): Input tensor of the shape [B, C, H, W] + + Returns: + output (torch.Tensor): Output tensor of the shape [B, C, H, W] + """ + # post-norm branches (op -> norm -> drop) + x = x + self.drop_path1(self.norm1(self._shifted_window_attn(x))) + x = x + self.drop_path2(self.norm2(self.mlp(x))) + x = self.norm3(x) # main-branch norm enabled for some blocks / stages (every 6 for Huge/Giant) + return x + + +class PatchMerging(nn.Module): + """ This class implements the patch merging as a strided convolution with a normalization before. + Args: + dim (int): Number of input channels + norm_layer (Type[nn.Module]): Type of normalization layer to be utilized. + """ + + def __init__(self, dim: int, norm_layer: Type[nn.Module] = nn.LayerNorm) -> None: + super(PatchMerging, self).__init__() + self.norm = norm_layer(4 * dim) + self.reduction = nn.Linear(in_features=4 * dim, out_features=2 * dim, bias=False) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ Forward pass. + Args: + x (torch.Tensor): Input tensor of the shape [B, C, H, W] + Returns: + output (torch.Tensor): Output tensor of the shape [B, 2 * C, H // 2, W // 2] + """ + B, C, H, W = x.shape + # unfold + BCHW -> BHWC together + # ordering, 5, 3, 1 instead of 3, 5, 1 maintains compat with original swin v1 merge + x = x.reshape(B, C, H // 2, 2, W // 2, 2).permute(0, 2, 4, 5, 3, 1).flatten(3) + x = self.norm(x) + x = bhwc_to_bchw(self.reduction(x)) + return x + + +class PatchEmbed(nn.Module): + """ 2D Image to Patch Embedding """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, norm_layer=None): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + B, C, H, W = x.shape + _assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).") + _assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).") + x = self.proj(x) + x = self.norm(x.permute(0, 2, 3, 1)).permute(0, 3, 1, 2) + return x + + +class SwinTransformerStage(nn.Module): + r"""This class implements a stage of the Swin transformer including multiple layers. + + Args: + embed_dim (int): Number of input channels + depth (int): Depth of the stage (number of layers) + downscale (bool): If true input is downsampled (see Fig. 3 or V1 paper) + feat_size (Tuple[int, int]): input feature map size (H, W) + num_heads (int): Number of attention heads to be utilized + window_size (int): Window size to be utilized + mlp_ratio (int): Ratio of the hidden dimension in the FFN to the input channels + drop (float): Dropout in input mapping + drop_attn (float): Dropout rate of attention map + drop_path (float): Dropout in main path + norm_layer (Type[nn.Module]): Type of normalization layer to be utilized. Default: nn.LayerNorm + extra_norm_period (int): Insert extra norm layer on main branch every N (period) blocks + extra_norm_stage (bool): End each stage with an extra norm layer in main branch + sequential_attn (bool): If true sequential self-attention is performed + """ + + def __init__( + self, + embed_dim: int, + depth: int, + downscale: bool, + num_heads: int, + feat_size: Tuple[int, int], + window_size: Tuple[int, int], + mlp_ratio: float = 4.0, + init_values: Optional[float] = 0.0, + drop: float = 0.0, + drop_attn: float = 0.0, + drop_path: Union[List[float], float] = 0.0, + norm_layer: Type[nn.Module] = nn.LayerNorm, + extra_norm_period: int = 0, + extra_norm_stage: bool = False, + sequential_attn: bool = False, + ) -> None: + super(SwinTransformerStage, self).__init__() + self.downscale: bool = downscale + self.grad_checkpointing: bool = False + self.feat_size: Tuple[int, int] = (feat_size[0] // 2, feat_size[1] // 2) if downscale else feat_size + + self.downsample = PatchMerging(embed_dim, norm_layer=norm_layer) if downscale else nn.Identity() + + def _extra_norm(index): + i = index + 1 + if extra_norm_period and i % extra_norm_period == 0: + return True + return i == depth if extra_norm_stage else False + + embed_dim = embed_dim * 2 if downscale else embed_dim + self.blocks = nn.Sequential(*[ + SwinTransformerBlock( + dim=embed_dim, + num_heads=num_heads, + feat_size=self.feat_size, + window_size=window_size, + shift_size=tuple([0 if ((index % 2) == 0) else w // 2 for w in window_size]), + mlp_ratio=mlp_ratio, + init_values=init_values, + drop=drop, + drop_attn=drop_attn, + drop_path=drop_path[index] if isinstance(drop_path, list) else drop_path, + extra_norm=_extra_norm(index), + sequential_attn=sequential_attn, + norm_layer=norm_layer, + ) + for index in range(depth)] + ) + + def update_input_size(self, new_window_size: int, new_feat_size: Tuple[int, int]) -> None: + """Method updates the resolution to utilize and the window size and so the pair-wise relative positions. + + Args: + new_window_size (int): New window size + new_feat_size (Tuple[int, int]): New input resolution + """ + self.feat_size: Tuple[int, int] = ( + (new_feat_size[0] // 2, new_feat_size[1] // 2) if self.downscale else new_feat_size + ) + for block in self.blocks: + block.update_input_size(new_window_size=new_window_size, new_feat_size=self.feat_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """Forward pass. + Args: + x (torch.Tensor): Input tensor of the shape [B, C, H, W] or [B, L, C] + Returns: + output (torch.Tensor): Output tensor of the shape [B, 2 * C, H // 2, W // 2] + """ + x = self.downsample(x) + B, C, H, W = x.shape + L = H * W + + x = bchw_to_bhwc(x).reshape(B, L, C) + for block in self.blocks: + # Perform checkpointing if utilized + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint.checkpoint(block, x) + else: + x = block(x) + x = bhwc_to_bchw(x.reshape(B, H, W, -1)) + return x + + +class SwinTransformerV2Cr(nn.Module): + r""" Swin Transformer V2 + A PyTorch impl of : `Swin Transformer V2: Scaling Up Capacity and Resolution` - + https://arxiv.org/pdf/2111.09883 + + Args: + img_size (Tuple[int, int]): Input resolution. + window_size (Optional[int]): Window size. If None, img_size // window_div. Default: None + img_window_ratio (int): Window size to image size ratio. Default: 32 + patch_size (int | tuple(int)): Patch size. Default: 4 + in_chans (int): Number of input channels. + depths (int): Depth of the stage (number of layers). + num_heads (int): Number of attention heads to be utilized. + embed_dim (int): Patch embedding dimension. Default: 96 + num_classes (int): Number of output classes. Default: 1000 + mlp_ratio (int): Ratio of the hidden dimension in the FFN to the input channels. Default: 4 + drop_rate (float): Dropout rate. Default: 0.0 + attn_drop_rate (float): Dropout rate of attention map. Default: 0.0 + drop_path_rate (float): Stochastic depth rate. Default: 0.0 + norm_layer (Type[nn.Module]): Type of normalization layer to be utilized. Default: nn.LayerNorm + extra_norm_period (int): Insert extra norm layer on main branch every N (period) blocks in stage + extra_norm_stage (bool): End each stage with an extra norm layer in main branch + sequential_attn (bool): If true sequential self-attention is performed. Default: False + """ + + def __init__( + self, + img_size: Tuple[int, int] = (224, 224), + patch_size: int = 4, + window_size: Optional[int] = None, + img_window_ratio: int = 32, + in_chans: int = 3, + num_classes: int = 1000, + embed_dim: int = 96, + depths: Tuple[int, ...] = (2, 2, 6, 2), + num_heads: Tuple[int, ...] = (3, 6, 12, 24), + mlp_ratio: float = 4.0, + init_values: Optional[float] = 0., + drop_rate: float = 0.0, + attn_drop_rate: float = 0.0, + drop_path_rate: float = 0.0, + norm_layer: Type[nn.Module] = nn.LayerNorm, + extra_norm_period: int = 0, + extra_norm_stage: bool = False, + sequential_attn: bool = False, + global_pool: str = 'avg', + weight_init='skip', + **kwargs: Any + ) -> None: + super(SwinTransformerV2Cr, self).__init__() + img_size = to_2tuple(img_size) + window_size = tuple([ + s // img_window_ratio for s in img_size]) if window_size is None else to_2tuple(window_size) + + self.num_classes: int = num_classes + self.patch_size: int = patch_size + self.img_size: Tuple[int, int] = img_size + self.window_size: int = window_size + self.num_features: int = int(embed_dim * 2 ** (len(depths) - 1)) + + self.patch_embed = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, + embed_dim=embed_dim, norm_layer=norm_layer) + patch_grid_size: Tuple[int, int] = self.patch_embed.grid_size + + drop_path_rate = torch.linspace(0.0, drop_path_rate, sum(depths)).tolist() + stages = [] + for index, (depth, num_heads) in enumerate(zip(depths, num_heads)): + stage_scale = 2 ** max(index - 1, 0) + stages.append( + SwinTransformerStage( + embed_dim=embed_dim * stage_scale, + depth=depth, + downscale=index != 0, + feat_size=(patch_grid_size[0] // stage_scale, patch_grid_size[1] // stage_scale), + num_heads=num_heads, + window_size=window_size, + mlp_ratio=mlp_ratio, + init_values=init_values, + drop=drop_rate, + drop_attn=attn_drop_rate, + drop_path=drop_path_rate[sum(depths[:index]):sum(depths[:index + 1])], + extra_norm_period=extra_norm_period, + extra_norm_stage=extra_norm_stage or (index + 1) == len(depths), # last stage ends w/ norm + sequential_attn=sequential_attn, + norm_layer=norm_layer, + ) + ) + self.stages = nn.Sequential(*stages) + + self.global_pool: str = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes else nn.Identity() + + # current weight init skips custom init and uses pytorch layer defaults, seems to work well + # FIXME more experiments needed + if weight_init != 'skip': + named_apply(init_weights, self) + + def update_input_size( + self, + new_img_size: Optional[Tuple[int, int]] = None, + new_window_size: Optional[int] = None, + img_window_ratio: int = 32, + ) -> None: + """Method updates the image resolution to be processed and window size and so the pair-wise relative positions. + + Args: + new_window_size (Optional[int]): New window size, if None based on new_img_size // window_div + new_img_size (Optional[Tuple[int, int]]): New input resolution, if None current resolution is used + img_window_ratio (int): divisor for calculating window size from image size + """ + # Check parameters + if new_img_size is None: + new_img_size = self.img_size + else: + new_img_size = to_2tuple(new_img_size) + if new_window_size is None: + new_window_size = tuple([s // img_window_ratio for s in new_img_size]) + # Compute new patch resolution & update resolution of each stage + new_patch_grid_size = (new_img_size[0] // self.patch_size, new_img_size[1] // self.patch_size) + for index, stage in enumerate(self.stages): + stage_scale = 2 ** max(index - 1, 0) + stage.update_input_size( + new_window_size=new_window_size, + new_img_size=(new_patch_grid_size[0] // stage_scale, new_patch_grid_size[1] // stage_scale), + ) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^patch_embed', # stem and embed + blocks=r'^stages\.(\d+)' if coarse else [ + (r'^stages\.(\d+).downsample', (0,)), + (r'^stages\.(\d+)\.\w+\.(\d+)', None), + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore() + def get_classifier(self) -> nn.Module: + """Method returns the classification head of the model. + Returns: + head (nn.Module): Current classification head + """ + return self.head + + def reset_classifier(self, num_classes: int, global_pool: Optional[str] = None) -> None: + """Method results the classification head + + Args: + num_classes (int): Number of classes to be predicted + global_pool (str): Unused + """ + self.num_classes: int = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x: torch.Tensor) -> torch.Tensor: + x = self.patch_embed(x) + x = self.stages(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean(dim=(2, 3)) + return x if pre_logits else self.head(x) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def init_weights(module: nn.Module, name: str = ''): + # FIXME WIP determining if there's a better weight init + if isinstance(module, nn.Linear): + if 'qkv' in name: + # treat the weights of Q, K, V separately + val = math.sqrt(6. / float(module.weight.shape[0] // 3 + module.weight.shape[1])) + nn.init.uniform_(module.weight, -val, val) + elif 'head' in name: + nn.init.zeros_(module.weight) + else: + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif hasattr(module, 'init_weights'): + module.init_weights() + + +def checkpoint_filter_fn(state_dict, model): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + out_dict = {} + if 'model' in state_dict: + # For deit models + state_dict = state_dict['model'] + for k, v in state_dict.items(): + if 'tau' in k: + # convert old tau based checkpoints -> logit_scale (inverse) + v = torch.log(1 / v) + k = k.replace('tau', 'logit_scale') + out_dict[k] = v + return out_dict + + +def _create_swin_transformer_v2_cr(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + model = build_model_with_cfg( + SwinTransformerV2Cr, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs + ) + return model + + +@register_model +def swinv2_cr_tiny_384(pretrained=False, **kwargs): + """Swin-T V2 CR @ 384x384, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=96, + depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_tiny_384', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_tiny_224(pretrained=False, **kwargs): + """Swin-T V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=96, + depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_tiny_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_tiny_ns_224(pretrained=False, **kwargs): + """Swin-T V2 CR @ 224x224, trained ImageNet-1k w/ extra stage norms. + ** Experimental, may make default if results are improved. ** + """ + model_kwargs = dict( + embed_dim=96, + depths=(2, 2, 6, 2), + num_heads=(3, 6, 12, 24), + extra_norm_stage=True, + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_tiny_ns_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_small_384(pretrained=False, **kwargs): + """Swin-S V2 CR @ 384x384, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=96, + depths=(2, 2, 18, 2), + num_heads=(3, 6, 12, 24), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_small_384', pretrained=pretrained, **model_kwargs + ) + + +@register_model +def swinv2_cr_small_224(pretrained=False, **kwargs): + """Swin-S V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=96, + depths=(2, 2, 18, 2), + num_heads=(3, 6, 12, 24), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_small_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_small_ns_224(pretrained=False, **kwargs): + """Swin-S V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=96, + depths=(2, 2, 18, 2), + num_heads=(3, 6, 12, 24), + extra_norm_stage=True, + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_small_ns_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_base_384(pretrained=False, **kwargs): + """Swin-B V2 CR @ 384x384, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=128, + depths=(2, 2, 18, 2), + num_heads=(4, 8, 16, 32), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_base_384', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_base_224(pretrained=False, **kwargs): + """Swin-B V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=128, + depths=(2, 2, 18, 2), + num_heads=(4, 8, 16, 32), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_base_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_base_ns_224(pretrained=False, **kwargs): + """Swin-B V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=128, + depths=(2, 2, 18, 2), + num_heads=(4, 8, 16, 32), + extra_norm_stage=True, + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_base_ns_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_large_384(pretrained=False, **kwargs): + """Swin-L V2 CR @ 384x384, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=192, + depths=(2, 2, 18, 2), + num_heads=(6, 12, 24, 48), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_large_384', pretrained=pretrained, **model_kwargs + ) + + +@register_model +def swinv2_cr_large_224(pretrained=False, **kwargs): + """Swin-L V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=192, + depths=(2, 2, 18, 2), + num_heads=(6, 12, 24, 48), + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_large_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_huge_384(pretrained=False, **kwargs): + """Swin-H V2 CR @ 384x384, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=352, + depths=(2, 2, 18, 2), + num_heads=(11, 22, 44, 88), # head count not certain for Huge, 384 & 224 trying diff values + extra_norm_period=6, + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_huge_384', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_huge_224(pretrained=False, **kwargs): + """Swin-H V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=352, + depths=(2, 2, 18, 2), + num_heads=(8, 16, 32, 64), # head count not certain for Huge, 384 & 224 trying diff values + extra_norm_period=6, + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_huge_224', pretrained=pretrained, **model_kwargs) + + +@register_model +def swinv2_cr_giant_384(pretrained=False, **kwargs): + """Swin-G V2 CR @ 384x384, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=512, + depths=(2, 2, 42, 2), + num_heads=(16, 32, 64, 128), + extra_norm_period=6, + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_giant_384', pretrained=pretrained, **model_kwargs + ) + + +@register_model +def swinv2_cr_giant_224(pretrained=False, **kwargs): + """Swin-G V2 CR @ 224x224, trained ImageNet-1k""" + model_kwargs = dict( + embed_dim=512, + depths=(2, 2, 42, 2), + num_heads=(16, 32, 64, 128), + extra_norm_period=6, + **kwargs + ) + return _create_swin_transformer_v2_cr('swinv2_cr_giant_224', pretrained=pretrained, **model_kwargs) diff --git a/custom_timm/models/tnt.py b/custom_timm/models/tnt.py new file mode 100644 index 0000000000000000000000000000000000000000..c73bb4b252c47158177d0fb8345fa38c1104542a --- /dev/null +++ b/custom_timm/models/tnt.py @@ -0,0 +1,304 @@ +""" Transformer in Transformer (TNT) in PyTorch + +A PyTorch implement of TNT as described in +'Transformer in Transformer' - https://arxiv.org/abs/2103.00112 + +The official mindspore code is released and available at +https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/cv/TNT +""" +import math +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from custom_timm.models.helpers import build_model_with_cfg +from custom_timm.models.layers import Mlp, DropPath, trunc_normal_ +from custom_timm.models.layers.helpers import to_2tuple +from custom_timm.models.layers import _assert +from custom_timm.models.registry import register_model +from custom_timm.models.vision_transformer import resize_pos_embed + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'pixel_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'tnt_s_patch16_224': _cfg( + url='https://github.com/contrastive/pytorch-image-models/releases/download/TNT/tnt_s_patch16_224.pth.tar', + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + ), + 'tnt_b_patch16_224': _cfg( + mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), + ), +} + + +class Attention(nn.Module): + """ Multi-Head Attention + """ + def __init__(self, dim, hidden_dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.hidden_dim = hidden_dim + self.num_heads = num_heads + head_dim = hidden_dim // num_heads + self.head_dim = head_dim + self.scale = head_dim ** -0.5 + + self.qk = nn.Linear(dim, hidden_dim * 2, bias=qkv_bias) + self.v = nn.Linear(dim, dim, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop, inplace=True) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop, inplace=True) + + def forward(self, x): + B, N, C = x.shape + qk = self.qk(x).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + q, k = qk.unbind(0) # make torchscript happy (cannot use tensor as tuple) + v = self.v(x).reshape(B, N, self.num_heads, -1).permute(0, 2, 1, 3) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, -1) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + """ TNT Block + """ + def __init__( + self, dim, in_dim, num_pixel, num_heads=12, in_num_head=4, mlp_ratio=4., + qkv_bias=False, drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + # Inner transformer + self.norm_in = norm_layer(in_dim) + self.attn_in = Attention( + in_dim, in_dim, num_heads=in_num_head, qkv_bias=qkv_bias, + attn_drop=attn_drop, proj_drop=drop) + + self.norm_mlp_in = norm_layer(in_dim) + self.mlp_in = Mlp(in_features=in_dim, hidden_features=int(in_dim * 4), + out_features=in_dim, act_layer=act_layer, drop=drop) + + self.norm1_proj = norm_layer(in_dim) + self.proj = nn.Linear(in_dim * num_pixel, dim, bias=True) + # Outer transformer + self.norm_out = norm_layer(dim) + self.attn_out = Attention( + dim, dim, num_heads=num_heads, qkv_bias=qkv_bias, + attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm_mlp = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), + out_features=dim, act_layer=act_layer, drop=drop) + + def forward(self, pixel_embed, patch_embed): + # inner + pixel_embed = pixel_embed + self.drop_path(self.attn_in(self.norm_in(pixel_embed))) + pixel_embed = pixel_embed + self.drop_path(self.mlp_in(self.norm_mlp_in(pixel_embed))) + # outer + B, N, C = patch_embed.size() + patch_embed = torch.cat( + [patch_embed[:, 0:1], patch_embed[:, 1:] + self.proj(self.norm1_proj(pixel_embed).reshape(B, N - 1, -1))], + dim=1) + patch_embed = patch_embed + self.drop_path(self.attn_out(self.norm_out(patch_embed))) + patch_embed = patch_embed + self.drop_path(self.mlp(self.norm_mlp(patch_embed))) + return pixel_embed, patch_embed + + +class PixelEmbed(nn.Module): + """ Image to Pixel Embedding + """ + def __init__(self, img_size=224, patch_size=16, in_chans=3, in_dim=48, stride=4): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + # grid_size property necessary for resizing positional embedding + self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1]) + num_patches = (self.grid_size[0]) * (self.grid_size[1]) + self.img_size = img_size + self.num_patches = num_patches + self.in_dim = in_dim + new_patch_size = [math.ceil(ps / stride) for ps in patch_size] + self.new_patch_size = new_patch_size + + self.proj = nn.Conv2d(in_chans, self.in_dim, kernel_size=7, padding=3, stride=stride) + self.unfold = nn.Unfold(kernel_size=new_patch_size, stride=new_patch_size) + + def forward(self, x, pixel_pos): + B, C, H, W = x.shape + _assert(H == self.img_size[0], + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).") + _assert(W == self.img_size[1], + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]}).") + x = self.proj(x) + x = self.unfold(x) + x = x.transpose(1, 2).reshape(B * self.num_patches, self.in_dim, self.new_patch_size[0], self.new_patch_size[1]) + x = x + pixel_pos + x = x.reshape(B * self.num_patches, self.in_dim, -1).transpose(1, 2) + return x + + +class TNT(nn.Module): + """ Transformer in Transformer - https://arxiv.org/abs/2103.00112 + """ + def __init__( + self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token', + embed_dim=768, in_dim=48, depth=12, num_heads=12, in_num_head=4, mlp_ratio=4., qkv_bias=False, + drop_rate=0., attn_drop_rate=0., drop_path_rate=0., norm_layer=nn.LayerNorm, first_stride=4): + super().__init__() + assert global_pool in ('', 'token', 'avg') + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.grad_checkpointing = False + + self.pixel_embed = PixelEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, in_dim=in_dim, stride=first_stride) + num_patches = self.pixel_embed.num_patches + self.num_patches = num_patches + new_patch_size = self.pixel_embed.new_patch_size + num_pixel = new_patch_size[0] * new_patch_size[1] + + self.norm1_proj = norm_layer(num_pixel * in_dim) + self.proj = nn.Linear(num_pixel * in_dim, embed_dim) + self.norm2_proj = norm_layer(embed_dim) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.patch_pos = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim)) + self.pixel_pos = nn.Parameter(torch.zeros(1, in_dim, new_patch_size[0], new_patch_size[1])) + self.pos_drop = nn.Dropout(p=drop_rate) + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + blocks = [] + for i in range(depth): + blocks.append(Block( + dim=embed_dim, in_dim=in_dim, num_pixel=num_pixel, num_heads=num_heads, in_num_head=in_num_head, + mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, attn_drop=attn_drop_rate, + drop_path=dpr[i], norm_layer=norm_layer)) + self.blocks = nn.ModuleList(blocks) + self.norm = norm_layer(embed_dim) + + self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.cls_token, std=.02) + trunc_normal_(self.patch_pos, std=.02) + trunc_normal_(self.pixel_pos, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'patch_pos', 'pixel_pos', 'cls_token'} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^cls_token|patch_pos|pixel_pos|pixel_embed|norm[12]_proj|proj', # stem and embed / pos + blocks=[ + (r'^blocks\.(\d+)', None), + (r'^norm', (99999,)), + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'token', 'avg') + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = x.shape[0] + pixel_embed = self.pixel_embed(x, self.pixel_pos) + + patch_embed = self.norm2_proj(self.proj(self.norm1_proj(pixel_embed.reshape(B, self.num_patches, -1)))) + patch_embed = torch.cat((self.cls_token.expand(B, -1, -1), patch_embed), dim=1) + patch_embed = patch_embed + self.patch_pos + patch_embed = self.pos_drop(patch_embed) + + if self.grad_checkpointing and not torch.jit.is_scripting(): + for blk in self.blocks: + pixel_embed, patch_embed = checkpoint(blk, pixel_embed, patch_embed) + else: + for blk in self.blocks: + pixel_embed, patch_embed = blk(pixel_embed, patch_embed) + + patch_embed = self.norm(patch_embed) + return patch_embed + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def checkpoint_filter_fn(state_dict, model): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + if state_dict['patch_pos'].shape != model.patch_pos.shape: + state_dict['patch_pos'] = resize_pos_embed(state_dict['patch_pos'], + model.patch_pos, getattr(model, 'num_tokens', 1), model.pixel_embed.grid_size) + return state_dict + + +def _create_tnt(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model = build_model_with_cfg( + TNT, variant, pretrained, + pretrained_filter_fn=checkpoint_filter_fn, + **kwargs) + return model + + +@register_model +def tnt_s_patch16_224(pretrained=False, **kwargs): + model_cfg = dict( + patch_size=16, embed_dim=384, in_dim=24, depth=12, num_heads=6, in_num_head=4, + qkv_bias=False, **kwargs) + model = _create_tnt('tnt_s_patch16_224', pretrained=pretrained, **model_cfg) + return model + + +@register_model +def tnt_b_patch16_224(pretrained=False, **kwargs): + model_cfg = dict( + patch_size=16, embed_dim=640, in_dim=40, depth=12, num_heads=10, in_num_head=4, + qkv_bias=False, **kwargs) + model = _create_tnt('tnt_b_patch16_224', pretrained=pretrained, **model_cfg) + return model diff --git a/custom_timm/models/tresnet.py b/custom_timm/models/tresnet.py new file mode 100644 index 0000000000000000000000000000000000000000..2469acd265aaff10c9d3b04a5b9db090f3939a7b --- /dev/null +++ b/custom_timm/models/tresnet.py @@ -0,0 +1,331 @@ +""" +TResNet: High Performance GPU-Dedicated Architecture +https://arxiv.org/pdf/2003.13630.pdf + +Original model: https://github.com/mrT23/TResNet + +""" +from collections import OrderedDict + +import torch +import torch.nn as nn + +from .helpers import build_model_with_cfg +from .layers import SpaceToDepthModule, BlurPool2d, InplaceAbn, ClassifierHead, SEModule +from .registry import register_model + +__all__ = ['tresnet_m', 'tresnet_l', 'tresnet_xl'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': (0., 0., 0.), 'std': (1., 1., 1.), + 'first_conv': 'body.conv1.0', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = { + 'tresnet_m': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_1k_miil_83_1-d236afcb.pth'), + 'tresnet_m_miil_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_miil_in21k-901b6ed4.pth', num_classes=11221), + 'tresnet_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_81_5-235b486c.pth'), + 'tresnet_xl': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_xl_82_0-a2d51b00.pth'), + 'tresnet_m_448': _cfg( + input_size=(3, 448, 448), pool_size=(14, 14), + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_m_448-bc359d10.pth'), + 'tresnet_l_448': _cfg( + input_size=(3, 448, 448), pool_size=(14, 14), + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_448-940d0cd1.pth'), + 'tresnet_xl_448': _cfg( + input_size=(3, 448, 448), pool_size=(14, 14), + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_xl_448-8c1815de.pth'), + + 'tresnet_v2_l': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/tresnet_l_v2_83_9-f36e4445.pth'), +} + + +def IABN2Float(module: nn.Module) -> nn.Module: + """If `module` is IABN don't use half precision.""" + if isinstance(module, InplaceAbn): + module.float() + for child in module.children(): + IABN2Float(child) + return module + + +def conv2d_iabn(ni, nf, stride, kernel_size=3, groups=1, act_layer="leaky_relu", act_param=1e-2): + return nn.Sequential( + nn.Conv2d( + ni, nf, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, groups=groups, bias=False), + InplaceAbn(nf, act_layer=act_layer, act_param=act_param) + ) + + +class BasicBlock(nn.Module): + expansion = 1 + + def __init__(self, inplanes, planes, stride=1, downsample=None, use_se=True, aa_layer=None): + super(BasicBlock, self).__init__() + if stride == 1: + self.conv1 = conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3) + else: + if aa_layer is None: + self.conv1 = conv2d_iabn(inplanes, planes, stride=2, act_param=1e-3) + else: + self.conv1 = nn.Sequential( + conv2d_iabn(inplanes, planes, stride=1, act_param=1e-3), + aa_layer(channels=planes, filt_size=3, stride=2)) + + self.conv2 = conv2d_iabn(planes, planes, stride=1, act_layer="identity") + self.relu = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + rd_chs = max(planes * self.expansion // 4, 64) + self.se = SEModule(planes * self.expansion, rd_channels=rd_chs) if use_se else None + + def forward(self, x): + if self.downsample is not None: + shortcut = self.downsample(x) + else: + shortcut = x + + out = self.conv1(x) + out = self.conv2(out) + + if self.se is not None: + out = self.se(out) + + out = out + shortcut + out = self.relu(out) + return out + + +class Bottleneck(nn.Module): + expansion = 4 + + def __init__( + self, inplanes, planes, stride=1, downsample=None, use_se=True, + act_layer="leaky_relu", aa_layer=None): + super(Bottleneck, self).__init__() + self.conv1 = conv2d_iabn( + inplanes, planes, kernel_size=1, stride=1, act_layer=act_layer, act_param=1e-3) + if stride == 1: + self.conv2 = conv2d_iabn( + planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3) + else: + if aa_layer is None: + self.conv2 = conv2d_iabn( + planes, planes, kernel_size=3, stride=2, act_layer=act_layer, act_param=1e-3) + else: + self.conv2 = nn.Sequential( + conv2d_iabn(planes, planes, kernel_size=3, stride=1, act_layer=act_layer, act_param=1e-3), + aa_layer(channels=planes, filt_size=3, stride=2)) + + reduction_chs = max(planes * self.expansion // 8, 64) + self.se = SEModule(planes, rd_channels=reduction_chs) if use_se else None + + self.conv3 = conv2d_iabn( + planes, planes * self.expansion, kernel_size=1, stride=1, act_layer="identity") + + self.act = nn.ReLU(inplace=True) + self.downsample = downsample + self.stride = stride + + def forward(self, x): + if self.downsample is not None: + shortcut = self.downsample(x) + else: + shortcut = x + + out = self.conv1(x) + out = self.conv2(out) + if self.se is not None: + out = self.se(out) + out = self.conv3(out) + out = out + shortcut # no inplace + out = self.act(out) + + return out + + +class TResNet(nn.Module): + def __init__( + self, + layers, + in_chans=3, + num_classes=1000, + width_factor=1.0, + v2=False, + global_pool='fast', + drop_rate=0., + ): + self.num_classes = num_classes + self.drop_rate = drop_rate + super(TResNet, self).__init__() + + aa_layer = BlurPool2d + + # TResnet stages + self.inplanes = int(64 * width_factor) + self.planes = int(64 * width_factor) + if v2: + self.inplanes = self.inplanes // 8 * 8 + self.planes = self.planes // 8 * 8 + + conv1 = conv2d_iabn(in_chans * 16, self.planes, stride=1, kernel_size=3) + layer1 = self._make_layer( + Bottleneck if v2 else BasicBlock, self.planes, layers[0], stride=1, use_se=True, aa_layer=aa_layer) + layer2 = self._make_layer( + Bottleneck if v2 else BasicBlock, self.planes * 2, layers[1], stride=2, use_se=True, aa_layer=aa_layer) + layer3 = self._make_layer( + Bottleneck, self.planes * 4, layers[2], stride=2, use_se=True, aa_layer=aa_layer) + layer4 = self._make_layer( + Bottleneck, self.planes * 8, layers[3], stride=2, use_se=False, aa_layer=aa_layer) + + # body + self.body = nn.Sequential(OrderedDict([ + ('SpaceToDepth', SpaceToDepthModule()), + ('conv1', conv1), + ('layer1', layer1), + ('layer2', layer2), + ('layer3', layer3), + ('layer4', layer4)])) + + self.feature_info = [ + dict(num_chs=self.planes, reduction=2, module=''), # Not with S2D? + dict(num_chs=self.planes * (Bottleneck.expansion if v2 else 1), reduction=4, module='body.layer1'), + dict(num_chs=self.planes * 2 * (Bottleneck.expansion if v2 else 1), reduction=8, module='body.layer2'), + dict(num_chs=self.planes * 4 * Bottleneck.expansion, reduction=16, module='body.layer3'), + dict(num_chs=self.planes * 8 * Bottleneck.expansion, reduction=32, module='body.layer4'), + ] + + # head + self.num_features = (self.planes * 8) * Bottleneck.expansion + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate) + + # model initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='leaky_relu') + elif isinstance(m, nn.BatchNorm2d) or isinstance(m, InplaceAbn): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + + # residual connections special initialization + for m in self.modules(): + if isinstance(m, BasicBlock): + m.conv2[1].weight = nn.Parameter(torch.zeros_like(m.conv2[1].weight)) # BN to zero + if isinstance(m, Bottleneck): + m.conv3[1].weight = nn.Parameter(torch.zeros_like(m.conv3[1].weight)) # BN to zero + if isinstance(m, nn.Linear): + m.weight.data.normal_(0, 0.01) + + def _make_layer(self, block, planes, blocks, stride=1, use_se=True, aa_layer=None): + downsample = None + if stride != 1 or self.inplanes != planes * block.expansion: + layers = [] + if stride == 2: + # avg pooling before 1x1 conv + layers.append(nn.AvgPool2d(kernel_size=2, stride=2, ceil_mode=True, count_include_pad=False)) + layers += [conv2d_iabn( + self.inplanes, planes * block.expansion, kernel_size=1, stride=1, act_layer="identity")] + downsample = nn.Sequential(*layers) + + layers = [] + layers.append(block( + self.inplanes, planes, stride, downsample, use_se=use_se, aa_layer=aa_layer)) + self.inplanes = planes * block.expansion + for i in range(1, blocks): + layers.append( + block(self.inplanes, planes, use_se=use_se, aa_layer=aa_layer)) + return nn.Sequential(*layers) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict(stem=r'^body\.conv1', blocks=r'^body\.layer(\d+)' if coarse else r'^body\.layer(\d+)\.(\d+)') + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='fast'): + self.head = ClassifierHead( + self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + return self.body(x) + + def forward_head(self, x, pre_logits: bool = False): + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_tresnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + TResNet, variant, pretrained, + feature_cfg=dict(out_indices=(1, 2, 3, 4), flatten_sequential=True), + **kwargs) + + +@register_model +def tresnet_m(pretrained=False, **kwargs): + model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs) + return _create_tresnet('tresnet_m', pretrained=pretrained, **model_kwargs) + + +@register_model +def tresnet_m_miil_in21k(pretrained=False, **kwargs): + model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs) + return _create_tresnet('tresnet_m_miil_in21k', pretrained=pretrained, **model_kwargs) + + +@register_model +def tresnet_l(pretrained=False, **kwargs): + model_kwargs = dict(layers=[4, 5, 18, 3], width_factor=1.2, **kwargs) + return _create_tresnet('tresnet_l', pretrained=pretrained, **model_kwargs) + + +@register_model +def tresnet_v2_l(pretrained=False, **kwargs): + model_kwargs = dict(layers=[3, 4, 23, 3], width_factor=1.0, v2=True, **kwargs) + return _create_tresnet('tresnet_v2_l', pretrained=pretrained, **model_kwargs) + + +@register_model +def tresnet_xl(pretrained=False, **kwargs): + model_kwargs = dict(layers=[4, 5, 24, 3], width_factor=1.3, **kwargs) + return _create_tresnet('tresnet_xl', pretrained=pretrained, **model_kwargs) + + +@register_model +def tresnet_m_448(pretrained=False, **kwargs): + model_kwargs = dict(layers=[3, 4, 11, 3], **kwargs) + return _create_tresnet('tresnet_m_448', pretrained=pretrained, **model_kwargs) + + +@register_model +def tresnet_l_448(pretrained=False, **kwargs): + model_kwargs = dict(layers=[4, 5, 18, 3], width_factor=1.2, **kwargs) + return _create_tresnet('tresnet_l_448', pretrained=pretrained, **model_kwargs) + + +@register_model +def tresnet_xl_448(pretrained=False, **kwargs): + model_kwargs = dict(layers=[4, 5, 24, 3], width_factor=1.3, **kwargs) + return _create_tresnet('tresnet_xl_448', pretrained=pretrained, **model_kwargs) diff --git a/custom_timm/models/twins.py b/custom_timm/models/twins.py new file mode 100644 index 0000000000000000000000000000000000000000..dfde68ca6e85558e2b094d138fe7e522395404f8 --- /dev/null +++ b/custom_timm/models/twins.py @@ -0,0 +1,449 @@ +""" Twins +A PyTorch impl of : `Twins: Revisiting the Design of Spatial Attention in Vision Transformers` + - https://arxiv.org/pdf/2104.13840.pdf + +Code/weights from https://github.com/Meituan-AutoML/Twins, original copyright/license info below + +""" +# -------------------------------------------------------- +# Twins +# Copyright (c) 2021 Meituan +# Licensed under The Apache 2.0 License [see LICENSE for details] +# Written by Xinjie Li, Xiangxiang Chu +# -------------------------------------------------------- +import math +from copy import deepcopy +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .layers import Mlp, DropPath, to_2tuple, trunc_normal_ +from .fx_features import register_notrace_module +from .registry import register_model +from .vision_transformer import Attention +from .helpers import build_model_with_cfg + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embeds.0.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'twins_pcpvt_small': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_small-e70e7e7a.pth', + ), + 'twins_pcpvt_base': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_base-e5ecb09b.pth', + ), + 'twins_pcpvt_large': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_pcpvt_large-d273f802.pth', + ), + 'twins_svt_small': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_svt_small-42e5f78c.pth', + ), + 'twins_svt_base': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_svt_base-c2265010.pth', + ), + 'twins_svt_large': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/twins_svt_large-90f6aaa9.pth', + ), +} + +Size_ = Tuple[int, int] + + +@register_notrace_module # reason: FX can't symbolically trace control flow in forward method +class LocallyGroupedAttn(nn.Module): + """ LSA: self attention within a group + """ + def __init__(self, dim, num_heads=8, attn_drop=0., proj_drop=0., ws=1): + assert ws != 1 + super(LocallyGroupedAttn, self).__init__() + assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." + + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=True) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.ws = ws + + def forward(self, x, size: Size_): + # There are two implementations for this function, zero padding or mask. We don't observe obvious difference for + # both. You can choose any one, we recommend forward_padding because it's neat. However, + # the masking implementation is more reasonable and accurate. + B, N, C = x.shape + H, W = size + x = x.view(B, H, W, C) + pad_l = pad_t = 0 + pad_r = (self.ws - W % self.ws) % self.ws + pad_b = (self.ws - H % self.ws) % self.ws + x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + _, Hp, Wp, _ = x.shape + _h, _w = Hp // self.ws, Wp // self.ws + x = x.reshape(B, _h, self.ws, _w, self.ws, C).transpose(2, 3) + qkv = self.qkv(x).reshape( + B, _h * _w, self.ws * self.ws, 3, self.num_heads, C // self.num_heads).permute(3, 0, 1, 4, 2, 5) + q, k, v = qkv[0], qkv[1], qkv[2] + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + attn = (attn @ v).transpose(2, 3).reshape(B, _h, _w, self.ws, self.ws, C) + x = attn.transpose(2, 3).reshape(B, _h * self.ws, _w * self.ws, C) + if pad_r > 0 or pad_b > 0: + x = x[:, :H, :W, :].contiguous() + x = x.reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + # def forward_mask(self, x, size: Size_): + # B, N, C = x.shape + # H, W = size + # x = x.view(B, H, W, C) + # pad_l = pad_t = 0 + # pad_r = (self.ws - W % self.ws) % self.ws + # pad_b = (self.ws - H % self.ws) % self.ws + # x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b)) + # _, Hp, Wp, _ = x.shape + # _h, _w = Hp // self.ws, Wp // self.ws + # mask = torch.zeros((1, Hp, Wp), device=x.device) + # mask[:, -pad_b:, :].fill_(1) + # mask[:, :, -pad_r:].fill_(1) + # + # x = x.reshape(B, _h, self.ws, _w, self.ws, C).transpose(2, 3) # B, _h, _w, ws, ws, C + # mask = mask.reshape(1, _h, self.ws, _w, self.ws).transpose(2, 3).reshape(1, _h * _w, self.ws * self.ws) + # attn_mask = mask.unsqueeze(2) - mask.unsqueeze(3) # 1, _h*_w, ws*ws, ws*ws + # attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-1000.0)).masked_fill(attn_mask == 0, float(0.0)) + # qkv = self.qkv(x).reshape( + # B, _h * _w, self.ws * self.ws, 3, self.num_heads, C // self.num_heads).permute(3, 0, 1, 4, 2, 5) + # # n_h, B, _w*_h, nhead, ws*ws, dim + # q, k, v = qkv[0], qkv[1], qkv[2] # B, _h*_w, n_head, ws*ws, dim_head + # attn = (q @ k.transpose(-2, -1)) * self.scale # B, _h*_w, n_head, ws*ws, ws*ws + # attn = attn + attn_mask.unsqueeze(2) + # attn = attn.softmax(dim=-1) + # attn = self.attn_drop(attn) # attn @v -> B, _h*_w, n_head, ws*ws, dim_head + # attn = (attn @ v).transpose(2, 3).reshape(B, _h, _w, self.ws, self.ws, C) + # x = attn.transpose(2, 3).reshape(B, _h * self.ws, _w * self.ws, C) + # if pad_r > 0 or pad_b > 0: + # x = x[:, :H, :W, :].contiguous() + # x = x.reshape(B, N, C) + # x = self.proj(x) + # x = self.proj_drop(x) + # return x + + +class GlobalSubSampleAttn(nn.Module): + """ GSA: using a key to summarize the information for a group to be efficient. + """ + def __init__(self, dim, num_heads=8, attn_drop=0., proj_drop=0., sr_ratio=1): + super().__init__() + assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}." + + self.dim = dim + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.q = nn.Linear(dim, dim, bias=True) + self.kv = nn.Linear(dim, dim * 2, bias=True) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.sr_ratio = sr_ratio + if sr_ratio > 1: + self.sr = nn.Conv2d(dim, dim, kernel_size=sr_ratio, stride=sr_ratio) + self.norm = nn.LayerNorm(dim) + else: + self.sr = None + self.norm = None + + def forward(self, x, size: Size_): + B, N, C = x.shape + q = self.q(x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3) + + if self.sr is not None: + x = x.permute(0, 2, 1).reshape(B, C, *size) + x = self.sr(x).reshape(B, C, -1).permute(0, 2, 1) + x = self.norm(x) + kv = self.kv(x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + k, v = kv[0], kv[1] + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Block(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, ws=None): + super().__init__() + self.norm1 = norm_layer(dim) + if ws is None: + self.attn = Attention(dim, num_heads, False, None, attn_drop, drop) + elif ws == 1: + self.attn = GlobalSubSampleAttn(dim, num_heads, attn_drop, drop, sr_ratio) + else: + self.attn = LocallyGroupedAttn(dim, num_heads, attn_drop, drop, ws) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x, size: Size_): + x = x + self.drop_path(self.attn(self.norm1(x), size)) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PosConv(nn.Module): + # PEG from https://arxiv.org/abs/2102.10882 + def __init__(self, in_chans, embed_dim=768, stride=1): + super(PosConv, self).__init__() + self.proj = nn.Sequential(nn.Conv2d(in_chans, embed_dim, 3, stride, 1, bias=True, groups=embed_dim), ) + self.stride = stride + + def forward(self, x, size: Size_): + B, N, C = x.shape + cnn_feat_token = x.transpose(1, 2).view(B, C, *size) + x = self.proj(cnn_feat_token) + if self.stride == 1: + x += cnn_feat_token + x = x.flatten(2).transpose(1, 2) + return x + + def no_weight_decay(self): + return ['proj.%d.weight' % i for i in range(4)] + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768): + super().__init__() + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + + self.img_size = img_size + self.patch_size = patch_size + assert img_size[0] % patch_size[0] == 0 and img_size[1] % patch_size[1] == 0, \ + f"img_size {img_size} should be divided by patch_size {patch_size}." + self.H, self.W = img_size[0] // patch_size[0], img_size[1] // patch_size[1] + self.num_patches = self.H * self.W + self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size) + self.norm = nn.LayerNorm(embed_dim) + + def forward(self, x) -> Tuple[torch.Tensor, Size_]: + B, C, H, W = x.shape + + x = self.proj(x).flatten(2).transpose(1, 2) + x = self.norm(x) + out_size = (H // self.patch_size[0], W // self.patch_size[1]) + + return x, out_size + + +class Twins(nn.Module): + """ Twins Vision Transfomer (Revisiting Spatial Attention) + + Adapted from PVT (PyramidVisionTransformer) class at https://github.com/whai362/PVT.git + """ + def __init__( + self, img_size=224, patch_size=4, in_chans=3, num_classes=1000, global_pool='avg', + embed_dims=(64, 128, 256, 512), num_heads=(1, 2, 4, 8), mlp_ratios=(4, 4, 4, 4), depths=(3, 4, 6, 3), + sr_ratios=(8, 4, 2, 1), wss=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + norm_layer=partial(nn.LayerNorm, eps=1e-6), block_cls=Block): + super().__init__() + self.num_classes = num_classes + self.global_pool = global_pool + self.depths = depths + self.embed_dims = embed_dims + self.num_features = embed_dims[-1] + self.grad_checkpointing = False + + img_size = to_2tuple(img_size) + prev_chs = in_chans + self.patch_embeds = nn.ModuleList() + self.pos_drops = nn.ModuleList() + for i in range(len(depths)): + self.patch_embeds.append(PatchEmbed(img_size, patch_size, prev_chs, embed_dims[i])) + self.pos_drops.append(nn.Dropout(p=drop_rate)) + prev_chs = embed_dims[i] + img_size = tuple(t // patch_size for t in img_size) + patch_size = 2 + + self.blocks = nn.ModuleList() + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] # stochastic depth decay rule + cur = 0 + for k in range(len(depths)): + _block = nn.ModuleList([block_cls( + dim=embed_dims[k], num_heads=num_heads[k], mlp_ratio=mlp_ratios[k], drop=drop_rate, + attn_drop=attn_drop_rate, drop_path=dpr[cur + i], norm_layer=norm_layer, sr_ratio=sr_ratios[k], + ws=1 if wss is None or i % 2 == 1 else wss[k]) for i in range(depths[k])]) + self.blocks.append(_block) + cur += depths[k] + + self.pos_block = nn.ModuleList([PosConv(embed_dim, embed_dim) for embed_dim in embed_dims]) + + self.norm = norm_layer(self.num_features) + + # classification head + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + # init weights + self.apply(self._init_weights) + + @torch.jit.ignore + def no_weight_decay(self): + return set(['pos_block.' + n for n, p in self.pos_block.named_parameters()]) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + matcher = dict( + stem=r'^patch_embeds.0', # stem and embed + blocks=[ + (r'^(?:blocks|patch_embeds|pos_block)\.(\d+)', None), + ('^norm', (99999,)) + ] if coarse else [ + (r'^blocks\.(\d+)\.(\d+)', None), + (r'^(?:patch_embeds|pos_block)\.(\d+)', (0,)), + (r'^norm', (99999,)) + ] + ) + return matcher + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg') + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + elif isinstance(m, nn.Conv2d): + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + fan_out //= m.groups + m.weight.data.normal_(0, math.sqrt(2.0 / fan_out)) + if m.bias is not None: + m.bias.data.zero_() + + def forward_features(self, x): + B = x.shape[0] + for i, (embed, drop, blocks, pos_blk) in enumerate( + zip(self.patch_embeds, self.pos_drops, self.blocks, self.pos_block)): + x, size = embed(x) + x = drop(x) + for j, blk in enumerate(blocks): + x = blk(x, size) + if j == 0: + x = pos_blk(x, size) # PEG here + if i < len(self.depths) - 1: + x = x.reshape(B, *size, -1).permute(0, 3, 1, 2).contiguous() + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + x = x.mean(dim=1) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_twins(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model = build_model_with_cfg(Twins, variant, pretrained, **kwargs) + return model + + +@register_model +def twins_pcpvt_small(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], + depths=[3, 4, 6, 3], sr_ratios=[8, 4, 2, 1], **kwargs) + return _create_twins('twins_pcpvt_small', pretrained=pretrained, **model_kwargs) + + +@register_model +def twins_pcpvt_base(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], + depths=[3, 4, 18, 3], sr_ratios=[8, 4, 2, 1], **kwargs) + return _create_twins('twins_pcpvt_base', pretrained=pretrained, **model_kwargs) + + +@register_model +def twins_pcpvt_large(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=4, embed_dims=[64, 128, 320, 512], num_heads=[1, 2, 5, 8], mlp_ratios=[8, 8, 4, 4], + depths=[3, 8, 27, 3], sr_ratios=[8, 4, 2, 1], **kwargs) + return _create_twins('twins_pcpvt_large', pretrained=pretrained, **model_kwargs) + + +@register_model +def twins_svt_small(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=4, embed_dims=[64, 128, 256, 512], num_heads=[2, 4, 8, 16], mlp_ratios=[4, 4, 4, 4], + depths=[2, 2, 10, 4], wss=[7, 7, 7, 7], sr_ratios=[8, 4, 2, 1], **kwargs) + return _create_twins('twins_svt_small', pretrained=pretrained, **model_kwargs) + + +@register_model +def twins_svt_base(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=4, embed_dims=[96, 192, 384, 768], num_heads=[3, 6, 12, 24], mlp_ratios=[4, 4, 4, 4], + depths=[2, 2, 18, 2], wss=[7, 7, 7, 7], sr_ratios=[8, 4, 2, 1], **kwargs) + return _create_twins('twins_svt_base', pretrained=pretrained, **model_kwargs) + + +@register_model +def twins_svt_large(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=4, embed_dims=[128, 256, 512, 1024], num_heads=[4, 8, 16, 32], mlp_ratios=[4, 4, 4, 4], + depths=[2, 2, 18, 2], wss=[7, 7, 7, 7], sr_ratios=[8, 4, 2, 1], **kwargs) + return _create_twins('twins_svt_large', pretrained=pretrained, **model_kwargs) diff --git a/custom_timm/models/vgg.py b/custom_timm/models/vgg.py new file mode 100644 index 0000000000000000000000000000000000000000..11cf08bd5426f58d4a831849b6780d4b05b1b592 --- /dev/null +++ b/custom_timm/models/vgg.py @@ -0,0 +1,279 @@ +"""VGG + +Adapted from https://github.com/pytorch/vision 'vgg.py' (BSD-3-Clause) with a few changes for +timm functionality. + +Copyright 2021 Ross Wightman +""" +import torch +import torch.nn as nn +import torch.nn.functional as F +from typing import Union, List, Dict, Any, cast + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, checkpoint_seq +from .fx_features import register_notrace_module +from .layers import ClassifierHead +from .registry import register_model + +__all__ = [ + 'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn', + 'vgg19_bn', 'vgg19', +] + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bilinear', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'features.0', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = { + 'vgg11': _cfg(url='https://download.pytorch.org/models/vgg11-bbd30ac9.pth'), + 'vgg13': _cfg(url='https://download.pytorch.org/models/vgg13-c768596a.pth'), + 'vgg16': _cfg(url='https://download.pytorch.org/models/vgg16-397923af.pth'), + 'vgg19': _cfg(url='https://download.pytorch.org/models/vgg19-dcbb9e9d.pth'), + 'vgg11_bn': _cfg(url='https://download.pytorch.org/models/vgg11_bn-6002323d.pth'), + 'vgg13_bn': _cfg(url='https://download.pytorch.org/models/vgg13_bn-abd245e5.pth'), + 'vgg16_bn': _cfg(url='https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'), + 'vgg19_bn': _cfg(url='https://download.pytorch.org/models/vgg19_bn-c79401a0.pth'), +} + + +cfgs: Dict[str, List[Union[str, int]]] = { + 'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'], + 'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'], + 'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'], +} + + +@register_notrace_module # reason: FX can't symbolically trace control flow in forward method +class ConvMlp(nn.Module): + + def __init__( + self, in_features=512, out_features=4096, kernel_size=7, mlp_ratio=1.0, + drop_rate: float = 0.2, act_layer: nn.Module = None, conv_layer: nn.Module = None): + super(ConvMlp, self).__init__() + self.input_kernel_size = kernel_size + mid_features = int(out_features * mlp_ratio) + self.fc1 = conv_layer(in_features, mid_features, kernel_size, bias=True) + self.act1 = act_layer(True) + self.drop = nn.Dropout(drop_rate) + self.fc2 = conv_layer(mid_features, out_features, 1, bias=True) + self.act2 = act_layer(True) + + def forward(self, x): + if x.shape[-2] < self.input_kernel_size or x.shape[-1] < self.input_kernel_size: + # keep the input size >= 7x7 + output_size = (max(self.input_kernel_size, x.shape[-2]), max(self.input_kernel_size, x.shape[-1])) + x = F.adaptive_avg_pool2d(x, output_size) + x = self.fc1(x) + x = self.act1(x) + x = self.drop(x) + x = self.fc2(x) + x = self.act2(x) + return x + + +class VGG(nn.Module): + + def __init__( + self, + cfg: List[Any], + num_classes: int = 1000, + in_chans: int = 3, + output_stride: int = 32, + mlp_ratio: float = 1.0, + act_layer: nn.Module = nn.ReLU, + conv_layer: nn.Module = nn.Conv2d, + norm_layer: nn.Module = None, + global_pool: str = 'avg', + drop_rate: float = 0., + ) -> None: + super(VGG, self).__init__() + assert output_stride == 32 + self.num_classes = num_classes + self.num_features = 4096 + self.drop_rate = drop_rate + self.grad_checkpointing = False + self.use_norm = norm_layer is not None + self.feature_info = [] + prev_chs = in_chans + net_stride = 1 + pool_layer = nn.MaxPool2d + layers: List[nn.Module] = [] + for v in cfg: + last_idx = len(layers) - 1 + if v == 'M': + self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{last_idx}')) + layers += [pool_layer(kernel_size=2, stride=2)] + net_stride *= 2 + else: + v = cast(int, v) + conv2d = conv_layer(prev_chs, v, kernel_size=3, padding=1) + if norm_layer is not None: + layers += [conv2d, norm_layer(v), act_layer(inplace=True)] + else: + layers += [conv2d, act_layer(inplace=True)] + prev_chs = v + self.features = nn.Sequential(*layers) + self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{len(layers) - 1}')) + + self.pre_logits = ConvMlp( + prev_chs, self.num_features, 7, mlp_ratio=mlp_ratio, + drop_rate=drop_rate, act_layer=act_layer, conv_layer=conv_layer) + self.head = ClassifierHead( + self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate) + + self._initialize_weights() + + @torch.jit.ignore + def group_matcher(self, coarse=False): + # this treats BN layers as separate groups for bn variants, a lot of effort to fix that + return dict(stem=r'^features\.0', blocks=r'^features\.(\d+)') + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, 'gradient checkpointing not supported' + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.head = ClassifierHead( + self.num_features, self.num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x: torch.Tensor) -> torch.Tensor: + x = self.features(x) + return x + + def forward_head(self, x: torch.Tensor, pre_logits: bool = False): + x = self.pre_logits(x) + return x if pre_logits else self.head(x) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.forward_features(x) + x = self.forward_head(x) + return x + + def _initialize_weights(self) -> None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.BatchNorm2d): + nn.init.constant_(m.weight, 1) + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + nn.init.constant_(m.bias, 0) + + +def _filter_fn(state_dict): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + out_dict = {} + for k, v in state_dict.items(): + k_r = k + k_r = k_r.replace('classifier.0', 'pre_logits.fc1') + k_r = k_r.replace('classifier.3', 'pre_logits.fc2') + k_r = k_r.replace('classifier.6', 'head.fc') + if 'classifier.0.weight' in k: + v = v.reshape(-1, 512, 7, 7) + if 'classifier.3.weight' in k: + v = v.reshape(-1, 4096, 1, 1) + out_dict[k_r] = v + return out_dict + + +def _create_vgg(variant: str, pretrained: bool, **kwargs: Any) -> VGG: + cfg = variant.split('_')[0] + # NOTE: VGG is one of few models with stride==1 features w/ 6 out_indices [0..5] + out_indices = kwargs.pop('out_indices', (0, 1, 2, 3, 4, 5)) + model = build_model_with_cfg( + VGG, variant, pretrained, + model_cfg=cfgs[cfg], + feature_cfg=dict(flatten_sequential=True, out_indices=out_indices), + pretrained_filter_fn=_filter_fn, + **kwargs) + return model + + +@register_model +def vgg11(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 11-layer model (configuration "A") from + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(**kwargs) + return _create_vgg('vgg11', pretrained=pretrained, **model_args) + + +@register_model +def vgg11_bn(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 11-layer model (configuration "A") with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs) + return _create_vgg('vgg11_bn', pretrained=pretrained, **model_args) + + +@register_model +def vgg13(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 13-layer model (configuration "B") + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(**kwargs) + return _create_vgg('vgg13', pretrained=pretrained, **model_args) + + +@register_model +def vgg13_bn(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 13-layer model (configuration "B") with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs) + return _create_vgg('vgg13_bn', pretrained=pretrained, **model_args) + + +@register_model +def vgg16(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 16-layer model (configuration "D") + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(**kwargs) + return _create_vgg('vgg16', pretrained=pretrained, **model_args) + + +@register_model +def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 16-layer model (configuration "D") with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs) + return _create_vgg('vgg16_bn', pretrained=pretrained, **model_args) + + +@register_model +def vgg19(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 19-layer model (configuration "E") + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(**kwargs) + return _create_vgg('vgg19', pretrained=pretrained, **model_args) + + +@register_model +def vgg19_bn(pretrained: bool = False, **kwargs: Any) -> VGG: + r"""VGG 19-layer model (configuration 'E') with batch normalization + `"Very Deep Convolutional Networks For Large-Scale Image Recognition" `._ + """ + model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs) + return _create_vgg('vgg19_bn', pretrained=pretrained, **model_args) \ No newline at end of file diff --git a/custom_timm/models/visformer.py b/custom_timm/models/visformer.py new file mode 100644 index 0000000000000000000000000000000000000000..0a95be8cbc7c92c6242cb3c3e762949f6f6be8f4 --- /dev/null +++ b/custom_timm/models/visformer.py @@ -0,0 +1,429 @@ +""" Visformer + +Paper: Visformer: The Vision-friendly Transformer - https://arxiv.org/abs/2104.12533 + +From original at https://github.com/danczs/Visformer + +Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman +""" +from copy import deepcopy + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import to_2tuple, trunc_normal_, DropPath, PatchEmbed, LayerNorm2d, create_classifier +from .registry import register_model + + +__all__ = ['Visformer'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.0', 'classifier': 'head', + **kwargs + } + + +default_cfgs = dict( + visformer_tiny=_cfg(), + visformer_small=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vt3p-weights/visformer_small-839e1f5b.pth' + ), +) + + +class SpatialMlp(nn.Module): + def __init__( + self, in_features, hidden_features=None, out_features=None, + act_layer=nn.GELU, drop=0., group=8, spatial_conv=False): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + drop_probs = to_2tuple(drop) + + self.in_features = in_features + self.out_features = out_features + self.spatial_conv = spatial_conv + if self.spatial_conv: + if group < 2: # net setting + hidden_features = in_features * 5 // 6 + else: + hidden_features = in_features * 2 + self.hidden_features = hidden_features + self.group = group + self.conv1 = nn.Conv2d(in_features, hidden_features, 1, stride=1, padding=0, bias=False) + self.act1 = act_layer() + self.drop1 = nn.Dropout(drop_probs[0]) + if self.spatial_conv: + self.conv2 = nn.Conv2d( + hidden_features, hidden_features, 3, stride=1, padding=1, groups=self.group, bias=False) + self.act2 = act_layer() + else: + self.conv2 = None + self.act2 = None + self.conv3 = nn.Conv2d(hidden_features, out_features, 1, stride=1, padding=0, bias=False) + self.drop3 = nn.Dropout(drop_probs[1]) + + def forward(self, x): + x = self.conv1(x) + x = self.act1(x) + x = self.drop1(x) + if self.conv2 is not None: + x = self.conv2(x) + x = self.act2(x) + x = self.conv3(x) + x = self.drop3(x) + return x + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, head_dim_ratio=1., attn_drop=0., proj_drop=0.): + super().__init__() + self.dim = dim + self.num_heads = num_heads + head_dim = round(dim // num_heads * head_dim_ratio) + self.head_dim = head_dim + self.scale = head_dim ** -0.5 + self.qkv = nn.Conv2d(dim, head_dim * num_heads * 3, 1, stride=1, padding=0, bias=False) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Conv2d(self.head_dim * self.num_heads, dim, 1, stride=1, padding=0, bias=False) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, C, H, W = x.shape + x = self.qkv(x).reshape(B, 3, self.num_heads, self.head_dim, -1).permute(1, 0, 2, 4, 3) + q, k, v = x.unbind(0) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = attn @ v + + x = x.permute(0, 1, 3, 2).reshape(B, -1, H, W) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__( + self, dim, num_heads, head_dim_ratio=1., mlp_ratio=4., + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=LayerNorm2d, + group=8, attn_disabled=False, spatial_conv=False): + super().__init__() + self.spatial_conv = spatial_conv + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + if attn_disabled: + self.norm1 = None + self.attn = None + else: + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, num_heads=num_heads, head_dim_ratio=head_dim_ratio, attn_drop=attn_drop, proj_drop=drop) + + self.norm2 = norm_layer(dim) + self.mlp = SpatialMlp( + in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop, + group=group, spatial_conv=spatial_conv) # new setting + + def forward(self, x): + if self.attn is not None: + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Visformer(nn.Module): + def __init__( + self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, init_channels=32, embed_dim=384, + depth=12, num_heads=6, mlp_ratio=4., drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + norm_layer=LayerNorm2d, attn_stage='111', pos_embed=True, spatial_conv='111', + vit_stem=False, group=8, global_pool='avg', conv_init=False, embed_norm=None): + super().__init__() + img_size = to_2tuple(img_size) + self.num_classes = num_classes + self.embed_dim = embed_dim + self.init_channels = init_channels + self.img_size = img_size + self.vit_stem = vit_stem + self.conv_init = conv_init + if isinstance(depth, (list, tuple)): + self.stage_num1, self.stage_num2, self.stage_num3 = depth + depth = sum(depth) + else: + self.stage_num1 = self.stage_num3 = depth // 3 + self.stage_num2 = depth - self.stage_num1 - self.stage_num3 + self.pos_embed = pos_embed + self.grad_checkpointing = False + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] + # stage 1 + if self.vit_stem: + self.stem = None + self.patch_embed1 = PatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, + embed_dim=embed_dim, norm_layer=embed_norm, flatten=False) + img_size = [x // patch_size for x in img_size] + else: + if self.init_channels is None: + self.stem = None + self.patch_embed1 = PatchEmbed( + img_size=img_size, patch_size=patch_size // 2, in_chans=in_chans, + embed_dim=embed_dim // 2, norm_layer=embed_norm, flatten=False) + img_size = [x // (patch_size // 2) for x in img_size] + else: + self.stem = nn.Sequential( + nn.Conv2d(in_chans, self.init_channels, 7, stride=2, padding=3, bias=False), + nn.BatchNorm2d(self.init_channels), + nn.ReLU(inplace=True) + ) + img_size = [x // 2 for x in img_size] + self.patch_embed1 = PatchEmbed( + img_size=img_size, patch_size=patch_size // 4, in_chans=self.init_channels, + embed_dim=embed_dim // 2, norm_layer=embed_norm, flatten=False) + img_size = [x // (patch_size // 4) for x in img_size] + + if self.pos_embed: + if self.vit_stem: + self.pos_embed1 = nn.Parameter(torch.zeros(1, embed_dim, *img_size)) + else: + self.pos_embed1 = nn.Parameter(torch.zeros(1, embed_dim//2, *img_size)) + self.pos_drop = nn.Dropout(p=drop_rate) + self.stage1 = nn.Sequential(*[ + Block( + dim=embed_dim//2, num_heads=num_heads, head_dim_ratio=0.5, mlp_ratio=mlp_ratio, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + group=group, attn_disabled=(attn_stage[0] == '0'), spatial_conv=(spatial_conv[0] == '1') + ) + for i in range(self.stage_num1) + ]) + + # stage2 + if not self.vit_stem: + self.patch_embed2 = PatchEmbed( + img_size=img_size, patch_size=patch_size // 8, in_chans=embed_dim // 2, + embed_dim=embed_dim, norm_layer=embed_norm, flatten=False) + img_size = [x // (patch_size // 8) for x in img_size] + if self.pos_embed: + self.pos_embed2 = nn.Parameter(torch.zeros(1, embed_dim, *img_size)) + self.stage2 = nn.Sequential(*[ + Block( + dim=embed_dim, num_heads=num_heads, head_dim_ratio=1.0, mlp_ratio=mlp_ratio, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + group=group, attn_disabled=(attn_stage[1] == '0'), spatial_conv=(spatial_conv[1] == '1') + ) + for i in range(self.stage_num1, self.stage_num1+self.stage_num2) + ]) + + # stage 3 + if not self.vit_stem: + self.patch_embed3 = PatchEmbed( + img_size=img_size, patch_size=patch_size // 8, in_chans=embed_dim, + embed_dim=embed_dim * 2, norm_layer=embed_norm, flatten=False) + img_size = [x // (patch_size // 8) for x in img_size] + if self.pos_embed: + self.pos_embed3 = nn.Parameter(torch.zeros(1, embed_dim*2, *img_size)) + self.stage3 = nn.Sequential(*[ + Block( + dim=embed_dim*2, num_heads=num_heads, head_dim_ratio=1.0, mlp_ratio=mlp_ratio, + drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer, + group=group, attn_disabled=(attn_stage[2] == '0'), spatial_conv=(spatial_conv[2] == '1') + ) + for i in range(self.stage_num1+self.stage_num2, depth) + ]) + + # head + self.num_features = embed_dim if self.vit_stem else embed_dim * 2 + self.norm = norm_layer(self.num_features) + self.global_pool, self.head = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + # weights init + if self.pos_embed: + trunc_normal_(self.pos_embed1, std=0.02) + if not self.vit_stem: + trunc_normal_(self.pos_embed2, std=0.02) + trunc_normal_(self.pos_embed3, std=0.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.Conv2d): + if self.conv_init: + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + else: + trunc_normal_(m.weight, std=0.02) + if m.bias is not None: + nn.init.constant_(m.bias, 0.) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^patch_embed1|pos_embed1|stem', # stem and embed + blocks=[ + (r'^stage(\d+)\.(\d+)' if coarse else r'^stage(\d+)\.(\d+)', None), + (r'^(?:patch_embed|pos_embed)(\d+)', (0,)), + (r'^norm', (99999,)) + ] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.head = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + if self.stem is not None: + x = self.stem(x) + + # stage 1 + x = self.patch_embed1(x) + if self.pos_embed: + x = self.pos_drop(x + self.pos_embed1) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.stage1, x) + else: + x = self.stage1(x) + + # stage 2 + if not self.vit_stem: + x = self.patch_embed2(x) + if self.pos_embed: + x = self.pos_drop(x + self.pos_embed2) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.stage2, x) + else: + x = self.stage2(x) + + # stage3 + if not self.vit_stem: + x = self.patch_embed3(x) + if self.pos_embed: + x = self.pos_drop(x + self.pos_embed3) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.stage3, x) + else: + x = self.stage3(x) + + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_visformer(variant, pretrained=False, default_cfg=None, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + model = build_model_with_cfg(Visformer, variant, pretrained, **kwargs) + return model + + +@register_model +def visformer_tiny(pretrained=False, **kwargs): + model_cfg = dict( + init_channels=16, embed_dim=192, depth=(7, 4, 4), num_heads=3, mlp_ratio=4., group=8, + attn_stage='011', spatial_conv='100', norm_layer=nn.BatchNorm2d, conv_init=True, + embed_norm=nn.BatchNorm2d, **kwargs) + model = _create_visformer('visformer_tiny', pretrained=pretrained, **model_cfg) + return model + + +@register_model +def visformer_small(pretrained=False, **kwargs): + model_cfg = dict( + init_channels=32, embed_dim=384, depth=(7, 4, 4), num_heads=6, mlp_ratio=4., group=8, + attn_stage='011', spatial_conv='100', norm_layer=nn.BatchNorm2d, conv_init=True, + embed_norm=nn.BatchNorm2d, **kwargs) + model = _create_visformer('visformer_small', pretrained=pretrained, **model_cfg) + return model + + +# @register_model +# def visformer_net1(pretrained=False, **kwargs): +# model = Visformer( +# init_channels=None, embed_dim=384, depth=(0, 12, 0), num_heads=6, mlp_ratio=4., attn_stage='111', +# spatial_conv='000', vit_stem=True, conv_init=True, **kwargs) +# model.default_cfg = _cfg() +# return model +# +# +# @register_model +# def visformer_net2(pretrained=False, **kwargs): +# model = Visformer( +# init_channels=32, embed_dim=384, depth=(0, 12, 0), num_heads=6, mlp_ratio=4., attn_stage='111', +# spatial_conv='000', vit_stem=False, conv_init=True, **kwargs) +# model.default_cfg = _cfg() +# return model +# +# +# @register_model +# def visformer_net3(pretrained=False, **kwargs): +# model = Visformer( +# init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., attn_stage='111', +# spatial_conv='000', vit_stem=False, conv_init=True, **kwargs) +# model.default_cfg = _cfg() +# return model +# +# +# @register_model +# def visformer_net4(pretrained=False, **kwargs): +# model = Visformer( +# init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., attn_stage='111', +# spatial_conv='000', vit_stem=False, conv_init=True, **kwargs) +# model.default_cfg = _cfg() +# return model +# +# +# @register_model +# def visformer_net5(pretrained=False, **kwargs): +# model = Visformer( +# init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., group=1, attn_stage='111', +# spatial_conv='111', vit_stem=False, conv_init=True, **kwargs) +# model.default_cfg = _cfg() +# return model +# +# +# @register_model +# def visformer_net6(pretrained=False, **kwargs): +# model = Visformer( +# init_channels=32, embed_dim=384, depth=12, num_heads=6, mlp_ratio=4., group=1, attn_stage='111', +# pos_embed=False, spatial_conv='111', conv_init=True, **kwargs) +# model.default_cfg = _cfg() +# return model +# +# +# @register_model +# def visformer_net7(pretrained=False, **kwargs): +# model = Visformer( +# init_channels=32, embed_dim=384, depth=(6, 7, 7), num_heads=6, group=1, attn_stage='000', +# pos_embed=False, spatial_conv='111', conv_init=True, **kwargs) +# model.default_cfg = _cfg() +# return model + + + + diff --git a/custom_timm/models/vision_transformer.py b/custom_timm/models/vision_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..52c406b23b7dc1aace4e955febe59964b666894b --- /dev/null +++ b/custom_timm/models/vision_transformer.py @@ -0,0 +1,1256 @@ +""" Vision Transformer (ViT) in PyTorch + +A PyTorch implement of Vision Transformers as described in: + +'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' + - https://arxiv.org/abs/2010.11929 + +`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers` + - https://arxiv.org/abs/2106.10270 + +The official jax code is released and available at https://github.com/google-research/vision_transformer + +Acknowledgments: +* The paper authors for releasing code and weights, thanks! +* I fixed my class token impl based on Phil Wang's https://github.com/lucidrains/vit-pytorch ... check it out +for some einops/einsum fun +* Simple transformer style inspired by Andrej Karpathy's https://github.com/karpathy/minGPT +* Bert reference code checks against Huggingface Transformers and Tensorflow Bert + +Hacked together by / Copyright 2020, Ross Wightman +""" +import math +import logging +from functools import partial +from collections import OrderedDict +from typing import Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD,\ + OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from .helpers import build_model_with_cfg, resolve_pretrained_cfg, named_apply, adapt_input_conv, checkpoint_seq +from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_ +from .registry import register_model + +_logger = logging.getLogger(__name__) + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # patch models (weights from official Google JAX impl) + 'vit_tiny_patch16_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'), + 'vit_tiny_patch16_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_small_patch32_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'), + 'vit_small_patch32_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_small_patch16_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'), + 'vit_small_patch16_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_base_patch32_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz'), + 'vit_base_patch32_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'B_32-i21k-300ep-lr_0.001-aug_light1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_base_patch16_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz'), + 'vit_base_patch16_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_base_patch8_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_224.npz'), + 'vit_large_patch32_224': _cfg( + url='', # no official model weights for this combo, only for in21k + ), + 'vit_large_patch32_384': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p32_384-9b920ba8.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_large_patch16_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz'), + 'vit_large_patch16_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0), + + 'vit_large_patch14_224': _cfg(url=''), + 'vit_huge_patch14_224': _cfg(url=''), + 'vit_giant_patch14_224': _cfg(url=''), + 'vit_gigantic_patch14_224': _cfg(url=''), + + + # patch models, imagenet21k (weights from official Google JAX impl) + 'vit_tiny_patch16_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz', + num_classes=21843), + 'vit_small_patch32_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/S_32-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz', + num_classes=21843), + 'vit_small_patch16_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0.npz', + num_classes=21843), + 'vit_base_patch32_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/B_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.03-do_0.0-sd_0.0.npz', + num_classes=21843), + 'vit_base_patch16_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz', + num_classes=21843), + 'vit_base_patch8_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/B_8-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0.npz', + num_classes=21843), + 'vit_large_patch32_224_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_patch32_224_in21k-9046d2e7.pth', + num_classes=21843), + 'vit_large_patch16_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1.npz', + num_classes=21843), + 'vit_huge_patch14_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/imagenet21k/ViT-H_14.npz', + hf_hub_id='timm/vit_huge_patch14_224_in21k', + num_classes=21843), + + # SAM trained models (https://arxiv.org/abs/2106.01548) + 'vit_base_patch32_224_sam': _cfg( + url='https://storage.googleapis.com/vit_models/sam/ViT-B_32.npz'), + 'vit_base_patch16_224_sam': _cfg( + url='https://storage.googleapis.com/vit_models/sam/ViT-B_16.npz'), + + # DINO pretrained - https://arxiv.org/abs/2104.14294 (no classifier head, for fine-tune only) + 'vit_small_patch16_224_dino': _cfg( + url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall16_pretrain/dino_deitsmall16_pretrain.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), + 'vit_small_patch8_224_dino': _cfg( + url='https://dl.fbaipublicfiles.com/dino/dino_deitsmall8_pretrain/dino_deitsmall8_pretrain.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), + 'vit_base_patch16_224_dino': _cfg( + url='https://dl.fbaipublicfiles.com/dino/dino_vitbase16_pretrain/dino_vitbase16_pretrain.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), + 'vit_base_patch8_224_dino': _cfg( + url='https://dl.fbaipublicfiles.com/dino/dino_vitbase8_pretrain/dino_vitbase8_pretrain.pth', + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, num_classes=0), + + + # ViT ImageNet-21K-P pretraining by MILL + 'vit_base_patch16_224_miil_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_in21k_miil-887286df.pth', + mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear', num_classes=11221), + 'vit_base_patch16_224_miil': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tresnet/vit_base_patch16_224_1k_miil_84_4-2deb18e3.pth', + mean=(0., 0., 0.), std=(1., 1., 1.), crop_pct=0.875, interpolation='bilinear'), + + 'vit_base_patch16_rpn_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_base_patch16_rpn_224-sw-3b07e89d.pth'), + + # experimental (may be removed) + 'vit_base_patch32_plus_256': _cfg(url='', input_size=(3, 256, 256), crop_pct=0.95), + 'vit_base_patch16_plus_240': _cfg(url='', input_size=(3, 240, 240), crop_pct=0.95), + 'vit_small_patch16_36x1_224': _cfg(url=''), + 'vit_small_patch16_18x2_224': _cfg(url=''), + 'vit_base_patch16_18x2_224': _cfg(url=''), + + 'vit_base_patch32_224_clip_laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-B-32-laion2B-s34B-b79K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=512), + 'vit_large_patch14_224_clip_laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-L-14-laion2B-s32B-b82K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, num_classes=768), + 'vit_huge_patch14_224_clip_laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-H-14-laion2B-s32B-b79K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=1024), + 'vit_giant_patch14_224_clip_laion2b': _cfg( + hf_hub_id='laion/CLIP-ViT-g-14-laion2B-s12B-b42K', + hf_hub_filename='open_clip_pytorch_model.bin', + mean=OPENAI_CLIP_MEAN, std=OPENAI_CLIP_STD, num_classes=1024), + +} + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + assert dim % num_heads == 0, 'dim should be divisible by num_heads' + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class LayerScale(nn.Module): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class Block(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + init_values=None, + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x): + x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x)))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class ResPostBlock(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + init_values=None, + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.init_values = init_values + + self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.norm1 = norm_layer(dim) + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.norm2 = norm_layer(dim) + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.init_weights() + + def init_weights(self): + # NOTE this init overrides that base model init with specific changes for the block type + if self.init_values is not None: + nn.init.constant_(self.norm1.weight, self.init_values) + nn.init.constant_(self.norm2.weight, self.init_values) + + def forward(self, x): + x = x + self.drop_path1(self.norm1(self.attn(x))) + x = x + self.drop_path2(self.norm2(self.mlp(x))) + return x + + +class ParallelBlock(nn.Module): + + def __init__( + self, + dim, + num_heads, + num_parallel=2, + mlp_ratio=4., + qkv_bias=False, + init_values=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer=nn.LayerNorm + ): + super().__init__() + self.num_parallel = num_parallel + self.attns = nn.ModuleList() + self.ffns = nn.ModuleList() + for _ in range(num_parallel): + self.attns.append(nn.Sequential(OrderedDict([ + ('norm', norm_layer(dim)), + ('attn', Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)), + ('ls', LayerScale(dim, init_values=init_values) if init_values else nn.Identity()), + ('drop_path', DropPath(drop_path) if drop_path > 0. else nn.Identity()) + ]))) + self.ffns.append(nn.Sequential(OrderedDict([ + ('norm', norm_layer(dim)), + ('mlp', Mlp(dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop)), + ('ls', LayerScale(dim, init_values=init_values) if init_values else nn.Identity()), + ('drop_path', DropPath(drop_path) if drop_path > 0. else nn.Identity()) + ]))) + + def _forward_jit(self, x): + x = x + torch.stack([attn(x) for attn in self.attns]).sum(dim=0) + x = x + torch.stack([ffn(x) for ffn in self.ffns]).sum(dim=0) + return x + + @torch.jit.ignore + def _forward(self, x): + x = x + sum(attn(x) for attn in self.attns) + x = x + sum(ffn(x) for ffn in self.ffns) + return x + + def forward(self, x): + if torch.jit.is_scripting() or torch.jit.is_tracing(): + return self._forward_jit(x) + else: + return self._forward(x) + + +class VisionTransformer(nn.Module): + """ Vision Transformer + + A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale` + - https://arxiv.org/abs/2010.11929 + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + global_pool='token', + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + init_values=None, + class_token=True, + no_embed_class=False, + pre_norm=False, + fc_norm=None, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + weight_init='', + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None, + block_fn=Block, + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + global_pool (str): type of global pooling for final sequence (default: 'token') + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + init_values: (float): layer-scale init values + class_token (bool): use class token + fc_norm (Optional[bool]): pre-fc norm after pool, set if global_pool == 'avg' if None (default: None) + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + weight_init (str): weight init scheme + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + act_layer: (nn.Module): MLP activation layer + """ + super().__init__() + assert global_pool in ('', 'avg', 'token') + assert class_token or global_pool != 'token' + use_fc_norm = global_pool == 'avg' if fc_norm is None else fc_norm + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_prefix_tokens = 1 if class_token else 0 + self.no_embed_class = no_embed_class + self.grad_checkpointing = False + + self.patch_embed = embed_layer( + img_size=img_size, + patch_size=patch_size, + in_chans=in_chans, + embed_dim=embed_dim, + bias=not pre_norm, # disable bias if pre-norm is used (e.g. CLIP) + ) + num_patches = self.patch_embed.num_patches + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None + embed_len = num_patches if no_embed_class else num_patches + self.num_prefix_tokens + self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * .02) + self.pos_drop = nn.Dropout(p=drop_rate) + self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity() + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.Sequential(*[ + block_fn( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + drop_path=dpr[i], + norm_layer=norm_layer, + act_layer=act_layer + ) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity() + + # Classifier Head + self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity() + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + if weight_init != 'skip': + self.init_weights(weight_init) + + def init_weights(self, mode=''): + assert mode in ('jax', 'jax_nlhb', 'moco', '') + head_bias = -math.log(self.num_classes) if 'nlhb' in mode else 0. + trunc_normal_(self.pos_embed, std=.02) + if self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + named_apply(get_init_weights_vit(mode, head_bias), self) + + def _init_weights(self, m): + # this fn left here for compat with downstream users + init_weights_vit_timm(m) + + @torch.jit.ignore() + def load_pretrained(self, checkpoint_path, prefix=''): + _load_weights(self, checkpoint_path, prefix) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token', 'dist_token'} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^cls_token|pos_embed|patch_embed', # stem and embed + blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes: int, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg', 'token') + self.global_pool = global_pool + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def _pos_embed(self, x): + if self.no_embed_class: + # deit-3, updated JAX (big vision) + # position embedding does not overlap with class token, add then concat + x = x + self.pos_embed + if self.cls_token is not None: + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + else: + # original timm, JAX, and deit vit impl + # pos_embed has entry for class token, concat then add + if self.cls_token is not None: + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + x = x + self.pos_embed + return self.pos_drop(x) + + def forward_features(self, x): + x = self.patch_embed(x) + x = self._pos_embed(x) + x = self.norm_pre(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x[:, self.num_prefix_tokens:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0] + x = self.fc_norm(x) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def init_weights_vit_timm(module: nn.Module, name: str = ''): + """ ViT weight initialization, original timm impl (for reproducibility) """ + if isinstance(module, nn.Linear): + trunc_normal_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif hasattr(module, 'init_weights'): + module.init_weights() + + +def init_weights_vit_jax(module: nn.Module, name: str = '', head_bias: float = 0.): + """ ViT weight initialization, matching JAX (Flax) impl """ + if isinstance(module, nn.Linear): + if name.startswith('head'): + nn.init.zeros_(module.weight) + nn.init.constant_(module.bias, head_bias) + else: + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.normal_(module.bias, std=1e-6) if 'mlp' in name else nn.init.zeros_(module.bias) + elif isinstance(module, nn.Conv2d): + lecun_normal_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif hasattr(module, 'init_weights'): + module.init_weights() + + +def init_weights_vit_moco(module: nn.Module, name: str = ''): + """ ViT weight initialization, matching moco-v3 impl minus fixed PatchEmbed """ + if isinstance(module, nn.Linear): + if 'qkv' in name: + # treat the weights of Q, K, V separately + val = math.sqrt(6. / float(module.weight.shape[0] // 3 + module.weight.shape[1])) + nn.init.uniform_(module.weight, -val, val) + else: + nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif hasattr(module, 'init_weights'): + module.init_weights() + + +def get_init_weights_vit(mode='jax', head_bias: float = 0.): + if 'jax' in mode: + return partial(init_weights_vit_jax, head_bias=head_bias) + elif 'moco' in mode: + return init_weights_vit_moco + else: + return init_weights_vit_timm + + +@torch.no_grad() +def _load_weights(model: VisionTransformer, checkpoint_path: str, prefix: str = ''): + """ Load weights from .npz checkpoints for official Google Brain Flax implementation + """ + import numpy as np + + def _n2p(w, t=True): + if w.ndim == 4 and w.shape[0] == w.shape[1] == w.shape[2] == 1: + w = w.flatten() + if t: + if w.ndim == 4: + w = w.transpose([3, 2, 0, 1]) + elif w.ndim == 3: + w = w.transpose([2, 0, 1]) + elif w.ndim == 2: + w = w.transpose([1, 0]) + return torch.from_numpy(w) + + w = np.load(checkpoint_path) + if not prefix and 'opt/target/embedding/kernel' in w: + prefix = 'opt/target/' + + if hasattr(model.patch_embed, 'backbone'): + # hybrid + backbone = model.patch_embed.backbone + stem_only = not hasattr(backbone, 'stem') + stem = backbone if stem_only else backbone.stem + stem.conv.weight.copy_(adapt_input_conv(stem.conv.weight.shape[1], _n2p(w[f'{prefix}conv_root/kernel']))) + stem.norm.weight.copy_(_n2p(w[f'{prefix}gn_root/scale'])) + stem.norm.bias.copy_(_n2p(w[f'{prefix}gn_root/bias'])) + if not stem_only: + for i, stage in enumerate(backbone.stages): + for j, block in enumerate(stage.blocks): + bp = f'{prefix}block{i + 1}/unit{j + 1}/' + for r in range(3): + getattr(block, f'conv{r + 1}').weight.copy_(_n2p(w[f'{bp}conv{r + 1}/kernel'])) + getattr(block, f'norm{r + 1}').weight.copy_(_n2p(w[f'{bp}gn{r + 1}/scale'])) + getattr(block, f'norm{r + 1}').bias.copy_(_n2p(w[f'{bp}gn{r + 1}/bias'])) + if block.downsample is not None: + block.downsample.conv.weight.copy_(_n2p(w[f'{bp}conv_proj/kernel'])) + block.downsample.norm.weight.copy_(_n2p(w[f'{bp}gn_proj/scale'])) + block.downsample.norm.bias.copy_(_n2p(w[f'{bp}gn_proj/bias'])) + embed_conv_w = _n2p(w[f'{prefix}embedding/kernel']) + else: + embed_conv_w = adapt_input_conv( + model.patch_embed.proj.weight.shape[1], _n2p(w[f'{prefix}embedding/kernel'])) + model.patch_embed.proj.weight.copy_(embed_conv_w) + model.patch_embed.proj.bias.copy_(_n2p(w[f'{prefix}embedding/bias'])) + model.cls_token.copy_(_n2p(w[f'{prefix}cls'], t=False)) + pos_embed_w = _n2p(w[f'{prefix}Transformer/posembed_input/pos_embedding'], t=False) + if pos_embed_w.shape != model.pos_embed.shape: + pos_embed_w = resize_pos_embed( # resize pos embedding when different size from pretrained weights + pos_embed_w, + model.pos_embed, + getattr(model, 'num_prefix_tokens', 1), + model.patch_embed.grid_size + ) + model.pos_embed.copy_(pos_embed_w) + model.norm.weight.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/scale'])) + model.norm.bias.copy_(_n2p(w[f'{prefix}Transformer/encoder_norm/bias'])) + if isinstance(model.head, nn.Linear) and model.head.bias.shape[0] == w[f'{prefix}head/bias'].shape[-1]: + model.head.weight.copy_(_n2p(w[f'{prefix}head/kernel'])) + model.head.bias.copy_(_n2p(w[f'{prefix}head/bias'])) + # NOTE representation layer has been removed, not used in latest 21k/1k pretrained weights + # if isinstance(getattr(model.pre_logits, 'fc', None), nn.Linear) and f'{prefix}pre_logits/bias' in w: + # model.pre_logits.fc.weight.copy_(_n2p(w[f'{prefix}pre_logits/kernel'])) + # model.pre_logits.fc.bias.copy_(_n2p(w[f'{prefix}pre_logits/bias'])) + for i, block in enumerate(model.blocks.children()): + block_prefix = f'{prefix}Transformer/encoderblock_{i}/' + mha_prefix = block_prefix + 'MultiHeadDotProductAttention_1/' + block.norm1.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/scale'])) + block.norm1.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_0/bias'])) + block.attn.qkv.weight.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/kernel'], t=False).flatten(1).T for n in ('query', 'key', 'value')])) + block.attn.qkv.bias.copy_(torch.cat([ + _n2p(w[f'{mha_prefix}{n}/bias'], t=False).reshape(-1) for n in ('query', 'key', 'value')])) + block.attn.proj.weight.copy_(_n2p(w[f'{mha_prefix}out/kernel']).flatten(1)) + block.attn.proj.bias.copy_(_n2p(w[f'{mha_prefix}out/bias'])) + for r in range(2): + getattr(block.mlp, f'fc{r + 1}').weight.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/kernel'])) + getattr(block.mlp, f'fc{r + 1}').bias.copy_(_n2p(w[f'{block_prefix}MlpBlock_3/Dense_{r}/bias'])) + block.norm2.weight.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/scale'])) + block.norm2.bias.copy_(_n2p(w[f'{block_prefix}LayerNorm_2/bias'])) + + +def resize_pos_embed(posemb, posemb_new, num_prefix_tokens=1, gs_new=()): + # Rescale the grid of position embeddings when loading from state_dict. Adapted from + # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 + _logger.info('Resized position embedding: %s to %s', posemb.shape, posemb_new.shape) + ntok_new = posemb_new.shape[1] + if num_prefix_tokens: + posemb_prefix, posemb_grid = posemb[:, :num_prefix_tokens], posemb[0, num_prefix_tokens:] + ntok_new -= num_prefix_tokens + else: + posemb_prefix, posemb_grid = posemb[:, :0], posemb[0] + gs_old = int(math.sqrt(len(posemb_grid))) + if not len(gs_new): # backwards compatibility + gs_new = [int(math.sqrt(ntok_new))] * 2 + assert len(gs_new) >= 2 + _logger.info('Position embedding grid-size from %s to %s', [gs_old, gs_old], gs_new) + posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) + posemb_grid = F.interpolate(posemb_grid, size=gs_new, mode='bicubic', align_corners=False) + posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new[0] * gs_new[1], -1) + posemb = torch.cat([posemb_prefix, posemb_grid], dim=1) + return posemb + + +def _convert_openai_clip(state_dict, model): + out_dict = {} + swaps = [ + ('visual.', ''), ('conv1', 'patch_embed.proj'), ('positional_embedding', 'pos_embed'), + ('transformer.resblocks.', 'blocks.'), ('ln_pre', 'norm_pre'), ('ln_post', 'norm'), ('ln_', 'norm'), + ('in_proj_', 'qkv.'), ('out_proj', 'proj'), ('mlp.c_fc', 'mlp.fc1'), ('mlp.c_proj', 'mlp.fc2'), + ] + for k, v in state_dict.items(): + if not k.startswith('visual.'): + continue + for sp in swaps: + k = k.replace(sp[0], sp[1]) + + if k == 'proj': + k = 'head.weight' + v = v.transpose(0, 1) + out_dict['head.bias'] = torch.zeros(v.shape[0]) + elif k == 'class_embedding': + k = 'cls_token' + v = v.unsqueeze(0).unsqueeze(1) + elif k == 'pos_embed': + v = v.unsqueeze(0) + if v.shape[1] != model.pos_embed.shape[1]: + # To resize pos embedding when using model at different size from pretrained weights + v = resize_pos_embed( + v, + model.pos_embed, + 0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1), + model.patch_embed.grid_size + ) + out_dict[k] = v + return out_dict + + +def checkpoint_filter_fn(state_dict, model, adapt_layer_scale=False): + """ convert patch embedding weight from manual patchify + linear proj to conv""" + import re + out_dict = {} + if 'model' in state_dict: + # For deit models + state_dict = state_dict['model'] + + if 'visual.class_embedding' in state_dict: + return _convert_openai_clip(state_dict, model) + + for k, v in state_dict.items(): + if 'patch_embed.proj.weight' in k and len(v.shape) < 4: + # For old models that I trained prior to conv based patchification + O, I, H, W = model.patch_embed.proj.weight.shape + v = v.reshape(O, -1, H, W) + elif k == 'pos_embed' and v.shape[1] != model.pos_embed.shape[1]: + # To resize pos embedding when using model at different size from pretrained weights + v = resize_pos_embed( + v, + model.pos_embed, + 0 if getattr(model, 'no_embed_class') else getattr(model, 'num_prefix_tokens', 1), + model.patch_embed.grid_size + ) + elif adapt_layer_scale and 'gamma_' in k: + # remap layer-scale gamma into sub-module (deit3 models) + k = re.sub(r'gamma_([0-9])', r'ls\1.gamma', k) + elif 'pre_logits' in k: + # NOTE representation layer removed as not used in latest 21k/1k pretrained weights + continue + out_dict[k] = v + return out_dict + + +def _create_vision_transformer(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + pretrained_cfg = resolve_pretrained_cfg(variant, pretrained_cfg=kwargs.pop('pretrained_cfg', None)) + model = build_model_with_cfg( + VisionTransformer, variant, pretrained, + pretrained_cfg=pretrained_cfg, + pretrained_filter_fn=checkpoint_filter_fn, + pretrained_custom_load='npz' in pretrained_cfg['url'], + **kwargs) + return model + + +@register_model +def vit_tiny_patch16_224(pretrained=False, **kwargs): + """ ViT-Tiny (Vit-Ti/16) + """ + model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer('vit_tiny_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_tiny_patch16_384(pretrained=False, **kwargs): + """ ViT-Tiny (Vit-Ti/16) @ 384x384. + """ + model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer('vit_tiny_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch32_224(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/32) + """ + model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch32_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch32_384(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/32) at 384x384. + """ + model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch32_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch16_224(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/16) + NOTE I've replaced my previous 'small' model definition and weights with the small variant from the DeiT paper + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch16_384(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/16) + NOTE I've replaced my previous 'small' model definition and weights with the small variant from the DeiT paper + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch32_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_384(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch32_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_384(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch8_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch32_224(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). No pretrained weights. + """ + model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch32_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch32_384(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch32_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch16_224(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 224x224, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch16_384(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch16_384', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch14_224(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/14) + """ + model_kwargs = dict(patch_size=14, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch14_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_huge_patch14_224(pretrained=False, **kwargs): + """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929). + """ + model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_huge_patch14_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_giant_patch14_224(pretrained=False, **kwargs): + """ ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560 + """ + model_kwargs = dict(patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_giant_patch14_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_gigantic_patch14_224(pretrained=False, **kwargs): + """ ViT-Gigantic (big-G) model (ViT-G/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560 + """ + model_kwargs = dict(patch_size=14, embed_dim=1664, mlp_ratio=64/13, depth=48, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_gigantic_patch14_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_tiny_patch16_224_in21k(pretrained=False, **kwargs): + """ ViT-Tiny (Vit-Ti/16). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict(patch_size=16, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer('vit_tiny_patch16_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch32_224_in21k(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/16) + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict(patch_size=32, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch32_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch16_224_in21k(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/16) + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch16_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_224_in21k(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch32_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_in21k(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch8_224_in21k(pretrained=False, **kwargs): + """ ViT-Base model (ViT-B/8) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch8_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch32_224_in21k(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/32) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has a representation layer but the 21k classifier head is zero'd out in original weights + """ + model_kwargs = dict(patch_size=32, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch32_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch16_224_in21k(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/16) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has valid 21k classifier head and no representation (pre-logits) layer + """ + model_kwargs = dict(patch_size=16, embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_large_patch16_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_huge_patch14_224_in21k(pretrained=False, **kwargs): + """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + NOTE: this model has a representation layer but the 21k classifier head is zero'd out in original weights + """ + model_kwargs = dict(patch_size=14, embed_dim=1280, depth=32, num_heads=16, **kwargs) + model = _create_vision_transformer('vit_huge_patch14_224_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_sam(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548 + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_sam', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_224_sam(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/32) w/ SAM pretrained weights. Paper: https://arxiv.org/abs/2106.01548 + """ + model_kwargs = dict(patch_size=32, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch32_224_sam', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch16_224_dino(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/16) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294 + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch16_224_dino', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch8_224_dino(pretrained=False, **kwargs): + """ ViT-Small (ViT-S/8) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294 + """ + model_kwargs = dict(patch_size=8, embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer('vit_small_patch8_224_dino', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_dino(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) /w DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294 + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_dino', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch8_224_dino(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/8) w/ DINO pretrained weights (no head) - https://arxiv.org/abs/2104.14294 + """ + model_kwargs = dict(patch_size=8, embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer('vit_base_patch8_224_dino', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_miil_in21k(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_miil_in21k', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_224_miil(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) from original paper (https://arxiv.org/abs/2010.11929). + Weights taken from: https://github.com/Alibaba-MIIL/ImageNet21K + """ + model_kwargs = dict(patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, **kwargs) + model = _create_vision_transformer('vit_base_patch16_224_miil', pretrained=pretrained, **model_kwargs) + return model + + +# Experimental models below + +@register_model +def vit_base_patch32_plus_256(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/32+) + """ + model_kwargs = dict(patch_size=32, embed_dim=896, depth=12, num_heads=14, init_values=1e-5, **kwargs) + model = _create_vision_transformer('vit_base_patch32_plus_256', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_plus_240(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16+) + """ + model_kwargs = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, init_values=1e-5, **kwargs) + model = _create_vision_transformer('vit_base_patch16_plus_240', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_rpn_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ residual post-norm + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, init_values=1e-5, class_token=False, + block_fn=ResPostBlock, global_pool=kwargs.pop('global_pool', 'avg'), **kwargs) + model = _create_vision_transformer('vit_base_patch16_rpn_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch16_36x1_224(pretrained=False, **kwargs): + """ ViT-Base w/ LayerScale + 36 x 1 (36 block serial) config. Experimental, may remove. + Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795 + Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow. + """ + model_kwargs = dict(patch_size=16, embed_dim=384, depth=36, num_heads=6, init_values=1e-5, **kwargs) + model = _create_vision_transformer('vit_small_patch16_36x1_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_patch16_18x2_224(pretrained=False, **kwargs): + """ ViT-Small w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove. + Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795 + Paper focuses on 24x2 + 48x1 for 'Small' width but those are extremely slow. + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=18, num_heads=6, init_values=1e-5, block_fn=ParallelBlock, **kwargs) + model = _create_vision_transformer('vit_small_patch16_18x2_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch16_18x2_224(pretrained=False, **kwargs): + """ ViT-Base w/ LayerScale + 18 x 2 (36 block parallel) config. Experimental, may remove. + Based on `Three things everyone should know about Vision Transformers` - https://arxiv.org/abs/2203.09795 + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=18, num_heads=12, init_values=1e-5, block_fn=ParallelBlock, **kwargs) + model = _create_vision_transformer('vit_base_patch16_18x2_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_patch32_224_clip_laion2b(pretrained=False, **kwargs): + """ ViT-B/32 + Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs. + """ + model_kwargs = dict( + patch_size=32, embed_dim=768, depth=12, num_heads=12, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs) + model = _create_vision_transformer('vit_base_patch32_224_clip_laion2b', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_patch14_224_clip_laion2b(pretrained=False, **kwargs): + """ ViT-Large model (ViT-L/14) + Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs. + """ + model_kwargs = dict( + patch_size=14, embed_dim=1024, depth=24, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs) + model = _create_vision_transformer('vit_large_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_huge_patch14_224_clip_laion2b(pretrained=False, **kwargs): + """ ViT-Huge model (ViT-H/14) from original paper (https://arxiv.org/abs/2010.11929). + Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs. + """ + model_kwargs = dict( + patch_size=14, embed_dim=1280, depth=32, num_heads=16, pre_norm=True, norm_layer=nn.LayerNorm, **kwargs) + model = _create_vision_transformer('vit_huge_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_giant_patch14_224_clip_laion2b(pretrained=False, **kwargs): + """ ViT-Giant (little-g) model (ViT-g/14) from `Scaling Vision Transformers` - https://arxiv.org/abs/2106.04560 + Pretrained weights from CLIP image tower trained on LAION-2B image-text pairs. + """ + model_kwargs = dict( + patch_size=14, embed_dim=1408, mlp_ratio=48/11, depth=40, num_heads=16, + pre_norm=True, norm_layer=nn.LayerNorm, **kwargs) + model = _create_vision_transformer('vit_giant_patch14_224_clip_laion2b', pretrained=pretrained, **model_kwargs) + return model diff --git a/custom_timm/models/vision_transformer_hybrid.py b/custom_timm/models/vision_transformer_hybrid.py new file mode 100644 index 0000000000000000000000000000000000000000..1e8a2b1354094fd5d73e4e3c4a6231ed3f44b64b --- /dev/null +++ b/custom_timm/models/vision_transformer_hybrid.py @@ -0,0 +1,371 @@ +""" Hybrid Vision Transformer (ViT) in PyTorch + +A PyTorch implement of the Hybrid Vision Transformers as described in: + +'An Image Is Worth 16 x 16 Words: Transformers for Image Recognition at Scale' + - https://arxiv.org/abs/2010.11929 + +`How to train your ViT? Data, Augmentation, and Regularization in Vision Transformers` + - https://arxiv.org/abs/2106.10270 + +NOTE These hybrid model definitions depend on code in vision_transformer.py. +They were moved here to keep file sizes sane. + +Hacked together by / Copyright 2020, Ross Wightman +""" +from copy import deepcopy +from functools import partial + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .layers import StdConv2dSame, StdConv2d, to_2tuple +from .resnet import resnet26d, resnet50d +from .resnetv2 import ResNetV2, create_resnetv2_stem +from .registry import register_model +from custom_timm.models.vision_transformer import _create_vision_transformer + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5), + 'first_conv': 'patch_embed.backbone.stem.conv', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # hybrid in-1k models (weights from official JAX impl where they exist) + 'vit_tiny_r_s16_p8_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_224.npz', + first_conv='patch_embed.backbone.conv'), + 'vit_tiny_r_s16_p8_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz', + first_conv='patch_embed.backbone.conv', input_size=(3, 384, 384), crop_pct=1.0), + 'vit_small_r26_s32_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'R26_S_32-i21k-300ep-lr_0.001-aug_light0-wd_0.03-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.03-res_224.npz', + ), + 'vit_small_r26_s32_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_base_r26_s32_224': _cfg(), + 'vit_base_r50_s16_224': _cfg(), + 'vit_base_r50_s16_384': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_384-9fd3c705.pth', + input_size=(3, 384, 384), crop_pct=1.0), + 'vit_large_r50_s32_224': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'R50_L_32-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_224.npz' + ), + 'vit_large_r50_s32_384': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/' + 'R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz', + input_size=(3, 384, 384), crop_pct=1.0 + ), + + # hybrid in-21k models (weights from official Google JAX impl where they exist) + 'vit_tiny_r_s16_p8_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/R_Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0.npz', + num_classes=21843, crop_pct=0.9, first_conv='patch_embed.backbone.conv'), + 'vit_small_r26_s32_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/R26_S_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.03-do_0.0-sd_0.0.npz', + num_classes=21843, crop_pct=0.9), + 'vit_base_r50_s16_224_in21k': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_base_resnet50_224_in21k-6f7c7740.pth', + num_classes=21843, crop_pct=0.9), + 'vit_large_r50_s32_224_in21k': _cfg( + url='https://storage.googleapis.com/vit_models/augreg/R50_L_32-i21k-300ep-lr_0.001-aug_medium2-wd_0.1-do_0.0-sd_0.0.npz', + num_classes=21843, crop_pct=0.9), + + # hybrid models (using timm resnet backbones) + 'vit_small_resnet26d_224': _cfg( + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'), + 'vit_small_resnet50d_s16_224': _cfg( + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'), + 'vit_base_resnet26d_224': _cfg( + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'), + 'vit_base_resnet50d_224': _cfg( + mean=IMAGENET_DEFAULT_MEAN, std=IMAGENET_DEFAULT_STD, first_conv='patch_embed.backbone.conv1.0'), +} + + +class HybridEmbed(nn.Module): + """ CNN Feature Map Embedding + Extract feature map from CNN, flatten, project to embedding dim. + """ + def __init__( + self, + backbone, + img_size=224, + patch_size=1, + feature_size=None, + in_chans=3, + embed_dim=768, + bias=True, + ): + super().__init__() + assert isinstance(backbone, nn.Module) + img_size = to_2tuple(img_size) + patch_size = to_2tuple(patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.backbone = backbone + if feature_size is None: + with torch.no_grad(): + # NOTE Most reliable way of determining output dims is to run forward pass + training = backbone.training + if training: + backbone.eval() + o = self.backbone(torch.zeros(1, in_chans, img_size[0], img_size[1])) + if isinstance(o, (list, tuple)): + o = o[-1] # last feature if backbone outputs list/tuple of features + feature_size = o.shape[-2:] + feature_dim = o.shape[1] + backbone.train(training) + else: + feature_size = to_2tuple(feature_size) + if hasattr(self.backbone, 'feature_info'): + feature_dim = self.backbone.feature_info.channels()[-1] + else: + feature_dim = self.backbone.num_features + assert feature_size[0] % patch_size[0] == 0 and feature_size[1] % patch_size[1] == 0 + self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.proj = nn.Conv2d(feature_dim, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias) + + def forward(self, x): + x = self.backbone(x) + if isinstance(x, (list, tuple)): + x = x[-1] # last feature if backbone outputs list/tuple of features + x = self.proj(x).flatten(2).transpose(1, 2) + return x + + +def _create_vision_transformer_hybrid(variant, backbone, pretrained=False, **kwargs): + embed_layer = partial(HybridEmbed, backbone=backbone) + kwargs.setdefault('patch_size', 1) # default patch size for hybrid models if not set + return _create_vision_transformer(variant, pretrained=pretrained, embed_layer=embed_layer, **kwargs) + + +def _resnetv2(layers=(3, 4, 9), **kwargs): + """ ResNet-V2 backbone helper""" + padding_same = kwargs.get('padding_same', True) + stem_type = 'same' if padding_same else '' + conv_layer = partial(StdConv2dSame, eps=1e-8) if padding_same else partial(StdConv2d, eps=1e-8) + if len(layers): + backbone = ResNetV2( + layers=layers, num_classes=0, global_pool='', in_chans=kwargs.get('in_chans', 3), + preact=False, stem_type=stem_type, conv_layer=conv_layer) + else: + backbone = create_resnetv2_stem( + kwargs.get('in_chans', 3), stem_type=stem_type, preact=False, conv_layer=conv_layer) + return backbone + + +@register_model +def vit_tiny_r_s16_p8_224(pretrained=False, **kwargs): + """ R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 224 x 224. + """ + backbone = _resnetv2(layers=(), **kwargs) + model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_tiny_r_s16_p8_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_tiny_r_s16_p8_384(pretrained=False, **kwargs): + """ R+ViT-Ti/S16 w/ 8x8 patch hybrid @ 384 x 384. + """ + backbone = _resnetv2(layers=(), **kwargs) + model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_tiny_r_s16_p8_384', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_r26_s32_224(pretrained=False, **kwargs): + """ R26+ViT-S/S32 hybrid. + """ + backbone = _resnetv2((2, 2, 2, 2), **kwargs) + model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_small_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_r26_s32_384(pretrained=False, **kwargs): + """ R26+ViT-S/S32 hybrid. + """ + backbone = _resnetv2((2, 2, 2, 2), **kwargs) + model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_small_r26_s32_384', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_r26_s32_224(pretrained=False, **kwargs): + """ R26+ViT-B/S32 hybrid. + """ + backbone = _resnetv2((2, 2, 2, 2), **kwargs) + model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_base_r26_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_r50_s16_224(pretrained=False, **kwargs): + """ R50+ViT-B/S16 hybrid from original paper (https://arxiv.org/abs/2010.11929). + """ + backbone = _resnetv2((3, 4, 9), **kwargs) + model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_base_r50_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_r50_s16_384(pretrained=False, **kwargs): + """ R50+ViT-B/16 hybrid from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-1k weights fine-tuned from in21k @ 384x384, source https://github.com/google-research/vision_transformer. + """ + backbone = _resnetv2((3, 4, 9), **kwargs) + model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_base_r50_s16_384', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_resnet50_384(pretrained=False, **kwargs): + # DEPRECATED this is forwarding to model def above for backwards compatibility + return vit_base_r50_s16_384(pretrained=pretrained, **kwargs) + + +@register_model +def vit_large_r50_s32_224(pretrained=False, **kwargs): + """ R50+ViT-L/S32 hybrid. + """ + backbone = _resnetv2((3, 4, 6, 3), **kwargs) + model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_large_r50_s32_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_large_r50_s32_384(pretrained=False, **kwargs): + """ R50+ViT-L/S32 hybrid. + """ + backbone = _resnetv2((3, 4, 6, 3), **kwargs) + model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_large_r50_s32_384', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_tiny_r_s16_p8_224_in21k(pretrained=False, **kwargs): + """ R+ViT-Ti/S16 w/ 8x8 patch hybrid. ImageNet-21k. + """ + backbone = _resnetv2(layers=(), **kwargs) + model_kwargs = dict(patch_size=8, embed_dim=192, depth=12, num_heads=3, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_tiny_r_s16_p8_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_r26_s32_224_in21k(pretrained=False, **kwargs): + """ R26+ViT-S/S32 hybrid. ImageNet-21k. + """ + backbone = _resnetv2((2, 2, 2, 2), **kwargs) + model_kwargs = dict(embed_dim=384, depth=12, num_heads=6, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_small_r26_s32_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_r50_s16_224_in21k(pretrained=False, **kwargs): + """ R50+ViT-B/16 hybrid model from original paper (https://arxiv.org/abs/2010.11929). + ImageNet-21k weights @ 224x224, source https://github.com/google-research/vision_transformer. + """ + backbone = _resnetv2(layers=(3, 4, 9), **kwargs) + model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_base_r50_s16_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_resnet50_224_in21k(pretrained=False, **kwargs): + # DEPRECATED this is forwarding to model def above for backwards compatibility + return vit_base_r50_s16_224_in21k(pretrained=pretrained, **kwargs) + + +@register_model +def vit_large_r50_s32_224_in21k(pretrained=False, **kwargs): + """ R50+ViT-L/S32 hybrid. ImageNet-21k. + """ + backbone = _resnetv2((3, 4, 6, 3), **kwargs) + model_kwargs = dict(embed_dim=1024, depth=24, num_heads=16, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_large_r50_s32_224_in21k', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_resnet26d_224(pretrained=False, **kwargs): + """ Custom ViT small hybrid w/ ResNet26D stride 32. No pretrained weights. + """ + backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4]) + model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_small_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_small_resnet50d_s16_224(pretrained=False, **kwargs): + """ Custom ViT small hybrid w/ ResNet50D 3-stages, stride 16. No pretrained weights. + """ + backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[3]) + model_kwargs = dict(embed_dim=768, depth=8, num_heads=8, mlp_ratio=3, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_small_resnet50d_s16_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_resnet26d_224(pretrained=False, **kwargs): + """ Custom ViT base hybrid w/ ResNet26D stride 32. No pretrained weights. + """ + backbone = resnet26d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4]) + model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_base_resnet26d_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_base_resnet50d_224(pretrained=False, **kwargs): + """ Custom ViT base hybrid w/ ResNet50D stride 32. No pretrained weights. + """ + backbone = resnet50d(pretrained=pretrained, in_chans=kwargs.get('in_chans', 3), features_only=True, out_indices=[4]) + model_kwargs = dict(embed_dim=768, depth=12, num_heads=12, **kwargs) + model = _create_vision_transformer_hybrid( + 'vit_base_resnet50d_224', backbone=backbone, pretrained=pretrained, **model_kwargs) + return model diff --git a/custom_timm/models/vision_transformer_relpos.py b/custom_timm/models/vision_transformer_relpos.py new file mode 100644 index 0000000000000000000000000000000000000000..288195adf4dde547efc7fc6af2b4350b6ea114e2 --- /dev/null +++ b/custom_timm/models/vision_transformer_relpos.py @@ -0,0 +1,654 @@ +""" Relative Position Vision Transformer (ViT) in PyTorch + +NOTE: these models are experimental / WIP, expect changes + +Hacked together by / Copyright 2022, Ross Wightman +""" +import math +import logging +from functools import partial +from collections import OrderedDict +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .helpers import build_model_with_cfg, resolve_pretrained_cfg, named_apply +from .layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_, to_2tuple +from .registry import register_model + +_logger = logging.getLogger(__name__) + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .9, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'patch_embed.proj', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + 'vit_relpos_base_patch32_plus_rpn_256': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_replos_base_patch32_plus_rpn_256-sw-dd486f51.pth', + input_size=(3, 256, 256)), + 'vit_relpos_base_patch16_plus_240': _cfg(url='', input_size=(3, 240, 240)), + + 'vit_relpos_small_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_small_patch16_224-sw-ec2778b4.pth'), + 'vit_relpos_medium_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_224-sw-11c174af.pth'), + 'vit_relpos_base_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_base_patch16_224-sw-49049aed.pth'), + + 'vit_srelpos_small_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_srelpos_small_patch16_224-sw-6cdb8849.pth'), + 'vit_srelpos_medium_patch16_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_srelpos_medium_patch16_224-sw-ad702b8c.pth'), + + 'vit_relpos_medium_patch16_cls_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_cls_224-sw-cfe8e259.pth'), + 'vit_relpos_base_patch16_cls_224': _cfg( + url=''), + 'vit_relpos_base_patch16_clsgap_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_base_patch16_gapcls_224-sw-1a341d6c.pth'), + + 'vit_relpos_small_patch16_rpn_224': _cfg(url=''), + 'vit_relpos_medium_patch16_rpn_224': _cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/vit_relpos_medium_patch16_rpn_224-sw-5d2befd8.pth'), + 'vit_relpos_base_patch16_rpn_224': _cfg(url=''), +} + + +def gen_relative_position_index( + q_size: Tuple[int, int], + k_size: Tuple[int, int] = None, + class_token: bool = False) -> torch.Tensor: + # Adapted with significant modifications from Swin / BeiT codebases + # get pair-wise relative position index for each token inside the window + q_coords = torch.stack(torch.meshgrid([torch.arange(q_size[0]), torch.arange(q_size[1])])).flatten(1) # 2, Wh, Ww + if k_size is None: + k_coords = q_coords + k_size = q_size + else: + # different q vs k sizes is a WIP + k_coords = torch.stack(torch.meshgrid([torch.arange(k_size[0]), torch.arange(k_size[1])])).flatten(1) + relative_coords = q_coords[:, :, None] - k_coords[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0) # Wh*Ww, Wh*Ww, 2 + _, relative_position_index = torch.unique(relative_coords.view(-1, 2), return_inverse=True, dim=0) + + if class_token: + # handle cls to token & token 2 cls & cls to cls as per beit for rel pos bias + # NOTE not intended or tested with MLP log-coords + max_size = (max(q_size[0], k_size[0]), max(q_size[1], k_size[1])) + num_relative_distance = (2 * max_size[0] - 1) * (2 * max_size[1] - 1) + 3 + relative_position_index = F.pad(relative_position_index, [1, 0, 1, 0]) + relative_position_index[0, 0:] = num_relative_distance - 3 + relative_position_index[0:, 0] = num_relative_distance - 2 + relative_position_index[0, 0] = num_relative_distance - 1 + + return relative_position_index.contiguous() + + +def gen_relative_log_coords( + win_size: Tuple[int, int], + pretrained_win_size: Tuple[int, int] = (0, 0), + mode='swin', +): + assert mode in ('swin', 'cr', 'rw') + # as per official swin-v2 impl, supporting timm specific 'cr' and 'rw' log coords as well + relative_coords_h = torch.arange(-(win_size[0] - 1), win_size[0], dtype=torch.float32) + relative_coords_w = torch.arange(-(win_size[1] - 1), win_size[1], dtype=torch.float32) + relative_coords_table = torch.stack(torch.meshgrid([relative_coords_h, relative_coords_w])) + relative_coords_table = relative_coords_table.permute(1, 2, 0).contiguous() # 2*Wh-1, 2*Ww-1, 2 + if mode == 'swin': + if pretrained_win_size[0] > 0: + relative_coords_table[:, :, 0] /= (pretrained_win_size[0] - 1) + relative_coords_table[:, :, 1] /= (pretrained_win_size[1] - 1) + else: + relative_coords_table[:, :, 0] /= (win_size[0] - 1) + relative_coords_table[:, :, 1] /= (win_size[1] - 1) + relative_coords_table *= 8 # normalize to -8, 8 + relative_coords_table = torch.sign(relative_coords_table) * torch.log2( + 1.0 + relative_coords_table.abs()) / math.log2(8) + else: + if mode == 'rw': + # cr w/ window size normalization -> [-1,1] log coords + relative_coords_table[:, :, 0] /= (win_size[0] - 1) + relative_coords_table[:, :, 1] /= (win_size[1] - 1) + relative_coords_table *= 8 # scale to -8, 8 + relative_coords_table = torch.sign(relative_coords_table) * torch.log2( + 1.0 + relative_coords_table.abs()) + relative_coords_table /= math.log2(9) # -> [-1, 1] + else: + # mode == 'cr' + relative_coords_table = torch.sign(relative_coords_table) * torch.log( + 1.0 + relative_coords_table.abs()) + + return relative_coords_table + + +class RelPosMlp(nn.Module): + def __init__( + self, + window_size, + num_heads=8, + hidden_dim=128, + prefix_tokens=0, + mode='cr', + pretrained_window_size=(0, 0) + ): + super().__init__() + self.window_size = window_size + self.window_area = self.window_size[0] * self.window_size[1] + self.prefix_tokens = prefix_tokens + self.num_heads = num_heads + self.bias_shape = (self.window_area,) * 2 + (num_heads,) + if mode == 'swin': + self.bias_act = nn.Sigmoid() + self.bias_gain = 16 + mlp_bias = (True, False) + elif mode == 'rw': + self.bias_act = nn.Tanh() + self.bias_gain = 4 + mlp_bias = True + else: + self.bias_act = nn.Identity() + self.bias_gain = None + mlp_bias = True + + self.mlp = Mlp( + 2, # x, y + hidden_features=hidden_dim, + out_features=num_heads, + act_layer=nn.ReLU, + bias=mlp_bias, + drop=(0.125, 0.) + ) + + self.register_buffer( + "relative_position_index", + gen_relative_position_index(window_size), + persistent=False) + + # get relative_coords_table + self.register_buffer( + "rel_coords_log", + gen_relative_log_coords(window_size, pretrained_window_size, mode=mode), + persistent=False) + + def get_bias(self) -> torch.Tensor: + relative_position_bias = self.mlp(self.rel_coords_log) + if self.relative_position_index is not None: + relative_position_bias = relative_position_bias.view(-1, self.num_heads)[ + self.relative_position_index.view(-1)] # Wh*Ww,Wh*Ww,nH + relative_position_bias = relative_position_bias.view(self.bias_shape) + relative_position_bias = relative_position_bias.permute(2, 0, 1) + relative_position_bias = self.bias_act(relative_position_bias) + if self.bias_gain is not None: + relative_position_bias = self.bias_gain * relative_position_bias + if self.prefix_tokens: + relative_position_bias = F.pad(relative_position_bias, [self.prefix_tokens, 0, self.prefix_tokens, 0]) + return relative_position_bias.unsqueeze(0).contiguous() + + def forward(self, attn, shared_rel_pos: Optional[torch.Tensor] = None): + return attn + self.get_bias() + + +class RelPosBias(nn.Module): + + def __init__(self, window_size, num_heads, prefix_tokens=0): + super().__init__() + assert prefix_tokens <= 1 + self.window_size = window_size + self.window_area = window_size[0] * window_size[1] + self.bias_shape = (self.window_area + prefix_tokens,) * 2 + (num_heads,) + + num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 * prefix_tokens + self.relative_position_bias_table = nn.Parameter(torch.zeros(num_relative_distance, num_heads)) + self.register_buffer( + "relative_position_index", + gen_relative_position_index(self.window_size, class_token=prefix_tokens > 0), + persistent=False, + ) + + self.init_weights() + + def init_weights(self): + trunc_normal_(self.relative_position_bias_table, std=.02) + + def get_bias(self) -> torch.Tensor: + relative_position_bias = self.relative_position_bias_table[self.relative_position_index.view(-1)] + # win_h * win_w, win_h * win_w, num_heads + relative_position_bias = relative_position_bias.view(self.bias_shape).permute(2, 0, 1) + return relative_position_bias.unsqueeze(0).contiguous() + + def forward(self, attn, shared_rel_pos: Optional[torch.Tensor] = None): + return attn + self.get_bias() + + +class RelPosAttention(nn.Module): + def __init__(self, dim, num_heads=8, qkv_bias=False, rel_pos_cls=None, attn_drop=0., proj_drop=0.): + super().__init__() + assert dim % num_heads == 0, 'dim should be divisible by num_heads' + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.rel_pos = rel_pos_cls(num_heads=num_heads) if rel_pos_cls else None + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + attn = (q @ k.transpose(-2, -1)) * self.scale + if self.rel_pos is not None: + attn = self.rel_pos(attn, shared_rel_pos=shared_rel_pos) + elif shared_rel_pos is not None: + attn = attn + shared_rel_pos + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class LayerScale(nn.Module): + def __init__(self, dim, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class RelPosBlock(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, rel_pos_cls=None, init_values=None, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = RelPosAttention( + dim, num_heads, qkv_bias=qkv_bias, rel_pos_cls=rel_pos_cls, attn_drop=attn_drop, proj_drop=drop) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): + x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x), shared_rel_pos=shared_rel_pos))) + x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x)))) + return x + + +class ResPostRelPosBlock(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, rel_pos_cls=None, init_values=None, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.init_values = init_values + + self.attn = RelPosAttention( + dim, num_heads, qkv_bias=qkv_bias, rel_pos_cls=rel_pos_cls, attn_drop=attn_drop, proj_drop=drop) + self.norm1 = norm_layer(dim) + self.drop_path1 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + self.norm2 = norm_layer(dim) + self.drop_path2 = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.init_weights() + + def init_weights(self): + # NOTE this init overrides that base model init with specific changes for the block type + if self.init_values is not None: + nn.init.constant_(self.norm1.weight, self.init_values) + nn.init.constant_(self.norm2.weight, self.init_values) + + def forward(self, x, shared_rel_pos: Optional[torch.Tensor] = None): + x = x + self.drop_path1(self.norm1(self.attn(x, shared_rel_pos=shared_rel_pos))) + x = x + self.drop_path2(self.norm2(self.mlp(x))) + return x + + +class VisionTransformerRelPos(nn.Module): + """ Vision Transformer w/ Relative Position Bias + + Differing from classic vit, this impl + * uses relative position index (swin v1 / beit) or relative log coord + mlp (swin v2) pos embed + * defaults to no class token (can be enabled) + * defaults to global avg pool for head (can be changed) + * layer-scale (residual branch gain) enabled + """ + + def __init__( + self, + img_size=224, + patch_size=16, + in_chans=3, + num_classes=1000, + global_pool='avg', + embed_dim=768, + depth=12, + num_heads=12, + mlp_ratio=4., + qkv_bias=True, + init_values=1e-6, + class_token=False, + fc_norm=False, + rel_pos_type='mlp', + rel_pos_dim=None, + shared_rel_pos=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + weight_init='skip', + embed_layer=PatchEmbed, + norm_layer=None, + act_layer=None, + block_fn=RelPosBlock + ): + """ + Args: + img_size (int, tuple): input image size + patch_size (int, tuple): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + global_pool (str): type of global pooling for final sequence (default: 'avg') + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + init_values: (float): layer-scale init values + class_token (bool): use class token (default: False) + fc_norm (bool): use pre classifier norm instead of pre-pool + rel_pos_ty pe (str): type of relative position + shared_rel_pos (bool): share relative pos across all blocks + drop_rate (float): dropout rate + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate + weight_init (str): weight init scheme + embed_layer (nn.Module): patch embedding layer + norm_layer: (nn.Module): normalization layer + act_layer: (nn.Module): MLP activation layer + """ + super().__init__() + assert global_pool in ('', 'avg', 'token') + assert class_token or global_pool != 'token' + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.num_classes = num_classes + self.global_pool = global_pool + self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models + self.num_prefix_tokens = 1 if class_token else 0 + self.grad_checkpointing = False + + self.patch_embed = embed_layer( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim) + feat_size = self.patch_embed.grid_size + + rel_pos_args = dict(window_size=feat_size, prefix_tokens=self.num_prefix_tokens) + if rel_pos_type.startswith('mlp'): + if rel_pos_dim: + rel_pos_args['hidden_dim'] = rel_pos_dim + # FIXME experimenting with different relpos log coord configs + if 'swin' in rel_pos_type: + rel_pos_args['mode'] = 'swin' + elif 'rw' in rel_pos_type: + rel_pos_args['mode'] = 'rw' + rel_pos_cls = partial(RelPosMlp, **rel_pos_args) + else: + rel_pos_cls = partial(RelPosBias, **rel_pos_args) + self.shared_rel_pos = None + if shared_rel_pos: + self.shared_rel_pos = rel_pos_cls(num_heads=num_heads) + # NOTE shared rel pos currently mutually exclusive w/ per-block, but could support both... + rel_pos_cls = None + + self.cls_token = nn.Parameter(torch.zeros(1, self.num_prefix_tokens, embed_dim)) if class_token else None + + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule + self.blocks = nn.ModuleList([ + block_fn( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, rel_pos_cls=rel_pos_cls, + init_values=init_values, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], + norm_layer=norm_layer, act_layer=act_layer) + for i in range(depth)]) + self.norm = norm_layer(embed_dim) if not fc_norm else nn.Identity() + + # Classifier Head + self.fc_norm = norm_layer(embed_dim) if fc_norm else nn.Identity() + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + if weight_init != 'skip': + self.init_weights(weight_init) + + def init_weights(self, mode=''): + assert mode in ('jax', 'moco', '') + if self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + # FIXME weight init scheme using PyTorch defaults curently + #named_apply(get_init_weights_vit(mode, head_bias), self) + + @torch.jit.ignore + def no_weight_decay(self): + return {'cls_token'} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^cls_token|patch_embed', # stem and embed + blocks=[(r'^blocks\.(\d+)', None), (r'^norm', (99999,))] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes: int, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg', 'token') + self.global_pool = global_pool + self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + x = self.patch_embed(x) + if self.cls_token is not None: + x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1) + + shared_rel_pos = self.shared_rel_pos.get_bias() if self.shared_rel_pos is not None else None + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(blk, x, shared_rel_pos=shared_rel_pos) + else: + x = blk(x, shared_rel_pos=shared_rel_pos) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x[:, self.num_prefix_tokens:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0] + x = self.fc_norm(x) + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_vision_transformer_relpos(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + + model = build_model_with_cfg(VisionTransformerRelPos, variant, pretrained, **kwargs) + return model + + +@register_model +def vit_relpos_base_patch32_plus_rpn_256(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/32+) w/ relative log-coord position and residual post-norm, no class token + """ + model_kwargs = dict( + patch_size=32, embed_dim=896, depth=12, num_heads=14, block_fn=ResPostRelPosBlock, **kwargs) + model = _create_vision_transformer_relpos( + 'vit_relpos_base_patch32_plus_rpn_256', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_base_patch16_plus_240(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16+) w/ relative log-coord position, no class token + """ + model_kwargs = dict(patch_size=16, embed_dim=896, depth=12, num_heads=14, **kwargs) + model = _create_vision_transformer_relpos('vit_relpos_base_patch16_plus_240', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_small_patch16_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, fc_norm=True, **kwargs) + model = _create_vision_transformer_relpos('vit_relpos_small_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_medium_patch16_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=True, **kwargs) + model = _create_vision_transformer_relpos('vit_relpos_medium_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_base_patch16_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True, **kwargs) + model = _create_vision_transformer_relpos('vit_relpos_base_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_srelpos_small_patch16_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ shared relative log-coord position, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, fc_norm=False, + rel_pos_dim=384, shared_rel_pos=True, **kwargs) + model = _create_vision_transformer_relpos('vit_srelpos_small_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_srelpos_medium_patch16_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ shared relative log-coord position, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=False, + rel_pos_dim=512, shared_rel_pos=True, **kwargs) + model = _create_vision_transformer_relpos( + 'vit_srelpos_medium_patch16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_medium_patch16_cls_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-M/16) w/ relative log-coord position, class token present + """ + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, fc_norm=False, + rel_pos_dim=256, class_token=True, global_pool='token', **kwargs) + model = _create_vision_transformer_relpos( + 'vit_relpos_medium_patch16_cls_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_base_patch16_cls_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position, class token present + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, + class_token=True, global_pool='token', **kwargs) + model = _create_vision_transformer_relpos('vit_relpos_base_patch16_cls_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_base_patch16_clsgap_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position, class token present + NOTE this config is a bit of a mistake, class token was enabled but global avg-pool w/ fc-norm was not disabled + Leaving here for comparisons w/ a future re-train as it performs quite well. + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, fc_norm=True, class_token=True, **kwargs) + model = _create_vision_transformer_relpos('vit_relpos_base_patch16_clsgap_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_small_patch16_rpn_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=6, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs) + model = _create_vision_transformer_relpos( + 'vit_relpos_small_patch16_rpn_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_medium_patch16_rpn_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=12, num_heads=8, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs) + model = _create_vision_transformer_relpos( + 'vit_relpos_medium_patch16_rpn_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def vit_relpos_base_patch16_rpn_224(pretrained=False, **kwargs): + """ ViT-Base (ViT-B/16) w/ relative log-coord position and residual post-norm, no class token + """ + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=12, num_heads=12, qkv_bias=False, block_fn=ResPostRelPosBlock, **kwargs) + model = _create_vision_transformer_relpos( + 'vit_relpos_base_patch16_rpn_224', pretrained=pretrained, **model_kwargs) + return model diff --git a/custom_timm/models/volo.py b/custom_timm/models/volo.py new file mode 100644 index 0000000000000000000000000000000000000000..2c2886af59a29bc8bd7493a85a8158eecce70914 --- /dev/null +++ b/custom_timm/models/volo.py @@ -0,0 +1,750 @@ +""" Vision OutLOoker (VOLO) implementation + +Paper: `VOLO: Vision Outlooker for Visual Recognition` - https://arxiv.org/abs/2106.13112 + +Code adapted from official impl at https://github.com/sail-sg/volo, original copyright in comment below + +Modifications and additions for timm by / Copyright 2022, Ross Wightman +""" +# Copyright 2021 Sea Limited. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import numpy as np + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.checkpoint import checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from custom_timm.models.layers import DropPath, Mlp, to_2tuple, to_ntuple, trunc_normal_ +from custom_timm.models.registry import register_model +from custom_timm.models.helpers import build_model_with_cfg + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': .96, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.conv.0', 'classifier': ('head', 'aux_head'), + **kwargs + } + + +default_cfgs = { + 'volo_d1_224': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d1_224_84.2.pth.tar', + crop_pct=0.96), + 'volo_d1_384': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d1_384_85.2.pth.tar', + crop_pct=1.0, input_size=(3, 384, 384)), + 'volo_d2_224': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d2_224_85.2.pth.tar', + crop_pct=0.96), + 'volo_d2_384': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d2_384_86.0.pth.tar', + crop_pct=1.0, input_size=(3, 384, 384)), + 'volo_d3_224': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d3_224_85.4.pth.tar', + crop_pct=0.96), + 'volo_d3_448': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d3_448_86.3.pth.tar', + crop_pct=1.0, input_size=(3, 448, 448)), + 'volo_d4_224': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d4_224_85.7.pth.tar', + crop_pct=0.96), + 'volo_d4_448': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d4_448_86.79.pth.tar', + crop_pct=1.15, input_size=(3, 448, 448)), + 'volo_d5_224': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d5_224_86.10.pth.tar', + crop_pct=0.96), + 'volo_d5_448': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d5_448_87.0.pth.tar', + crop_pct=1.15, input_size=(3, 448, 448)), + 'volo_d5_512': _cfg( + url='https://github.com/sail-sg/volo/releases/download/volo_1/d5_512_87.07.pth.tar', + crop_pct=1.15, input_size=(3, 512, 512)), +} + + +class OutlookAttention(nn.Module): + + def __init__(self, dim, num_heads, kernel_size=3, padding=1, stride=1, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + head_dim = dim // num_heads + self.num_heads = num_heads + self.kernel_size = kernel_size + self.padding = padding + self.stride = stride + self.scale = head_dim ** -0.5 + + self.v = nn.Linear(dim, dim, bias=qkv_bias) + self.attn = nn.Linear(dim, kernel_size ** 4 * num_heads) + + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + self.unfold = nn.Unfold(kernel_size=kernel_size, padding=padding, stride=stride) + self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride, ceil_mode=True) + + def forward(self, x): + B, H, W, C = x.shape + + v = self.v(x).permute(0, 3, 1, 2) # B, C, H, W + + h, w = math.ceil(H / self.stride), math.ceil(W / self.stride) + v = self.unfold(v).reshape( + B, self.num_heads, C // self.num_heads, + self.kernel_size * self.kernel_size, h * w).permute(0, 1, 4, 3, 2) # B,H,N,kxk,C/H + + attn = self.pool(x.permute(0, 3, 1, 2)).permute(0, 2, 3, 1) + attn = self.attn(attn).reshape( + B, h * w, self.num_heads, self.kernel_size * self.kernel_size, + self.kernel_size * self.kernel_size).permute(0, 2, 1, 3, 4) # B,H,N,kxk,kxk + attn = attn * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).permute(0, 1, 4, 3, 2).reshape(B, C * self.kernel_size * self.kernel_size, h * w) + x = F.fold(x, output_size=(H, W), kernel_size=self.kernel_size, padding=self.padding, stride=self.stride) + + x = self.proj(x.permute(0, 2, 3, 1)) + x = self.proj_drop(x) + + return x + + +class Outlooker(nn.Module): + def __init__( + self, dim, kernel_size, padding, stride=1, num_heads=1, mlp_ratio=3., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, qkv_bias=False + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = OutlookAttention( + dim, num_heads, kernel_size=kernel_size, + padding=padding, stride=stride, + qkv_bias=qkv_bias, attn_drop=attn_drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Attention(nn.Module): + + def __init__( + self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim ** -0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, H, W, C = x.shape + + qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind(0) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + x = (attn @ v).transpose(1, 2).reshape(B, H, W, C) + x = self.proj(x) + x = self.proj_drop(x) + + return x + + +class Transformer(nn.Module): + + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, + attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop) + + # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer) + + def forward(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class ClassAttention(nn.Module): + + def __init__( + self, dim, num_heads=8, head_dim=None, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + if head_dim is not None: + self.head_dim = head_dim + else: + head_dim = dim // num_heads + self.head_dim = head_dim + self.scale = head_dim ** -0.5 + + self.kv = nn.Linear(dim, self.head_dim * self.num_heads * 2, bias=qkv_bias) + self.q = nn.Linear(dim, self.head_dim * self.num_heads, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(self.head_dim * self.num_heads, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + + kv = self.kv(x).reshape(B, N, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4) + k, v = kv.unbind(0) + q = self.q(x[:, :1, :]).reshape(B, self.num_heads, 1, self.head_dim) + attn = ((q * self.scale) @ k.transpose(-2, -1)) + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + cls_embed = (attn @ v).transpose(1, 2).reshape(B, 1, self.head_dim * self.num_heads) + cls_embed = self.proj(cls_embed) + cls_embed = self.proj_drop(cls_embed) + return cls_embed + + +class ClassBlock(nn.Module): + + def __init__( + self, dim, num_heads, head_dim=None, mlp_ratio=4., qkv_bias=False, + drop=0., attn_drop=0., drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = ClassAttention( + dim, num_heads=num_heads, head_dim=head_dim, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + # NOTE: drop path for stochastic depth + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop) + + def forward(self, x): + cls_embed = x[:, :1] + cls_embed = cls_embed + self.drop_path(self.attn(self.norm1(x))) + cls_embed = cls_embed + self.drop_path(self.mlp(self.norm2(cls_embed))) + return torch.cat([cls_embed, x[:, 1:]], dim=1) + + +def get_block(block_type, **kargs): + if block_type == 'ca': + return ClassBlock(**kargs) + + +def rand_bbox(size, lam, scale=1): + """ + get bounding box as token labeling (https://github.com/zihangJiang/TokenLabeling) + return: bounding box + """ + W = size[1] // scale + H = size[2] // scale + cut_rat = np.sqrt(1. - lam) + cut_w = np.int(W * cut_rat) + cut_h = np.int(H * cut_rat) + + # uniform + cx = np.random.randint(W) + cy = np.random.randint(H) + + bbx1 = np.clip(cx - cut_w // 2, 0, W) + bby1 = np.clip(cy - cut_h // 2, 0, H) + bbx2 = np.clip(cx + cut_w // 2, 0, W) + bby2 = np.clip(cy + cut_h // 2, 0, H) + + return bbx1, bby1, bbx2, bby2 + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding. + Different with ViT use 1 conv layer, we use 4 conv layers to do patch embedding + """ + + def __init__( + self, img_size=224, stem_conv=False, stem_stride=1, + patch_size=8, in_chans=3, hidden_dim=64, embed_dim=384): + super().__init__() + assert patch_size in [4, 8, 16] + if stem_conv: + self.conv = nn.Sequential( + nn.Conv2d(in_chans, hidden_dim, kernel_size=7, stride=stem_stride, padding=3, bias=False), # 112x112 + nn.BatchNorm2d(hidden_dim), + nn.ReLU(inplace=True), + nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1, bias=False), # 112x112 + nn.BatchNorm2d(hidden_dim), + nn.ReLU(inplace=True), + nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=1, padding=1, bias=False), # 112x112 + nn.BatchNorm2d(hidden_dim), + nn.ReLU(inplace=True), + ) + else: + self.conv = None + + self.proj = nn.Conv2d( + hidden_dim, embed_dim, kernel_size=patch_size // stem_stride, stride=patch_size // stem_stride) + self.num_patches = (img_size // patch_size) * (img_size // patch_size) + + def forward(self, x): + if self.conv is not None: + x = self.conv(x) + x = self.proj(x) # B, C, H, W + return x + + +class Downsample(nn.Module): + """ Image to Patch Embedding, downsampling between stage1 and stage2 + """ + + def __init__(self, in_embed_dim, out_embed_dim, patch_size=2): + super().__init__() + self.proj = nn.Conv2d(in_embed_dim, out_embed_dim, kernel_size=patch_size, stride=patch_size) + + def forward(self, x): + x = x.permute(0, 3, 1, 2) + x = self.proj(x) # B, C, H, W + x = x.permute(0, 2, 3, 1) + return x + + +def outlooker_blocks( + block_fn, index, dim, layers, num_heads=1, kernel_size=3, padding=1, stride=2, + mlp_ratio=3., qkv_bias=False, attn_drop=0, drop_path_rate=0., **kwargs): + """ + generate outlooker layer in stage1 + return: outlooker layers + """ + blocks = [] + for block_idx in range(layers[index]): + block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1) + blocks.append( + block_fn( + dim, kernel_size=kernel_size, padding=padding, + stride=stride, num_heads=num_heads, mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, attn_drop=attn_drop, drop_path=block_dpr)) + blocks = nn.Sequential(*blocks) + return blocks + + +def transformer_blocks( + block_fn, index, dim, layers, num_heads, mlp_ratio=3., + qkv_bias=False, attn_drop=0, drop_path_rate=0., **kwargs): + """ + generate transformer layers in stage2 + return: transformer layers + """ + blocks = [] + for block_idx in range(layers[index]): + block_dpr = drop_path_rate * (block_idx + sum(layers[:index])) / (sum(layers) - 1) + blocks.append( + block_fn( + dim, num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + drop_path=block_dpr)) + blocks = nn.Sequential(*blocks) + return blocks + + +class VOLO(nn.Module): + """ + Vision Outlooker, the main class of our model + """ + + def __init__( + self, + layers, + img_size=224, + in_chans=3, + num_classes=1000, + global_pool='token', + patch_size=8, + stem_hidden_dim=64, + embed_dims=None, + num_heads=None, + downsamples=(True, False, False, False), + outlook_attention=(True, False, False, False), + mlp_ratio=3.0, + qkv_bias=False, + drop_rate=0., + attn_drop_rate=0., + drop_path_rate=0., + norm_layer=nn.LayerNorm, + post_layers=('ca', 'ca'), + use_aux_head=True, + use_mix_token=False, + pooling_scale=2, + ): + super().__init__() + num_layers = len(layers) + mlp_ratio = to_ntuple(num_layers)(mlp_ratio) + img_size = to_2tuple(img_size) + + self.num_classes = num_classes + self.global_pool = global_pool + self.mix_token = use_mix_token + self.pooling_scale = pooling_scale + self.num_features = embed_dims[-1] + if use_mix_token: # enable token mixing, see token labeling for details. + self.beta = 1.0 + assert global_pool == 'token', "return all tokens if mix_token is enabled" + self.grad_checkpointing = False + + self.patch_embed = PatchEmbed( + stem_conv=True, stem_stride=2, patch_size=patch_size, + in_chans=in_chans, hidden_dim=stem_hidden_dim, + embed_dim=embed_dims[0]) + + # inital positional encoding, we add positional encoding after outlooker blocks + patch_grid = (img_size[0] // patch_size // pooling_scale, img_size[1] // patch_size // pooling_scale) + self.pos_embed = nn.Parameter(torch.zeros(1, patch_grid[0], patch_grid[1], embed_dims[-1])) + self.pos_drop = nn.Dropout(p=drop_rate) + + # set the main block in network + network = [] + for i in range(len(layers)): + if outlook_attention[i]: + # stage 1 + stage = outlooker_blocks( + Outlooker, i, embed_dims[i], layers, num_heads[i], mlp_ratio=mlp_ratio[i], + qkv_bias=qkv_bias, attn_drop=attn_drop_rate, norm_layer=norm_layer) + network.append(stage) + else: + # stage 2 + stage = transformer_blocks( + Transformer, i, embed_dims[i], layers, num_heads[i], mlp_ratio=mlp_ratio[i], qkv_bias=qkv_bias, + drop_path_rate=drop_path_rate, attn_drop=attn_drop_rate, norm_layer=norm_layer) + network.append(stage) + + if downsamples[i]: + # downsampling between two stages + network.append(Downsample(embed_dims[i], embed_dims[i + 1], 2)) + + self.network = nn.ModuleList(network) + + # set post block, for example, class attention layers + self.post_network = None + if post_layers is not None: + self.post_network = nn.ModuleList( + [ + get_block( + post_layers[i], + dim=embed_dims[-1], + num_heads=num_heads[-1], + mlp_ratio=mlp_ratio[-1], + qkv_bias=qkv_bias, + attn_drop=attn_drop_rate, + drop_path=0., + norm_layer=norm_layer) + for i in range(len(post_layers)) + ]) + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dims[-1])) + trunc_normal_(self.cls_token, std=.02) + + # set output type + if use_aux_head: + self.aux_head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + else: + self.aux_head = None + self.norm = norm_layer(self.num_features) + + # Classifier head + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + trunc_normal_(self.pos_embed, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^cls_token|pos_embed|patch_embed', # stem and embed + blocks=[ + (r'^network\.(\d+)\.(\d+)', None), + (r'^network\.(\d+)', (0,)), + ], + blocks2=[ + (r'^cls_token', (0,)), + (r'^post_network\.(\d+)', None), + (r'^norm', (99999,)) + ], + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=None): + self.num_classes = num_classes + if global_pool is not None: + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + if self.aux_head is not None: + self.aux_head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_tokens(self, x): + for idx, block in enumerate(self.network): + if idx == 2: + # add positional encoding after outlooker blocks + x = x + self.pos_embed + x = self.pos_drop(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(block, x) + else: + x = block(x) + + B, H, W, C = x.shape + x = x.reshape(B, -1, C) + return x + + def forward_cls(self, x): + B, N, C = x.shape + cls_tokens = self.cls_token.expand(B, -1, -1) + x = torch.cat([cls_tokens, x], dim=1) + for block in self.post_network: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(block, x) + else: + x = block(x) + return x + + def forward_train(self, x): + """ A separate forward fn for training with mix_token (if a train script supports). + Combining multiple modes in as single forward with different return types is torchscript hell. + """ + x = self.patch_embed(x) + x = x.permute(0, 2, 3, 1) # B,C,H,W-> B,H,W,C + + # mix token, see token labeling for details. + if self.mix_token and self.training: + lam = np.random.beta(self.beta, self.beta) + patch_h, patch_w = x.shape[1] // self.pooling_scale, x.shape[2] // self.pooling_scale + bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam, scale=self.pooling_scale) + temp_x = x.clone() + sbbx1, sbby1 = self.pooling_scale * bbx1, self.pooling_scale * bby1 + sbbx2, sbby2 = self.pooling_scale * bbx2, self.pooling_scale * bby2 + temp_x[:, sbbx1:sbbx2, sbby1:sbby2, :] = x.flip(0)[:, sbbx1:sbbx2, sbby1:sbby2, :] + x = temp_x + else: + bbx1, bby1, bbx2, bby2 = 0, 0, 0, 0 + + # step2: tokens learning in the two stages + x = self.forward_tokens(x) + + # step3: post network, apply class attention or not + if self.post_network is not None: + x = self.forward_cls(x) + x = self.norm(x) + + if self.global_pool == 'avg': + x_cls = x.mean(dim=1) + elif self.global_pool == 'token': + x_cls = x[:, 0] + else: + x_cls = x + + if self.aux_head is None: + return x_cls + + x_aux = self.aux_head(x[:, 1:]) # generate classes in all feature tokens, see token labeling + if not self.training: + return x_cls + 0.5 * x_aux.max(1)[0] + + if self.mix_token and self.training: # reverse "mix token", see token labeling for details. + x_aux = x_aux.reshape(x_aux.shape[0], patch_h, patch_w, x_aux.shape[-1]) + temp_x = x_aux.clone() + temp_x[:, bbx1:bbx2, bby1:bby2, :] = x_aux.flip(0)[:, bbx1:bbx2, bby1:bby2, :] + x_aux = temp_x + x_aux = x_aux.reshape(x_aux.shape[0], patch_h * patch_w, x_aux.shape[-1]) + + # return these: 1. class token, 2. classes from all feature tokens, 3. bounding box + return x_cls, x_aux, (bbx1, bby1, bbx2, bby2) + + def forward_features(self, x): + x = self.patch_embed(x).permute(0, 2, 3, 1) # B,C,H,W-> B,H,W,C + + # step2: tokens learning in the two stages + x = self.forward_tokens(x) + + # step3: post network, apply class attention or not + if self.post_network is not None: + x = self.forward_cls(x) + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool == 'avg': + out = x.mean(dim=1) + elif self.global_pool == 'token': + out = x[:, 0] + else: + out = x + if pre_logits: + return out + out = self.head(out) + if self.aux_head is not None: + # generate classes in all feature tokens, see token labeling + aux = self.aux_head(x[:, 1:]) + out = out + 0.5 * aux.max(1)[0] + return out + + def forward(self, x): + """ simplified forward (without mix token training) """ + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_volo(variant, pretrained=False, **kwargs): + if kwargs.get('features_only', None): + raise RuntimeError('features_only not implemented for Vision Transformer models.') + return build_model_with_cfg(VOLO, variant, pretrained, **kwargs) + + +@register_model +def volo_d1_224(pretrained=False, **kwargs): + """ VOLO-D1 model, Params: 27M """ + model_args = dict(layers=(4, 4, 8, 2), embed_dims=(192, 384, 384, 384), num_heads=(6, 12, 12, 12), **kwargs) + model = _create_volo('volo_d1_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d1_384(pretrained=False, **kwargs): + """ VOLO-D1 model, Params: 27M """ + model_args = dict(layers=(4, 4, 8, 2), embed_dims=(192, 384, 384, 384), num_heads=(6, 12, 12, 12), **kwargs) + model = _create_volo('volo_d1_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d2_224(pretrained=False, **kwargs): + """ VOLO-D2 model, Params: 59M """ + model_args = dict(layers=(6, 4, 10, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs) + model = _create_volo('volo_d2_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d2_384(pretrained=False, **kwargs): + """ VOLO-D2 model, Params: 59M """ + model_args = dict(layers=(6, 4, 10, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs) + model = _create_volo('volo_d2_384', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d3_224(pretrained=False, **kwargs): + """ VOLO-D3 model, Params: 86M """ + model_args = dict(layers=(8, 8, 16, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs) + model = _create_volo('volo_d3_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d3_448(pretrained=False, **kwargs): + """ VOLO-D3 model, Params: 86M """ + model_args = dict(layers=(8, 8, 16, 4), embed_dims=(256, 512, 512, 512), num_heads=(8, 16, 16, 16), **kwargs) + model = _create_volo('volo_d3_448', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d4_224(pretrained=False, **kwargs): + """ VOLO-D4 model, Params: 193M """ + model_args = dict(layers=(8, 8, 16, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), **kwargs) + model = _create_volo('volo_d4_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d4_448(pretrained=False, **kwargs): + """ VOLO-D4 model, Params: 193M """ + model_args = dict(layers=(8, 8, 16, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), **kwargs) + model = _create_volo('volo_d4_448', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d5_224(pretrained=False, **kwargs): + """ VOLO-D5 model, Params: 296M + stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5 + """ + model_args = dict( + layers=(12, 12, 20, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), + mlp_ratio=4, stem_hidden_dim=128, **kwargs) + model = _create_volo('volo_d5_224', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d5_448(pretrained=False, **kwargs): + """ VOLO-D5 model, Params: 296M + stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5 + """ + model_args = dict( + layers=(12, 12, 20, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), + mlp_ratio=4, stem_hidden_dim=128, **kwargs) + model = _create_volo('volo_d5_448', pretrained=pretrained, **model_args) + return model + + +@register_model +def volo_d5_512(pretrained=False, **kwargs): + """ VOLO-D5 model, Params: 296M + stem_hidden_dim=128, the dim in patch embedding is 128 for VOLO-D5 + """ + model_args = dict( + layers=(12, 12, 20, 4), embed_dims=(384, 768, 768, 768), num_heads=(12, 16, 16, 16), + mlp_ratio=4, stem_hidden_dim=128, **kwargs) + model = _create_volo('volo_d5_512', pretrained=pretrained, **model_args) + return model diff --git a/custom_timm/models/vovnet.py b/custom_timm/models/vovnet.py new file mode 100644 index 0000000000000000000000000000000000000000..8e80ffc66c432f6e174c70f5d33bb0dbcde50409 --- /dev/null +++ b/custom_timm/models/vovnet.py @@ -0,0 +1,424 @@ +""" VoVNet (V1 & V2) + +Papers: +* `An Energy and GPU-Computation Efficient Backbone Network` - https://arxiv.org/abs/1904.09730 +* `CenterMask : Real-Time Anchor-Free Instance Segmentation` - https://arxiv.org/abs/1911.06667 + +Looked at https://github.com/youngwanLEE/vovnet-detectron2 & +https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py +for some reference, rewrote most of the code. + +Hacked together by / Copyright 2020 Ross Wightman +""" + +from typing import List + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .registry import register_model +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import ConvNormAct, SeparableConvNormAct, BatchNormAct2d, ClassifierHead, DropPath,\ + create_attn, create_norm_act_layer, get_norm_act_layer + + +# model cfgs adapted from https://github.com/youngwanLEE/vovnet-detectron2 & +# https://github.com/stigma0617/VoVNet.pytorch/blob/master/models_vovnet/vovnet.py +model_cfgs = dict( + vovnet39a=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=5, + block_per_stage=[1, 1, 2, 2], + residual=False, + depthwise=False, + attn='', + ), + vovnet57a=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=5, + block_per_stage=[1, 1, 4, 3], + residual=False, + depthwise=False, + attn='', + + ), + ese_vovnet19b_slim_dw=dict( + stem_chs=[64, 64, 64], + stage_conv_chs=[64, 80, 96, 112], + stage_out_chs=[112, 256, 384, 512], + layer_per_block=3, + block_per_stage=[1, 1, 1, 1], + residual=True, + depthwise=True, + attn='ese', + + ), + ese_vovnet19b_dw=dict( + stem_chs=[64, 64, 64], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=3, + block_per_stage=[1, 1, 1, 1], + residual=True, + depthwise=True, + attn='ese', + ), + ese_vovnet19b_slim=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[64, 80, 96, 112], + stage_out_chs=[112, 256, 384, 512], + layer_per_block=3, + block_per_stage=[1, 1, 1, 1], + residual=True, + depthwise=False, + attn='ese', + ), + ese_vovnet19b=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=3, + block_per_stage=[1, 1, 1, 1], + residual=True, + depthwise=False, + attn='ese', + + ), + ese_vovnet39b=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=5, + block_per_stage=[1, 1, 2, 2], + residual=True, + depthwise=False, + attn='ese', + ), + ese_vovnet57b=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=5, + block_per_stage=[1, 1, 4, 3], + residual=True, + depthwise=False, + attn='ese', + + ), + ese_vovnet99b=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=5, + block_per_stage=[1, 3, 9, 3], + residual=True, + depthwise=False, + attn='ese', + ), + eca_vovnet39b=dict( + stem_chs=[64, 64, 128], + stage_conv_chs=[128, 160, 192, 224], + stage_out_chs=[256, 512, 768, 1024], + layer_per_block=5, + block_per_stage=[1, 1, 2, 2], + residual=True, + depthwise=False, + attn='eca', + ), +) +model_cfgs['ese_vovnet39b_evos'] = model_cfgs['ese_vovnet39b'] +model_cfgs['ese_vovnet99b_iabn'] = model_cfgs['ese_vovnet99b'] + + +def _cfg(url=''): + return { + 'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7), + 'crop_pct': 0.875, 'interpolation': 'bicubic', + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'stem.0.conv', 'classifier': 'head.fc', + } + + +default_cfgs = dict( + vovnet39a=_cfg(url=''), + vovnet57a=_cfg(url=''), + ese_vovnet19b_slim_dw=_cfg(url=''), + ese_vovnet19b_dw=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ese_vovnet19b_dw-a8741004.pth'), + ese_vovnet19b_slim=_cfg(url=''), + ese_vovnet39b=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/ese_vovnet39b-f912fe73.pth'), + ese_vovnet57b=_cfg(url=''), + ese_vovnet99b=_cfg(url=''), + eca_vovnet39b=_cfg(url=''), + ese_vovnet39b_evos=_cfg(url=''), + ese_vovnet99b_iabn=_cfg(url=''), +) + + +class SequentialAppendList(nn.Sequential): + def __init__(self, *args): + super(SequentialAppendList, self).__init__(*args) + + def forward(self, x: torch.Tensor, concat_list: List[torch.Tensor]) -> torch.Tensor: + for i, module in enumerate(self): + if i == 0: + concat_list.append(module(x)) + else: + concat_list.append(module(concat_list[-1])) + x = torch.cat(concat_list, dim=1) + return x + + +class OsaBlock(nn.Module): + + def __init__( + self, in_chs, mid_chs, out_chs, layer_per_block, residual=False, + depthwise=False, attn='', norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path=None): + super(OsaBlock, self).__init__() + + self.residual = residual + self.depthwise = depthwise + conv_kwargs = dict(norm_layer=norm_layer, act_layer=act_layer) + + next_in_chs = in_chs + if self.depthwise and next_in_chs != mid_chs: + assert not residual + self.conv_reduction = ConvNormAct(next_in_chs, mid_chs, 1, **conv_kwargs) + else: + self.conv_reduction = None + + mid_convs = [] + for i in range(layer_per_block): + if self.depthwise: + conv = SeparableConvNormAct(mid_chs, mid_chs, **conv_kwargs) + else: + conv = ConvNormAct(next_in_chs, mid_chs, 3, **conv_kwargs) + next_in_chs = mid_chs + mid_convs.append(conv) + self.conv_mid = SequentialAppendList(*mid_convs) + + # feature aggregation + next_in_chs = in_chs + layer_per_block * mid_chs + self.conv_concat = ConvNormAct(next_in_chs, out_chs, **conv_kwargs) + + self.attn = create_attn(attn, out_chs) if attn else None + + self.drop_path = drop_path + + def forward(self, x): + output = [x] + if self.conv_reduction is not None: + x = self.conv_reduction(x) + x = self.conv_mid(x, output) + x = self.conv_concat(x) + if self.attn is not None: + x = self.attn(x) + if self.drop_path is not None: + x = self.drop_path(x) + if self.residual: + x = x + output[0] + return x + + +class OsaStage(nn.Module): + + def __init__( + self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block, downsample=True, + residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d, act_layer=nn.ReLU, + drop_path_rates=None): + super(OsaStage, self).__init__() + self.grad_checkpointing = False + + if downsample: + self.pool = nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True) + else: + self.pool = None + + blocks = [] + for i in range(block_per_stage): + last_block = i == block_per_stage - 1 + if drop_path_rates is not None and drop_path_rates[i] > 0.: + drop_path = DropPath(drop_path_rates[i]) + else: + drop_path = None + blocks += [OsaBlock( + in_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0, depthwise=depthwise, + attn=attn if last_block else '', norm_layer=norm_layer, act_layer=act_layer, drop_path=drop_path) + ] + in_chs = out_chs + self.blocks = nn.Sequential(*blocks) + + def forward(self, x): + if self.pool is not None: + x = self.pool(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + return x + + +class VovNet(nn.Module): + + def __init__( + self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0., stem_stride=4, + output_stride=32, norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path_rate=0.): + """ VovNet (v2) + """ + super(VovNet, self).__init__() + self.num_classes = num_classes + self.drop_rate = drop_rate + assert stem_stride in (4, 2) + assert output_stride == 32 # FIXME support dilation + + stem_chs = cfg["stem_chs"] + stage_conv_chs = cfg["stage_conv_chs"] + stage_out_chs = cfg["stage_out_chs"] + block_per_stage = cfg["block_per_stage"] + layer_per_block = cfg["layer_per_block"] + conv_kwargs = dict(norm_layer=norm_layer, act_layer=act_layer) + + # Stem module + last_stem_stride = stem_stride // 2 + conv_type = SeparableConvNormAct if cfg["depthwise"] else ConvNormAct + self.stem = nn.Sequential(*[ + ConvNormAct(in_chans, stem_chs[0], 3, stride=2, **conv_kwargs), + conv_type(stem_chs[0], stem_chs[1], 3, stride=1, **conv_kwargs), + conv_type(stem_chs[1], stem_chs[2], 3, stride=last_stem_stride, **conv_kwargs), + ]) + self.feature_info = [dict( + num_chs=stem_chs[1], reduction=2, module=f'stem.{1 if stem_stride == 4 else 2}')] + current_stride = stem_stride + + # OSA stages + stage_dpr = torch.split(torch.linspace(0, drop_path_rate, sum(block_per_stage)), block_per_stage) + in_ch_list = stem_chs[-1:] + stage_out_chs[:-1] + stage_args = dict(residual=cfg["residual"], depthwise=cfg["depthwise"], attn=cfg["attn"], **conv_kwargs) + stages = [] + for i in range(4): # num_stages + downsample = stem_stride == 2 or i > 0 # first stage has no stride/downsample if stem_stride is 4 + stages += [OsaStage( + in_ch_list[i], stage_conv_chs[i], stage_out_chs[i], block_per_stage[i], layer_per_block, + downsample=downsample, drop_path_rates=stage_dpr[i], **stage_args) + ] + self.num_features = stage_out_chs[i] + current_stride *= 2 if downsample else 1 + self.feature_info += [dict(num_chs=self.num_features, reduction=current_stride, module=f'stages.{i}')] + + self.stages = nn.Sequential(*stages) + + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate) + + for n, m in self.named_modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.Linear): + nn.init.zeros_(m.bias) + + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', + blocks=r'^stages\.(\d+)' if coarse else r'^stages\.(\d+).blocks\.(\d+)', + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + for s in self.stages: + s.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + return self.stages(x) + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _create_vovnet(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + VovNet, variant, pretrained, + model_cfg=model_cfgs[variant], + feature_cfg=dict(flatten_sequential=True), + **kwargs) + + +@register_model +def vovnet39a(pretrained=False, **kwargs): + return _create_vovnet('vovnet39a', pretrained=pretrained, **kwargs) + + +@register_model +def vovnet57a(pretrained=False, **kwargs): + return _create_vovnet('vovnet57a', pretrained=pretrained, **kwargs) + + +@register_model +def ese_vovnet19b_slim_dw(pretrained=False, **kwargs): + return _create_vovnet('ese_vovnet19b_slim_dw', pretrained=pretrained, **kwargs) + + +@register_model +def ese_vovnet19b_dw(pretrained=False, **kwargs): + return _create_vovnet('ese_vovnet19b_dw', pretrained=pretrained, **kwargs) + + +@register_model +def ese_vovnet19b_slim(pretrained=False, **kwargs): + return _create_vovnet('ese_vovnet19b_slim', pretrained=pretrained, **kwargs) + + +@register_model +def ese_vovnet39b(pretrained=False, **kwargs): + return _create_vovnet('ese_vovnet39b', pretrained=pretrained, **kwargs) + + +@register_model +def ese_vovnet57b(pretrained=False, **kwargs): + return _create_vovnet('ese_vovnet57b', pretrained=pretrained, **kwargs) + + +@register_model +def ese_vovnet99b(pretrained=False, **kwargs): + return _create_vovnet('ese_vovnet99b', pretrained=pretrained, **kwargs) + + +@register_model +def eca_vovnet39b(pretrained=False, **kwargs): + return _create_vovnet('eca_vovnet39b', pretrained=pretrained, **kwargs) + + +# Experimental Models + +@register_model +def ese_vovnet39b_evos(pretrained=False, **kwargs): + def norm_act_fn(num_features, **nkwargs): + return create_norm_act_layer('evonorms0', num_features, jit=False, **nkwargs) + return _create_vovnet('ese_vovnet39b_evos', pretrained=pretrained, norm_layer=norm_act_fn, **kwargs) + + +@register_model +def ese_vovnet99b_iabn(pretrained=False, **kwargs): + norm_layer = get_norm_act_layer('iabn', act_layer='leaky_relu') + return _create_vovnet( + 'ese_vovnet99b_iabn', pretrained=pretrained, norm_layer=norm_layer, act_layer=nn.LeakyReLU, **kwargs) diff --git a/custom_timm/models/xception.py b/custom_timm/models/xception.py new file mode 100644 index 0000000000000000000000000000000000000000..99d02c467b5b40944fb00eed7f40f6bd62c66839 --- /dev/null +++ b/custom_timm/models/xception.py @@ -0,0 +1,249 @@ +""" +Ported to pytorch thanks to [tstandley](https://github.com/tstandley/Xception-PyTorch) + +@author: tstandley +Adapted by cadene + +Creates an Xception Model as defined in: + +Francois Chollet +Xception: Deep Learning with Depthwise Separable Convolutions +https://arxiv.org/pdf/1610.02357.pdf + +This weights ported from the Keras implementation. Achieves the following performance on the validation set: + +Loss:0.9173 Prec@1:78.892 Prec@5:94.292 + +REMEMBER to set your image size to 3x299x299 for both test and validation + +normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], + std=[0.5, 0.5, 0.5]) + +The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299 +""" +import torch.jit +import torch.nn as nn +import torch.nn.functional as F + +from .helpers import build_model_with_cfg +from .layers import create_classifier +from .registry import register_model + +__all__ = ['Xception'] + +default_cfgs = { + 'xception': { + 'url': 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-cadene/xception-43020ad28.pth', + 'input_size': (3, 299, 299), + 'pool_size': (10, 10), + 'crop_pct': 0.8975, + 'interpolation': 'bicubic', + 'mean': (0.5, 0.5, 0.5), + 'std': (0.5, 0.5, 0.5), + 'num_classes': 1000, + 'first_conv': 'conv1', + 'classifier': 'fc' + # The resize parameter of the validation transform should be 333, and make sure to center crop at 299x299 + } +} + + +class SeparableConv2d(nn.Module): + def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding=0, dilation=1): + super(SeparableConv2d, self).__init__() + + self.conv1 = nn.Conv2d( + in_channels, in_channels, kernel_size, stride, padding, dilation, groups=in_channels, bias=False) + self.pointwise = nn.Conv2d(in_channels, out_channels, 1, 1, 0, 1, 1, bias=False) + + def forward(self, x): + x = self.conv1(x) + x = self.pointwise(x) + return x + + +class Block(nn.Module): + def __init__(self, in_channels, out_channels, reps, strides=1, start_with_relu=True, grow_first=True): + super(Block, self).__init__() + + if out_channels != in_channels or strides != 1: + self.skip = nn.Conv2d(in_channels, out_channels, 1, stride=strides, bias=False) + self.skipbn = nn.BatchNorm2d(out_channels) + else: + self.skip = None + + rep = [] + for i in range(reps): + if grow_first: + inc = in_channels if i == 0 else out_channels + outc = out_channels + else: + inc = in_channels + outc = in_channels if i < (reps - 1) else out_channels + rep.append(nn.ReLU(inplace=True)) + rep.append(SeparableConv2d(inc, outc, 3, stride=1, padding=1)) + rep.append(nn.BatchNorm2d(outc)) + + if not start_with_relu: + rep = rep[1:] + else: + rep[0] = nn.ReLU(inplace=False) + + if strides != 1: + rep.append(nn.MaxPool2d(3, strides, 1)) + self.rep = nn.Sequential(*rep) + + def forward(self, inp): + x = self.rep(inp) + + if self.skip is not None: + skip = self.skip(inp) + skip = self.skipbn(skip) + else: + skip = inp + + x += skip + return x + + +class Xception(nn.Module): + """ + Xception optimized for the ImageNet dataset, as specified in + https://arxiv.org/pdf/1610.02357.pdf + """ + + def __init__(self, num_classes=1000, in_chans=3, drop_rate=0., global_pool='avg'): + """ Constructor + Args: + num_classes: number of classes + """ + super(Xception, self).__init__() + self.drop_rate = drop_rate + self.global_pool = global_pool + self.num_classes = num_classes + self.num_features = 2048 + + self.conv1 = nn.Conv2d(in_chans, 32, 3, 2, 0, bias=False) + self.bn1 = nn.BatchNorm2d(32) + self.act1 = nn.ReLU(inplace=True) + + self.conv2 = nn.Conv2d(32, 64, 3, bias=False) + self.bn2 = nn.BatchNorm2d(64) + self.act2 = nn.ReLU(inplace=True) + + self.block1 = Block(64, 128, 2, 2, start_with_relu=False) + self.block2 = Block(128, 256, 2, 2) + self.block3 = Block(256, 728, 2, 2) + + self.block4 = Block(728, 728, 3, 1) + self.block5 = Block(728, 728, 3, 1) + self.block6 = Block(728, 728, 3, 1) + self.block7 = Block(728, 728, 3, 1) + + self.block8 = Block(728, 728, 3, 1) + self.block9 = Block(728, 728, 3, 1) + self.block10 = Block(728, 728, 3, 1) + self.block11 = Block(728, 728, 3, 1) + + self.block12 = Block(728, 1024, 2, 2, grow_first=False) + + self.conv3 = SeparableConv2d(1024, 1536, 3, 1, 1) + self.bn3 = nn.BatchNorm2d(1536) + self.act3 = nn.ReLU(inplace=True) + + self.conv4 = SeparableConv2d(1536, self.num_features, 3, 1, 1) + self.bn4 = nn.BatchNorm2d(self.num_features) + self.act4 = nn.ReLU(inplace=True) + self.feature_info = [ + dict(num_chs=64, reduction=2, module='act2'), + dict(num_chs=128, reduction=4, module='block2.rep.0'), + dict(num_chs=256, reduction=8, module='block3.rep.0'), + dict(num_chs=728, reduction=16, module='block12.rep.0'), + dict(num_chs=2048, reduction=32, module='act4'), + ] + + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + # #------- init weights -------- + for m in self.modules(): + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1) + m.bias.data.zero_() + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^conv[12]|bn[12]', + blocks=[ + (r'^block(\d+)', None), + (r'^conv[34]|bn[34]', (99,)), + ], + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + assert not enable, "gradient checkpointing not supported" + + @torch.jit.ignore + def get_classifier(self): + return self.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.num_classes = num_classes + self.global_pool, self.fc = create_classifier(self.num_features, self.num_classes, pool_type=global_pool) + + def forward_features(self, x): + x = self.conv1(x) + x = self.bn1(x) + x = self.act1(x) + + x = self.conv2(x) + x = self.bn2(x) + x = self.act2(x) + + x = self.block1(x) + x = self.block2(x) + x = self.block3(x) + x = self.block4(x) + x = self.block5(x) + x = self.block6(x) + x = self.block7(x) + x = self.block8(x) + x = self.block9(x) + x = self.block10(x) + x = self.block11(x) + x = self.block12(x) + + x = self.conv3(x) + x = self.bn3(x) + x = self.act3(x) + + x = self.conv4(x) + x = self.bn4(x) + x = self.act4(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + x = self.global_pool(x) + if self.drop_rate: + F.dropout(x, self.drop_rate, training=self.training) + return x if pre_logits else self.fc(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _xception(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + Xception, variant, pretrained, + feature_cfg=dict(feature_cls='hook'), + **kwargs) + + +@register_model +def xception(pretrained=False, **kwargs): + return _xception('xception', pretrained=pretrained, **kwargs) diff --git a/custom_timm/models/xception_aligned.py b/custom_timm/models/xception_aligned.py new file mode 100644 index 0000000000000000000000000000000000000000..7ac75ff05e53279b72cfaea2809f78a757f8e540 --- /dev/null +++ b/custom_timm/models/xception_aligned.py @@ -0,0 +1,358 @@ +"""Pytorch impl of Aligned Xception 41, 65, 71 + +This is a correct, from scratch impl of Aligned Xception (Deeplab) models compatible with TF weights at +https://github.com/tensorflow/models/blob/master/research/deeplab/g3doc/model_zoo.md + +Hacked together by / Copyright 2020 Ross Wightman +""" +from functools import partial + +import torch +import torch.nn as nn + +from custom_timm.data import IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD +from .helpers import build_model_with_cfg, checkpoint_seq +from .layers import ClassifierHead, ConvNormAct, create_conv2d, get_norm_act_layer +from .layers.helpers import to_3tuple +from .registry import register_model + +__all__ = ['XceptionAligned'] + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 299, 299), 'pool_size': (10, 10), + 'crop_pct': 0.903, 'interpolation': 'bicubic', + 'mean': IMAGENET_INCEPTION_MEAN, 'std': IMAGENET_INCEPTION_STD, + 'first_conv': 'stem.0.conv', 'classifier': 'head.fc', + **kwargs + } + + +default_cfgs = dict( + xception41=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_41-e6439c97.pth'), + xception65=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/xception65_ra3-1447db8d.pth', + crop_pct=0.94, + ), + xception71=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_xception_71-8eec7df1.pth'), + + xception41p=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/xception41p_ra3-33195bc8.pth', + crop_pct=0.94, + ), + xception65p=_cfg( + url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-tpu-weights/xception65p_ra3-3c6114e4.pth', + crop_pct=0.94, + ), +) + + +class SeparableConv2d(nn.Module): + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=1, padding='', + act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d): + super(SeparableConv2d, self).__init__() + self.kernel_size = kernel_size + self.dilation = dilation + + # depthwise convolution + self.conv_dw = create_conv2d( + in_chs, in_chs, kernel_size, stride=stride, + padding=padding, dilation=dilation, depthwise=True) + self.bn_dw = norm_layer(in_chs) + self.act_dw = act_layer(inplace=True) if act_layer is not None else nn.Identity() + + # pointwise convolution + self.conv_pw = create_conv2d(in_chs, out_chs, kernel_size=1) + self.bn_pw = norm_layer(out_chs) + self.act_pw = act_layer(inplace=True) if act_layer is not None else nn.Identity() + + def forward(self, x): + x = self.conv_dw(x) + x = self.bn_dw(x) + x = self.act_dw(x) + x = self.conv_pw(x) + x = self.bn_pw(x) + x = self.act_pw(x) + return x + + +class PreSeparableConv2d(nn.Module): + def __init__( + self, in_chs, out_chs, kernel_size=3, stride=1, dilation=1, padding='', + act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, first_act=True): + super(PreSeparableConv2d, self).__init__() + norm_act_layer = get_norm_act_layer(norm_layer, act_layer=act_layer) + self.kernel_size = kernel_size + self.dilation = dilation + + self.norm = norm_act_layer(in_chs, inplace=True) if first_act else nn.Identity() + # depthwise convolution + self.conv_dw = create_conv2d( + in_chs, in_chs, kernel_size, stride=stride, + padding=padding, dilation=dilation, depthwise=True) + + # pointwise convolution + self.conv_pw = create_conv2d(in_chs, out_chs, kernel_size=1) + + def forward(self, x): + x = self.norm(x) + x = self.conv_dw(x) + x = self.conv_pw(x) + return x + + +class XceptionModule(nn.Module): + def __init__( + self, in_chs, out_chs, stride=1, dilation=1, pad_type='', + start_with_relu=True, no_skip=False, act_layer=nn.ReLU, norm_layer=None): + super(XceptionModule, self).__init__() + out_chs = to_3tuple(out_chs) + self.in_channels = in_chs + self.out_channels = out_chs[-1] + self.no_skip = no_skip + if not no_skip and (self.out_channels != self.in_channels or stride != 1): + self.shortcut = ConvNormAct( + in_chs, self.out_channels, 1, stride=stride, norm_layer=norm_layer, apply_act=False) + else: + self.shortcut = None + + separable_act_layer = None if start_with_relu else act_layer + self.stack = nn.Sequential() + for i in range(3): + if start_with_relu: + self.stack.add_module(f'act{i + 1}', act_layer(inplace=i > 0)) + self.stack.add_module(f'conv{i + 1}', SeparableConv2d( + in_chs, out_chs[i], 3, stride=stride if i == 2 else 1, dilation=dilation, padding=pad_type, + act_layer=separable_act_layer, norm_layer=norm_layer)) + in_chs = out_chs[i] + + def forward(self, x): + skip = x + x = self.stack(x) + if self.shortcut is not None: + skip = self.shortcut(skip) + if not self.no_skip: + x = x + skip + return x + + +class PreXceptionModule(nn.Module): + def __init__( + self, in_chs, out_chs, stride=1, dilation=1, pad_type='', + no_skip=False, act_layer=nn.ReLU, norm_layer=None): + super(PreXceptionModule, self).__init__() + out_chs = to_3tuple(out_chs) + self.in_channels = in_chs + self.out_channels = out_chs[-1] + self.no_skip = no_skip + if not no_skip and (self.out_channels != self.in_channels or stride != 1): + self.shortcut = create_conv2d(in_chs, self.out_channels, 1, stride=stride) + else: + self.shortcut = nn.Identity() + + self.norm = get_norm_act_layer(norm_layer, act_layer=act_layer)(in_chs, inplace=True) + self.stack = nn.Sequential() + for i in range(3): + self.stack.add_module(f'conv{i + 1}', PreSeparableConv2d( + in_chs, out_chs[i], 3, stride=stride if i == 2 else 1, dilation=dilation, padding=pad_type, + act_layer=act_layer, norm_layer=norm_layer, first_act=i > 0)) + in_chs = out_chs[i] + + def forward(self, x): + x = self.norm(x) + skip = x + x = self.stack(x) + if not self.no_skip: + x = x + self.shortcut(skip) + return x + + +class XceptionAligned(nn.Module): + """Modified Aligned Xception + """ + + def __init__( + self, block_cfg, num_classes=1000, in_chans=3, output_stride=32, preact=False, + act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, drop_rate=0., global_pool='avg'): + super(XceptionAligned, self).__init__() + assert output_stride in (8, 16, 32) + self.num_classes = num_classes + self.drop_rate = drop_rate + self.grad_checkpointing = False + + layer_args = dict(act_layer=act_layer, norm_layer=norm_layer) + self.stem = nn.Sequential(*[ + ConvNormAct(in_chans, 32, kernel_size=3, stride=2, **layer_args), + create_conv2d(32, 64, kernel_size=3, stride=1) if preact else + ConvNormAct(32, 64, kernel_size=3, stride=1, **layer_args) + ]) + + curr_dilation = 1 + curr_stride = 2 + self.feature_info = [] + self.blocks = nn.Sequential() + module_fn = PreXceptionModule if preact else XceptionModule + for i, b in enumerate(block_cfg): + b['dilation'] = curr_dilation + if b['stride'] > 1: + name = f'blocks.{i}.stack.conv2' if preact else f'blocks.{i}.stack.act3' + self.feature_info += [dict(num_chs=to_3tuple(b['out_chs'])[-2], reduction=curr_stride, module=name)] + next_stride = curr_stride * b['stride'] + if next_stride > output_stride: + curr_dilation *= b['stride'] + b['stride'] = 1 + else: + curr_stride = next_stride + self.blocks.add_module(str(i), module_fn(**b, **layer_args)) + self.num_features = self.blocks[-1].out_channels + + self.feature_info += [dict( + num_chs=self.num_features, reduction=curr_stride, module='blocks.' + str(len(self.blocks) - 1))] + self.act = act_layer(inplace=True) if preact else nn.Identity() + self.head = ClassifierHead( + in_chs=self.num_features, num_classes=num_classes, pool_type=global_pool, drop_rate=drop_rate) + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^stem', + blocks=r'^blocks\.(\d+)', + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head.fc + + def reset_classifier(self, num_classes, global_pool='avg'): + self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate) + + def forward_features(self, x): + x = self.stem(x) + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint_seq(self.blocks, x) + else: + x = self.blocks(x) + x = self.act(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + return self.head(x, pre_logits=pre_logits) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def _xception(variant, pretrained=False, **kwargs): + return build_model_with_cfg( + XceptionAligned, variant, pretrained, + feature_cfg=dict(flatten_sequential=True, feature_cls='hook'), + **kwargs) + + +@register_model +def xception41(pretrained=False, **kwargs): + """ Modified Aligned Xception-41 + """ + block_cfg = [ + # entry flow + dict(in_chs=64, out_chs=128, stride=2), + dict(in_chs=128, out_chs=256, stride=2), + dict(in_chs=256, out_chs=728, stride=2), + # middle flow + *([dict(in_chs=728, out_chs=728, stride=1)] * 8), + # exit flow + dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2), + dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False), + ] + model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs) + return _xception('xception41', pretrained=pretrained, **model_args) + + +@register_model +def xception65(pretrained=False, **kwargs): + """ Modified Aligned Xception-65 + """ + block_cfg = [ + # entry flow + dict(in_chs=64, out_chs=128, stride=2), + dict(in_chs=128, out_chs=256, stride=2), + dict(in_chs=256, out_chs=728, stride=2), + # middle flow + *([dict(in_chs=728, out_chs=728, stride=1)] * 16), + # exit flow + dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2), + dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False), + ] + model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs) + return _xception('xception65', pretrained=pretrained, **model_args) + + +@register_model +def xception71(pretrained=False, **kwargs): + """ Modified Aligned Xception-71 + """ + block_cfg = [ + # entry flow + dict(in_chs=64, out_chs=128, stride=2), + dict(in_chs=128, out_chs=256, stride=1), + dict(in_chs=256, out_chs=256, stride=2), + dict(in_chs=256, out_chs=728, stride=1), + dict(in_chs=728, out_chs=728, stride=2), + # middle flow + *([dict(in_chs=728, out_chs=728, stride=1)] * 16), + # exit flow + dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2), + dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True, start_with_relu=False), + ] + model_args = dict(block_cfg=block_cfg, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs) + return _xception('xception71', pretrained=pretrained, **model_args) + + +@register_model +def xception41p(pretrained=False, **kwargs): + """ Modified Aligned Xception-41 w/ Pre-Act + """ + block_cfg = [ + # entry flow + dict(in_chs=64, out_chs=128, stride=2), + dict(in_chs=128, out_chs=256, stride=2), + dict(in_chs=256, out_chs=728, stride=2), + # middle flow + *([dict(in_chs=728, out_chs=728, stride=1)] * 8), + # exit flow + dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2), + dict(in_chs=1024, out_chs=(1536, 1536, 2048), no_skip=True, stride=1), + ] + model_args = dict(block_cfg=block_cfg, preact=True, norm_layer=nn.BatchNorm2d, **kwargs) + return _xception('xception41p', pretrained=pretrained, **model_args) + + +@register_model +def xception65p(pretrained=False, **kwargs): + """ Modified Aligned Xception-65 w/ Pre-Act + """ + block_cfg = [ + # entry flow + dict(in_chs=64, out_chs=128, stride=2), + dict(in_chs=128, out_chs=256, stride=2), + dict(in_chs=256, out_chs=728, stride=2), + # middle flow + *([dict(in_chs=728, out_chs=728, stride=1)] * 16), + # exit flow + dict(in_chs=728, out_chs=(728, 1024, 1024), stride=2), + dict(in_chs=1024, out_chs=(1536, 1536, 2048), stride=1, no_skip=True), + ] + model_args = dict( + block_cfg=block_cfg, preact=True, norm_layer=partial(nn.BatchNorm2d, eps=.001, momentum=.1), **kwargs) + return _xception('xception65p', pretrained=pretrained, **model_args) diff --git a/custom_timm/models/xcit.py b/custom_timm/models/xcit.py new file mode 100644 index 0000000000000000000000000000000000000000..8c706df76cc54703c6a74623247298449e508a17 --- /dev/null +++ b/custom_timm/models/xcit.py @@ -0,0 +1,842 @@ +""" Cross-Covariance Image Transformer (XCiT) in PyTorch + +Paper: + - https://arxiv.org/abs/2106.09681 + +Same as the official implementation, with some minor adaptations, original copyright below + - https://github.com/facebookresearch/xcit/blob/master/xcit.py + +Modifications and additions for timm hacked together by / Copyright 2021, Ross Wightman +""" +# Copyright (c) 2015-present, Facebook, Inc. +# All rights reserved. + +import math +from functools import partial + +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +from custom_timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD +from .helpers import build_model_with_cfg +from .vision_transformer import _cfg, Mlp +from .registry import register_model +from .layers import DropPath, trunc_normal_, to_2tuple +from .cait import ClassAttn +from .fx_features import register_notrace_module + + +def _cfg(url='', **kwargs): + return { + 'url': url, + 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None, + 'crop_pct': 1.0, 'interpolation': 'bicubic', 'fixed_input_size': True, + 'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD, + 'first_conv': 'patch_embed.proj.0.0', 'classifier': 'head', + **kwargs + } + + +default_cfgs = { + # Patch size 16 + 'xcit_nano_12_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_224.pth'), + 'xcit_nano_12_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_224_dist.pth'), + 'xcit_nano_12_p16_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p16_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_tiny_12_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_224.pth'), + 'xcit_tiny_12_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_224_dist.pth'), + 'xcit_tiny_12_p16_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p16_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_tiny_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_224.pth'), + 'xcit_tiny_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_224_dist.pth'), + 'xcit_tiny_24_p16_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p16_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_small_12_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_224.pth'), + 'xcit_small_12_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_224_dist.pth'), + 'xcit_small_12_p16_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p16_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_small_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_224.pth'), + 'xcit_small_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_224_dist.pth'), + 'xcit_small_24_p16_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p16_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_medium_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_224.pth'), + 'xcit_medium_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_224_dist.pth'), + 'xcit_medium_24_p16_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p16_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_large_24_p16_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_224.pth'), + 'xcit_large_24_p16_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_224_dist.pth'), + 'xcit_large_24_p16_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p16_384_dist.pth', input_size=(3, 384, 384)), + + # Patch size 8 + 'xcit_nano_12_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_224.pth'), + 'xcit_nano_12_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_224_dist.pth'), + 'xcit_nano_12_p8_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_nano_12_p8_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_tiny_12_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_224.pth'), + 'xcit_tiny_12_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_224_dist.pth'), + 'xcit_tiny_12_p8_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_12_p8_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_tiny_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_224.pth'), + 'xcit_tiny_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_224_dist.pth'), + 'xcit_tiny_24_p8_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_tiny_24_p8_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_small_12_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_224.pth'), + 'xcit_small_12_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_224_dist.pth'), + 'xcit_small_12_p8_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_small_12_p8_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_small_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_224.pth'), + 'xcit_small_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_224_dist.pth'), + 'xcit_small_24_p8_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_small_24_p8_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_medium_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_224.pth'), + 'xcit_medium_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_224_dist.pth'), + 'xcit_medium_24_p8_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_medium_24_p8_384_dist.pth', input_size=(3, 384, 384)), + 'xcit_large_24_p8_224': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_224.pth'), + 'xcit_large_24_p8_224_dist': _cfg(url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_224_dist.pth'), + 'xcit_large_24_p8_384_dist': _cfg( + url='https://dl.fbaipublicfiles.com/xcit/xcit_large_24_p8_384_dist.pth', input_size=(3, 384, 384)), +} + + +@register_notrace_module # reason: FX can't symbolically trace torch.arange in forward method +class PositionalEncodingFourier(nn.Module): + """ + Positional encoding relying on a fourier kernel matching the one used in the "Attention is all of Need" paper. + Based on the official XCiT code + - https://github.com/facebookresearch/xcit/blob/master/xcit.py + """ + + def __init__(self, hidden_dim=32, dim=768, temperature=10000): + super().__init__() + self.token_projection = nn.Conv2d(hidden_dim * 2, dim, kernel_size=1) + self.scale = 2 * math.pi + self.temperature = temperature + self.hidden_dim = hidden_dim + self.dim = dim + self.eps = 1e-6 + + def forward(self, B: int, H: int, W: int): + device = self.token_projection.weight.device + y_embed = torch.arange(1, H+1, dtype=torch.float32, device=device).unsqueeze(1).repeat(1, 1, W) + x_embed = torch.arange(1, W+1, dtype=torch.float32, device=device).repeat(1, H, 1) + y_embed = y_embed / (y_embed[:, -1:, :] + self.eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + self.eps) * self.scale + dim_t = torch.arange(self.hidden_dim, dtype=torch.float32, device=device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode='floor') / self.hidden_dim) + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack([pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()], dim=4).flatten(3) + pos_y = torch.stack([pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()], dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + pos = self.token_projection(pos) + return pos.repeat(B, 1, 1, 1) # (B, C, H, W) + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution + batch norm""" + return torch.nn.Sequential( + nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False), + nn.BatchNorm2d(out_planes) + ) + + +class ConvPatchEmbed(nn.Module): + """Image to Patch Embedding using multiple convolutional layers""" + + def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768, act_layer=nn.GELU): + super().__init__() + img_size = to_2tuple(img_size) + num_patches = (img_size[1] // patch_size) * (img_size[0] // patch_size) + self.img_size = img_size + self.patch_size = patch_size + self.num_patches = num_patches + + if patch_size == 16: + self.proj = torch.nn.Sequential( + conv3x3(in_chans, embed_dim // 8, 2), + act_layer(), + conv3x3(embed_dim // 8, embed_dim // 4, 2), + act_layer(), + conv3x3(embed_dim // 4, embed_dim // 2, 2), + act_layer(), + conv3x3(embed_dim // 2, embed_dim, 2), + ) + elif patch_size == 8: + self.proj = torch.nn.Sequential( + conv3x3(in_chans, embed_dim // 4, 2), + act_layer(), + conv3x3(embed_dim // 4, embed_dim // 2, 2), + act_layer(), + conv3x3(embed_dim // 2, embed_dim, 2), + ) + else: + raise('For convolutional projection, patch size has to be in [8, 16]') + + def forward(self, x): + x = self.proj(x) + Hp, Wp = x.shape[2], x.shape[3] + x = x.flatten(2).transpose(1, 2) # (B, N, C) + return x, (Hp, Wp) + + +class LPI(nn.Module): + """ + Local Patch Interaction module that allows explicit communication between tokens in 3x3 windows to augment the + implicit communication performed by the block diagonal scatter attention. Implemented using 2 layers of separable + 3x3 convolutions with GeLU and BatchNorm2d + """ + + def __init__(self, in_features, out_features=None, act_layer=nn.GELU, kernel_size=3): + super().__init__() + out_features = out_features or in_features + + padding = kernel_size // 2 + + self.conv1 = torch.nn.Conv2d( + in_features, in_features, kernel_size=kernel_size, padding=padding, groups=in_features) + self.act = act_layer() + self.bn = nn.BatchNorm2d(in_features) + self.conv2 = torch.nn.Conv2d( + in_features, out_features, kernel_size=kernel_size, padding=padding, groups=out_features) + + def forward(self, x, H: int, W: int): + B, N, C = x.shape + x = x.permute(0, 2, 1).reshape(B, C, H, W) + x = self.conv1(x) + x = self.act(x) + x = self.bn(x) + x = self.conv2(x) + x = x.reshape(B, C, N).permute(0, 2, 1) + return x + + +class ClassAttentionBlock(nn.Module): + """Class Attention Layer as in CaiT https://arxiv.org/abs/2103.17239""" + + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., drop_path=0., + act_layer=nn.GELU, norm_layer=nn.LayerNorm, eta=1., tokens_norm=False): + super().__init__() + self.norm1 = norm_layer(dim) + + self.attn = ClassAttn( + dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + + if eta is not None: # LayerScale Initialization (no layerscale when None) + self.gamma1 = nn.Parameter(eta * torch.ones(dim)) + self.gamma2 = nn.Parameter(eta * torch.ones(dim)) + else: + self.gamma1, self.gamma2 = 1.0, 1.0 + + # See https://github.com/rwightman/pytorch-image-models/pull/747#issuecomment-877795721 + self.tokens_norm = tokens_norm + + def forward(self, x): + x_norm1 = self.norm1(x) + x_attn = torch.cat([self.attn(x_norm1), x_norm1[:, 1:]], dim=1) + x = x + self.drop_path(self.gamma1 * x_attn) + if self.tokens_norm: + x = self.norm2(x) + else: + x = torch.cat([self.norm2(x[:, 0:1]), x[:, 1:]], dim=1) + x_res = x + cls_token = x[:, 0:1] + cls_token = self.gamma2 * self.mlp(cls_token) + x = torch.cat([cls_token, x[:, 1:]], dim=1) + x = x_res + self.drop_path(x) + return x + + +class XCA(nn.Module): + """ Cross-Covariance Attention (XCA) + Operation where the channels are updated using a weighted sum. The weights are obtained from the (softmax + normalized) Cross-covariance matrix (Q^T \\cdot K \\in d_h \\times d_h) + """ + + def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0.): + super().__init__() + self.num_heads = num_heads + self.temperature = nn.Parameter(torch.ones(num_heads, 1, 1)) + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + # Result of next line is (qkv, B, num (H)eads, (C')hannels per head, N) + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 4, 1) + q, k, v = qkv.unbind(0) # make torchscript happy (cannot use tensor as tuple) + + # Paper section 3.2 l2-Normalization and temperature scaling + q = torch.nn.functional.normalize(q, dim=-1) + k = torch.nn.functional.normalize(k, dim=-1) + attn = (q @ k.transpose(-2, -1)) * self.temperature + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + + # (B, H, C', N), permute -> (B, N, H, C') + x = (attn @ v).permute(0, 3, 1, 2).reshape(B, N, C) + x = self.proj(x) + x = self.proj_drop(x) + return x + + @torch.jit.ignore + def no_weight_decay(self): + return {'temperature'} + + +class XCABlock(nn.Module): + def __init__( + self, dim, num_heads, mlp_ratio=4., qkv_bias=False, drop=0., attn_drop=0., + drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, eta=1.): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = XCA(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop) + self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() + + self.norm3 = norm_layer(dim) + self.local_mp = LPI(in_features=dim, act_layer=act_layer) + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, hidden_features=int(dim * mlp_ratio), act_layer=act_layer, drop=drop) + + self.gamma1 = nn.Parameter(eta * torch.ones(dim)) + self.gamma3 = nn.Parameter(eta * torch.ones(dim)) + self.gamma2 = nn.Parameter(eta * torch.ones(dim)) + + def forward(self, x, H: int, W: int): + x = x + self.drop_path(self.gamma1 * self.attn(self.norm1(x))) + # NOTE official code has 3 then 2, so keeping it the same to be consistent with loaded weights + # See https://github.com/rwightman/pytorch-image-models/pull/747#issuecomment-877795721 + x = x + self.drop_path(self.gamma3 * self.local_mp(self.norm3(x), H, W)) + x = x + self.drop_path(self.gamma2 * self.mlp(self.norm2(x))) + return x + + +class XCiT(nn.Module): + """ + Based on timm and DeiT code bases + https://github.com/rwightman/pytorch-image-models/tree/master/timm + https://github.com/facebookresearch/deit/ + """ + + def __init__( + self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, global_pool='token', embed_dim=768, + depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0., + act_layer=None, norm_layer=None, cls_attn_layers=2, use_pos_embed=True, eta=1., tokens_norm=False): + """ + Args: + img_size (int, tuple): input image size + patch_size (int): patch size + in_chans (int): number of input channels + num_classes (int): number of classes for classification head + embed_dim (int): embedding dimension + depth (int): depth of transformer + num_heads (int): number of attention heads + mlp_ratio (int): ratio of mlp hidden dim to embedding dim + qkv_bias (bool): enable bias for qkv if True + drop_rate (float): dropout rate after positional embedding, and in XCA/CA projection + MLP + attn_drop_rate (float): attention dropout rate + drop_path_rate (float): stochastic depth rate (constant across all layers) + norm_layer: (nn.Module): normalization layer + cls_attn_layers: (int) Depth of Class attention layers + use_pos_embed: (bool) whether to use positional encoding + eta: (float) layerscale initialization value + tokens_norm: (bool) Whether to normalize all tokens or just the cls_token in the CA + + Notes: + - Although `layer_norm` is user specifiable, there are hard-coded `BatchNorm2d`s in the local patch + interaction (class LPI) and the patch embedding (class ConvPatchEmbed) + """ + super().__init__() + assert global_pool in ('', 'avg', 'token') + img_size = to_2tuple(img_size) + assert (img_size[0] % patch_size == 0) and (img_size[0] % patch_size == 0), \ + '`patch_size` should divide image dimensions evenly' + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + + self.num_classes = num_classes + self.num_features = self.embed_dim = embed_dim + self.global_pool = global_pool + self.grad_checkpointing = False + + self.patch_embed = ConvPatchEmbed( + img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim, act_layer=act_layer) + + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.use_pos_embed = use_pos_embed + if use_pos_embed: + self.pos_embed = PositionalEncodingFourier(dim=embed_dim) + self.pos_drop = nn.Dropout(p=drop_rate) + + self.blocks = nn.ModuleList([ + XCABlock( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, + attn_drop=attn_drop_rate, drop_path=drop_path_rate, act_layer=act_layer, norm_layer=norm_layer, eta=eta) + for _ in range(depth)]) + + self.cls_attn_blocks = nn.ModuleList([ + ClassAttentionBlock( + dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, drop=drop_rate, + attn_drop=attn_drop_rate, act_layer=act_layer, norm_layer=norm_layer, eta=eta, tokens_norm=tokens_norm) + for _ in range(cls_attn_layers)]) + + # Classifier head + self.norm = norm_layer(embed_dim) + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + # Init weights + trunc_normal_(self.cls_token, std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + + @torch.jit.ignore + def no_weight_decay(self): + return {'pos_embed', 'cls_token'} + + @torch.jit.ignore + def group_matcher(self, coarse=False): + return dict( + stem=r'^cls_token|pos_embed|patch_embed', # stem and embed + blocks=r'^blocks\.(\d+)', + cls_attn_blocks=[(r'^cls_attn_blocks\.(\d+)', None), (r'^norm', (99999,))] + ) + + @torch.jit.ignore + def set_grad_checkpointing(self, enable=True): + self.grad_checkpointing = enable + + @torch.jit.ignore + def get_classifier(self): + return self.head + + def reset_classifier(self, num_classes, global_pool=''): + self.num_classes = num_classes + if global_pool is not None: + assert global_pool in ('', 'avg', 'token') + self.global_pool = global_pool + self.head = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity() + + def forward_features(self, x): + B = x.shape[0] + # x is (B, N, C). (Hp, Hw) is (height in units of patches, width in units of patches) + x, (Hp, Wp) = self.patch_embed(x) + + if self.use_pos_embed: + # `pos_embed` (B, C, Hp, Wp), reshape -> (B, C, N), permute -> (B, N, C) + pos_encoding = self.pos_embed(B, Hp, Wp).reshape(B, -1, x.shape[1]).permute(0, 2, 1) + x = x + pos_encoding + x = self.pos_drop(x) + + for blk in self.blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(blk, x, Hp, Wp) + else: + x = blk(x, Hp, Wp) + + x = torch.cat((self.cls_token.expand(B, -1, -1), x), dim=1) + + for blk in self.cls_attn_blocks: + if self.grad_checkpointing and not torch.jit.is_scripting(): + x = checkpoint(blk, x) + else: + x = blk(x) + + x = self.norm(x) + return x + + def forward_head(self, x, pre_logits: bool = False): + if self.global_pool: + x = x[:, 1:].mean(dim=1) if self.global_pool == 'avg' else x[:, 0] + return x if pre_logits else self.head(x) + + def forward(self, x): + x = self.forward_features(x) + x = self.forward_head(x) + return x + + +def checkpoint_filter_fn(state_dict, model): + if 'model' in state_dict: + state_dict = state_dict['model'] + # For consistency with timm's transformer models while being compatible with official weights source we rename + # pos_embeder to pos_embed. Also account for use_pos_embed == False + use_pos_embed = getattr(model, 'pos_embed', None) is not None + pos_embed_keys = [k for k in state_dict if k.startswith('pos_embed')] + for k in pos_embed_keys: + if use_pos_embed: + state_dict[k.replace('pos_embeder.', 'pos_embed.')] = state_dict.pop(k) + else: + del state_dict[k] + # timm's implementation of class attention in CaiT is slightly more efficient as it does not compute query vectors + # for all tokens, just the class token. To use official weights source we must split qkv into q, k, v + if 'cls_attn_blocks.0.attn.qkv.weight' in state_dict and 'cls_attn_blocks.0.attn.q.weight' in model.state_dict(): + num_ca_blocks = len(model.cls_attn_blocks) + for i in range(num_ca_blocks): + qkv_weight = state_dict.pop(f'cls_attn_blocks.{i}.attn.qkv.weight') + qkv_weight = qkv_weight.reshape(3, -1, qkv_weight.shape[-1]) + for j, subscript in enumerate('qkv'): + state_dict[f'cls_attn_blocks.{i}.attn.{subscript}.weight'] = qkv_weight[j] + qkv_bias = state_dict.pop(f'cls_attn_blocks.{i}.attn.qkv.bias', None) + if qkv_bias is not None: + qkv_bias = qkv_bias.reshape(3, -1) + for j, subscript in enumerate('qkv'): + state_dict[f'cls_attn_blocks.{i}.attn.{subscript}.bias'] = qkv_bias[j] + return state_dict + + +def _create_xcit(variant, pretrained=False, default_cfg=None, **kwargs): + model = build_model_with_cfg( + XCiT, variant, pretrained, pretrained_filter_fn=checkpoint_filter_fn, **kwargs) + return model + + +@register_model +def xcit_nano_12_p16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs) + model = _create_xcit('xcit_nano_12_p16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_nano_12_p16_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs) + model = _create_xcit('xcit_nano_12_p16_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_nano_12_p16_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, img_size=384, **kwargs) + model = _create_xcit('xcit_nano_12_p16_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_12_p16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_12_p16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_12_p16_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_12_p16_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_12_p16_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_12_p16_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_12_p16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_12_p16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_12_p16_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_12_p16_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_12_p16_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_12_p16_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_24_p16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_24_p16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_24_p16_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_24_p16_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_24_p16_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_24_p16_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_24_p16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_24_p16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_24_p16_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_24_p16_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_24_p16_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_24_p16_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_medium_24_p16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_medium_24_p16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_medium_24_p16_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_medium_24_p16_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_medium_24_p16_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_medium_24_p16_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_large_24_p16_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_large_24_p16_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_large_24_p16_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_large_24_p16_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_large_24_p16_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=16, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_large_24_p16_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +# Patch size 8x8 models +@register_model +def xcit_nano_12_p8_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs) + model = _create_xcit('xcit_nano_12_p8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_nano_12_p8_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs) + model = _create_xcit('xcit_nano_12_p8_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_nano_12_p8_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=128, depth=12, num_heads=4, eta=1.0, tokens_norm=False, **kwargs) + model = _create_xcit('xcit_nano_12_p8_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_12_p8_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_12_p8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_12_p8_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_12_p8_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_12_p8_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=192, depth=12, num_heads=4, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_12_p8_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_12_p8_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_12_p8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_12_p8_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_12_p8_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_12_p8_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=384, depth=12, num_heads=8, eta=1.0, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_12_p8_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_24_p8_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_24_p8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_24_p8_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_24_p8_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_tiny_24_p8_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=192, depth=24, num_heads=4, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_tiny_24_p8_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_24_p8_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_24_p8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_24_p8_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_24_p8_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_small_24_p8_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=384, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_small_24_p8_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_medium_24_p8_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_medium_24_p8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_medium_24_p8_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_medium_24_p8_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_medium_24_p8_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=512, depth=24, num_heads=8, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_medium_24_p8_384_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_large_24_p8_224(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_large_24_p8_224', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_large_24_p8_224_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_large_24_p8_224_dist', pretrained=pretrained, **model_kwargs) + return model + + +@register_model +def xcit_large_24_p8_384_dist(pretrained=False, **kwargs): + model_kwargs = dict( + patch_size=8, embed_dim=768, depth=24, num_heads=16, eta=1e-5, tokens_norm=True, **kwargs) + model = _create_xcit('xcit_large_24_p8_384_dist', pretrained=pretrained, **model_kwargs) + return model diff --git a/custom_timm/optim/__init__.py b/custom_timm/optim/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7ee4958eb562bcfe06a5da72be4b76ee610a0ccc --- /dev/null +++ b/custom_timm/optim/__init__.py @@ -0,0 +1,15 @@ +from .adabelief import AdaBelief +from .adafactor import Adafactor +from .adahessian import Adahessian +from .adamp import AdamP +from .adamw import AdamW +from .lamb import Lamb +from .lars import Lars +from .lookahead import Lookahead +from .madgrad import MADGRAD +from .nadam import Nadam +from .nvnovograd import NvNovoGrad +from .radam import RAdam +from .rmsprop_tf import RMSpropTF +from .sgdp import SGDP +from .optim_factory import create_optimizer, create_optimizer_v2, optimizer_kwargs diff --git a/custom_timm/optim/adabelief.py b/custom_timm/optim/adabelief.py new file mode 100644 index 0000000000000000000000000000000000000000..951d715cc0b605df2f7313c95840b7784c4d0a70 --- /dev/null +++ b/custom_timm/optim/adabelief.py @@ -0,0 +1,201 @@ +import math +import torch +from torch.optim.optimizer import Optimizer + + +class AdaBelief(Optimizer): + r"""Implements AdaBelief algorithm. Modified from Adam in PyTorch + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-16) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + decoupled_decay (boolean, optional): (default: True) If set as True, then + the optimizer uses decoupled weight decay as in AdamW + fixed_decay (boolean, optional): (default: False) This is used when weight_decouple + is set as True. + When fixed_decay == True, the weight decay is performed as + $W_{new} = W_{old} - W_{old} \times decay$. + When fixed_decay == False, the weight decay is performed as + $W_{new} = W_{old} - W_{old} \times decay \times lr$. Note that in this case, the + weight decay ratio decreases with learning rate (lr). + rectify (boolean, optional): (default: True) If set as True, then perform the rectified + update similar to RAdam + degenerated_to_sgd (boolean, optional) (default:True) If set as True, then perform SGD update + when variance of gradient is high + reference: AdaBelief Optimizer, adapting stepsizes by the belief in observed gradients, NeurIPS 2020 + + For a complete table of recommended hyperparameters, see https://github.com/juntang-zhuang/Adabelief-Optimizer' + For example train/args for EfficientNet see these gists + - link to train_scipt: https://gist.github.com/juntang-zhuang/0a501dd51c02278d952cf159bc233037 + - link to args.yaml: https://gist.github.com/juntang-zhuang/517ce3c27022b908bb93f78e4f786dc3 + """ + + def __init__( + self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-16, weight_decay=0, amsgrad=False, + decoupled_decay=True, fixed_decay=False, rectify=True, degenerated_to_sgd=True): + + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + + if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance(params[0], dict): + for param in params: + if 'betas' in param and (param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): + param['buffer'] = [[None, None, None] for _ in range(10)] + + defaults = dict( + lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, amsgrad=amsgrad, + degenerated_to_sgd=degenerated_to_sgd, decoupled_decay=decoupled_decay, rectify=rectify, + fixed_decay=fixed_decay, buffer=[[None, None, None] for _ in range(10)]) + super(AdaBelief, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdaBelief, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def reset(self): + for group in self.param_groups: + for p in group['params']: + state = self.state[p] + amsgrad = group['amsgrad'] + + # State initialization + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + + # Exponential moving average of squared gradient values + state['exp_avg_var'] = torch.zeros_like(p) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_var'] = torch.zeros_like(p) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + if grad.dtype in {torch.float16, torch.bfloat16}: + grad = grad.float() + if grad.is_sparse: + raise RuntimeError( + 'AdaBelief does not support sparse gradients, please consider SparseAdam instead') + + p_fp32 = p + if p.dtype in {torch.float16, torch.bfloat16}: + p_fp32 = p_fp32.float() + + amsgrad = group['amsgrad'] + beta1, beta2 = group['betas'] + state = self.state[p] + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p_fp32) + # Exponential moving average of squared gradient values + state['exp_avg_var'] = torch.zeros_like(p_fp32) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_var'] = torch.zeros_like(p_fp32) + + # perform weight decay, check if decoupled weight decay + if group['decoupled_decay']: + if not group['fixed_decay']: + p_fp32.mul_(1.0 - group['lr'] * group['weight_decay']) + else: + p_fp32.mul_(1.0 - group['weight_decay']) + else: + if group['weight_decay'] != 0: + grad.add_(p_fp32, alpha=group['weight_decay']) + + # get current state variable + exp_avg, exp_avg_var = state['exp_avg'], state['exp_avg_var'] + + state['step'] += 1 + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + # Update first and second moment running average + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + grad_residual = grad - exp_avg + exp_avg_var.mul_(beta2).addcmul_(grad_residual, grad_residual, value=1 - beta2) + + if amsgrad: + max_exp_avg_var = state['max_exp_avg_var'] + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_var, exp_avg_var.add_(group['eps']), out=max_exp_avg_var) + + # Use the max. for normalizing running avg. of gradient + denom = (max_exp_avg_var.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + else: + denom = (exp_avg_var.add_(group['eps']).sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + + # update + if not group['rectify']: + # Default update + step_size = group['lr'] / bias_correction1 + p_fp32.addcdiv_(exp_avg, denom, value=-step_size) + else: + # Rectified update, forked from RAdam + buffered = group['buffer'][int(state['step'] % 10)] + if state['step'] == buffered[0]: + num_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + num_sma_max = 2 / (1 - beta2) - 1 + num_sma = num_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = num_sma + + # more conservative since it's an approximated value + if num_sma >= 5: + step_size = math.sqrt( + (1 - beta2_t) * + (num_sma - 4) / (num_sma_max - 4) * + (num_sma - 2) / num_sma * + num_sma_max / (num_sma_max - 2)) / (1 - beta1 ** state['step']) + elif group['degenerated_to_sgd']: + step_size = 1.0 / (1 - beta1 ** state['step']) + else: + step_size = -1 + buffered[2] = step_size + + if num_sma >= 5: + denom = exp_avg_var.sqrt().add_(group['eps']) + p_fp32.addcdiv_(exp_avg, denom, value=-step_size * group['lr']) + elif step_size > 0: + p_fp32.add_(exp_avg, alpha=-step_size * group['lr']) + + if p.dtype in {torch.float16, torch.bfloat16}: + p.copy_(p_fp32) + + return loss diff --git a/custom_timm/optim/adafactor.py b/custom_timm/optim/adafactor.py new file mode 100644 index 0000000000000000000000000000000000000000..06057433a9bffa555bdc13b27a1c56cff26acf15 --- /dev/null +++ b/custom_timm/optim/adafactor.py @@ -0,0 +1,167 @@ +""" Adafactor Optimizer + +Lifted from https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py + +Original header/copyright below. + +""" +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import torch +import math + + +class Adafactor(torch.optim.Optimizer): + """Implements Adafactor algorithm. + This implementation is based on: `Adafactor: Adaptive Learning Rates with Sublinear Memory Cost` + (see https://arxiv.org/abs/1804.04235) + + Note that this optimizer internally adjusts the learning rate depending on the + *scale_parameter*, *relative_step* and *warmup_init* options. + + To use a manual (external) learning rate schedule you should set `scale_parameter=False` and + `relative_step=False`. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups + lr (float, optional): external learning rate (default: None) + eps (tuple[float, float]): regularization constants for square gradient + and parameter scale respectively (default: (1e-30, 1e-3)) + clip_threshold (float): threshold of root mean square of final gradient update (default: 1.0) + decay_rate (float): coefficient used to compute running averages of square gradient (default: -0.8) + beta1 (float): coefficient used for computing running averages of gradient (default: None) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + scale_parameter (bool): if True, learning rate is scaled by root mean square of parameter (default: True) + warmup_init (bool): time-dependent learning rate computation depends on + whether warm-up initialization is being used (default: False) + """ + + def __init__(self, params, lr=None, eps=1e-30, eps_scale=1e-3, clip_threshold=1.0, + decay_rate=-0.8, betas=None, weight_decay=0.0, scale_parameter=True, warmup_init=False): + relative_step = not lr + if warmup_init and not relative_step: + raise ValueError('warmup_init requires relative_step=True') + + beta1 = None if betas is None else betas[0] # make it compat with standard betas arg + defaults = dict(lr=lr, eps=eps, eps_scale=eps_scale, clip_threshold=clip_threshold, decay_rate=decay_rate, + beta1=beta1, weight_decay=weight_decay, scale_parameter=scale_parameter, + relative_step=relative_step, warmup_init=warmup_init) + super(Adafactor, self).__init__(params, defaults) + + @staticmethod + def _get_lr(param_group, param_state): + if param_group['relative_step']: + min_step = 1e-6 * param_state['step'] if param_group['warmup_init'] else 1e-2 + lr_t = min(min_step, 1.0 / math.sqrt(param_state['step'])) + param_scale = 1.0 + if param_group['scale_parameter']: + param_scale = max(param_group['eps_scale'], param_state['RMS']) + param_group['lr'] = lr_t * param_scale + return param_group['lr'] + + @staticmethod + def _get_options(param_group, param_shape): + factored = len(param_shape) >= 2 + use_first_moment = param_group['beta1'] is not None + return factored, use_first_moment + + @staticmethod + def _rms(tensor): + return tensor.norm(2) / (tensor.numel() ** 0.5) + + def _approx_sq_grad(self, exp_avg_sq_row, exp_avg_sq_col): + r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1) + c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt() + return torch.mul(r_factor, c_factor) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + if grad.dtype in {torch.float16, torch.bfloat16}: + grad = grad.float() + if grad.is_sparse: + raise RuntimeError('Adafactor does not support sparse gradients.') + + state = self.state[p] + + factored, use_first_moment = self._get_options(group, grad.shape) + # State Initialization + if len(state) == 0: + state['step'] = 0 + + if use_first_moment: + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(grad) + if factored: + state['exp_avg_sq_row'] = torch.zeros(grad.shape[:-1]).to(grad) + state['exp_avg_sq_col'] = torch.zeros(grad.shape[:-2] + grad.shape[-1:]).to(grad) + else: + state['exp_avg_sq'] = torch.zeros_like(grad) + + state['RMS'] = 0 + else: + if use_first_moment: + state['exp_avg'] = state['exp_avg'].to(grad) + if factored: + state['exp_avg_sq_row'] = state['exp_avg_sq_row'].to(grad) + state['exp_avg_sq_col'] = state['exp_avg_sq_col'].to(grad) + else: + state['exp_avg_sq'] = state['exp_avg_sq'].to(grad) + + p_fp32 = p + if p.dtype in {torch.float16, torch.bfloat16}: + p_fp32 = p_fp32.float() + + state['step'] += 1 + state['RMS'] = self._rms(p_fp32) + lr_t = self._get_lr(group, state) + + beta2t = 1.0 - math.pow(state['step'], group['decay_rate']) + update = grad ** 2 + group['eps'] + if factored: + exp_avg_sq_row = state['exp_avg_sq_row'] + exp_avg_sq_col = state['exp_avg_sq_col'] + + exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=1.0 - beta2t) + exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=1.0 - beta2t) + + # Approximation of exponential moving average of square of gradient + update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col) + update.mul_(grad) + else: + exp_avg_sq = state['exp_avg_sq'] + + exp_avg_sq.mul_(beta2t).add_(update, alpha=1.0 - beta2t) + update = exp_avg_sq.rsqrt().mul_(grad) + + update.div_((self._rms(update) / group['clip_threshold']).clamp_(min=1.0)) + update.mul_(lr_t) + + if use_first_moment: + exp_avg = state['exp_avg'] + exp_avg.mul_(group['beta1']).add_(update, alpha=1 - group['beta1']) + update = exp_avg + + if group['weight_decay'] != 0: + p_fp32.add_(p_fp32, alpha=-group['weight_decay'] * lr_t) + + p_fp32.add_(-update) + if p.dtype in {torch.float16, torch.bfloat16}: + p.copy_(p_fp32) + + return loss diff --git a/custom_timm/optim/adahessian.py b/custom_timm/optim/adahessian.py new file mode 100644 index 0000000000000000000000000000000000000000..985c67ca686a65f61f5c5b1a7db3e5bba815a19b --- /dev/null +++ b/custom_timm/optim/adahessian.py @@ -0,0 +1,156 @@ +""" AdaHessian Optimizer + +Lifted from https://github.com/davda54/ada-hessian/blob/master/ada_hessian.py +Originally licensed MIT, Copyright 2020, David Samuel +""" +import torch + + +class Adahessian(torch.optim.Optimizer): + """ + Implements the AdaHessian algorithm from "ADAHESSIAN: An Adaptive Second OrderOptimizer for Machine Learning" + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups + lr (float, optional): learning rate (default: 0.1) + betas ((float, float), optional): coefficients used for computing running averages of gradient and the + squared hessian trace (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0.0) + hessian_power (float, optional): exponent of the hessian trace (default: 1.0) + update_each (int, optional): compute the hessian trace approximation only after *this* number of steps + (to save time) (default: 1) + n_samples (int, optional): how many times to sample `z` for the approximation of the hessian trace (default: 1) + """ + + def __init__(self, params, lr=0.1, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.0, + hessian_power=1.0, update_each=1, n_samples=1, avg_conv_kernel=False): + if not 0.0 <= lr: + raise ValueError(f"Invalid learning rate: {lr}") + if not 0.0 <= eps: + raise ValueError(f"Invalid epsilon value: {eps}") + if not 0.0 <= betas[0] < 1.0: + raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}") + if not 0.0 <= betas[1] < 1.0: + raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}") + if not 0.0 <= hessian_power <= 1.0: + raise ValueError(f"Invalid Hessian power value: {hessian_power}") + + self.n_samples = n_samples + self.update_each = update_each + self.avg_conv_kernel = avg_conv_kernel + + # use a separate generator that deterministically generates the same `z`s across all GPUs in case of distributed training + self.seed = 2147483647 + self.generator = torch.Generator().manual_seed(self.seed) + + defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, hessian_power=hessian_power) + super(Adahessian, self).__init__(params, defaults) + + for p in self.get_params(): + p.hess = 0.0 + self.state[p]["hessian step"] = 0 + + @property + def is_second_order(self): + return True + + def get_params(self): + """ + Gets all parameters in all param_groups with gradients + """ + + return (p for group in self.param_groups for p in group['params'] if p.requires_grad) + + def zero_hessian(self): + """ + Zeros out the accumalated hessian traces. + """ + + for p in self.get_params(): + if not isinstance(p.hess, float) and self.state[p]["hessian step"] % self.update_each == 0: + p.hess.zero_() + + @torch.no_grad() + def set_hessian(self): + """ + Computes the Hutchinson approximation of the hessian trace and accumulates it for each trainable parameter. + """ + + params = [] + for p in filter(lambda p: p.grad is not None, self.get_params()): + if self.state[p]["hessian step"] % self.update_each == 0: # compute the trace only each `update_each` step + params.append(p) + self.state[p]["hessian step"] += 1 + + if len(params) == 0: + return + + if self.generator.device != params[0].device: # hackish way of casting the generator to the right device + self.generator = torch.Generator(params[0].device).manual_seed(self.seed) + + grads = [p.grad for p in params] + + for i in range(self.n_samples): + # Rademacher distribution {-1.0, 1.0} + zs = [torch.randint(0, 2, p.size(), generator=self.generator, device=p.device) * 2.0 - 1.0 for p in params] + h_zs = torch.autograd.grad( + grads, params, grad_outputs=zs, only_inputs=True, retain_graph=i < self.n_samples - 1) + for h_z, z, p in zip(h_zs, zs, params): + p.hess += h_z * z / self.n_samples # approximate the expected values of z*(H@z) + + @torch.no_grad() + def step(self, closure=None): + """ + Performs a single optimization step. + Arguments: + closure (callable, optional) -- a closure that reevaluates the model and returns the loss (default: None) + """ + + loss = None + if closure is not None: + loss = closure() + + self.zero_hessian() + self.set_hessian() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None or p.hess is None: + continue + + if self.avg_conv_kernel and p.dim() == 4: + p.hess = torch.abs(p.hess).mean(dim=[2, 3], keepdim=True).expand_as(p.hess).clone() + + # Perform correct stepweight decay as in AdamW + p.mul_(1 - group['lr'] * group['weight_decay']) + + state = self.state[p] + + # State initialization + if len(state) == 1: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of Hessian diagonal square values + state['exp_hessian_diag_sq'] = torch.zeros_like(p) + + exp_avg, exp_hessian_diag_sq = state['exp_avg'], state['exp_hessian_diag_sq'] + beta1, beta2 = group['betas'] + state['step'] += 1 + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(p.grad, alpha=1 - beta1) + exp_hessian_diag_sq.mul_(beta2).addcmul_(p.hess, p.hess, value=1 - beta2) + + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + k = group['hessian_power'] + denom = (exp_hessian_diag_sq / bias_correction2).pow_(k / 2).add_(group['eps']) + + # make update + step_size = group['lr'] / bias_correction1 + p.addcdiv_(exp_avg, denom, value=-step_size) + + return loss diff --git a/custom_timm/optim/adamp.py b/custom_timm/optim/adamp.py new file mode 100644 index 0000000000000000000000000000000000000000..ee187633ab745dbb0344dcdc3dcb1cf40e6ae5e9 --- /dev/null +++ b/custom_timm/optim/adamp.py @@ -0,0 +1,105 @@ +""" +AdamP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/adamp.py + +Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217 +Code: https://github.com/clovaai/AdamP + +Copyright (c) 2020-present NAVER Corp. +MIT license +""" + +import torch +import torch.nn.functional as F +from torch.optim.optimizer import Optimizer +import math + + +def _channel_view(x) -> torch.Tensor: + return x.reshape(x.size(0), -1) + + +def _layer_view(x) -> torch.Tensor: + return x.reshape(1, -1) + + +def projection(p, grad, perturb, delta: float, wd_ratio: float, eps: float): + wd = 1. + expand_size = (-1,) + (1,) * (len(p.shape) - 1) + for view_func in [_channel_view, _layer_view]: + param_view = view_func(p) + grad_view = view_func(grad) + cosine_sim = F.cosine_similarity(grad_view, param_view, dim=1, eps=eps).abs_() + + # FIXME this is a problem for PyTorch XLA + if cosine_sim.max() < delta / math.sqrt(param_view.size(1)): + p_n = p / param_view.norm(p=2, dim=1).add_(eps).reshape(expand_size) + perturb -= p_n * view_func(p_n * perturb).sum(dim=1).reshape(expand_size) + wd = wd_ratio + return perturb, wd + + return perturb, wd + + +class AdamP(Optimizer): + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=0, delta=0.1, wd_ratio=0.1, nesterov=False): + defaults = dict( + lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, + delta=delta, wd_ratio=wd_ratio, nesterov=nesterov) + super(AdamP, self).__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + grad = p.grad + beta1, beta2 = group['betas'] + nesterov = group['nesterov'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + + # Adam + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + + state['step'] += 1 + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + step_size = group['lr'] / bias_correction1 + + if nesterov: + perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom + else: + perturb = exp_avg / denom + + # Projection + wd_ratio = 1. + if len(p.shape) > 1: + perturb, wd_ratio = projection(p, grad, perturb, group['delta'], group['wd_ratio'], group['eps']) + + # Weight decay + if group['weight_decay'] > 0: + p.mul_(1. - group['lr'] * group['weight_decay'] * wd_ratio) + + # Step + p.add_(perturb, alpha=-step_size) + + return loss diff --git a/custom_timm/optim/adamw.py b/custom_timm/optim/adamw.py new file mode 100644 index 0000000000000000000000000000000000000000..66478bc6ef3c50ab9d40cabb0cfb2bd24277c815 --- /dev/null +++ b/custom_timm/optim/adamw.py @@ -0,0 +1,122 @@ +""" AdamW Optimizer +Impl copied from PyTorch master + +NOTE: Builtin optim.AdamW is used by the factory, this impl only serves as a Python based reference, will be removed +someday +""" +import math +import torch +from torch.optim.optimizer import Optimizer + + +class AdamW(Optimizer): + r"""Implements AdamW algorithm. + + The original Adam algorithm was proposed in `Adam: A Method for Stochastic Optimization`_. + The AdamW variant was proposed in `Decoupled Weight Decay Regularization`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay coefficient (default: 1e-2) + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + + .. _Adam\: A Method for Stochastic Optimization: + https://arxiv.org/abs/1412.6980 + .. _Decoupled Weight Decay Regularization: + https://arxiv.org/abs/1711.05101 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=1e-2, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad) + super(AdamW, self).__init__(params, defaults) + + def __setstate__(self, state): + super(AdamW, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + + # Perform stepweight decay + p.data.mul_(1 - group['lr'] * group['weight_decay']) + + # Perform optimization step + grad = p.grad + if grad.is_sparse: + raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + bias_correction1 = 1 - beta1 ** state['step'] + bias_correction2 = 1 - beta2 ** state['step'] + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = (max_exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + else: + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + + step_size = group['lr'] / bias_correction1 + + p.addcdiv_(exp_avg, denom, value=-step_size) + + return loss diff --git a/custom_timm/optim/lamb.py b/custom_timm/optim/lamb.py new file mode 100644 index 0000000000000000000000000000000000000000..12c7c49b8a01ef793c97654ac938259ca6508449 --- /dev/null +++ b/custom_timm/optim/lamb.py @@ -0,0 +1,192 @@ +""" PyTorch Lamb optimizer w/ behaviour similar to NVIDIA FusedLamb + +This optimizer code was adapted from the following (starting with latest) +* https://github.com/HabanaAI/Model-References/blob/2b435114fe8e31f159b1d3063b8280ae37af7423/PyTorch/nlp/bert/pretraining/lamb.py +* https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py +* https://github.com/cybertronai/pytorch-lamb + +Use FusedLamb if you can (GPU). The reason for including this variant of Lamb is to have a version that is +similar in behaviour to APEX FusedLamb if you aren't using NVIDIA GPUs or cannot install/use APEX. + +In addition to some cleanup, this Lamb impl has been modified to support PyTorch XLA and has been tested on TPU. + +Original copyrights for above sources are below. + +Modifications Copyright 2021 Ross Wightman +""" +# Copyright (c) 2021, Habana Labs Ltd. All rights reserved. + +# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# MIT License +# +# Copyright (c) 2019 cybertronai +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +import math + +import torch +from torch.optim import Optimizer + + +class Lamb(Optimizer): + """Implements a pure pytorch variant of FuseLAMB (NvLamb variant) optimizer from apex.optimizers.FusedLAMB + reference: https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py + + LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate. (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its norm. (default: (0.9, 0.999)) + eps (float, optional): term added to the denominator to improve + numerical stability. (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + grad_averaging (bool, optional): whether apply (1-beta2) to grad when + calculating running averages of gradient. (default: True) + max_grad_norm (float, optional): value used to clip global grad norm (default: 1.0) + trust_clip (bool): enable LAMBC trust ratio clipping (default: False) + always_adapt (boolean, optional): Apply adaptive learning rate to 0.0 + weight decay parameter (default: False) + + .. _Large Batch Optimization for Deep Learning - Training BERT in 76 minutes: + https://arxiv.org/abs/1904.00962 + .. _On the Convergence of Adam and Beyond: + https://openreview.net/forum?id=ryQu7f-RZ + """ + + def __init__( + self, params, lr=1e-3, bias_correction=True, betas=(0.9, 0.999), eps=1e-6, + weight_decay=0.01, grad_averaging=True, max_grad_norm=1.0, trust_clip=False, always_adapt=False): + defaults = dict( + lr=lr, bias_correction=bias_correction, betas=betas, eps=eps, weight_decay=weight_decay, + grad_averaging=grad_averaging, max_grad_norm=max_grad_norm, + trust_clip=trust_clip, always_adapt=always_adapt) + super().__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + device = self.param_groups[0]['params'][0].device + one_tensor = torch.tensor(1.0, device=device) # because torch.where doesn't handle scalars correctly + global_grad_norm = torch.zeros(1, device=device) + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + if grad.is_sparse: + raise RuntimeError('Lamb does not support sparse gradients, consider SparseAdam instad.') + global_grad_norm.add_(grad.pow(2).sum()) + + global_grad_norm = torch.sqrt(global_grad_norm) + # FIXME it'd be nice to remove explicit tensor conversion of scalars when torch.where promotes + # scalar types properly https://github.com/pytorch/pytorch/issues/9190 + max_grad_norm = torch.tensor(self.defaults['max_grad_norm'], device=device) + clip_global_grad_norm = torch.where( + global_grad_norm > max_grad_norm, + global_grad_norm / max_grad_norm, + one_tensor) + + for group in self.param_groups: + bias_correction = 1 if group['bias_correction'] else 0 + beta1, beta2 = group['betas'] + grad_averaging = 1 if group['grad_averaging'] else 0 + beta3 = 1 - beta1 if grad_averaging else 1.0 + + # assume same step across group now to simplify things + # per parameter step can be easily support by making it tensor, or pass list into kernel + if 'step' in group: + group['step'] += 1 + else: + group['step'] = 1 + + if bias_correction: + bias_correction1 = 1 - beta1 ** group['step'] + bias_correction2 = 1 - beta2 ** group['step'] + else: + bias_correction1, bias_correction2 = 1.0, 1.0 + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.div_(clip_global_grad_norm) + state = self.state[p] + + # State initialization + if len(state) == 0: + # Exponential moving average of gradient valuesa + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=beta3) # m_t + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) # v_t + + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(group['eps']) + update = (exp_avg / bias_correction1).div_(denom) + + weight_decay = group['weight_decay'] + if weight_decay != 0: + update.add_(p, alpha=weight_decay) + + if weight_decay != 0 or group['always_adapt']: + # Layer-wise LR adaptation. By default, skip adaptation on parameters that are + # excluded from weight decay, unless always_adapt == True, then always enabled. + w_norm = p.norm(2.0) + g_norm = update.norm(2.0) + # FIXME nested where required since logical and/or not working in PT XLA + trust_ratio = torch.where( + w_norm > 0, + torch.where(g_norm > 0, w_norm / g_norm, one_tensor), + one_tensor, + ) + if group['trust_clip']: + # LAMBC trust clipping, upper bound fixed at one + trust_ratio = torch.minimum(trust_ratio, one_tensor) + update.mul_(trust_ratio) + + p.add_(update, alpha=-group['lr']) + + return loss diff --git a/custom_timm/optim/lars.py b/custom_timm/optim/lars.py new file mode 100644 index 0000000000000000000000000000000000000000..38ca9e0b5cb90855104ce7b5ff358cb7fa343f12 --- /dev/null +++ b/custom_timm/optim/lars.py @@ -0,0 +1,135 @@ +""" PyTorch LARS / LARC Optimizer + +An implementation of LARS (SGD) + LARC in PyTorch + +Based on: + * PyTorch SGD: https://github.com/pytorch/pytorch/blob/1.7/torch/optim/sgd.py#L100 + * NVIDIA APEX LARC: https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py + +Additional cleanup and modifications to properly support PyTorch XLA. + +Copyright 2021 Ross Wightman +""" +import torch +from torch.optim.optimizer import Optimizer + + +class Lars(Optimizer): + """ LARS for PyTorch + + Paper: `Large batch training of Convolutional Networks` - https://arxiv.org/pdf/1708.03888.pdf + + Args: + params (iterable): iterable of parameters to optimize or dicts defining parameter groups. + lr (float, optional): learning rate (default: 1.0). + momentum (float, optional): momentum factor (default: 0) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + dampening (float, optional): dampening for momentum (default: 0) + nesterov (bool, optional): enables Nesterov momentum (default: False) + trust_coeff (float): trust coefficient for computing adaptive lr / trust_ratio (default: 0.001) + eps (float): eps for division denominator (default: 1e-8) + trust_clip (bool): enable LARC trust ratio clipping (default: False) + always_adapt (bool): always apply LARS LR adapt, otherwise only when group weight_decay != 0 (default: False) + """ + + def __init__( + self, + params, + lr=1.0, + momentum=0, + dampening=0, + weight_decay=0, + nesterov=False, + trust_coeff=0.001, + eps=1e-8, + trust_clip=False, + always_adapt=False, + ): + if lr < 0.0: + raise ValueError(f"Invalid learning rate: {lr}") + if momentum < 0.0: + raise ValueError(f"Invalid momentum value: {momentum}") + if weight_decay < 0.0: + raise ValueError(f"Invalid weight_decay value: {weight_decay}") + if nesterov and (momentum <= 0 or dampening != 0): + raise ValueError("Nesterov momentum requires a momentum and zero dampening") + + defaults = dict( + lr=lr, + momentum=momentum, + dampening=dampening, + weight_decay=weight_decay, + nesterov=nesterov, + trust_coeff=trust_coeff, + eps=eps, + trust_clip=trust_clip, + always_adapt=always_adapt, + ) + super().__init__(params, defaults) + + def __setstate__(self, state): + super().__setstate__(state) + for group in self.param_groups: + group.setdefault("nesterov", False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Args: + closure (callable, optional): A closure that reevaluates the model and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + device = self.param_groups[0]['params'][0].device + one_tensor = torch.tensor(1.0, device=device) # because torch.where doesn't handle scalars correctly + + for group in self.param_groups: + weight_decay = group['weight_decay'] + momentum = group['momentum'] + dampening = group['dampening'] + nesterov = group['nesterov'] + trust_coeff = group['trust_coeff'] + eps = group['eps'] + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + + # apply LARS LR adaptation, LARC clipping, weight decay + # ref: https://github.com/NVIDIA/apex/blob/master/apex/parallel/LARC.py + if weight_decay != 0 or group['always_adapt']: + w_norm = p.norm(2.0) + g_norm = grad.norm(2.0) + trust_ratio = trust_coeff * w_norm / (g_norm + w_norm * weight_decay + eps) + # FIXME nested where required since logical and/or not working in PT XLA + trust_ratio = torch.where( + w_norm > 0, + torch.where(g_norm > 0, trust_ratio, one_tensor), + one_tensor, + ) + if group['trust_clip']: + trust_ratio = torch.minimum(trust_ratio / group['lr'], one_tensor) + grad.add_(p, alpha=weight_decay) + grad.mul_(trust_ratio) + + # apply SGD update https://github.com/pytorch/pytorch/blob/1.7/torch/optim/sgd.py#L100 + if momentum != 0: + param_state = self.state[p] + if 'momentum_buffer' not in param_state: + buf = param_state['momentum_buffer'] = torch.clone(grad).detach() + else: + buf = param_state['momentum_buffer'] + buf.mul_(momentum).add_(grad, alpha=1. - dampening) + if nesterov: + grad = grad.add(buf, alpha=momentum) + else: + grad = buf + + p.add_(grad, alpha=-group['lr']) + + return loss \ No newline at end of file diff --git a/custom_timm/optim/lookahead.py b/custom_timm/optim/lookahead.py new file mode 100644 index 0000000000000000000000000000000000000000..462c3acd247016a94acd39a27dd44f29ae854d31 --- /dev/null +++ b/custom_timm/optim/lookahead.py @@ -0,0 +1,61 @@ +""" Lookahead Optimizer Wrapper. +Implementation modified from: https://github.com/alphadl/lookahead.pytorch +Paper: `Lookahead Optimizer: k steps forward, 1 step back` - https://arxiv.org/abs/1907.08610 + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +from torch.optim.optimizer import Optimizer +from collections import defaultdict + + +class Lookahead(Optimizer): + def __init__(self, base_optimizer, alpha=0.5, k=6): + # NOTE super().__init__() not called on purpose + if not 0.0 <= alpha <= 1.0: + raise ValueError(f'Invalid slow update rate: {alpha}') + if not 1 <= k: + raise ValueError(f'Invalid lookahead steps: {k}') + defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0) + self._base_optimizer = base_optimizer + self.param_groups = base_optimizer.param_groups + self.defaults = base_optimizer.defaults + self.defaults.update(defaults) + self.state = defaultdict(dict) + # manually add our defaults to the param groups + for name, default in defaults.items(): + for group in self._base_optimizer.param_groups: + group.setdefault(name, default) + + @torch.no_grad() + def update_slow(self, group): + for fast_p in group["params"]: + if fast_p.grad is None: + continue + param_state = self._base_optimizer.state[fast_p] + if 'lookahead_slow_buff' not in param_state: + param_state['lookahead_slow_buff'] = torch.empty_like(fast_p) + param_state['lookahead_slow_buff'].copy_(fast_p) + slow = param_state['lookahead_slow_buff'] + slow.add_(fast_p - slow, alpha=group['lookahead_alpha']) + fast_p.copy_(slow) + + def sync_lookahead(self): + for group in self._base_optimizer.param_groups: + self.update_slow(group) + + @torch.no_grad() + def step(self, closure=None): + loss = self._base_optimizer.step(closure) + for group in self._base_optimizer.param_groups: + group['lookahead_step'] += 1 + if group['lookahead_step'] % group['lookahead_k'] == 0: + self.update_slow(group) + return loss + + def state_dict(self): + return self._base_optimizer.state_dict() + + def load_state_dict(self, state_dict): + self._base_optimizer.load_state_dict(state_dict) + self.param_groups = self._base_optimizer.param_groups diff --git a/custom_timm/optim/madgrad.py b/custom_timm/optim/madgrad.py new file mode 100644 index 0000000000000000000000000000000000000000..a76713bf27ed1daf0ce598ac5f25c6238c7fdb57 --- /dev/null +++ b/custom_timm/optim/madgrad.py @@ -0,0 +1,184 @@ +""" PyTorch MADGRAD optimizer + +MADGRAD: https://arxiv.org/abs/2101.11075 + +Code from: https://github.com/facebookresearch/madgrad +""" +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import TYPE_CHECKING, Any, Callable, Optional + +import torch +import torch.optim + +if TYPE_CHECKING: + from torch.optim.optimizer import _params_t +else: + _params_t = Any + + +class MADGRAD(torch.optim.Optimizer): + """ + MADGRAD_: A Momentumized, Adaptive, Dual Averaged Gradient Method for Stochastic + Optimization. + + .. _MADGRAD: https://arxiv.org/abs/2101.11075 + + MADGRAD is a general purpose optimizer that can be used in place of SGD or + Adam may converge faster and generalize better. Currently GPU-only. + Typically, the same learning rate schedule that is used for SGD or Adam may + be used. The overall learning rate is not comparable to either method and + should be determined by a hyper-parameter sweep. + + MADGRAD requires less weight decay than other methods, often as little as + zero. Momentum values used for SGD or Adam's beta1 should work here also. + + On sparse problems both weight_decay and momentum should be set to 0. + + Arguments: + params (iterable): + Iterable of parameters to optimize or dicts defining parameter groups. + lr (float): + Learning rate (default: 1e-2). + momentum (float): + Momentum value in the range [0,1) (default: 0.9). + weight_decay (float): + Weight decay, i.e. a L2 penalty (default: 0). + eps (float): + Term added to the denominator outside of the root operation to improve numerical stability. (default: 1e-6). + """ + + def __init__( + self, + params: _params_t, + lr: float = 1e-2, + momentum: float = 0.9, + weight_decay: float = 0, + eps: float = 1e-6, + decoupled_decay: bool = False, + ): + if momentum < 0 or momentum >= 1: + raise ValueError(f"Momentum {momentum} must be in the range [0,1]") + if lr <= 0: + raise ValueError(f"Learning rate {lr} must be positive") + if weight_decay < 0: + raise ValueError(f"Weight decay {weight_decay} must be non-negative") + if eps < 0: + raise ValueError(f"Eps must be non-negative") + + defaults = dict( + lr=lr, eps=eps, momentum=momentum, weight_decay=weight_decay, decoupled_decay=decoupled_decay) + super().__init__(params, defaults) + + @property + def supports_memory_efficient_fp16(self) -> bool: + return False + + @property + def supports_flat_params(self) -> bool: + return True + + @torch.no_grad() + def step(self, closure: Optional[Callable[[], float]] = None) -> Optional[float]: + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + eps = group['eps'] + lr = group['lr'] + eps + weight_decay = group['weight_decay'] + momentum = group['momentum'] + ck = 1 - momentum + + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad + if momentum != 0.0 and grad.is_sparse: + raise RuntimeError("momentum != 0 is not compatible with sparse gradients") + + state = self.state[p] + if len(state) == 0: + state['step'] = 0 + state['grad_sum_sq'] = torch.zeros_like(p) + state['s'] = torch.zeros_like(p) + if momentum != 0: + state['x0'] = torch.clone(p).detach() + + state['step'] += 1 + grad_sum_sq = state['grad_sum_sq'] + s = state['s'] + lamb = lr * math.sqrt(state['step']) + + # Apply weight decay + if weight_decay != 0: + if group['decoupled_decay']: + p.mul_(1.0 - group['lr'] * weight_decay) + else: + if grad.is_sparse: + raise RuntimeError("weight_decay option is not compatible with sparse gradients") + grad.add_(p, alpha=weight_decay) + + if grad.is_sparse: + grad = grad.coalesce() + grad_val = grad._values() + + p_masked = p.sparse_mask(grad) + grad_sum_sq_masked = grad_sum_sq.sparse_mask(grad) + s_masked = s.sparse_mask(grad) + + # Compute x_0 from other known quantities + rms_masked_vals = grad_sum_sq_masked._values().pow(1 / 3).add_(eps) + x0_masked_vals = p_masked._values().addcdiv(s_masked._values(), rms_masked_vals, value=1) + + # Dense + sparse op + grad_sq = grad * grad + grad_sum_sq.add_(grad_sq, alpha=lamb) + grad_sum_sq_masked.add_(grad_sq, alpha=lamb) + + rms_masked_vals = grad_sum_sq_masked._values().pow_(1 / 3).add_(eps) + + s.add_(grad, alpha=lamb) + s_masked._values().add_(grad_val, alpha=lamb) + + # update masked copy of p + p_kp1_masked_vals = x0_masked_vals.addcdiv(s_masked._values(), rms_masked_vals, value=-1) + # Copy updated masked p to dense p using an add operation + p_masked._values().add_(p_kp1_masked_vals, alpha=-1) + p.add_(p_masked, alpha=-1) + else: + if momentum == 0: + # Compute x_0 from other known quantities + rms = grad_sum_sq.pow(1 / 3).add_(eps) + x0 = p.addcdiv(s, rms, value=1) + else: + x0 = state['x0'] + + # Accumulate second moments + grad_sum_sq.addcmul_(grad, grad, value=lamb) + rms = grad_sum_sq.pow(1 / 3).add_(eps) + + # Update s + s.add_(grad, alpha=lamb) + + # Step + if momentum == 0: + p.copy_(x0.addcdiv(s, rms, value=-1)) + else: + z = x0.addcdiv(s, rms, value=-1) + + # p is a moving average of z + p.mul_(1 - ck).add_(z, alpha=ck) + + return loss diff --git a/custom_timm/optim/nadam.py b/custom_timm/optim/nadam.py new file mode 100644 index 0000000000000000000000000000000000000000..6268d5d451ed2fe26b47e46476dc1feee7da9649 --- /dev/null +++ b/custom_timm/optim/nadam.py @@ -0,0 +1,92 @@ +import math + +import torch +from torch.optim.optimizer import Optimizer + + +class Nadam(Optimizer): + """Implements Nadam algorithm (a variant of Adam based on Nesterov momentum). + + It has been proposed in `Incorporating Nesterov Momentum into Adam`__. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 2e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + schedule_decay (float, optional): momentum schedule decay (default: 4e-3) + + __ http://cs229.stanford.edu/proj2015/054_report.pdf + __ http://www.cs.toronto.edu/~fritz/absps/momentum.pdf + + Originally taken from: https://github.com/pytorch/pytorch/pull/1408 + NOTE: Has potential issues but does work well on some problems. + """ + + def __init__(self, params, lr=2e-3, betas=(0.9, 0.999), eps=1e-8, + weight_decay=0, schedule_decay=4e-3): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + defaults = dict( + lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, schedule_decay=schedule_decay) + super(Nadam, self).__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + state['m_schedule'] = 1. + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + + # Warming momentum schedule + m_schedule = state['m_schedule'] + schedule_decay = group['schedule_decay'] + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + eps = group['eps'] + state['step'] += 1 + t = state['step'] + bias_correction2 = 1 - beta2 ** t + + if group['weight_decay'] != 0: + grad = grad.add(p, alpha=group['weight_decay']) + + momentum_cache_t = beta1 * (1. - 0.5 * (0.96 ** (t * schedule_decay))) + momentum_cache_t_1 = beta1 * (1. - 0.5 * (0.96 ** ((t + 1) * schedule_decay))) + m_schedule_new = m_schedule * momentum_cache_t + m_schedule_next = m_schedule * momentum_cache_t * momentum_cache_t_1 + state['m_schedule'] = m_schedule_new + + # Decay the first and second moment running average coefficient + exp_avg.mul_(beta1).add_(grad, alpha=1. - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1. - beta2) + + denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(eps) + p.addcdiv_(grad, denom, value=-group['lr'] * (1. - momentum_cache_t) / (1. - m_schedule_new)) + p.addcdiv_(exp_avg, denom, value=-group['lr'] * momentum_cache_t_1 / (1. - m_schedule_next)) + + return loss diff --git a/custom_timm/optim/nvnovograd.py b/custom_timm/optim/nvnovograd.py new file mode 100644 index 0000000000000000000000000000000000000000..fda3f4a620fcca5593034dfb9683f2c8f3b78ac1 --- /dev/null +++ b/custom_timm/optim/nvnovograd.py @@ -0,0 +1,120 @@ +""" Nvidia NovoGrad Optimizer. +Original impl by Nvidia from Jasper example: + - https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/SpeechRecognition/Jasper +Paper: `Stochastic Gradient Methods with Layer-wise Adaptive Moments for Training of Deep Networks` + - https://arxiv.org/abs/1905.11286 +""" + +import torch +from torch.optim.optimizer import Optimizer +import math + + +class NvNovoGrad(Optimizer): + """ + Implements Novograd algorithm. + + Args: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-3) + betas (Tuple[float, float], optional): coefficients used for computing + running averages of gradient and its square (default: (0.95, 0.98)) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-8) + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + grad_averaging: gradient averaging + amsgrad (boolean, optional): whether to use the AMSGrad variant of this + algorithm from the paper `On the Convergence of Adam and Beyond`_ + (default: False) + """ + + def __init__(self, params, lr=1e-3, betas=(0.95, 0.98), eps=1e-8, + weight_decay=0, grad_averaging=False, amsgrad=False): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, + grad_averaging=grad_averaging, + amsgrad=amsgrad) + + super(NvNovoGrad, self).__init__(params, defaults) + + def __setstate__(self, state): + super(NvNovoGrad, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + if grad.is_sparse: + raise RuntimeError('Sparse gradients are not supported.') + amsgrad = group['amsgrad'] + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros([]).to(state['exp_avg'].device) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + if amsgrad: + max_exp_avg_sq = state['max_exp_avg_sq'] + beta1, beta2 = group['betas'] + + state['step'] += 1 + + norm = torch.sum(torch.pow(grad, 2)) + + if exp_avg_sq == 0: + exp_avg_sq.copy_(norm) + else: + exp_avg_sq.mul_(beta2).add_(norm, alpha=1 - beta2) + + if amsgrad: + # Maintains the maximum of all 2nd moment running avg. till now + torch.max(max_exp_avg_sq, exp_avg_sq, out=max_exp_avg_sq) + # Use the max. for normalizing running avg. of gradient + denom = max_exp_avg_sq.sqrt().add_(group['eps']) + else: + denom = exp_avg_sq.sqrt().add_(group['eps']) + + grad.div_(denom) + if group['weight_decay'] != 0: + grad.add_(p, alpha=group['weight_decay']) + if group['grad_averaging']: + grad.mul_(1 - beta1) + exp_avg.mul_(beta1).add_(grad) + + p.add_(exp_avg, alpha=-group['lr']) + + return loss diff --git a/custom_timm/optim/optim_factory.py b/custom_timm/optim/optim_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..4acaec67bb094c870b5ecd34b41b14a172de8bdd --- /dev/null +++ b/custom_timm/optim/optim_factory.py @@ -0,0 +1,340 @@ +""" Optimizer Factory w/ Custom Weight Decay +Hacked together by / Copyright 2021 Ross Wightman +""" +import logging +from itertools import islice +from typing import Optional, Callable, Tuple + +import torch +import torch.nn as nn +import torch.optim as optim + +from custom_timm.models.helpers import group_parameters + +from .adabelief import AdaBelief +from .adafactor import Adafactor +from .adahessian import Adahessian +from .adamp import AdamP +from .lamb import Lamb +from .lars import Lars +from .lookahead import Lookahead +from .madgrad import MADGRAD +from .nadam import Nadam +from .nvnovograd import NvNovoGrad +from .radam import RAdam +from .rmsprop_tf import RMSpropTF +from .sgdp import SGDP + +try: + from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD + has_apex = True +except ImportError: + has_apex = False + +_logger = logging.getLogger(__name__) + + +def param_groups_weight_decay( + model: nn.Module, + weight_decay=1e-5, + no_weight_decay_list=() +): + no_weight_decay_list = set(no_weight_decay_list) + decay = [] + no_decay = [] + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + if param.ndim <= 1 or name.endswith(".bias") or name in no_weight_decay_list: + no_decay.append(param) + else: + decay.append(param) + + return [ + {'params': no_decay, 'weight_decay': 0.}, + {'params': decay, 'weight_decay': weight_decay}] + + +def _group(it, size): + it = iter(it) + return iter(lambda: tuple(islice(it, size)), ()) + + +def _layer_map(model, layers_per_group=12, num_groups=None): + def _in_head(n, hp): + if not hp: + return True + elif isinstance(hp, (tuple, list)): + return any([n.startswith(hpi) for hpi in hp]) + else: + return n.startswith(hp) + + head_prefix = getattr(model, 'pretrained_cfg', {}).get('classifier', None) + names_trunk = [] + names_head = [] + for n, _ in model.named_parameters(): + names_head.append(n) if _in_head(n, head_prefix) else names_trunk.append(n) + + # group non-head layers + num_trunk_layers = len(names_trunk) + if num_groups is not None: + layers_per_group = -(num_trunk_layers // -num_groups) + names_trunk = list(_group(names_trunk, layers_per_group)) + + num_trunk_groups = len(names_trunk) + layer_map = {n: i for i, l in enumerate(names_trunk) for n in l} + layer_map.update({n: num_trunk_groups for n in names_head}) + return layer_map + + +def param_groups_layer_decay( + model: nn.Module, + weight_decay: float = 0.05, + no_weight_decay_list: Tuple[str] = (), + layer_decay: float = .75, + end_layer_decay: Optional[float] = None, + verbose: bool = False, +): + """ + Parameter groups for layer-wise lr decay & weight decay + Based on BEiT: https://github.com/microsoft/unilm/blob/master/beit/optim_factory.py#L58 + """ + no_weight_decay_list = set(no_weight_decay_list) + param_group_names = {} # NOTE for debugging + param_groups = {} + + if hasattr(model, 'group_matcher'): + # FIXME interface needs more work + layer_map = group_parameters(model, model.group_matcher(coarse=False), reverse=True) + else: + # fallback + layer_map = _layer_map(model) + num_layers = max(layer_map.values()) + 1 + layer_max = num_layers - 1 + layer_scales = list(layer_decay ** (layer_max - i) for i in range(num_layers)) + + for name, param in model.named_parameters(): + if not param.requires_grad: + continue + + # no decay: all 1D parameters and model specific ones + if param.ndim == 1 or name in no_weight_decay_list: + g_decay = "no_decay" + this_decay = 0. + else: + g_decay = "decay" + this_decay = weight_decay + + layer_id = layer_map.get(name, layer_max) + group_name = "layer_%d_%s" % (layer_id, g_decay) + + if group_name not in param_groups: + this_scale = layer_scales[layer_id] + param_group_names[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "param_names": [], + } + param_groups[group_name] = { + "lr_scale": this_scale, + "weight_decay": this_decay, + "params": [], + } + + param_group_names[group_name]["param_names"].append(name) + param_groups[group_name]["params"].append(param) + + if verbose: + import json + _logger.info("parameter groups: \n%s" % json.dumps(param_group_names, indent=2)) + + return list(param_groups.values()) + + +def optimizer_kwargs(cfg): + """ cfg/argparse to kwargs helper + Convert optimizer args in argparse args or cfg like object to keyword args for updated create fn. + """ + kwargs = dict( + opt=cfg.opt, + lr=cfg.lr, + weight_decay=cfg.weight_decay, + momentum=cfg.momentum) + if getattr(cfg, 'opt_eps', None) is not None: + kwargs['eps'] = cfg.opt_eps + if getattr(cfg, 'opt_betas', None) is not None: + kwargs['betas'] = cfg.opt_betas + if getattr(cfg, 'layer_decay', None) is not None: + kwargs['layer_decay'] = cfg.layer_decay + if getattr(cfg, 'opt_args', None) is not None: + kwargs.update(cfg.opt_args) + return kwargs + + +def create_optimizer(args, model, filter_bias_and_bn=True): + """ Legacy optimizer factory for backwards compatibility. + NOTE: Use create_optimizer_v2 for new code. + """ + return create_optimizer_v2( + model, + **optimizer_kwargs(cfg=args), + filter_bias_and_bn=filter_bias_and_bn, + ) + + +def create_optimizer_v2( + model_or_params, + opt: str = 'sgd', + lr: Optional[float] = None, + weight_decay: float = 0., + momentum: float = 0.9, + filter_bias_and_bn: bool = True, + layer_decay: Optional[float] = None, + param_group_fn: Optional[Callable] = None, + **kwargs): + """ Create an optimizer. + + TODO currently the model is passed in and all parameters are selected for optimization. + For more general use an interface that allows selection of parameters to optimize and lr groups, one of: + * a filter fn interface that further breaks params into groups in a weight_decay compatible fashion + * expose the parameters interface and leave it up to caller + + Args: + model_or_params (nn.Module): model containing parameters to optimize + opt: name of optimizer to create + lr: initial learning rate + weight_decay: weight decay to apply in optimizer + momentum: momentum for momentum based optimizers (others may use betas via kwargs) + filter_bias_and_bn: filter out bias, bn and other 1d params from weight decay + **kwargs: extra optimizer specific kwargs to pass through + + Returns: + Optimizer + """ + if isinstance(model_or_params, nn.Module): + # a model was passed in, extract parameters and add weight decays to appropriate layers + no_weight_decay = {} + if hasattr(model_or_params, 'no_weight_decay'): + no_weight_decay = model_or_params.no_weight_decay() + + if param_group_fn: + parameters = param_group_fn(model_or_params) + elif layer_decay is not None: + parameters = param_groups_layer_decay( + model_or_params, + weight_decay=weight_decay, + layer_decay=layer_decay, + no_weight_decay_list=no_weight_decay) + weight_decay = 0. + elif weight_decay and filter_bias_and_bn: + parameters = param_groups_weight_decay(model_or_params, weight_decay, no_weight_decay) + weight_decay = 0. + else: + parameters = model_or_params.parameters() + else: + # iterable of parameters or param groups passed in + parameters = model_or_params + + opt_lower = opt.lower() + opt_split = opt_lower.split('_') + opt_lower = opt_split[-1] + if 'fused' in opt_lower: + assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' + + opt_args = dict(weight_decay=weight_decay, **kwargs) + if lr is not None: + opt_args.setdefault('lr', lr) + + # basic SGD & related + if opt_lower == 'sgd' or opt_lower == 'nesterov': + # NOTE 'sgd' refers to SGD + nesterov momentum for legacy / backwards compat reasons + opt_args.pop('eps', None) + optimizer = optim.SGD(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'momentum': + opt_args.pop('eps', None) + optimizer = optim.SGD(parameters, momentum=momentum, nesterov=False, **opt_args) + elif opt_lower == 'sgdp': + optimizer = SGDP(parameters, momentum=momentum, nesterov=True, **opt_args) + + # adaptive + elif opt_lower == 'adam': + optimizer = optim.Adam(parameters, **opt_args) + elif opt_lower == 'adamw': + optimizer = optim.AdamW(parameters, **opt_args) + elif opt_lower == 'adamp': + optimizer = AdamP(parameters, wd_ratio=0.01, nesterov=True, **opt_args) + elif opt_lower == 'nadam': + try: + # NOTE PyTorch >= 1.10 should have native NAdam + optimizer = optim.Nadam(parameters, **opt_args) + except AttributeError: + optimizer = Nadam(parameters, **opt_args) + elif opt_lower == 'radam': + optimizer = RAdam(parameters, **opt_args) + elif opt_lower == 'adamax': + optimizer = optim.Adamax(parameters, **opt_args) + elif opt_lower == 'adabelief': + optimizer = AdaBelief(parameters, rectify=False, **opt_args) + elif opt_lower == 'radabelief': + optimizer = AdaBelief(parameters, rectify=True, **opt_args) + elif opt_lower == 'adadelta': + optimizer = optim.Adadelta(parameters, **opt_args) + elif opt_lower == 'adagrad': + opt_args.setdefault('eps', 1e-8) + optimizer = optim.Adagrad(parameters, **opt_args) + elif opt_lower == 'adafactor': + optimizer = Adafactor(parameters, **opt_args) + elif opt_lower == 'lamb': + optimizer = Lamb(parameters, **opt_args) + elif opt_lower == 'lambc': + optimizer = Lamb(parameters, trust_clip=True, **opt_args) + elif opt_lower == 'larc': + optimizer = Lars(parameters, momentum=momentum, trust_clip=True, **opt_args) + elif opt_lower == 'lars': + optimizer = Lars(parameters, momentum=momentum, **opt_args) + elif opt_lower == 'nlarc': + optimizer = Lars(parameters, momentum=momentum, trust_clip=True, nesterov=True, **opt_args) + elif opt_lower == 'nlars': + optimizer = Lars(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'madgrad': + optimizer = MADGRAD(parameters, momentum=momentum, **opt_args) + elif opt_lower == 'madgradw': + optimizer = MADGRAD(parameters, momentum=momentum, decoupled_decay=True, **opt_args) + elif opt_lower == 'novograd' or opt_lower == 'nvnovograd': + optimizer = NvNovoGrad(parameters, **opt_args) + elif opt_lower == 'rmsprop': + optimizer = optim.RMSprop(parameters, alpha=0.9, momentum=momentum, **opt_args) + elif opt_lower == 'rmsproptf': + optimizer = RMSpropTF(parameters, alpha=0.9, momentum=momentum, **opt_args) + + # second order + elif opt_lower == 'adahessian': + optimizer = Adahessian(parameters, **opt_args) + + # NVIDIA fused optimizers, require APEX to be installed + elif opt_lower == 'fusedsgd': + opt_args.pop('eps', None) + optimizer = FusedSGD(parameters, momentum=momentum, nesterov=True, **opt_args) + elif opt_lower == 'fusedmomentum': + opt_args.pop('eps', None) + optimizer = FusedSGD(parameters, momentum=momentum, nesterov=False, **opt_args) + elif opt_lower == 'fusedadam': + optimizer = FusedAdam(parameters, adam_w_mode=False, **opt_args) + elif opt_lower == 'fusedadamw': + optimizer = FusedAdam(parameters, adam_w_mode=True, **opt_args) + elif opt_lower == 'fusedlamb': + optimizer = FusedLAMB(parameters, **opt_args) + elif opt_lower == 'fusednovograd': + opt_args.setdefault('betas', (0.95, 0.98)) + optimizer = FusedNovoGrad(parameters, **opt_args) + + else: + assert False and "Invalid optimizer" + raise ValueError + + if len(opt_split) > 1: + if opt_split[0] == 'lookahead': + optimizer = Lookahead(optimizer) + + return optimizer diff --git a/custom_timm/optim/radam.py b/custom_timm/optim/radam.py new file mode 100644 index 0000000000000000000000000000000000000000..eb8d22e06c42e487c831297008851b4adc254d78 --- /dev/null +++ b/custom_timm/optim/radam.py @@ -0,0 +1,89 @@ +"""RAdam Optimizer. +Implementation lifted from: https://github.com/LiyuanLucasLiu/RAdam +Paper: `On the Variance of the Adaptive Learning Rate and Beyond` - https://arxiv.org/abs/1908.03265 +""" +import math +import torch +from torch.optim.optimizer import Optimizer + + +class RAdam(Optimizer): + + def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0): + defaults = dict( + lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, + buffer=[[None, None, None] for _ in range(10)]) + super(RAdam, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RAdam, self).__setstate__(state) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad.float() + if grad.is_sparse: + raise RuntimeError('RAdam does not support sparse gradients') + + p_fp32 = p.float() + + state = self.state[p] + + if len(state) == 0: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_fp32) + state['exp_avg_sq'] = torch.zeros_like(p_fp32) + else: + state['exp_avg'] = state['exp_avg'].type_as(p_fp32) + state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_fp32) + + exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] + beta1, beta2 = group['betas'] + + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) + + state['step'] += 1 + buffered = group['buffer'][int(state['step'] % 10)] + if state['step'] == buffered[0]: + num_sma, step_size = buffered[1], buffered[2] + else: + buffered[0] = state['step'] + beta2_t = beta2 ** state['step'] + num_sma_max = 2 / (1 - beta2) - 1 + num_sma = num_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + buffered[1] = num_sma + + # more conservative since it's an approximated value + if num_sma >= 5: + step_size = group['lr'] * math.sqrt( + (1 - beta2_t) * + (num_sma - 4) / (num_sma_max - 4) * + (num_sma - 2) / num_sma * + num_sma_max / (num_sma_max - 2)) / (1 - beta1 ** state['step']) + else: + step_size = group['lr'] / (1 - beta1 ** state['step']) + buffered[2] = step_size + + if group['weight_decay'] != 0: + p_fp32.add_(p_fp32, alpha=-group['weight_decay'] * group['lr']) + + # more conservative since it's an approximated value + if num_sma >= 5: + denom = exp_avg_sq.sqrt().add_(group['eps']) + p_fp32.addcdiv_(exp_avg, denom, value=-step_size) + else: + p_fp32.add_(exp_avg, alpha=-step_size) + + p.copy_(p_fp32) + + return loss diff --git a/custom_timm/optim/rmsprop_tf.py b/custom_timm/optim/rmsprop_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..0817887db380261dfee3fcd4bd155b5d923f5248 --- /dev/null +++ b/custom_timm/optim/rmsprop_tf.py @@ -0,0 +1,139 @@ +""" RMSProp modified to behave like Tensorflow impl + +Originally cut & paste from PyTorch RMSProp +https://github.com/pytorch/pytorch/blob/063946d2b3f3f1e953a2a3b54e0b34f1393de295/torch/optim/rmsprop.py +Licensed under BSD-Clause 3 (ish), https://github.com/pytorch/pytorch/blob/master/LICENSE + +Modifications Copyright 2021 Ross Wightman +""" + +import torch +from torch.optim import Optimizer + + +class RMSpropTF(Optimizer): + """Implements RMSprop algorithm (TensorFlow style epsilon) + + NOTE: This is a direct cut-and-paste of PyTorch RMSprop with eps applied before sqrt + and a few other modifications to closer match Tensorflow for matching hyper-params. + + Noteworthy changes include: + 1. Epsilon applied inside square-root + 2. square_avg initialized to ones + 3. LR scaling of update accumulated in momentum buffer + + Proposed by G. Hinton in his + `course `_. + + The centered version first appears in `Generating Sequences + With Recurrent Neural Networks `_. + + Arguments: + params (iterable): iterable of parameters to optimize or dicts defining + parameter groups + lr (float, optional): learning rate (default: 1e-2) + momentum (float, optional): momentum factor (default: 0) + alpha (float, optional): smoothing (decay) constant (default: 0.9) + eps (float, optional): term added to the denominator to improve + numerical stability (default: 1e-10) + centered (bool, optional) : if ``True``, compute the centered RMSProp, + the gradient is normalized by an estimation of its variance + weight_decay (float, optional): weight decay (L2 penalty) (default: 0) + decoupled_decay (bool, optional): decoupled weight decay as per https://arxiv.org/abs/1711.05101 + lr_in_momentum (bool, optional): learning rate scaling is included in the momentum buffer + update as per defaults in Tensorflow + + """ + + def __init__(self, params, lr=1e-2, alpha=0.9, eps=1e-10, weight_decay=0, momentum=0., centered=False, + decoupled_decay=False, lr_in_momentum=True): + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= momentum: + raise ValueError("Invalid momentum value: {}".format(momentum)) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + if not 0.0 <= alpha: + raise ValueError("Invalid alpha value: {}".format(alpha)) + + defaults = dict( + lr=lr, momentum=momentum, alpha=alpha, eps=eps, centered=centered, weight_decay=weight_decay, + decoupled_decay=decoupled_decay, lr_in_momentum=lr_in_momentum) + super(RMSpropTF, self).__init__(params, defaults) + + def __setstate__(self, state): + super(RMSpropTF, self).__setstate__(state) + for group in self.param_groups: + group.setdefault('momentum', 0) + group.setdefault('centered', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + if grad.is_sparse: + raise RuntimeError('RMSprop does not support sparse gradients') + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + state['square_avg'] = torch.ones_like(p) # PyTorch inits to zero + if group['momentum'] > 0: + state['momentum_buffer'] = torch.zeros_like(p) + if group['centered']: + state['grad_avg'] = torch.zeros_like(p) + + square_avg = state['square_avg'] + one_minus_alpha = 1. - group['alpha'] + + state['step'] += 1 + + if group['weight_decay'] != 0: + if group['decoupled_decay']: + p.mul_(1. - group['lr'] * group['weight_decay']) + else: + grad = grad.add(p, alpha=group['weight_decay']) + + # Tensorflow order of ops for updating squared avg + square_avg.add_(grad.pow(2) - square_avg, alpha=one_minus_alpha) + # square_avg.mul_(alpha).addcmul_(grad, grad, value=1 - alpha) # PyTorch original + + if group['centered']: + grad_avg = state['grad_avg'] + grad_avg.add_(grad - grad_avg, alpha=one_minus_alpha) + avg = square_avg.addcmul(grad_avg, grad_avg, value=-1).add(group['eps']).sqrt_() # eps in sqrt + # grad_avg.mul_(alpha).add_(grad, alpha=1 - alpha) # PyTorch original + else: + avg = square_avg.add(group['eps']).sqrt_() # eps moved in sqrt + + if group['momentum'] > 0: + buf = state['momentum_buffer'] + # Tensorflow accumulates the LR scaling in the momentum buffer + if group['lr_in_momentum']: + buf.mul_(group['momentum']).addcdiv_(grad, avg, value=group['lr']) + p.add_(-buf) + else: + # PyTorch scales the param update by LR + buf.mul_(group['momentum']).addcdiv_(grad, avg) + p.add_(buf, alpha=-group['lr']) + else: + p.addcdiv_(grad, avg, value=-group['lr']) + + return loss diff --git a/custom_timm/optim/sgdp.py b/custom_timm/optim/sgdp.py new file mode 100644 index 0000000000000000000000000000000000000000..baf05fa55c632371498ec53ff679b11023429df6 --- /dev/null +++ b/custom_timm/optim/sgdp.py @@ -0,0 +1,70 @@ +""" +SGDP Optimizer Implementation copied from https://github.com/clovaai/AdamP/blob/master/adamp/sgdp.py + +Paper: `Slowing Down the Weight Norm Increase in Momentum-based Optimizers` - https://arxiv.org/abs/2006.08217 +Code: https://github.com/clovaai/AdamP + +Copyright (c) 2020-present NAVER Corp. +MIT license +""" + +import torch +import torch.nn.functional as F +from torch.optim.optimizer import Optimizer, required +import math + +from .adamp import projection + + +class SGDP(Optimizer): + def __init__(self, params, lr=required, momentum=0, dampening=0, + weight_decay=0, nesterov=False, eps=1e-8, delta=0.1, wd_ratio=0.1): + defaults = dict( + lr=lr, momentum=momentum, dampening=dampening, weight_decay=weight_decay, + nesterov=nesterov, eps=eps, delta=delta, wd_ratio=wd_ratio) + super(SGDP, self).__init__(params, defaults) + + @torch.no_grad() + def step(self, closure=None): + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + weight_decay = group['weight_decay'] + momentum = group['momentum'] + dampening = group['dampening'] + nesterov = group['nesterov'] + + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + state = self.state[p] + + # State initialization + if len(state) == 0: + state['momentum'] = torch.zeros_like(p) + + # SGD + buf = state['momentum'] + buf.mul_(momentum).add_(grad, alpha=1. - dampening) + if nesterov: + d_p = grad + momentum * buf + else: + d_p = buf + + # Projection + wd_ratio = 1. + if len(p.shape) > 1: + d_p, wd_ratio = projection(p, grad, d_p, group['delta'], group['wd_ratio'], group['eps']) + + # Weight decay + if weight_decay != 0: + p.mul_(1. - group['lr'] * group['weight_decay'] * wd_ratio / (1-momentum)) + + # Step + p.add_(d_p, alpha=-group['lr']) + + return loss diff --git a/custom_timm/scheduler/__init__.py b/custom_timm/scheduler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f1961b88fc3c37cdd8c73f9fddd4bfa1ada95f23 --- /dev/null +++ b/custom_timm/scheduler/__init__.py @@ -0,0 +1,8 @@ +from .cosine_lr import CosineLRScheduler +from .multistep_lr import MultiStepLRScheduler +from .plateau_lr import PlateauLRScheduler +from .poly_lr import PolyLRScheduler +from .step_lr import StepLRScheduler +from .tanh_lr import TanhLRScheduler + +from .scheduler_factory import create_scheduler diff --git a/custom_timm/scheduler/cosine_lr.py b/custom_timm/scheduler/cosine_lr.py new file mode 100644 index 0000000000000000000000000000000000000000..84ee349ec281f89e331be3643b613e158bb3c194 --- /dev/null +++ b/custom_timm/scheduler/cosine_lr.py @@ -0,0 +1,119 @@ +""" Cosine Scheduler + +Cosine LR schedule with warmup, cycle/restarts, noise, k-decay. + +Hacked together by / Copyright 2021 Ross Wightman +""" +import logging +import math +import numpy as np +import torch + +from .scheduler import Scheduler + + +_logger = logging.getLogger(__name__) + + +class CosineLRScheduler(Scheduler): + """ + Cosine decay with restarts. + This is described in the paper https://arxiv.org/abs/1608.03983. + + Inspiration from + https://github.com/allenai/allennlp/blob/master/allennlp/training/learning_rate_schedulers/cosine.py + + k-decay option based on `k-decay: A New Method For Learning Rate Schedule` - https://arxiv.org/abs/2004.05909 + """ + + def __init__(self, + optimizer: torch.optim.Optimizer, + t_initial: int, + lr_min: float = 0., + cycle_mul: float = 1., + cycle_decay: float = 1., + cycle_limit: int = 1, + warmup_t=0, + warmup_lr_init=0, + warmup_prefix=False, + t_in_epochs=True, + noise_range_t=None, + noise_pct=0.67, + noise_std=1.0, + noise_seed=42, + k_decay=1.0, + initialize=True) -> None: + super().__init__( + optimizer, param_group_field="lr", + noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, + initialize=initialize) + + assert t_initial > 0 + assert lr_min >= 0 + if t_initial == 1 and cycle_mul == 1 and cycle_decay == 1: + _logger.warning("Cosine annealing scheduler will have no effect on the learning " + "rate since t_initial = t_mul = eta_mul = 1.") + self.t_initial = t_initial + self.lr_min = lr_min + self.cycle_mul = cycle_mul + self.cycle_decay = cycle_decay + self.cycle_limit = cycle_limit + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + self.warmup_prefix = warmup_prefix + self.t_in_epochs = t_in_epochs + self.k_decay = k_decay + if self.warmup_t: + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + + def _get_lr(self, t): + if t < self.warmup_t: + lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] + else: + if self.warmup_prefix: + t = t - self.warmup_t + + if self.cycle_mul != 1: + i = math.floor(math.log(1 - t / self.t_initial * (1 - self.cycle_mul), self.cycle_mul)) + t_i = self.cycle_mul ** i * self.t_initial + t_curr = t - (1 - self.cycle_mul ** i) / (1 - self.cycle_mul) * self.t_initial + else: + i = t // self.t_initial + t_i = self.t_initial + t_curr = t - (self.t_initial * i) + + gamma = self.cycle_decay ** i + lr_max_values = [v * gamma for v in self.base_values] + k = self.k_decay + + if i < self.cycle_limit: + lrs = [ + self.lr_min + 0.5 * (lr_max - self.lr_min) * (1 + math.cos(math.pi * t_curr ** k / t_i ** k)) + for lr_max in lr_max_values + ] + else: + lrs = [self.lr_min for _ in self.base_values] + + return lrs + + def get_epoch_values(self, epoch: int): + if self.t_in_epochs: + return self._get_lr(epoch) + else: + return None + + def get_update_values(self, num_updates: int): + if not self.t_in_epochs: + return self._get_lr(num_updates) + else: + return None + + def get_cycle_length(self, cycles=0): + cycles = max(1, cycles or self.cycle_limit) + if self.cycle_mul == 1.0: + return self.t_initial * cycles + else: + return int(math.floor(-self.t_initial * (self.cycle_mul ** cycles - 1) / (1 - self.cycle_mul))) diff --git a/custom_timm/scheduler/multistep_lr.py b/custom_timm/scheduler/multistep_lr.py new file mode 100644 index 0000000000000000000000000000000000000000..8b0ca920307fa4ee6e63340d76ca278b729091e3 --- /dev/null +++ b/custom_timm/scheduler/multistep_lr.py @@ -0,0 +1,65 @@ +""" MultiStep LR Scheduler + +Basic multi step LR schedule with warmup, noise. +""" +import torch +import bisect +from custom_timm.scheduler.scheduler import Scheduler +from typing import List + +class MultiStepLRScheduler(Scheduler): + """ + """ + + def __init__(self, + optimizer: torch.optim.Optimizer, + decay_t: List[int], + decay_rate: float = 1., + warmup_t=0, + warmup_lr_init=0, + t_in_epochs=True, + noise_range_t=None, + noise_pct=0.67, + noise_std=1.0, + noise_seed=42, + initialize=True, + ) -> None: + super().__init__( + optimizer, param_group_field="lr", + noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, + initialize=initialize) + + self.decay_t = decay_t + self.decay_rate = decay_rate + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + self.t_in_epochs = t_in_epochs + if self.warmup_t: + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + + def get_curr_decay_steps(self, t): + # find where in the array t goes, + # assumes self.decay_t is sorted + return bisect.bisect_right(self.decay_t, t+1) + + def _get_lr(self, t): + if t < self.warmup_t: + lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] + else: + lrs = [v * (self.decay_rate ** self.get_curr_decay_steps(t)) for v in self.base_values] + return lrs + + def get_epoch_values(self, epoch: int): + if self.t_in_epochs: + return self._get_lr(epoch) + else: + return None + + def get_update_values(self, num_updates: int): + if not self.t_in_epochs: + return self._get_lr(num_updates) + else: + return None diff --git a/custom_timm/scheduler/plateau_lr.py b/custom_timm/scheduler/plateau_lr.py new file mode 100644 index 0000000000000000000000000000000000000000..cacfab3ce7f073c9a99037ed85259fa3286f51ad --- /dev/null +++ b/custom_timm/scheduler/plateau_lr.py @@ -0,0 +1,103 @@ +""" Plateau Scheduler + +Adapts PyTorch plateau scheduler and allows application of noise, warmup. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch + +from .scheduler import Scheduler + + +class PlateauLRScheduler(Scheduler): + """Decay the LR by a factor every time the validation loss plateaus.""" + + def __init__(self, + optimizer, + decay_rate=0.1, + patience_t=10, + verbose=True, + threshold=1e-4, + cooldown_t=0, + warmup_t=0, + warmup_lr_init=0, + lr_min=0, + mode='max', + noise_range_t=None, + noise_type='normal', + noise_pct=0.67, + noise_std=1.0, + noise_seed=None, + initialize=True, + ): + super().__init__( + optimizer, + 'lr', + noise_range_t=noise_range_t, + noise_type=noise_type, + noise_pct=noise_pct, + noise_std=noise_std, + noise_seed=noise_seed, + initialize=initialize, + ) + + self.lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( + self.optimizer, + patience=patience_t, + factor=decay_rate, + verbose=verbose, + threshold=threshold, + cooldown=cooldown_t, + mode=mode, + min_lr=lr_min + ) + + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + if self.warmup_t: + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + self.restore_lr = None + + def state_dict(self): + return { + 'best': self.lr_scheduler.best, + 'last_epoch': self.lr_scheduler.last_epoch, + } + + def load_state_dict(self, state_dict): + self.lr_scheduler.best = state_dict['best'] + if 'last_epoch' in state_dict: + self.lr_scheduler.last_epoch = state_dict['last_epoch'] + + # override the base class step fn completely + def step(self, epoch, metric=None): + if epoch <= self.warmup_t: + lrs = [self.warmup_lr_init + epoch * s for s in self.warmup_steps] + super().update_groups(lrs) + else: + if self.restore_lr is not None: + # restore actual LR from before our last noise perturbation before stepping base + for i, param_group in enumerate(self.optimizer.param_groups): + param_group['lr'] = self.restore_lr[i] + self.restore_lr = None + + self.lr_scheduler.step(metric, epoch) # step the base scheduler + + if self._is_apply_noise(epoch): + self._apply_noise(epoch) + + def _apply_noise(self, epoch): + noise = self._calculate_noise(epoch) + + # apply the noise on top of previous LR, cache the old value so we can restore for normal + # stepping of base scheduler + restore_lr = [] + for i, param_group in enumerate(self.optimizer.param_groups): + old_lr = float(param_group['lr']) + restore_lr.append(old_lr) + new_lr = old_lr + old_lr * noise + param_group['lr'] = new_lr + self.restore_lr = restore_lr diff --git a/custom_timm/scheduler/poly_lr.py b/custom_timm/scheduler/poly_lr.py new file mode 100644 index 0000000000000000000000000000000000000000..9c351be6ed56f8fe130cd391df0a7a7f89c7a96c --- /dev/null +++ b/custom_timm/scheduler/poly_lr.py @@ -0,0 +1,116 @@ +""" Polynomial Scheduler + +Polynomial LR schedule with warmup, noise. + +Hacked together by / Copyright 2021 Ross Wightman +""" +import math +import logging + +import torch + +from .scheduler import Scheduler + + +_logger = logging.getLogger(__name__) + + +class PolyLRScheduler(Scheduler): + """ Polynomial LR Scheduler w/ warmup, noise, and k-decay + + k-decay option based on `k-decay: A New Method For Learning Rate Schedule` - https://arxiv.org/abs/2004.05909 + """ + + def __init__(self, + optimizer: torch.optim.Optimizer, + t_initial: int, + power: float = 0.5, + lr_min: float = 0., + cycle_mul: float = 1., + cycle_decay: float = 1., + cycle_limit: int = 1, + warmup_t=0, + warmup_lr_init=0, + warmup_prefix=False, + t_in_epochs=True, + noise_range_t=None, + noise_pct=0.67, + noise_std=1.0, + noise_seed=42, + k_decay=1.0, + initialize=True) -> None: + super().__init__( + optimizer, param_group_field="lr", + noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, + initialize=initialize) + + assert t_initial > 0 + assert lr_min >= 0 + if t_initial == 1 and cycle_mul == 1 and cycle_decay == 1: + _logger.warning("Cosine annealing scheduler will have no effect on the learning " + "rate since t_initial = t_mul = eta_mul = 1.") + self.t_initial = t_initial + self.power = power + self.lr_min = lr_min + self.cycle_mul = cycle_mul + self.cycle_decay = cycle_decay + self.cycle_limit = cycle_limit + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + self.warmup_prefix = warmup_prefix + self.t_in_epochs = t_in_epochs + self.k_decay = k_decay + if self.warmup_t: + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + + def _get_lr(self, t): + if t < self.warmup_t: + lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] + else: + if self.warmup_prefix: + t = t - self.warmup_t + + if self.cycle_mul != 1: + i = math.floor(math.log(1 - t / self.t_initial * (1 - self.cycle_mul), self.cycle_mul)) + t_i = self.cycle_mul ** i * self.t_initial + t_curr = t - (1 - self.cycle_mul ** i) / (1 - self.cycle_mul) * self.t_initial + else: + i = t // self.t_initial + t_i = self.t_initial + t_curr = t - (self.t_initial * i) + + gamma = self.cycle_decay ** i + lr_max_values = [v * gamma for v in self.base_values] + k = self.k_decay + + if i < self.cycle_limit: + lrs = [ + self.lr_min + (lr_max - self.lr_min) * (1 - t_curr ** k / t_i ** k) ** self.power + for lr_max in lr_max_values + ] + else: + lrs = [self.lr_min for _ in self.base_values] + + return lrs + + def get_epoch_values(self, epoch: int): + if self.t_in_epochs: + return self._get_lr(epoch) + else: + return None + + def get_update_values(self, num_updates: int): + if not self.t_in_epochs: + return self._get_lr(num_updates) + else: + return None + + def get_cycle_length(self, cycles=0): + cycles = max(1, cycles or self.cycle_limit) + if self.cycle_mul == 1.0: + return self.t_initial * cycles + else: + return int(math.floor(-self.t_initial * (self.cycle_mul ** cycles - 1) / (1 - self.cycle_mul))) diff --git a/custom_timm/scheduler/scheduler.py b/custom_timm/scheduler/scheduler.py new file mode 100644 index 0000000000000000000000000000000000000000..af20be9b59d2fecfd813785ea6bc06093f57858d --- /dev/null +++ b/custom_timm/scheduler/scheduler.py @@ -0,0 +1,117 @@ +from typing import Dict, Any + +import torch + + +class Scheduler: + """ Parameter Scheduler Base Class + A scheduler base class that can be used to schedule any optimizer parameter groups. + + Unlike the builtin PyTorch schedulers, this is intended to be consistently called + * At the END of each epoch, before incrementing the epoch count, to calculate next epoch's value + * At the END of each optimizer update, after incrementing the update count, to calculate next update's value + + The schedulers built on this should try to remain as stateless as possible (for simplicity). + + This family of schedulers is attempting to avoid the confusion of the meaning of 'last_epoch' + and -1 values for special behaviour. All epoch and update counts must be tracked in the training + code and explicitly passed in to the schedulers on the corresponding step or step_update call. + + Based on ideas from: + * https://github.com/pytorch/fairseq/tree/master/fairseq/optim/lr_scheduler + * https://github.com/allenai/allennlp/tree/master/allennlp/training/learning_rate_schedulers + """ + + def __init__(self, + optimizer: torch.optim.Optimizer, + param_group_field: str, + noise_range_t=None, + noise_type='normal', + noise_pct=0.67, + noise_std=1.0, + noise_seed=None, + initialize: bool = True) -> None: + self.optimizer = optimizer + self.param_group_field = param_group_field + self._initial_param_group_field = f"initial_{param_group_field}" + if initialize: + for i, group in enumerate(self.optimizer.param_groups): + if param_group_field not in group: + raise KeyError(f"{param_group_field} missing from param_groups[{i}]") + group.setdefault(self._initial_param_group_field, group[param_group_field]) + else: + for i, group in enumerate(self.optimizer.param_groups): + if self._initial_param_group_field not in group: + raise KeyError(f"{self._initial_param_group_field} missing from param_groups[{i}]") + self.base_values = [group[self._initial_param_group_field] for group in self.optimizer.param_groups] + self.metric = None # any point to having this for all? + self.noise_range_t = noise_range_t + self.noise_pct = noise_pct + self.noise_type = noise_type + self.noise_std = noise_std + self.noise_seed = noise_seed if noise_seed is not None else 42 + self.update_groups(self.base_values) + + def state_dict(self) -> Dict[str, Any]: + return {key: value for key, value in self.__dict__.items() if key != 'optimizer'} + + def load_state_dict(self, state_dict: Dict[str, Any]) -> None: + self.__dict__.update(state_dict) + + def get_epoch_values(self, epoch: int): + return None + + def get_update_values(self, num_updates: int): + return None + + def step(self, epoch: int, metric: float = None) -> None: + self.metric = metric + values = self.get_epoch_values(epoch) + if values is not None: + values = self._add_noise(values, epoch) + self.update_groups(values) + + def step_update(self, num_updates: int, metric: float = None): + self.metric = metric + values = self.get_update_values(num_updates) + if values is not None: + values = self._add_noise(values, num_updates) + self.update_groups(values) + + def update_groups(self, values): + if not isinstance(values, (list, tuple)): + values = [values] * len(self.optimizer.param_groups) + for param_group, value in zip(self.optimizer.param_groups, values): + if 'lr_scale' in param_group: + param_group[self.param_group_field] = value * param_group['lr_scale'] + else: + param_group[self.param_group_field] = value + + def _add_noise(self, lrs, t): + if self._is_apply_noise(t): + noise = self._calculate_noise(t) + lrs = [v + v * noise for v in lrs] + return lrs + + def _is_apply_noise(self, t) -> bool: + """Return True if scheduler in noise range.""" + apply_noise = False + if self.noise_range_t is not None: + if isinstance(self.noise_range_t, (list, tuple)): + apply_noise = self.noise_range_t[0] <= t < self.noise_range_t[1] + else: + apply_noise = t >= self.noise_range_t + return apply_noise + + def _calculate_noise(self, t) -> float: + g = torch.Generator() + g.manual_seed(self.noise_seed + t) + if self.noise_type == 'normal': + while True: + # resample if noise out of percent limit, brute force but shouldn't spin much + noise = torch.randn(1, generator=g).item() + if abs(noise) < self.noise_pct: + return noise + else: + noise = 2 * (torch.rand(1, generator=g).item() - 0.5) * self.noise_pct + return noise diff --git a/custom_timm/scheduler/scheduler_factory.py b/custom_timm/scheduler/scheduler_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..3e100fe029c3bc2405d3cae0695376603dd78618 --- /dev/null +++ b/custom_timm/scheduler/scheduler_factory.py @@ -0,0 +1,107 @@ +""" Scheduler Factory +Hacked together by / Copyright 2021 Ross Wightman +""" +from .cosine_lr import CosineLRScheduler +from .multistep_lr import MultiStepLRScheduler +from .plateau_lr import PlateauLRScheduler +from .poly_lr import PolyLRScheduler +from .step_lr import StepLRScheduler +from .tanh_lr import TanhLRScheduler + + +def create_scheduler(args, optimizer): + num_epochs = args.epochs + + if getattr(args, 'lr_noise', None) is not None: + lr_noise = getattr(args, 'lr_noise') + if isinstance(lr_noise, (list, tuple)): + noise_range = [n * num_epochs for n in lr_noise] + if len(noise_range) == 1: + noise_range = noise_range[0] + else: + noise_range = lr_noise * num_epochs + else: + noise_range = None + noise_args = dict( + noise_range_t=noise_range, + noise_pct=getattr(args, 'lr_noise_pct', 0.67), + noise_std=getattr(args, 'lr_noise_std', 1.), + noise_seed=getattr(args, 'seed', 42), + ) + cycle_args = dict( + cycle_mul=getattr(args, 'lr_cycle_mul', 1.), + cycle_decay=getattr(args, 'lr_cycle_decay', 0.1), + cycle_limit=getattr(args, 'lr_cycle_limit', 1), + ) + + lr_scheduler = None + if args.sched == 'cosine': + lr_scheduler = CosineLRScheduler( + optimizer, + t_initial=num_epochs, + lr_min=args.min_lr, + warmup_lr_init=args.warmup_lr, + warmup_t=args.warmup_epochs, + k_decay=getattr(args, 'lr_k_decay', 1.0), + **cycle_args, + **noise_args, + ) + num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs + elif args.sched == 'tanh': + lr_scheduler = TanhLRScheduler( + optimizer, + t_initial=num_epochs, + lr_min=args.min_lr, + warmup_lr_init=args.warmup_lr, + warmup_t=args.warmup_epochs, + t_in_epochs=True, + **cycle_args, + **noise_args, + ) + num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs + elif args.sched == 'step': + lr_scheduler = StepLRScheduler( + optimizer, + decay_t=args.decay_epochs, + decay_rate=args.decay_rate, + warmup_lr_init=args.warmup_lr, + warmup_t=args.warmup_epochs, + **noise_args, + ) + elif args.sched == 'multistep': + lr_scheduler = MultiStepLRScheduler( + optimizer, + decay_t=args.decay_milestones, + decay_rate=args.decay_rate, + warmup_lr_init=args.warmup_lr, + warmup_t=args.warmup_epochs, + **noise_args, + ) + elif args.sched == 'plateau': + mode = 'min' if 'loss' in getattr(args, 'eval_metric', '') else 'max' + lr_scheduler = PlateauLRScheduler( + optimizer, + decay_rate=args.decay_rate, + patience_t=args.patience_epochs, + lr_min=args.min_lr, + mode=mode, + warmup_lr_init=args.warmup_lr, + warmup_t=args.warmup_epochs, + cooldown_t=0, + **noise_args, + ) + elif args.sched == 'poly': + lr_scheduler = PolyLRScheduler( + optimizer, + power=args.decay_rate, # overloading 'decay_rate' as polynomial power + t_initial=num_epochs, + lr_min=args.min_lr, + warmup_lr_init=args.warmup_lr, + warmup_t=args.warmup_epochs, + k_decay=getattr(args, 'lr_k_decay', 1.0), + **cycle_args, + **noise_args, + ) + num_epochs = lr_scheduler.get_cycle_length() + args.cooldown_epochs + + return lr_scheduler, num_epochs diff --git a/custom_timm/scheduler/step_lr.py b/custom_timm/scheduler/step_lr.py new file mode 100644 index 0000000000000000000000000000000000000000..f797e1a8cf35999531dd5f1ccbbe09a9d0cf30a9 --- /dev/null +++ b/custom_timm/scheduler/step_lr.py @@ -0,0 +1,63 @@ +""" Step Scheduler + +Basic step LR schedule with warmup, noise. + +Hacked together by / Copyright 2020 Ross Wightman +""" +import math +import torch + +from .scheduler import Scheduler + + +class StepLRScheduler(Scheduler): + """ + """ + + def __init__(self, + optimizer: torch.optim.Optimizer, + decay_t: float, + decay_rate: float = 1., + warmup_t=0, + warmup_lr_init=0, + t_in_epochs=True, + noise_range_t=None, + noise_pct=0.67, + noise_std=1.0, + noise_seed=42, + initialize=True, + ) -> None: + super().__init__( + optimizer, param_group_field="lr", + noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, + initialize=initialize) + + self.decay_t = decay_t + self.decay_rate = decay_rate + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + self.t_in_epochs = t_in_epochs + if self.warmup_t: + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + + def _get_lr(self, t): + if t < self.warmup_t: + lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] + else: + lrs = [v * (self.decay_rate ** (t // self.decay_t)) for v in self.base_values] + return lrs + + def get_epoch_values(self, epoch: int): + if self.t_in_epochs: + return self._get_lr(epoch) + else: + return None + + def get_update_values(self, num_updates: int): + if not self.t_in_epochs: + return self._get_lr(num_updates) + else: + return None diff --git a/custom_timm/scheduler/tanh_lr.py b/custom_timm/scheduler/tanh_lr.py new file mode 100644 index 0000000000000000000000000000000000000000..f2d3c9cdb11ad31766062f1a8d3e69d3f845edc1 --- /dev/null +++ b/custom_timm/scheduler/tanh_lr.py @@ -0,0 +1,117 @@ +""" TanH Scheduler + +TanH schedule with warmup, cycle/restarts, noise. + +Hacked together by / Copyright 2021 Ross Wightman +""" +import logging +import math +import numpy as np +import torch + +from .scheduler import Scheduler + + +_logger = logging.getLogger(__name__) + + +class TanhLRScheduler(Scheduler): + """ + Hyberbolic-Tangent decay with restarts. + This is described in the paper https://arxiv.org/abs/1806.01593 + """ + + def __init__(self, + optimizer: torch.optim.Optimizer, + t_initial: int, + lb: float = -7., + ub: float = 3., + lr_min: float = 0., + cycle_mul: float = 1., + cycle_decay: float = 1., + cycle_limit: int = 1, + warmup_t=0, + warmup_lr_init=0, + warmup_prefix=False, + t_in_epochs=True, + noise_range_t=None, + noise_pct=0.67, + noise_std=1.0, + noise_seed=42, + initialize=True) -> None: + super().__init__( + optimizer, param_group_field="lr", + noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, + initialize=initialize) + + assert t_initial > 0 + assert lr_min >= 0 + assert lb < ub + assert cycle_limit >= 0 + assert warmup_t >= 0 + assert warmup_lr_init >= 0 + self.lb = lb + self.ub = ub + self.t_initial = t_initial + self.lr_min = lr_min + self.cycle_mul = cycle_mul + self.cycle_decay = cycle_decay + self.cycle_limit = cycle_limit + self.warmup_t = warmup_t + self.warmup_lr_init = warmup_lr_init + self.warmup_prefix = warmup_prefix + self.t_in_epochs = t_in_epochs + if self.warmup_t: + t_v = self.base_values if self.warmup_prefix else self._get_lr(self.warmup_t) + self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in t_v] + super().update_groups(self.warmup_lr_init) + else: + self.warmup_steps = [1 for _ in self.base_values] + + def _get_lr(self, t): + if t < self.warmup_t: + lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] + else: + if self.warmup_prefix: + t = t - self.warmup_t + + if self.cycle_mul != 1: + i = math.floor(math.log(1 - t / self.t_initial * (1 - self.cycle_mul), self.cycle_mul)) + t_i = self.cycle_mul ** i * self.t_initial + t_curr = t - (1 - self.cycle_mul ** i) / (1 - self.cycle_mul) * self.t_initial + else: + i = t // self.t_initial + t_i = self.t_initial + t_curr = t - (self.t_initial * i) + + if i < self.cycle_limit: + gamma = self.cycle_decay ** i + lr_max_values = [v * gamma for v in self.base_values] + + tr = t_curr / t_i + lrs = [ + self.lr_min + 0.5 * (lr_max - self.lr_min) * (1 - math.tanh(self.lb * (1. - tr) + self.ub * tr)) + for lr_max in lr_max_values + ] + else: + lrs = [self.lr_min for _ in self.base_values] + return lrs + + def get_epoch_values(self, epoch: int): + if self.t_in_epochs: + return self._get_lr(epoch) + else: + return None + + def get_update_values(self, num_updates: int): + if not self.t_in_epochs: + return self._get_lr(num_updates) + else: + return None + + def get_cycle_length(self, cycles=0): + cycles = max(1, cycles or self.cycle_limit) + if self.cycle_mul == 1.0: + return self.t_initial * cycles + else: + return int(math.floor(-self.t_initial * (self.cycle_mul ** cycles - 1) / (1 - self.cycle_mul))) diff --git a/custom_timm/utils/__init__.py b/custom_timm/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..7b139852d79644f97de7cf373a1a4c3dbd17f050 --- /dev/null +++ b/custom_timm/utils/__init__.py @@ -0,0 +1,14 @@ +from .agc import adaptive_clip_grad +from .checkpoint_saver import CheckpointSaver +from .clip_grad import dispatch_clip_grad +from .cuda import ApexScaler, NativeScaler +from .decay_batch import decay_batch_step, check_batch_size_retry +from .distributed import distribute_bn, reduce_tensor +from .jit import set_jit_legacy, set_jit_fuser +from .log import setup_default_logging, FormatterNoInfo +from .metrics import AverageMeter, accuracy +from .misc import natural_key, add_bool_arg +from .model import unwrap_model, get_state_dict, freeze, unfreeze +from .model_ema import ModelEma, ModelEmaV2 +from .random import random_seed +from .summary import update_summary, get_outdir diff --git a/custom_timm/utils/agc.py b/custom_timm/utils/agc.py new file mode 100644 index 0000000000000000000000000000000000000000..f51401726ff6810d97d0fa567f4e31b474325a59 --- /dev/null +++ b/custom_timm/utils/agc.py @@ -0,0 +1,42 @@ +""" Adaptive Gradient Clipping + +An impl of AGC, as per (https://arxiv.org/abs/2102.06171): + +@article{brock2021high, + author={Andrew Brock and Soham De and Samuel L. Smith and Karen Simonyan}, + title={High-Performance Large-Scale Image Recognition Without Normalization}, + journal={arXiv preprint arXiv:}, + year={2021} +} + +Code references: + * Official JAX impl (paper authors): https://github.com/deepmind/deepmind-research/tree/master/nfnets + * Phil Wang's PyTorch gist: https://gist.github.com/lucidrains/0d6560077edac419ab5d3aa29e674d5c + +Hacked together by / Copyright 2021 Ross Wightman +""" +import torch + + +def unitwise_norm(x, norm_type=2.0): + if x.ndim <= 1: + return x.norm(norm_type) + else: + # works for nn.ConvNd and nn,Linear where output dim is first in the kernel/weight tensor + # might need special cases for other weights (possibly MHA) where this may not be true + return x.norm(norm_type, dim=tuple(range(1, x.ndim)), keepdim=True) + + +def adaptive_clip_grad(parameters, clip_factor=0.01, eps=1e-3, norm_type=2.0): + if isinstance(parameters, torch.Tensor): + parameters = [parameters] + for p in parameters: + if p.grad is None: + continue + p_data = p.detach() + g_data = p.grad.detach() + max_norm = unitwise_norm(p_data, norm_type=norm_type).clamp_(min=eps).mul_(clip_factor) + grad_norm = unitwise_norm(g_data, norm_type=norm_type) + clipped_grad = g_data * (max_norm / grad_norm.clamp(min=1e-6)) + new_grads = torch.where(grad_norm < max_norm, g_data, clipped_grad) + p.grad.detach().copy_(new_grads) diff --git a/custom_timm/utils/checkpoint_saver.py b/custom_timm/utils/checkpoint_saver.py new file mode 100644 index 0000000000000000000000000000000000000000..6aad74ee52655f68220f799efaffcbccdd0748ad --- /dev/null +++ b/custom_timm/utils/checkpoint_saver.py @@ -0,0 +1,150 @@ +""" Checkpoint Saver + +Track top-n training checkpoints and maintain recovery checkpoints on specified intervals. + +Hacked together by / Copyright 2020 Ross Wightman +""" + +import glob +import operator +import os +import logging + +import torch + +from .model import unwrap_model, get_state_dict + + +_logger = logging.getLogger(__name__) + + +class CheckpointSaver: + def __init__( + self, + model, + optimizer, + args=None, + model_ema=None, + amp_scaler=None, + checkpoint_prefix='checkpoint', + recovery_prefix='recovery', + checkpoint_dir='', + recovery_dir='', + decreasing=False, + max_history=10, + unwrap_fn=unwrap_model): + + # objects to save state_dicts of + self.model = model + self.optimizer = optimizer + self.args = args + self.model_ema = model_ema + self.amp_scaler = amp_scaler + + # state + self.checkpoint_files = [] # (filename, metric) tuples in order of decreasing betterness + self.best_epoch = None + self.best_metric = None + self.curr_recovery_file = '' + self.last_recovery_file = '' + + # config + self.checkpoint_dir = checkpoint_dir + self.recovery_dir = recovery_dir + self.save_prefix = checkpoint_prefix + self.recovery_prefix = recovery_prefix + self.extension = '.pth.tar' + self.decreasing = decreasing # a lower metric is better if True + self.cmp = operator.lt if decreasing else operator.gt # True if lhs better than rhs + self.max_history = max_history + self.unwrap_fn = unwrap_fn + assert self.max_history >= 1 + + def save_checkpoint(self, epoch, metric=None): + assert epoch >= 0 + tmp_save_path = os.path.join(self.checkpoint_dir, 'tmp' + self.extension) + last_save_path = os.path.join(self.checkpoint_dir, 'last' + self.extension) + self._save(tmp_save_path, epoch, metric) + if os.path.exists(last_save_path): + os.unlink(last_save_path) # required for Windows support. + os.rename(tmp_save_path, last_save_path) + worst_file = self.checkpoint_files[-1] if self.checkpoint_files else None + if (len(self.checkpoint_files) < self.max_history + or metric is None or self.cmp(metric, worst_file[1])): + if len(self.checkpoint_files) >= self.max_history: + self._cleanup_checkpoints(1) + filename = '-'.join([self.save_prefix, str(epoch)]) + self.extension + save_path = os.path.join(self.checkpoint_dir, filename) + os.link(last_save_path, save_path) + self.checkpoint_files.append((save_path, metric)) + self.checkpoint_files = sorted( + self.checkpoint_files, key=lambda x: x[1], + reverse=not self.decreasing) # sort in descending order if a lower metric is not better + + checkpoints_str = "Current checkpoints:\n" + for c in self.checkpoint_files: + checkpoints_str += ' {}\n'.format(c) + _logger.info(checkpoints_str) + + if metric is not None and (self.best_metric is None or self.cmp(metric, self.best_metric)): + self.best_epoch = epoch + self.best_metric = metric + best_save_path = os.path.join(self.checkpoint_dir, 'model_best' + self.extension) + if os.path.exists(best_save_path): + os.unlink(best_save_path) + os.link(last_save_path, best_save_path) + + return (None, None) if self.best_metric is None else (self.best_metric, self.best_epoch) + + def _save(self, save_path, epoch, metric=None): + save_state = { + 'epoch': epoch, + 'arch': type(self.model).__name__.lower(), + 'state_dict': get_state_dict(self.model, self.unwrap_fn), + 'optimizer': self.optimizer.state_dict(), + 'version': 2, # version < 2 increments epoch before save + } + if self.args is not None: + save_state['arch'] = self.args.model + save_state['args'] = self.args + if self.amp_scaler is not None: + save_state[self.amp_scaler.state_dict_key] = self.amp_scaler.state_dict() + if self.model_ema is not None: + save_state['state_dict_ema'] = get_state_dict(self.model_ema, self.unwrap_fn) + if metric is not None: + save_state['metric'] = metric + torch.save(save_state, save_path) + + def _cleanup_checkpoints(self, trim=0): + trim = min(len(self.checkpoint_files), trim) + delete_index = self.max_history - trim + if delete_index < 0 or len(self.checkpoint_files) <= delete_index: + return + to_delete = self.checkpoint_files[delete_index:] + for d in to_delete: + try: + _logger.debug("Cleaning checkpoint: {}".format(d)) + os.remove(d[0]) + except Exception as e: + _logger.error("Exception '{}' while deleting checkpoint".format(e)) + self.checkpoint_files = self.checkpoint_files[:delete_index] + + def save_recovery(self, epoch, batch_idx=0): + assert epoch >= 0 + filename = '-'.join([self.recovery_prefix, str(epoch), str(batch_idx)]) + self.extension + save_path = os.path.join(self.recovery_dir, filename) + self._save(save_path, epoch) + if os.path.exists(self.last_recovery_file): + try: + _logger.debug("Cleaning recovery: {}".format(self.last_recovery_file)) + os.remove(self.last_recovery_file) + except Exception as e: + _logger.error("Exception '{}' while removing {}".format(e, self.last_recovery_file)) + self.last_recovery_file = self.curr_recovery_file + self.curr_recovery_file = save_path + + def find_recovery(self): + recovery_path = os.path.join(self.recovery_dir, self.recovery_prefix) + files = glob.glob(recovery_path + '*' + self.extension) + files = sorted(files) + return files[0] if len(files) else '' diff --git a/custom_timm/utils/clip_grad.py b/custom_timm/utils/clip_grad.py new file mode 100644 index 0000000000000000000000000000000000000000..73671d3a5d2ad856630ce2b2d7b0d6e6e627c59a --- /dev/null +++ b/custom_timm/utils/clip_grad.py @@ -0,0 +1,23 @@ +import torch + +from custom_timm.utils.agc import adaptive_clip_grad + + +def dispatch_clip_grad(parameters, value: float, mode: str = 'norm', norm_type: float = 2.0): + """ Dispatch to gradient clipping method + + Args: + parameters (Iterable): model parameters to clip + value (float): clipping value/factor/norm, mode dependant + mode (str): clipping mode, one of 'norm', 'value', 'agc' + norm_type (float): p-norm, default 2.0 + """ + if mode == 'norm': + torch.nn.utils.clip_grad_norm_(parameters, value, norm_type=norm_type) + elif mode == 'value': + torch.nn.utils.clip_grad_value_(parameters, value) + elif mode == 'agc': + adaptive_clip_grad(parameters, value, norm_type=norm_type) + else: + assert False, f"Unknown clip mode ({mode})." + diff --git a/custom_timm/utils/cuda.py b/custom_timm/utils/cuda.py new file mode 100644 index 0000000000000000000000000000000000000000..9e7bddf30463a7be7186c7def47c4e4dfb9993aa --- /dev/null +++ b/custom_timm/utils/cuda.py @@ -0,0 +1,55 @@ +""" CUDA / AMP utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch + +try: + from apex import amp + has_apex = True +except ImportError: + amp = None + has_apex = False + +from .clip_grad import dispatch_clip_grad + + +class ApexScaler: + state_dict_key = "amp" + + def __call__(self, loss, optimizer, clip_grad=None, clip_mode='norm', parameters=None, create_graph=False): + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward(create_graph=create_graph) + if clip_grad is not None: + dispatch_clip_grad(amp.master_params(optimizer), clip_grad, mode=clip_mode) + optimizer.step() + + def state_dict(self): + if 'state_dict' in amp.__dict__: + return amp.state_dict() + + def load_state_dict(self, state_dict): + if 'load_state_dict' in amp.__dict__: + amp.load_state_dict(state_dict) + + +class NativeScaler: + state_dict_key = "amp_scaler" + + def __init__(self): + self._scaler = torch.cuda.amp.GradScaler() + + def __call__(self, loss, optimizer, clip_grad=None, clip_mode='norm', parameters=None, create_graph=False): + self._scaler.scale(loss).backward(create_graph=create_graph) + if clip_grad is not None: + assert parameters is not None + self._scaler.unscale_(optimizer) # unscale the gradients of optimizer's assigned params in-place + dispatch_clip_grad(parameters, clip_grad, mode=clip_mode) + self._scaler.step(optimizer) + self._scaler.update() + + def state_dict(self): + return self._scaler.state_dict() + + def load_state_dict(self, state_dict): + self._scaler.load_state_dict(state_dict) diff --git a/custom_timm/utils/decay_batch.py b/custom_timm/utils/decay_batch.py new file mode 100644 index 0000000000000000000000000000000000000000..852fa4b8dc3d46932b67ed3e42170a5de92415d9 --- /dev/null +++ b/custom_timm/utils/decay_batch.py @@ -0,0 +1,43 @@ +""" Batch size decay and retry helpers. + +Copyright 2022 Ross Wightman +""" +import math + + +def decay_batch_step(batch_size, num_intra_steps=2, no_odd=False): + """ power of two batch-size decay with intra steps + + Decay by stepping between powers of 2: + * determine power-of-2 floor of current batch size (base batch size) + * divide above value by num_intra_steps to determine step size + * floor batch_size to nearest multiple of step_size (from base batch size) + Examples: + num_steps == 4 --> 64, 56, 48, 40, 32, 28, 24, 20, 16, 14, 12, 10, 8, 7, 6, 5, 4, 3, 2, 1 + num_steps (no_odd=True) == 4 --> 64, 56, 48, 40, 32, 28, 24, 20, 16, 14, 12, 10, 8, 6, 4, 2 + num_steps == 2 --> 64, 48, 32, 24, 16, 12, 8, 6, 4, 3, 2, 1 + num_steps == 1 --> 64, 32, 16, 8, 4, 2, 1 + """ + if batch_size <= 1: + # return 0 for stopping value so easy to use in loop + return 0 + base_batch_size = int(2 ** (math.log(batch_size - 1) // math.log(2))) + step_size = max(base_batch_size // num_intra_steps, 1) + batch_size = base_batch_size + ((batch_size - base_batch_size - 1) // step_size) * step_size + if no_odd and batch_size % 2: + batch_size -= 1 + return batch_size + + +def check_batch_size_retry(error_str): + """ check failure error string for conditions where batch decay retry should not be attempted + """ + error_str = error_str.lower() + if 'required rank' in error_str: + # Errors involving phrase 'required rank' typically happen when a conv is used that's + # not compatible with channels_last memory format. + return False + if 'illegal' in error_str: + # 'Illegal memory access' errors in CUDA typically leave process in unusable state + return False + return True diff --git a/custom_timm/utils/distributed.py b/custom_timm/utils/distributed.py new file mode 100644 index 0000000000000000000000000000000000000000..3c5dba8c1de5a6ff53638207521377fdfbc4f239 --- /dev/null +++ b/custom_timm/utils/distributed.py @@ -0,0 +1,28 @@ +""" Distributed training/validation utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import torch +from torch import distributed as dist + +from .model import unwrap_model + + +def reduce_tensor(tensor, n): + rt = tensor.clone() + dist.all_reduce(rt, op=dist.ReduceOp.SUM) + rt /= n + return rt + + +def distribute_bn(model, world_size, reduce=False): + # ensure every node has the same running bn stats + for bn_name, bn_buf in unwrap_model(model).named_buffers(recurse=True): + if ('running_mean' in bn_name) or ('running_var' in bn_name): + if reduce: + # average bn stats across whole group + torch.distributed.all_reduce(bn_buf, op=dist.ReduceOp.SUM) + bn_buf /= float(world_size) + else: + # broadcast bn stats from rank 0 to whole group + torch.distributed.broadcast(bn_buf, 0) diff --git a/custom_timm/utils/jit.py b/custom_timm/utils/jit.py new file mode 100644 index 0000000000000000000000000000000000000000..d527411fd3e1985639bb0b161bd484142a3619dd --- /dev/null +++ b/custom_timm/utils/jit.py @@ -0,0 +1,58 @@ +""" JIT scripting/tracing utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import os + +import torch + + +def set_jit_legacy(): + """ Set JIT executor to legacy w/ support for op fusion + This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes + in the JIT exectutor. These API are not supported so could change. + """ + # + assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!" + torch._C._jit_set_profiling_executor(False) + torch._C._jit_set_profiling_mode(False) + torch._C._jit_override_can_fuse_on_gpu(True) + #torch._C._jit_set_texpr_fuser_enabled(True) + + +def set_jit_fuser(fuser): + if fuser == "te": + # default fuser should be == 'te' + torch._C._jit_set_profiling_executor(True) + torch._C._jit_set_profiling_mode(True) + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(True) + torch._C._jit_set_texpr_fuser_enabled(True) + try: + torch._C._jit_set_nvfuser_enabled(False) + except Exception: + pass + elif fuser == "old" or fuser == "legacy": + torch._C._jit_set_profiling_executor(False) + torch._C._jit_set_profiling_mode(False) + torch._C._jit_override_can_fuse_on_gpu(True) + torch._C._jit_set_texpr_fuser_enabled(False) + try: + torch._C._jit_set_nvfuser_enabled(False) + except Exception: + pass + elif fuser == "nvfuser" or fuser == "nvf": + os.environ['PYTORCH_NVFUSER_DISABLE_FALLBACK'] = '1' + #os.environ['PYTORCH_NVFUSER_DISABLE_FMA'] = '1' + #os.environ['PYTORCH_NVFUSER_JIT_OPT_LEVEL'] = '0' + torch._C._jit_set_texpr_fuser_enabled(False) + torch._C._jit_set_profiling_executor(True) + torch._C._jit_set_profiling_mode(True) + torch._C._jit_can_fuse_on_cpu() + torch._C._jit_can_fuse_on_gpu() + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(False) + torch._C._jit_set_nvfuser_guard_mode(True) + torch._C._jit_set_nvfuser_enabled(True) + else: + assert False, f"Invalid jit fuser ({fuser})" diff --git a/custom_timm/utils/log.py b/custom_timm/utils/log.py new file mode 100644 index 0000000000000000000000000000000000000000..c99469e0884f3e45905ef7c7f0d1e491092697ad --- /dev/null +++ b/custom_timm/utils/log.py @@ -0,0 +1,28 @@ +""" Logging helpers + +Hacked together by / Copyright 2020 Ross Wightman +""" +import logging +import logging.handlers + + +class FormatterNoInfo(logging.Formatter): + def __init__(self, fmt='%(levelname)s: %(message)s'): + logging.Formatter.__init__(self, fmt) + + def format(self, record): + if record.levelno == logging.INFO: + return str(record.getMessage()) + return logging.Formatter.format(self, record) + + +def setup_default_logging(default_level=logging.INFO, log_path=''): + console_handler = logging.StreamHandler() + console_handler.setFormatter(FormatterNoInfo()) + logging.root.addHandler(console_handler) + logging.root.setLevel(default_level) + if log_path: + file_handler = logging.handlers.RotatingFileHandler(log_path, maxBytes=(1024 ** 2 * 2), backupCount=3) + file_formatter = logging.Formatter("%(asctime)s - %(name)20s: [%(levelname)8s] - %(message)s") + file_handler.setFormatter(file_formatter) + logging.root.addHandler(file_handler) diff --git a/custom_timm/utils/metrics.py b/custom_timm/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..9fdbe13ef15c541679906239374ff8a7eedf5181 --- /dev/null +++ b/custom_timm/utils/metrics.py @@ -0,0 +1,32 @@ +""" Eval metrics and related + +Hacked together by / Copyright 2020 Ross Wightman +""" + + +class AverageMeter: + """Computes and stores the average and current value""" + def __init__(self): + self.reset() + + def reset(self): + self.val = 0 + self.avg = 0 + self.sum = 0 + self.count = 0 + + def update(self, val, n=1): + self.val = val + self.sum += val * n + self.count += n + self.avg = self.sum / self.count + + +def accuracy(output, target, topk=(1,)): + """Computes the accuracy over the k top predictions for the specified values of k""" + maxk = min(max(topk), output.size()[1]) + batch_size = target.size(0) + _, pred = output.topk(maxk, 1, True, True) + pred = pred.t() + correct = pred.eq(target.reshape(1, -1).expand_as(pred)) + return [correct[:min(k, maxk)].reshape(-1).float().sum(0) * 100. / batch_size for k in topk] diff --git a/custom_timm/utils/misc.py b/custom_timm/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..39c0097c60ed602547f832f1f8dafbe37f156064 --- /dev/null +++ b/custom_timm/utils/misc.py @@ -0,0 +1,18 @@ +""" Misc utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import re + + +def natural_key(string_): + """See http://www.codinghorror.com/blog/archives/001018.html""" + return [int(s) if s.isdigit() else s for s in re.split(r'(\d+)', string_.lower())] + + +def add_bool_arg(parser, name, default=False, help=''): + dest_name = name.replace('-', '_') + group = parser.add_mutually_exclusive_group(required=False) + group.add_argument('--' + name, dest=dest_name, action='store_true', help=help) + group.add_argument('--no-' + name, dest=dest_name, action='store_false', help=help) + parser.set_defaults(**{dest_name: default}) diff --git a/custom_timm/utils/model.py b/custom_timm/utils/model.py new file mode 100644 index 0000000000000000000000000000000000000000..b95c45392bfb551f52bc8b8dca1aaf8c8b1940b1 --- /dev/null +++ b/custom_timm/utils/model.py @@ -0,0 +1,273 @@ +""" Model / state_dict utils + +Hacked together by / Copyright 2020 Ross Wightman +""" +import fnmatch + +import torch +from torchvision.ops.misc import FrozenBatchNorm2d + +from .model_ema import ModelEma + + +def unwrap_model(model): + if isinstance(model, ModelEma): + return unwrap_model(model.ema) + else: + return model.module if hasattr(model, 'module') else model + + +def get_state_dict(model, unwrap_fn=unwrap_model): + return unwrap_fn(model).state_dict() + + +def avg_sq_ch_mean(model, input, output): + """ calculate average channel square mean of output activations + """ + return torch.mean(output.mean(axis=[0, 2, 3]) ** 2).item() + + +def avg_ch_var(model, input, output): + """ calculate average channel variance of output activations + """ + return torch.mean(output.var(axis=[0, 2, 3])).item() + + +def avg_ch_var_residual(model, input, output): + """ calculate average channel variance of output activations + """ + return torch.mean(output.var(axis=[0, 2, 3])).item() + + +class ActivationStatsHook: + """Iterates through each of `model`'s modules and matches modules using unix pattern + matching based on `hook_fn_locs` and registers `hook_fn` to the module if there is + a match. + + Arguments: + model (nn.Module): model from which we will extract the activation stats + hook_fn_locs (List[str]): List of `hook_fn` locations based on Unix type string + matching with the name of model's modules. + hook_fns (List[Callable]): List of hook functions to be registered at every + module in `layer_names`. + + Inspiration from https://docs.fast.ai/callback.hook.html. + + Refer to https://gist.github.com/amaarora/6e56942fcb46e67ba203f3009b30d950 for an example + on how to plot Signal Propogation Plots using `ActivationStatsHook`. + """ + + def __init__(self, model, hook_fn_locs, hook_fns): + self.model = model + self.hook_fn_locs = hook_fn_locs + self.hook_fns = hook_fns + if len(hook_fn_locs) != len(hook_fns): + raise ValueError("Please provide `hook_fns` for each `hook_fn_locs`, \ + their lengths are different.") + self.stats = dict((hook_fn.__name__, []) for hook_fn in hook_fns) + for hook_fn_loc, hook_fn in zip(hook_fn_locs, hook_fns): + self.register_hook(hook_fn_loc, hook_fn) + + def _create_hook(self, hook_fn): + def append_activation_stats(module, input, output): + out = hook_fn(module, input, output) + self.stats[hook_fn.__name__].append(out) + + return append_activation_stats + + def register_hook(self, hook_fn_loc, hook_fn): + for name, module in self.model.named_modules(): + if not fnmatch.fnmatch(name, hook_fn_loc): + continue + module.register_forward_hook(self._create_hook(hook_fn)) + + +def extract_spp_stats( + model, + hook_fn_locs, + hook_fns, + input_shape=[8, 3, 224, 224]): + """Extract average square channel mean and variance of activations during + forward pass to plot Signal Propogation Plots (SPP). + + Paper: https://arxiv.org/abs/2101.08692 + + Example Usage: https://gist.github.com/amaarora/6e56942fcb46e67ba203f3009b30d950 + """ + x = torch.normal(0., 1., input_shape) + hook = ActivationStatsHook(model, hook_fn_locs=hook_fn_locs, hook_fns=hook_fns) + _ = model(x) + return hook.stats + + +def freeze_batch_norm_2d(module): + """ + Converts all `BatchNorm2d` and `SyncBatchNorm` layers of provided module into `FrozenBatchNorm2d`. If `module` is + itself an instance of either `BatchNorm2d` or `SyncBatchNorm`, it is converted into `FrozenBatchNorm2d` and + returned. Otherwise, the module is walked recursively and submodules are converted in place. + + Args: + module (torch.nn.Module): Any PyTorch module. + + Returns: + torch.nn.Module: Resulting module + + Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 + """ + res = module + if isinstance(module, (torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.batchnorm.SyncBatchNorm)): + res = FrozenBatchNorm2d(module.num_features) + res.num_features = module.num_features + res.affine = module.affine + if module.affine: + res.weight.data = module.weight.data.clone().detach() + res.bias.data = module.bias.data.clone().detach() + res.running_mean.data = module.running_mean.data + res.running_var.data = module.running_var.data + res.eps = module.eps + else: + for name, child in module.named_children(): + new_child = freeze_batch_norm_2d(child) + if new_child is not child: + res.add_module(name, new_child) + return res + + +def unfreeze_batch_norm_2d(module): + """ + Converts all `FrozenBatchNorm2d` layers of provided module into `BatchNorm2d`. If `module` is itself and instance + of `FrozenBatchNorm2d`, it is converted into `BatchNorm2d` and returned. Otherwise, the module is walked + recursively and submodules are converted in place. + + Args: + module (torch.nn.Module): Any PyTorch module. + + Returns: + torch.nn.Module: Resulting module + + Inspired by https://github.com/pytorch/pytorch/blob/a5895f85be0f10212791145bfedc0261d364f103/torch/nn/modules/batchnorm.py#L762 + """ + res = module + if isinstance(module, FrozenBatchNorm2d): + res = torch.nn.BatchNorm2d(module.num_features) + if module.affine: + res.weight.data = module.weight.data.clone().detach() + res.bias.data = module.bias.data.clone().detach() + res.running_mean.data = module.running_mean.data + res.running_var.data = module.running_var.data + res.eps = module.eps + else: + for name, child in module.named_children(): + new_child = unfreeze_batch_norm_2d(child) + if new_child is not child: + res.add_module(name, new_child) + return res + + +def _freeze_unfreeze(root_module, submodules=[], include_bn_running_stats=True, mode='freeze'): + """ + Freeze or unfreeze parameters of the specified modules and those of all their hierarchical descendants. This is + done in place. + Args: + root_module (nn.Module, optional): Root module relative to which the `submodules` are referenced. + submodules (list[str]): List of modules for which the parameters will be (un)frozen. They are to be provided as + named modules relative to the root module (accessible via `root_module.named_modules()`). An empty list + means that the whole root module will be (un)frozen. Defaults to [] + include_bn_running_stats (bool): Whether to also (un)freeze the running statistics of batch norm 2d layers. + Defaults to `True`. + mode (bool): Whether to freeze ("freeze") or unfreeze ("unfreeze"). Defaults to `"freeze"`. + """ + assert mode in ["freeze", "unfreeze"], '`mode` must be one of "freeze" or "unfreeze"' + + if isinstance(root_module, (torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.batchnorm.SyncBatchNorm)): + # Raise assertion here because we can't convert it in place + raise AssertionError( + "You have provided a batch norm layer as the `root module`. Please use " + "`timm.utils.model.freeze_batch_norm_2d` or `timm.utils.model.unfreeze_batch_norm_2d` instead.") + + if isinstance(submodules, str): + submodules = [submodules] + + named_modules = submodules + submodules = [root_module.get_submodule(m) for m in submodules] + + if not len(submodules): + named_modules, submodules = list(zip(*root_module.named_children())) + + for n, m in zip(named_modules, submodules): + # (Un)freeze parameters + for p in m.parameters(): + p.requires_grad = False if mode == 'freeze' else True + if include_bn_running_stats: + # Helper to add submodule specified as a named_module + def _add_submodule(module, name, submodule): + split = name.rsplit('.', 1) + if len(split) > 1: + module.get_submodule(split[0]).add_module(split[1], submodule) + else: + module.add_module(name, submodule) + + # Freeze batch norm + if mode == 'freeze': + res = freeze_batch_norm_2d(m) + # It's possible that `m` is a type of BatchNorm in itself, in which case `unfreeze_batch_norm_2d` won't + # convert it in place, but will return the converted result. In this case `res` holds the converted + # result and we may try to re-assign the named module + if isinstance(m, (torch.nn.modules.batchnorm.BatchNorm2d, torch.nn.modules.batchnorm.SyncBatchNorm)): + _add_submodule(root_module, n, res) + # Unfreeze batch norm + else: + res = unfreeze_batch_norm_2d(m) + # Ditto. See note above in mode == 'freeze' branch + if isinstance(m, FrozenBatchNorm2d): + _add_submodule(root_module, n, res) + + +def freeze(root_module, submodules=[], include_bn_running_stats=True): + """ + Freeze parameters of the specified modules and those of all their hierarchical descendants. This is done in place. + Args: + root_module (nn.Module): Root module relative to which `submodules` are referenced. + submodules (list[str]): List of modules for which the parameters will be frozen. They are to be provided as + named modules relative to the root module (accessible via `root_module.named_modules()`). An empty list + means that the whole root module will be frozen. Defaults to `[]`. + include_bn_running_stats (bool): Whether to also freeze the running statistics of `BatchNorm2d` and + `SyncBatchNorm` layers. These will be converted to `FrozenBatchNorm2d` in place. Hint: During fine tuning, + it's good practice to freeze batch norm stats. And note that these are different to the affine parameters + which are just normal PyTorch parameters. Defaults to `True`. + + Hint: If you want to freeze batch norm ONLY, use `timm.utils.model.freeze_batch_norm_2d`. + + Examples:: + + >>> model = timm.create_model('resnet18') + >>> # Freeze up to and including layer2 + >>> submodules = [n for n, _ in model.named_children()] + >>> print(submodules) + ['conv1', 'bn1', 'act1', 'maxpool', 'layer1', 'layer2', 'layer3', 'layer4', 'global_pool', 'fc'] + >>> freeze(model, submodules[:submodules.index('layer2') + 1]) + >>> # Check for yourself that it works as expected + >>> print(model.layer2[0].conv1.weight.requires_grad) + False + >>> print(model.layer3[0].conv1.weight.requires_grad) + True + >>> # Unfreeze + >>> unfreeze(model) + """ + _freeze_unfreeze(root_module, submodules, include_bn_running_stats=include_bn_running_stats, mode="freeze") + + +def unfreeze(root_module, submodules=[], include_bn_running_stats=True): + """ + Unfreeze parameters of the specified modules and those of all their hierarchical descendants. This is done in place. + Args: + root_module (nn.Module): Root module relative to which `submodules` are referenced. + submodules (list[str]): List of submodules for which the parameters will be (un)frozen. They are to be provided + as named modules relative to the root module (accessible via `root_module.named_modules()`). An empty + list means that the whole root module will be unfrozen. Defaults to `[]`. + include_bn_running_stats (bool): Whether to also unfreeze the running statistics of `FrozenBatchNorm2d` layers. + These will be converted to `BatchNorm2d` in place. Defaults to `True`. + + See example in docstring for `freeze`. + """ + _freeze_unfreeze(root_module, submodules, include_bn_running_stats=include_bn_running_stats, mode="unfreeze") diff --git a/custom_timm/utils/model_ema.py b/custom_timm/utils/model_ema.py new file mode 100644 index 0000000000000000000000000000000000000000..073d5c5ea1a4afc5aa3817b6354b2566f8cc2cf5 --- /dev/null +++ b/custom_timm/utils/model_ema.py @@ -0,0 +1,126 @@ +""" Exponential Moving Average (EMA) of model updates + +Hacked together by / Copyright 2020 Ross Wightman +""" +import logging +from collections import OrderedDict +from copy import deepcopy + +import torch +import torch.nn as nn + +_logger = logging.getLogger(__name__) + + +class ModelEma: + """ Model Exponential Moving Average (DEPRECATED) + + Keep a moving average of everything in the model state_dict (parameters and buffers). + This version is deprecated, it does not work with scripted models. Will be removed eventually. + + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + + A smoothed version of the weights is necessary for some training schemes to perform well. + E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use + RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA + smoothing of weights to match results. Pay attention to the decay constant you are using + relative to your update count per epoch. + + To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but + disable validation of the EMA weights. Validation will have to be done manually in a separate + process, or after the training stops converging. + + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + """ + def __init__(self, model, decay=0.9999, device='', resume=''): + # make a copy of the model for accumulating moving average of weights + self.ema = deepcopy(model) + self.ema.eval() + self.decay = decay + self.device = device # perform ema on different device from model if set + if device: + self.ema.to(device=device) + self.ema_has_module = hasattr(self.ema, 'module') + if resume: + self._load_checkpoint(resume) + for p in self.ema.parameters(): + p.requires_grad_(False) + + def _load_checkpoint(self, checkpoint_path): + checkpoint = torch.load(checkpoint_path, map_location='cpu') + assert isinstance(checkpoint, dict) + if 'state_dict_ema' in checkpoint: + new_state_dict = OrderedDict() + for k, v in checkpoint['state_dict_ema'].items(): + # ema model may have been wrapped by DataParallel, and need module prefix + if self.ema_has_module: + name = 'module.' + k if not k.startswith('module') else k + else: + name = k + new_state_dict[name] = v + self.ema.load_state_dict(new_state_dict) + _logger.info("Loaded state_dict_ema") + else: + _logger.warning("Failed to find state_dict_ema, starting from loaded model weights") + + def update(self, model): + # correct a mismatch in state dict keys + needs_module = hasattr(model, 'module') and not self.ema_has_module + with torch.no_grad(): + msd = model.state_dict() + for k, ema_v in self.ema.state_dict().items(): + if needs_module: + k = 'module.' + k + model_v = msd[k].detach() + if self.device: + model_v = model_v.to(device=self.device) + ema_v.copy_(ema_v * self.decay + (1. - self.decay) * model_v) + + +class ModelEmaV2(nn.Module): + """ Model Exponential Moving Average V2 + + Keep a moving average of everything in the model state_dict (parameters and buffers). + V2 of this module is simpler, it does not match params/buffers based on name but simply + iterates in order. It works with torchscript (JIT of full model). + + This is intended to allow functionality like + https://www.tensorflow.org/api_docs/python/tf/train/ExponentialMovingAverage + + A smoothed version of the weights is necessary for some training schemes to perform well. + E.g. Google's hyper-params for training MNASNet, MobileNet-V3, EfficientNet, etc that use + RMSprop with a short 2.4-3 epoch decay period and slow LR decay rate of .96-.99 requires EMA + smoothing of weights to match results. Pay attention to the decay constant you are using + relative to your update count per epoch. + + To keep EMA from using GPU resources, set device='cpu'. This will save a bit of memory but + disable validation of the EMA weights. Validation will have to be done manually in a separate + process, or after the training stops converging. + + This class is sensitive where it is initialized in the sequence of model init, + GPU assignment and distributed training wrappers. + """ + def __init__(self, model, decay=0.9999, device=None): + super(ModelEmaV2, self).__init__() + # make a copy of the model for accumulating moving average of weights + self.module = deepcopy(model) + self.module.eval() + self.decay = decay + self.device = device # perform ema on different device from model if set + if self.device is not None: + self.module.to(device=device) + + def _update(self, model, update_fn): + with torch.no_grad(): + for ema_v, model_v in zip(self.module.state_dict().values(), model.state_dict().values()): + if self.device is not None: + model_v = model_v.to(device=self.device) + ema_v.copy_(update_fn(ema_v, model_v)) + + def update(self, model): + self._update(model, update_fn=lambda e, m: self.decay * e + (1. - self.decay) * m) + + def set(self, model): + self._update(model, update_fn=lambda e, m: m) diff --git a/custom_timm/utils/random.py b/custom_timm/utils/random.py new file mode 100644 index 0000000000000000000000000000000000000000..a9679983e96a9a6634c0b77aaf7b996e70eff50b --- /dev/null +++ b/custom_timm/utils/random.py @@ -0,0 +1,9 @@ +import random +import numpy as np +import torch + + +def random_seed(seed=42, rank=0): + torch.manual_seed(seed + rank) + np.random.seed(seed + rank) + random.seed(seed + rank) diff --git a/custom_timm/utils/summary.py b/custom_timm/utils/summary.py new file mode 100644 index 0000000000000000000000000000000000000000..9f5af9a08598556c3fed136f258f88bd578c1e1c --- /dev/null +++ b/custom_timm/utils/summary.py @@ -0,0 +1,39 @@ +""" Summary utilities + +Hacked together by / Copyright 2020 Ross Wightman +""" +import csv +import os +from collections import OrderedDict +try: + import wandb +except ImportError: + pass + +def get_outdir(path, *paths, inc=False): + outdir = os.path.join(path, *paths) + if not os.path.exists(outdir): + os.makedirs(outdir) + elif inc: + count = 1 + outdir_inc = outdir + '-' + str(count) + while os.path.exists(outdir_inc): + count = count + 1 + outdir_inc = outdir + '-' + str(count) + assert count < 100 + outdir = outdir_inc + os.makedirs(outdir) + return outdir + + +def update_summary(epoch, train_metrics, eval_metrics, filename, write_header=False, log_wandb=False): + rowd = OrderedDict(epoch=epoch) + rowd.update([('train_' + k, v) for k, v in train_metrics.items()]) + rowd.update([('eval_' + k, v) for k, v in eval_metrics.items()]) + if log_wandb: + wandb.log(rowd) + with open(filename, mode='a') as cf: + dw = csv.DictWriter(cf, fieldnames=rowd.keys()) + if write_header: # first iteration (epoch == 1 can't be used) + dw.writeheader() + dw.writerow(rowd) diff --git a/custom_timm/version.py b/custom_timm/version.py new file mode 100644 index 0000000000000000000000000000000000000000..70039a4cbb99f75059bfe30c5e56c2295a73a5cd --- /dev/null +++ b/custom_timm/version.py @@ -0,0 +1 @@ +__version__ = '0.6.13' diff --git a/wrapper_for_mps/__init__.py b/wrapper_for_mps/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d40bcf5d6be0ff616c8704bdf8c699fcda876ba8 --- /dev/null +++ b/wrapper_for_mps/__init__.py @@ -0,0 +1,7 @@ +import torch +from comfy.model_management import get_torch_device + +device = get_torch_device() +#https://github.com/microsoft/DirectML/issues/414#issuecomment-1541319479 +def sparse_to_dense(sparse_tensor): + return sparse_tensor.to_dense() \ No newline at end of file