# Copyright (c) Facebook, Inc. and its affiliates. import copy import logging import numpy as np import pycocotools.mask as mask_util import torch from torch.nn import functional as F from detectron2.data import detection_utils as utils from detectron2.data import transforms as T from detectron2.projects.point_rend import ColorAugSSDTransform from detectron2.structures import BitMasks, Instances, polygons_to_bitmask from modeling.utils import configurable __all__ = ["MaskFormerInstanceDatasetMapper"] class MaskFormerInstanceDatasetMapper: """ A callable which takes a dataset dict in Detectron2 Dataset format, and map it into a format used by MaskFormer for instance segmentation. The callable currently does the following: 1. Read the image from "file_name" 2. Applies geometric transforms to the image and annotation 3. Find and applies suitable cropping to the image and annotation 4. Prepare image and annotation to Tensors """ @configurable def __init__( self, is_train=True, *, augmentations, image_format, size_divisibility, ): """ NOTE: this interface is experimental. Args: is_train: for training or inference augmentations: a list of augmentations or deterministic transforms to apply image_format: an image format supported by :func:`detection_utils.read_image`. size_divisibility: pad image size to be divisible by this value """ self.is_train = is_train self.tfm_gens = augmentations self.img_format = image_format self.size_divisibility = size_divisibility logger = logging.getLogger(__name__) mode = "training" if is_train else "inference" logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}") @classmethod def from_config(cls, cfg, is_train=True): # Build augmentation cfg_input = cfg['INPUT'] augs = [ T.ResizeShortestEdge( cfg_input['MIN_SIZE_TRAIN'], cfg_input['MAX_SIZE_TRAIN'], cfg_input['MIN_SIZE_TRAIN_SAMPLING'], ) ] cfg_input_crop = cfg_input['CROP'] if cfg_input_crop['ENABLED']: augs.append( T.RandomCrop( cfg_input_crop['TYPE'], cfg_input_crop['SIZE'], ) ) if cfg_input['COLOR_AUG_SSD']: augs.append(ColorAugSSDTransform(img_format=cfg_input['FORMAT'])) augs.append(T.RandomFlip()) ret = { "is_train": is_train, "augmentations": augs, "image_format": cfg_input['FORMAT'], "size_divisibility": cfg_input['SIZE_DIVISIBILITY'], } return ret def __call__(self, dataset_dict): """ Args: dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format. Returns: dict: a format that builtin models in detectron2 accept """ assert self.is_train, "MaskFormerPanopticDatasetMapper should only be used for training!" dataset_dict = copy.deepcopy(dataset_dict) # it will be modified by code below image = utils.read_image(dataset_dict["file_name"], format=self.img_format) utils.check_image_size(dataset_dict, image) aug_input = T.AugInput(image) aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input) image = aug_input.image # transform instnace masks assert "annotations" in dataset_dict for anno in dataset_dict["annotations"]: anno.pop("keypoints", None) annos = [ utils.transform_instance_annotations(obj, transforms, image.shape[:2]) for obj in dataset_dict.pop("annotations") if obj.get("iscrowd", 0) == 0 ] if len(annos): assert "segmentation" in annos[0] segms = [obj["segmentation"] for obj in annos] masks = [] for segm in segms: if isinstance(segm, list): # polygon masks.append(polygons_to_bitmask(segm, *image.shape[:2])) elif isinstance(segm, dict): # COCO RLE masks.append(mask_util.decode(segm)) elif isinstance(segm, np.ndarray): assert segm.ndim == 2, "Expect segmentation of 2 dimensions, got {}.".format( segm.ndim ) # mask array masks.append(segm) else: raise ValueError( "Cannot convert segmentation of type '{}' to BitMasks!" "Supported types are: polygons as list[list[float] or ndarray]," " COCO-style RLE as a dict, or a binary segmentation mask " " in a 2D numpy array of shape HxW.".format(type(segm)) ) # Pad image and segmentation label here! image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1))) masks = [torch.from_numpy(np.ascontiguousarray(x)) for x in masks] classes = [int(obj["category_id"]) for obj in annos] classes = torch.tensor(classes, dtype=torch.int64) if self.size_divisibility > 0: image_size = (image.shape[-2], image.shape[-1]) padding_size = [ 0, self.size_divisibility - image_size[1], 0, self.size_divisibility - image_size[0], ] # pad image image = F.pad(image, padding_size, value=128).contiguous() # pad mask masks = [F.pad(x, padding_size, value=0).contiguous() for x in masks] image_shape = (image.shape[-2], image.shape[-1]) # h, w # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory, # but not efficient on large generic data structures due to the use of pickle & mp.Queue. # Therefore it's important to use torch.Tensor. dataset_dict["image"] = image # Prepare per-category binary masks instances = Instances(image_shape) instances.gt_classes = classes if len(masks) == 0: # Some image does not have annotation (all ignored) instances.gt_masks = torch.zeros((0, image.shape[-2], image.shape[-1])) else: masks = BitMasks(torch.stack(masks)) instances.gt_masks = masks.tensor dataset_dict["instances"] = instances return dataset_dict