Spaces:

rafiaashraf
/

IDM-VTON

Paused

File size: 14,471 Bytes

e36117a

# Copyright (c) Facebook, Inc. and its affiliates.

from typing import Any, List
import torch
from torch.nn import functional as F

from detectron2.config import CfgNode
from detectron2.structures import Instances

from .mask_or_segm import MaskOrSegmentationLoss
from .registry import DENSEPOSE_LOSS_REGISTRY
from .utils import (
    BilinearInterpolationHelper,
    ChartBasedAnnotationsAccumulator,
    LossDict,
    extract_packed_annotations_from_matches,
)


@DENSEPOSE_LOSS_REGISTRY.register()
class DensePoseChartLoss:
    """

    DensePose loss for chart-based training. A mesh is split into charts,

    each chart is given a label (I) and parametrized by 2 coordinates referred to

    as U and V. Ground truth consists of a number of points annotated with

    I, U and V values and coarse segmentation S defined for all pixels of the

    object bounding box. In some cases (see `COARSE_SEGM_TRAINED_BY_MASKS`),

    semantic segmentation annotations can be used as ground truth inputs as well.



    Estimated values are tensors:

     * U coordinates, tensor of shape [N, C, S, S]

     * V coordinates, tensor of shape [N, C, S, S]

     * fine segmentation estimates, tensor of shape [N, C, S, S] with raw unnormalized

       scores for each fine segmentation label at each location

     * coarse segmentation estimates, tensor of shape [N, D, S, S] with raw unnormalized

       scores for each coarse segmentation label at each location

    where N is the number of detections, C is the number of fine segmentation

    labels, S is the estimate size ( = width = height) and D is the number of

    coarse segmentation channels.



    The losses are:

    * regression (smooth L1) loss for U and V coordinates

    * cross entropy loss for fine (I) and coarse (S) segmentations

    Each loss has an associated weight

    """

    def __init__(self, cfg: CfgNode):
        """

        Initialize chart-based loss from configuration options



        Args:

            cfg (CfgNode): configuration options

        """
        # fmt: off
        self.heatmap_size = cfg.MODEL.ROI_DENSEPOSE_HEAD.HEATMAP_SIZE
        self.w_points     = cfg.MODEL.ROI_DENSEPOSE_HEAD.POINT_REGRESSION_WEIGHTS
        self.w_part       = cfg.MODEL.ROI_DENSEPOSE_HEAD.PART_WEIGHTS
        self.w_segm       = cfg.MODEL.ROI_DENSEPOSE_HEAD.INDEX_WEIGHTS
        self.n_segm_chan  = cfg.MODEL.ROI_DENSEPOSE_HEAD.NUM_COARSE_SEGM_CHANNELS
        # fmt: on
        self.segm_trained_by_masks = cfg.MODEL.ROI_DENSEPOSE_HEAD.COARSE_SEGM_TRAINED_BY_MASKS
        self.segm_loss = MaskOrSegmentationLoss(cfg)

    def __call__(

        self, proposals_with_gt: List[Instances], densepose_predictor_outputs: Any, **kwargs

    ) -> LossDict:
        """

        Produce chart-based DensePose losses



        Args:

            proposals_with_gt (list of Instances): detections with associated ground truth data

            densepose_predictor_outputs: an object of a dataclass that contains predictor outputs

                with estimated values; assumed to have the following attributes:

                * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]

                * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]

                * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]

                * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]

            where N is the number of detections, C is the number of fine segmentation

            labels, S is the estimate size ( = width = height) and D is the number of

            coarse segmentation channels.



        Return:

            dict: str -> tensor: dict of losses with the following entries:

             * `loss_densepose_U`: smooth L1 loss for U coordinate estimates

             * `loss_densepose_V`: smooth L1 loss for V coordinate estimates

             * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine

                 segmentation estimates given ground truth labels;

             * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse

                 segmentation estimates given ground truth labels;

        """
        # densepose outputs are computed for all images and all bounding boxes;
        # i.e. if a batch has 4 images with (3, 1, 2, 1) proposals respectively,
        # the outputs will have size(0) == 3+1+2+1 == 7

        if not len(proposals_with_gt):
            return self.produce_fake_densepose_losses(densepose_predictor_outputs)

        accumulator = ChartBasedAnnotationsAccumulator()
        packed_annotations = extract_packed_annotations_from_matches(proposals_with_gt, accumulator)

        # NOTE: we need to keep the same computation graph on all the GPUs to
        # perform reduction properly. Hence even if we have no data on one
        # of the GPUs, we still need to generate the computation graph.
        # Add fake (zero) loss in the form Tensor.sum() * 0
        if packed_annotations is None:
            return self.produce_fake_densepose_losses(densepose_predictor_outputs)

        h, w = densepose_predictor_outputs.u.shape[2:]
        interpolator = BilinearInterpolationHelper.from_matches(
            packed_annotations,
            (h, w),
        )

        j_valid_fg = interpolator.j_valid * (  # pyre-ignore[16]
            packed_annotations.fine_segm_labels_gt > 0
        )
        # pyre-fixme[6]: For 1st param expected `Tensor` but got `int`.
        if not torch.any(j_valid_fg):
            return self.produce_fake_densepose_losses(densepose_predictor_outputs)

        losses_uv = self.produce_densepose_losses_uv(
            proposals_with_gt,
            densepose_predictor_outputs,
            packed_annotations,
            interpolator,
            j_valid_fg,  # pyre-ignore[6]
        )

        losses_segm = self.produce_densepose_losses_segm(
            proposals_with_gt,
            densepose_predictor_outputs,
            packed_annotations,
            interpolator,
            j_valid_fg,  # pyre-ignore[6]
        )

        return {**losses_uv, **losses_segm}

    def produce_fake_densepose_losses(self, densepose_predictor_outputs: Any) -> LossDict:
        """

        Fake losses for fine segmentation and U/V coordinates. These are used when

        no suitable ground truth data was found in a batch. The loss has a value 0

        and is primarily used to construct the computation graph, so that

        `DistributedDataParallel` has similar graphs on all GPUs and can perform

        reduction properly.



        Args:

            densepose_predictor_outputs: DensePose predictor outputs, an object

                of a dataclass that is assumed to have the following attributes:

             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]

             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]

             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]

        Return:

            dict: str -> tensor: dict of losses with the following entries:

             * `loss_densepose_U`: has value 0

             * `loss_densepose_V`: has value 0

             * `loss_densepose_I`: has value 0

             * `loss_densepose_S`: has value 0

        """
        losses_uv = self.produce_fake_densepose_losses_uv(densepose_predictor_outputs)
        losses_segm = self.produce_fake_densepose_losses_segm(densepose_predictor_outputs)
        return {**losses_uv, **losses_segm}

    def produce_fake_densepose_losses_uv(self, densepose_predictor_outputs: Any) -> LossDict:
        """

        Fake losses for U/V coordinates. These are used when no suitable ground

        truth data was found in a batch. The loss has a value 0

        and is primarily used to construct the computation graph, so that

        `DistributedDataParallel` has similar graphs on all GPUs and can perform

        reduction properly.



        Args:

            densepose_predictor_outputs: DensePose predictor outputs, an object

                of a dataclass that is assumed to have the following attributes:

             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]

             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]

        Return:

            dict: str -> tensor: dict of losses with the following entries:

             * `loss_densepose_U`: has value 0

             * `loss_densepose_V`: has value 0

        """
        return {
            "loss_densepose_U": densepose_predictor_outputs.u.sum() * 0,
            "loss_densepose_V": densepose_predictor_outputs.v.sum() * 0,
        }

    def produce_fake_densepose_losses_segm(self, densepose_predictor_outputs: Any) -> LossDict:
        """

        Fake losses for fine / coarse segmentation. These are used when

        no suitable ground truth data was found in a batch. The loss has a value 0

        and is primarily used to construct the computation graph, so that

        `DistributedDataParallel` has similar graphs on all GPUs and can perform

        reduction properly.



        Args:

            densepose_predictor_outputs: DensePose predictor outputs, an object

                of a dataclass that is assumed to have the following attributes:

             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]

             * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]

        Return:

            dict: str -> tensor: dict of losses with the following entries:

             * `loss_densepose_I`: has value 0

             * `loss_densepose_S`: has value 0, added only if `segm_trained_by_masks` is False

        """
        losses = {
            "loss_densepose_I": densepose_predictor_outputs.fine_segm.sum() * 0,
            "loss_densepose_S": self.segm_loss.fake_value(densepose_predictor_outputs),
        }
        return losses

    def produce_densepose_losses_uv(

        self,

        proposals_with_gt: List[Instances],

        densepose_predictor_outputs: Any,

        packed_annotations: Any,

        interpolator: BilinearInterpolationHelper,

        j_valid_fg: torch.Tensor,

    ) -> LossDict:
        """

        Compute losses for U/V coordinates: smooth L1 loss between

        estimated coordinates and the ground truth.



        Args:

            proposals_with_gt (list of Instances): detections with associated ground truth data

            densepose_predictor_outputs: DensePose predictor outputs, an object

                of a dataclass that is assumed to have the following attributes:

             * u - U coordinate estimates per fine labels, tensor of shape [N, C, S, S]

             * v - V coordinate estimates per fine labels, tensor of shape [N, C, S, S]

        Return:

            dict: str -> tensor: dict of losses with the following entries:

             * `loss_densepose_U`: smooth L1 loss for U coordinate estimates

             * `loss_densepose_V`: smooth L1 loss for V coordinate estimates

        """
        u_gt = packed_annotations.u_gt[j_valid_fg]
        u_est = interpolator.extract_at_points(densepose_predictor_outputs.u)[j_valid_fg]
        v_gt = packed_annotations.v_gt[j_valid_fg]
        v_est = interpolator.extract_at_points(densepose_predictor_outputs.v)[j_valid_fg]
        return {
            "loss_densepose_U": F.smooth_l1_loss(u_est, u_gt, reduction="sum") * self.w_points,
            "loss_densepose_V": F.smooth_l1_loss(v_est, v_gt, reduction="sum") * self.w_points,
        }

    def produce_densepose_losses_segm(

        self,

        proposals_with_gt: List[Instances],

        densepose_predictor_outputs: Any,

        packed_annotations: Any,

        interpolator: BilinearInterpolationHelper,

        j_valid_fg: torch.Tensor,

    ) -> LossDict:
        """

        Losses for fine / coarse segmentation: cross-entropy

        for segmentation unnormalized scores given ground truth labels at

        annotated points for fine segmentation and dense mask annotations

        for coarse segmentation.



        Args:

            proposals_with_gt (list of Instances): detections with associated ground truth data

            densepose_predictor_outputs: DensePose predictor outputs, an object

                of a dataclass that is assumed to have the following attributes:

             * fine_segm - fine segmentation estimates, tensor of shape [N, C, S, S]

             * coarse_segm - coarse segmentation estimates, tensor of shape [N, D, S, S]

        Return:

            dict: str -> tensor: dict of losses with the following entries:

             * `loss_densepose_I`: cross entropy for raw unnormalized scores for fine

                 segmentation estimates given ground truth labels

             * `loss_densepose_S`: cross entropy for raw unnormalized scores for coarse

                 segmentation estimates given ground truth labels;

                 may be included if coarse segmentation is only trained

                 using DensePose ground truth; if additional supervision through

                 instance segmentation data is performed (`segm_trained_by_masks` is True),

                 this loss is handled by `produce_mask_losses` instead

        """
        fine_segm_gt = packed_annotations.fine_segm_labels_gt[
            interpolator.j_valid  # pyre-ignore[16]
        ]
        fine_segm_est = interpolator.extract_at_points(
            densepose_predictor_outputs.fine_segm,
            slice_fine_segm=slice(None),
            w_ylo_xlo=interpolator.w_ylo_xlo[:, None],  # pyre-ignore[16]
            w_ylo_xhi=interpolator.w_ylo_xhi[:, None],  # pyre-ignore[16]
            w_yhi_xlo=interpolator.w_yhi_xlo[:, None],  # pyre-ignore[16]
            w_yhi_xhi=interpolator.w_yhi_xhi[:, None],  # pyre-ignore[16]
        )[interpolator.j_valid, :]
        return {
            "loss_densepose_I": F.cross_entropy(fine_segm_est, fine_segm_gt.long()) * self.w_part,
            "loss_densepose_S": self.segm_loss(
                proposals_with_gt, densepose_predictor_outputs, packed_annotations
            )
            * self.w_segm,
        }