File size: 6,403 Bytes
b6af722
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Tuple, Union

import torch

from cosmos_predict1.diffusion.functional.batch_ops import batch_mul


def create_per_sample_loss_mask(
    loss_masking_cfg: dict,
    data_batch: dict,
    x_shape: Tuple[int],
    dtype: torch.dtype,
    device: Union[str, torch.device] = "cuda",
):
    """
    Creates a per-sample loss mask based on the given configuration and input data batch.

    This function generates a dictionary of loss masks for each specified key in the loss masking configuration.
    For keys present in both the configuration and the data batch, the corresponding data batch value is used.
    For keys present only in the configuration, a tensor of zeros with the specified shape is created.
    Additionally, it computes loss mask weights for each key based on the configuration values and adjusts them
    based on the presence of certain keys in the data batch, such as "skip_face" and "object_loss_map".

    Note:
    - The original `loss_masking_cfg` and `data_batch` are not modified by this function.
    - For image data, it is assumed that the channel is always the first dimension.
    - `skip_face` is for face regions that should be skipped during training, the key is provided so that we can generate
    diverse human and avoid collapse to a single face given certain prompts. The issue happens for getty projects,
    where face distribution in the dataset is high unbalanced that single man face can be shown in more than 100+ images.

    Parameters:
        loss_masking_cfg (dict): Configuration for loss masking, specifying which keys to include and their weights.
        data_batch (dict): The batch of data containing actual data points and potential mask indicators like "skip_face".
        x_shape (tuple): The shape of the input data, used to initialize zero masks for keys not in the data batch.
        dtype (torch.dtype): The data type for the tensors in the loss masks.
        device (str, optional): The device on which to create the tensors. Defaults to 'cuda'.

    Returns:
        dict: A dictionary containing combined loss masks adjusted according to the `loss_masking_cfg` and `data_batch`.

    Raises:
        AssertionError: If "skip_face" is not present in `data_batch`.

    Note: `create_combined_loss_mask` is assumed to be a separate function that combines individual loss masks into a
    single mask or set of masks based on the given parameters. Its behavior should be documented separately.
    """
    loss_mask_data: dict = {}
    for key in loss_masking_cfg:
        if key not in data_batch:
            loss_mask_data[key] = torch.zeros((x_shape[0], 1, x_shape[2], x_shape[3]), device=device)
        else:
            loss_mask_data[key] = data_batch[key]

    if "skip_face" not in data_batch:
        # When skip_face is not there in data_dict, use 0 as default. This will not skip any sample.
        data_batch["skip_face"] = torch.zeros((x_shape[0],), dtype=dtype, device=device)

    loss_mask_weight: dict = {}
    for k, v in loss_masking_cfg.items():
        loss_mask_weight[k] = torch.tensor(v, device=device).expand(data_batch["skip_face"].size())

    if "human_face_mask" in loss_mask_weight:
        loss_mask_weight["human_face_mask"] = (1 - data_batch["skip_face"]) * loss_mask_weight["human_face_mask"]

    if "object_loss_map" in data_batch:
        loss_mask_weight["object_loss_map"] = torch.ones(data_batch["object_loss_map"].shape[0], device=device)

    return create_combined_loss_mask(loss_mask_data, x_shape, dtype, device, loss_mask_weight)


def create_combined_loss_mask(data, x_shape, dtype, device="cuda", loss_masking=None):
    """
    Creates a combined loss mask from multiple input masks.

    This function combines several loss masks into a single mask. In regions where masks overlap,
    the highest value is assigned. Non-overlapping regions are assigned a default value of 1.
    Regions with a mask value of zero are explicitly zeroed out, which is essential for padded loss calculations.

    Example:
        Given the following masks and weights:
            mask1: [0, 1, 1, 1, 0, 0], weight: 2
            mask2: [1, 0, 1, 0, 0, 0], weight: 4
            mask3: [0, 1, 0, 0, 0, 0], weight: 0
        The resulting combined loss mask would be:
            [4, 0, 4, 2, 1, 1]

    Parameters:
        data (dict): Contains the loss masks and their weights.
        x_shape (tuple): The shape of the output mask.
        dtype: The data type for the output mask.
        device: The device on which the output mask will be allocated.
        loss_masking: The loss masking weight configuration.

    Returns:
        torch.Tensor: The combined loss mask.
    """

    loss_mask = torch.ones(x_shape, dtype=dtype, device=device)
    zero_mask = torch.ones(x_shape, dtype=dtype, device=device)

    if loss_masking:
        for key in loss_masking:
            # Repeat mask along channel's dimension. ndim=4 for images.
            repeat_dims = (1, x_shape[1]) + tuple([1] * (data[key].ndim - 2))
            mask_key = torch.tile(data[key], dims=repeat_dims)
            weight_key = loss_masking[key]

            # handle zero weight case
            is_zero_weight = (weight_key == 0).float()[:, None, None, None]
            zero_mask = zero_mask * (
                (1 - is_zero_weight) * torch.ones(x_shape, dtype=dtype, device=device)
                + is_zero_weight * (1 - mask_key.bool().float())
            )

            # calculate weights
            no_mask_region = (mask_key.bool() == 0).float()
            loss_mask = batch_mul(mask_key, weight_key) + batch_mul(no_mask_region, loss_mask)

    loss_mask_final = loss_mask * zero_mask
    return loss_mask_final