Spaces:
Running
Running
from detectron2.layers.nms import batched_nms | |
from pytorch3d.ops.iou_box3d import box3d_overlap | |
from ProposalNetwork.utils.plane import Plane_torch as Plane_torch | |
# from segment_anything.utils.transforms import ResizeLongestSide | |
# from cubercnn.data.generate_ground_segmentations import init_segmentation | |
import logging | |
import numpy as np | |
from torchvision.ops import sigmoid_focal_loss | |
from typing import Dict, List, Tuple | |
import torch | |
from torch import nn | |
import torch.nn.functional as F | |
from pytorch3d.transforms.so3 import ( | |
so3_relative_angle | |
) | |
from detectron2.config import configurable | |
from detectron2.structures import Instances, Boxes, pairwise_iou, pairwise_ioa | |
from detectron2.layers import ShapeSpec | |
from detectron2.modeling.proposal_generator.proposal_utils import add_ground_truth_to_proposals | |
from detectron2.utils.events import get_event_storage | |
from detectron2.modeling.roi_heads import ( | |
StandardROIHeads, ROI_HEADS_REGISTRY, select_foreground_proposals, | |
) | |
from detectron2.modeling.poolers import ROIPooler | |
from ProposalNetwork.utils.conversions import cubes_to_box | |
from ProposalNetwork.utils.spaces import Cubes | |
from ProposalNetwork.utils.utils import iou_2d, convex_hull | |
from cubercnn.modeling.roi_heads.cube_head import build_cube_head | |
from cubercnn.modeling.proposal_generator.rpn import subsample_labels | |
from cubercnn.modeling.roi_heads.fast_rcnn import FastRCNNOutputs | |
from cubercnn import util | |
from torchvision.ops import generalized_box_iou_loss | |
from cubercnn.util.math_util import so3_relative_angle_batched | |
logger = logging.getLogger(__name__) | |
E_CONSTANT = 2.71828183 | |
SQRT_2_CONSTANT = 1.41421356 | |
def build_roi_heads(cfg, input_shape=None, priors=None): | |
""" | |
Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`. | |
""" | |
name = cfg.MODEL.ROI_HEADS.NAME | |
return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape, priors=priors) | |
class ROIHeads3DScore(StandardROIHeads): | |
'''3D head for the weak cube rcnn model''' | |
def __init__( | |
self, | |
*, | |
ignore_thresh: float, | |
cube_head: nn.Module, | |
cube_pooler: nn.Module, | |
loss_w_3d: float, | |
loss_w_iou: float, | |
loss_w_seg: float, | |
loss_w_pose: float, | |
loss_w_normal_vec: float, | |
loss_w_z: float, | |
loss_w_dims: float, | |
loss_w_depth: float, | |
use_confidence: float, | |
inverse_z_weight: bool, | |
z_type: str, | |
pose_type: str, | |
cluster_bins: int, | |
priors = None, | |
dims_priors_enabled = None, | |
dims_priors_func = None, | |
disentangled_loss=None, | |
virtual_depth=None, | |
virtual_focal=None, | |
test_scale=None, | |
allocentric_pose=None, | |
chamfer_pose=None, | |
scale_roi_boxes=None, | |
loss_functions=['dims', 'pose_alignment', 'pose_ground', 'iou', 'segmentation', 'z', 'z_pseudo_gt_patch'], | |
segmentor, | |
**kwargs, | |
): | |
super().__init__(**kwargs) | |
self.scale_roi_boxes = scale_roi_boxes | |
self.segmentor = segmentor | |
# rotation settings | |
self.allocentric_pose = allocentric_pose | |
self.chamfer_pose = chamfer_pose | |
# virtual settings | |
self.virtual_depth = virtual_depth | |
self.virtual_focal = virtual_focal | |
# loss weights, <=0 is off | |
self.loss_w_3d = loss_w_3d | |
self.loss_w_iou = loss_w_iou | |
self.loss_w_seg = loss_w_seg | |
self.loss_w_pose = loss_w_pose | |
self.loss_w_normal_vec = loss_w_normal_vec | |
self.loss_w_z = loss_w_z | |
self.loss_w_dims = loss_w_dims | |
self.loss_w_depth = loss_w_depth | |
# loss functions | |
self.loss_functions = loss_functions | |
# loss modes | |
self.disentangled_loss = disentangled_loss | |
self.inverse_z_weight = inverse_z_weight | |
# misc | |
self.test_scale = test_scale | |
self.ignore_thresh = ignore_thresh | |
# related to network outputs | |
self.z_type = z_type | |
self.pose_type = pose_type | |
self.use_confidence = use_confidence | |
# related to priors | |
self.cluster_bins = cluster_bins | |
self.dims_priors_enabled = dims_priors_enabled | |
self.dims_priors_func = dims_priors_func | |
# if there is no 3D loss, then we don't need any heads. | |
# if loss_w_3d > 0: | |
self.cube_head = cube_head | |
self.cube_pooler = cube_pooler | |
# the dimensions could rely on pre-computed priors | |
if self.dims_priors_enabled and priors is not None: | |
self.priors_dims_per_cat = nn.Parameter(torch.FloatTensor(priors['priors_dims_per_cat']).unsqueeze(0)) | |
else: | |
self.priors_dims_per_cat = nn.Parameter(torch.ones(1, self.num_classes, 2, 3)) | |
# Optionally, refactor priors and store them in the network params | |
if self.cluster_bins > 1 and priors is not None: | |
# the depth could have been clustered based on 2D scales | |
priors_z_scales = torch.stack([torch.FloatTensor(prior[1]) for prior in priors['priors_bins']]) | |
self.priors_z_scales = nn.Parameter(priors_z_scales) | |
else: | |
self.priors_z_scales = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins)) | |
# the depth can be based on priors | |
if self.z_type == 'clusters': | |
assert self.cluster_bins > 1, 'To use z_type of priors, there must be more than 1 cluster bin' | |
if priors is None: | |
self.priors_z_stats = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins, 2).float()) | |
else: | |
# stats | |
priors_z_stats = torch.cat([torch.FloatTensor(prior[2]).unsqueeze(0) for prior in priors['priors_bins']]) | |
self.priors_z_stats = nn.Parameter(priors_z_stats) | |
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], priors=None): | |
ret = super().from_config(cfg, input_shape) | |
# pass along priors | |
ret["box_predictor"] = FastRCNNOutputs(cfg, ret['box_head'].output_shape) | |
ret.update(cls._init_cube_head(cfg, input_shape)) | |
ret["priors"] = priors | |
return ret | |
def _init_cube_head(self, cfg, input_shape: Dict[str, ShapeSpec]): | |
in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES | |
pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) | |
pooler_resolution = cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION | |
pooler_sampling_ratio = cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO | |
pooler_type = cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE | |
cube_pooler = ROIPooler( | |
output_size=pooler_resolution, | |
scales=pooler_scales, | |
sampling_ratio=pooler_sampling_ratio, | |
pooler_type=pooler_type, | |
) | |
in_channels = [input_shape[f].channels for f in in_features][0] | |
shape = ShapeSpec( | |
channels=in_channels, width=pooler_resolution, height=pooler_resolution | |
) | |
cube_head = build_cube_head(cfg, shape) | |
logger.info('Loss functions: %s', cfg.loss_functions) | |
possible_losses = ['dims', 'pose_alignment', 'pose_ground', 'pose_ground2', 'iou', 'segmentation', 'z', 'z_pseudo_gt_patch', 'z_pseudo_gt_center','depth'] | |
assert all([x in possible_losses for x in cfg.loss_functions]), f'loss functions must be in {possible_losses}, but was {cfg.loss_functions}' | |
if 'segmentation' in cfg.loss_functions or 'depth' in cfg.loss_functions: | |
segmentor = init_segmentation(device=cfg.MODEL.DEVICE) | |
else: | |
segmentor = None | |
return { | |
'cube_head': cube_head, | |
'cube_pooler': cube_pooler, | |
'use_confidence': cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE, | |
'inverse_z_weight': cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT, | |
'loss_w_3d': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D, | |
'loss_w_iou': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_IOU, | |
'loss_w_seg': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_SEG, | |
'loss_w_pose': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE, | |
'loss_w_dims': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS, | |
'loss_w_normal_vec': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_NORMAL_VEC, | |
'loss_w_z': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z, | |
'loss_w_depth': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DEPTH, | |
'z_type': cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE, | |
'pose_type': cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE, | |
'dims_priors_enabled': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED, | |
'dims_priors_func': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC, | |
'disentangled_loss': cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS, | |
'virtual_depth': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH, | |
'virtual_focal': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL, | |
'test_scale': cfg.INPUT.MIN_SIZE_TEST, | |
'chamfer_pose': cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE, | |
'allocentric_pose': cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE, | |
'cluster_bins': cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS, | |
'ignore_thresh': cfg.MODEL.RPN.IGNORE_THRESHOLD, | |
'scale_roi_boxes': cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES, | |
'loss_functions': cfg.loss_functions, | |
'segmentor': segmentor, | |
} | |
def forward(self, images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, targets): | |
im_dims = [image.shape[1:] for image in images] | |
del images | |
if self.training: | |
proposals = self.label_and_sample_proposals(proposals, targets) | |
losses = self._forward_box(features, proposals) | |
if self.loss_w_3d > 0: | |
tmp_list = [x.gt_boxes3D.tolist() for x in targets] | |
idx_list = [] | |
for i in range(len(tmp_list)): | |
for j in range(len(tmp_list[i])): | |
idx_list.append(tmp_list[i][j][0]) | |
first_occurrence_indices = {} | |
unique_counter = 0 | |
result_indices = [] | |
for entry in idx_list: | |
if entry not in first_occurrence_indices: | |
first_occurrence_indices[entry] = unique_counter | |
unique_counter += 1 | |
result_indices.append(first_occurrence_indices[entry]) | |
if 'segmentation' in self.loss_functions or 'depth' in self.loss_functions: | |
mask_per_image = self.object_masks(images_raw.tensor, targets) # over all images in batch | |
masks_all_images = [sublist for outer_list in mask_per_image for sublist in outer_list] | |
else: | |
mask_per_image, masks_all_images = None, None | |
instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps) | |
losses.update(losses_cube) | |
else: | |
instances_3d = None | |
return instances_3d, losses | |
else: | |
# when oracle is available, by pass the box forward. | |
# simulate the predicted instances by creating a new | |
# instance for each passed in image. | |
if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]): | |
pred_instances = [] | |
for proposal, im_dim in zip(proposals, im_dims): | |
pred_instances_i = Instances(im_dim) | |
pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D']) | |
pred_instances_i.pred_classes = proposal['gt_classes'] | |
pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float() | |
pred_instances.append(pred_instances_i) | |
else: | |
pred_instances = self._forward_box(features, proposals) | |
mask_per_image, masks_all_images, first_occurrence_indices = None, None, None | |
pred_instances = self._forward_cube(features, pred_instances, Ks, im_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps) | |
return pred_instances, {} | |
def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]): | |
""" | |
Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, | |
the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. | |
Args: | |
features (dict[str, Tensor]): mapping from feature map names to tensor. | |
Same as in :meth:`ROIHeads.forward`. | |
proposals (list[Instances]): the per-image object proposals with | |
their matching ground truth. | |
Each has fields "proposal_boxes", and "objectness_logits", | |
"gt_classes", "gt_boxes". | |
Returns: | |
In training, a dict of losses. | |
In inference, a list of `Instances`, the predicted instances. | |
""" | |
features = [features[f] for f in self.box_in_features] | |
box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) | |
box_features = self.box_head(box_features) | |
predictions = self.box_predictor(box_features) | |
del box_features | |
if self.training: | |
losses = self.box_predictor.losses( | |
predictions, proposals, | |
) | |
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( | |
predictions, proposals | |
) | |
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): | |
proposals_per_image.pred_boxes = Boxes(pred_boxes_per_image) | |
# proposals is modified in-place below, so losses must be computed first. | |
if self.train_on_pred_boxes: | |
with torch.no_grad(): | |
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( | |
predictions, proposals | |
) | |
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): | |
proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image) | |
return losses | |
else: | |
pred_instances, _ = self.box_predictor.inference(predictions, proposals, ) | |
return pred_instances | |
def l1_loss(self, vals, target): | |
return F.smooth_l1_loss(vals, target, reduction='none', beta=0.0) | |
def chamfer_loss(self, vals, target): | |
B = vals.shape[0] | |
xx = vals.view(B, 8, 1, 3) | |
yy = target.view(B, 1, 8, 3) | |
l1_dist = (xx - yy).abs().sum(-1) | |
l1 = (l1_dist.min(1).values.mean(-1) + l1_dist.min(2).values.mean(-1)) | |
return l1 | |
# optionally, scale proposals to zoom RoI in (<1.0) our out (>1.0) | |
def scale_proposals(self, proposal_boxes): | |
if self.scale_roi_boxes > 0: | |
proposal_boxes_scaled = [] | |
for boxes in proposal_boxes: | |
centers = boxes.get_centers() | |
widths = boxes.tensor[:, 2] - boxes.tensor[:, 0] | |
heights = boxes.tensor[:, 2] - boxes.tensor[:, 0] | |
x1 = centers[:, 0] - 0.5*widths*self.scale_roi_boxes | |
x2 = centers[:, 0] + 0.5*widths*self.scale_roi_boxes | |
y1 = centers[:, 1] - 0.5*heights*self.scale_roi_boxes | |
y2 = centers[:, 1] + 0.5*heights*self.scale_roi_boxes | |
boxes_scaled = Boxes(torch.stack([x1, y1, x2, y2], dim=1)) | |
proposal_boxes_scaled.append(boxes_scaled) | |
else: | |
proposal_boxes_scaled = proposal_boxes | |
return proposal_boxes_scaled | |
def object_masks(self, images, instances): | |
'''list of masks for each object in the image. | |
Returns | |
------ | |
mask_per_image: List of torch.Tensor of shape (N_instance, 1, H, W) | |
''' | |
org_shape = images.shape[-2:] | |
resize_transform = ResizeLongestSide(self.segmentor.image_encoder.img_size) | |
batched_input = [] | |
images = resize_transform.apply_image_torch(images*1.0)# .permute(2, 0, 1).contiguous() | |
for image, instance in zip(images, instances): | |
boxes = instance.gt_boxes.tensor | |
transformed_boxes = resize_transform.apply_boxes_torch(boxes, org_shape) # Bx4 | |
batched_input.append({'image': image, 'boxes': transformed_boxes, 'original_size':org_shape}) | |
seg_out = self.segmentor(batched_input, multimask_output=False) | |
mask_per_image = [i['masks'] for i in seg_out] | |
return mask_per_image | |
def dice_loss(self, y, y_hat): | |
'''Andreas: i am extremely unconfident in the correctness of this implementation | |
taken from my implementation in the DLCV course | |
see also: https://gist.github.com/weiliu620/52d140b22685cf9552da4899e2160183''' | |
smooth = 1 | |
y_hat = F.sigmoid(y_hat) | |
y_hat = y_hat.view(-1) | |
y = y.view(-1) | |
intersection = (y_hat * y).sum() | |
dice = (2.*intersection + smooth)/(y_hat.sum() + y.sum() + smooth) | |
return 1 - dice | |
def segment_loss(self, gt_mask, bube_corners, at_which_mask_idx, loss='focal'): | |
n = len(bube_corners) | |
y_hat = [] | |
y = [] | |
for i in range(n): | |
gt_mask_i = gt_mask[at_which_mask_idx[i]][0] | |
bube_corners_i = bube_corners[i] | |
# just need the shape of the gt_mask | |
bube_mask = convex_hull(gt_mask[0].squeeze(), bube_corners_i) | |
gt_mask_i = (gt_mask_i * 1.0).float() | |
y.append(gt_mask_i) | |
y_hat.append(bube_mask) | |
y = torch.stack(y) | |
y_hat = torch.stack(y_hat) | |
if loss == 'bce': | |
score = F.binary_cross_entropy_with_logits(y, y_hat, reduction='none').mean((1,2)) # mean over h,w | |
elif loss == 'dice': | |
score = self.dice_loss(y, y_hat) | |
elif loss == 'focal': | |
score = sigmoid_focal_loss(y, y_hat, reduction='none').mean((1,2)) | |
return score | |
def pose_loss(self, cube_pose:torch.Tensor, num_boxes_per_image:list[int]): | |
''' | |
Loss based on pose consistency within a single image | |
generate all combinations of poses as one row of the combination matrix at the time | |
this will give the equivalent to the lower triangle of the matrix | |
''' | |
loss_pose = torch.zeros(1, device=cube_pose.device) | |
fail_count = 0 | |
for cube_pose_ in cube_pose.split(num_boxes_per_image): | |
# normalise with the number of elements in the lower triangle to make the loss more fair between images with different number of boxes | |
# we don't really care about the eps | |
# we cannot use this when there is only one cube in an image, so skip it | |
if len(cube_pose_) == 1: | |
fail_count += 1 | |
continue | |
loss_pose_t = 1-so3_relative_angle_batched(cube_pose_, eps=10000, cos_angle=True).abs() | |
loss_pose += torch.mean(loss_pose_t) | |
if fail_count == len(num_boxes_per_image): # ensure that loss is None if all images in batch only had 1 box | |
return None | |
return loss_pose * 1/(fail_count+1) | |
def normal_vector_from_maps(self, ground_maps, depth_maps, Ks, use_nth=5): | |
'''compute a normal vector corresponding to the ground from a point ground generated from a depth map''' | |
# ### point cloud | |
dvc = depth_maps.device | |
normal_vecs = [] | |
# i cannot really see any other options than to loop over the them because the images have different sizes | |
for ground_map, depth_map, org_image_size, K in zip(ground_maps, depth_maps, depth_maps.image_sizes, Ks): | |
if ground_map.shape == (1,1): ground_map = None | |
z = depth_map[::use_nth,::use_nth] | |
# i don't know if it makes sense to use the image shape as the | |
# this way it looks much more correct | |
# https://github.com/DepthAnything/Depth-Anything-V2/blob/31dc97708961675ce6b3a8d8ffa729170a4aa273/metric_depth/depth_to_pointcloud.py#L100 | |
width, height = z.shape[1], z.shape[0] | |
focal_length_x, focal_length_y = K[0,0] // use_nth, K[1,1] // use_nth | |
u, v = torch.meshgrid(torch.arange(width, device=dvc), torch.arange(height,device=dvc), indexing='xy') | |
cx, cy = width / 2, height / 2 # principal point of camera | |
# https://www.open3d.org/docs/0.7.0/python_api/open3d.geometry.create_point_cloud_from_depth_image.html | |
x = (u - cx) * z / focal_length_x | |
y = (v - cy) * z / focal_length_y | |
if ground_map is not None: | |
# select only the points in x,y,z that are part of the ground map | |
ground = ground_map[::use_nth,::use_nth] | |
zg = z[ground > 0] | |
xg = x[ground > 0] | |
yg = y[ground > 0] | |
else: | |
# the ground map also works to remove the padded 0's to the depth maps | |
# so in the case the ground map is not available we must ensure to only select the valid part of the image | |
mask = torch.ones(org_image_size, device=dvc) | |
image_without_pad = mask[::use_nth,::use_nth] | |
zg = z[image_without_pad > 0] | |
xg = x[image_without_pad > 0] | |
yg = y[image_without_pad > 0] | |
# normalise the points | |
points = torch.stack((xg, yg, zg), axis=-1) | |
plane = Plane_torch() | |
# best_eq is the ground plane as a,b,c,d in the equation ax + by + cz + d = 0 | |
# if this errors out, run the filter ground script first | |
best_eq, best_inliers = plane.fit_parallel(points, thresh=0.05, maxIteration=1000) | |
normal_vec = best_eq[:-1] | |
x_up = torch.tensor([1.0, 0.0, 0.0], device=dvc) | |
y_up = torch.tensor([0.0, 1.0, 0.0], device=dvc) | |
z_up = torch.tensor([0.0, 0.0, 1.0], device=dvc) | |
# make sure normal vector is consistent with y-up | |
if (normal_vec @ z_up).abs() > (normal_vec @ y_up).abs(): | |
# this means the plane has been found as the back wall | |
# to rectify this we can turn the vector 90 degrees around the local x-axis | |
# note that this assumes that the walls are perpendicular to the floor | |
normal_vec = normal_vec[torch.tensor([0,2,1], device=dvc)] * torch.tensor([1, 1, -1], device=dvc) | |
if (normal_vec @ x_up).abs() > (normal_vec @ y_up).abs(): | |
# this means the plane has been found as the side wall | |
# to rectify this we can turn the vector 90 degrees around the local y-axis | |
# note that this assumes that the walls are perpendicular to the floor | |
normal_vec = normal_vec[torch.tensor([2,0,1], device=dvc)] * torch.tensor([-1, 1, 1], device=dvc) | |
if normal_vec @ y_up < 0: | |
normal_vec *= -1 | |
normal_vecs.append(normal_vec) | |
return torch.stack(normal_vecs) | |
def z_loss(self, gt_boxes:Boxes, cubes:Cubes, Ks, im_sizes, proj_boxes:Boxes): | |
max_count = 50 # 50 steps of 0.1 meters | |
num_preds = cubes.num_instances | |
# Find losses | |
scores = torch.zeros((num_preds), device=cubes.device) | |
gt_area = gt_boxes.area() | |
pred_center = proj_boxes.get_centers() | |
pred_area = proj_boxes.area() | |
gt_boxes_t = gt_boxes.tensor | |
is_within_gt_box = ((gt_boxes_t[:, 0] - max_count <= pred_center[:,0]) <= gt_boxes_t[:, 2] + max_count) & \ | |
((gt_boxes_t[:, 1] - max_count <= pred_center[:,1]) <= gt_boxes_t[:, 3] + max_count) | |
values_tensor = torch.linspace(0.0, (max_count-1)/10, max_count, device=cubes.device) | |
is_gt_smaller = gt_area < pred_area | |
for i in range(num_preds): | |
# Check if pred center is within gt box | |
if is_within_gt_box[i]: | |
cube_tensor = cubes[i].tensor | |
mod_cube_tensor = cube_tensor[0,0].clone().unsqueeze(0).repeat((max_count,1)) | |
# Check if too small or too big. | |
if is_gt_smaller[i]: # NOTE has disadvantage when box has different shape, CAN FAIL TODO Change to checking each corner instead | |
mod_cube_tensor[:, 2] += values_tensor | |
else: | |
mod_cube_tensor[:, 2] -= values_tensor | |
mod_cube = Cubes(mod_cube_tensor) | |
mod_box = Boxes(cubes_to_box(mod_cube, Ks[i], im_sizes[i])[0].tensor) | |
pred_areas = mod_box.area() | |
mask_zero_area = (pred_areas == 0) * 10000000 | |
pred_areas = pred_areas + mask_zero_area | |
idx = torch.argmin(self.l1_loss(gt_area[i].repeat(max_count), pred_areas)) | |
scores[i] = self.l1_loss(cubes[i].tensor[0,0,2], mod_cube_tensor[idx,2]) | |
else: | |
#If center is outside return something high? | |
scores[i] = torch.tensor(0.1 * max_count, requires_grad=True) | |
return scores/2 | |
def pseudo_gt_z_box_loss(self, depth_maps, proposal_boxes:tuple[torch.Tensor], pred_z): | |
'''Compute the pseudo ground truth z loss based on the depth map | |
for now, use the median value depth constrained of the proposal box as the ground truth depth | |
Args: | |
depth_maps: detectron2 Imagelist | |
proposal_boxes: predicted 2d box. list[detectron2 Boxes of shape (N, 4)] | |
pred_z: predicted z. torch.Tensor of shape (N, 1) | |
Returns: | |
z_loss: torch.Tensor of shape (N, 1)''' | |
gt_z = [] | |
for depth_map, boxes in zip(depth_maps, proposal_boxes): | |
boxes = Boxes(boxes) | |
h, w = depth_map.shape | |
# x1, y1, x2, y2 = box | |
# clamp boxes extending the image | |
boxes.clip((h, w)) | |
# remove boxes fully outside the image | |
mask = boxes.area() > 0 | |
boxes_in = boxes[mask] | |
# median of each of the depth maps corresponding each box | |
for box in boxes_in: | |
# TODO: this could be way more efficiently, but I don't know how to slice many boxes at once | |
gt_z.append(torch.median((depth_map[box[1].long():box[3].long(), box[0].long():box[2].long()])).unsqueeze(0)) | |
# for boxes outside image, fall back to same method as in pseudo_gt_z_loss_point | |
boxes_out = boxes[~mask] | |
if len(boxes_out) == 0: | |
continue | |
xy = boxes_out.get_centers() | |
x = torch.clamp(xy[:,0],10,w-11) | |
y = torch.clamp(xy[:,1],10,h-11) | |
gt_z.append(depth_map[y.long(), x.long()]) | |
gt_z_o = torch.cat(gt_z) | |
l1loss = self.l1_loss(pred_z, gt_z_o) | |
return l1loss | |
def dim_loss(self, priors:tuple[torch.Tensor], dimensions): | |
''' | |
priors : List | |
dimensions : List of Lists | |
P(dim|priors) | |
''' | |
[prior_mean, prior_std] = priors | |
# Drop rows of prior_mean and prior_std for rows in prior_std containing nan | |
mask = ~torch.isnan(prior_std).any(dim=1) | |
if not mask.all(): | |
return None, None, None | |
prior_mean = prior_mean[mask] | |
prior_std = prior_std[mask] | |
dimensions = dimensions[mask] | |
# z-score ie how many std's we are from the mean | |
dimensions_scores = (dimensions - prior_mean).abs()/prior_std | |
dimensions_scores = torch.max(dimensions_scores - 1.0, torch.zeros_like(dimensions_scores, device=dimensions_scores.device)) | |
return dimensions_scores[:,0], dimensions_scores[:,1], dimensions_scores[:,2] | |
def pseudo_gt_z_point_loss(self, depth_maps, pred_xy, pred_z, num_boxes_per_image): | |
'''Compute the pseudo ground truth z loss based on the depth map | |
for now, use the point in depth map corresponding to the center point of the pred box as the pseudo ground truth | |
Args: | |
depth_maps: detectron2 Imagelist | |
pred_xy: predicted centre. torch.Tensor of shape (N, 2) | |
pred_z: predicted z. torch.Tensor of shape (N, 1) | |
Returns: | |
z_loss: torch.Tensor of shape (N, 1)''' | |
gt_z = [] | |
for depth_map, xy in zip(depth_maps, pred_xy.split(num_boxes_per_image)): | |
h, w = depth_map.shape | |
y, x = xy[:,1], xy[:,0] | |
# clamp points outside the image | |
x = torch.clamp(x,10,w-11) | |
y = torch.clamp(y,10,h-11) | |
gt_z.append(depth_map[y.long(), x.long()]) | |
gt_z_o = torch.cat(gt_z) | |
l1loss = self.l1_loss(pred_z, gt_z_o) | |
return l1loss | |
def depth_range_loss(self, gt_mask, at_which_mask_idx, depth_maps, cubes, gt_boxes, num_instances): | |
""" | |
Apply seg_mask on depth image, take difference in min and max values as GT value. Take length as prediction value. Then l1-loss. | |
""" | |
gt_boxes_t = gt_boxes.tensor | |
counter = 0 | |
gt_depths = [] | |
corner_depths = cubes.get_all_corners()[:,0,:,2] | |
# max function gives both vals and idx, so we take only the vals | |
pred_depth = torch.max(corner_depths,dim=1)[0] - torch.min(corner_depths,dim=1)[0] | |
for depth_map, cube in zip(depth_maps, cubes.split(num_instances, dim=0)): | |
for j in range(cube.num_instances): | |
segmentation_mask = gt_mask[at_which_mask_idx[counter]][0] | |
depth_map = F.interpolate(depth_map.unsqueeze(0).unsqueeze(0),size=segmentation_mask.shape, mode='bilinear', align_corners=True).squeeze() | |
depth_range = depth_map[segmentation_mask] | |
# if segmentation fails, fall back to the bbox | |
if depth_range.numel() == 0: | |
depth_range = depth_map[gt_boxes_t[counter,1].long():gt_boxes_t[counter,3].long(), gt_boxes_t[counter,0].long():gt_boxes_t[counter,2].long()] | |
gt_depth = torch.quantile(depth_range,0.9) - torch.quantile(depth_range,0.1) #torch.max(depth_range) - torch.min(depth_range) | |
gt_depths.append(gt_depth) | |
counter += 1 | |
gt_depths = torch.stack(gt_depths) | |
scores = self.l1_loss(gt_depths, pred_depth) | |
return scores | |
def normal_to_rotation(self, normal): | |
'''https://gamedev.stackexchange.com/questions/22204/from-normal-to-rotation-matrix''' | |
x1 = torch.tensor([1.0, 0, 0], device=normal.device).repeat(normal.shape[0],1) | |
t0 = torch.cross(normal, x1, dim=1) | |
if torch.bmm(t0.view(normal.shape[0],1,3), t0.view(normal.shape[0], 3, 1)).flatten().any() < 0.001: | |
y1 = torch.tensor([0, 1.0, 0], device=normal.device).repeat(normal.shape[0],1) | |
t0 = torch.cross(normal, y1, dim=1) | |
t0 = t0 / torch.norm(t0) | |
t1t = torch.cross(normal, t0, dim=1) | |
t1 = t1t / torch.norm(t1t) | |
return torch.cat([t0, t1, normal],dim=1).reshape((normal.shape[0],3,3))#.permute((0,2,1)) | |
def _forward_cube(self, features, instances, Ks, im_current_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps): | |
features = [features[f] for f in self.in_features] | |
# training on foreground | |
if self.training: | |
losses = {} | |
# add up the amount we should normalize the losses by. | |
# this follows the same logic as the BoxHead, where each FG proposal | |
# is able to contribute the same amount of supervision. Technically, | |
# this value doesn't change during training unless the batch size is dynamic. | |
self.normalize_factor = max(sum([i.gt_classes.numel() for i in instances]), 1.0) | |
# The loss is only defined on positive proposals | |
proposals, _ = select_foreground_proposals(instances, self.num_classes) | |
proposal_boxes = [x.proposal_boxes for x in proposals] | |
pred_boxes = [x.pred_boxes for x in proposals] | |
box_classes = (torch.cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)) | |
gt_boxes3D = torch.cat([p.gt_boxes3D for p in proposals], dim=0,) | |
gt_poses = torch.cat([p.gt_poses for p in proposals], dim=0,) | |
assert len(gt_poses) == len(gt_boxes3D) == len(box_classes) | |
at_which_mask_idx = [] | |
for entry in gt_boxes3D: | |
entry = entry[0].item() | |
at_which_mask_idx.append(first_occurrence_indices[entry]) | |
# eval on all instances | |
else: | |
proposals = instances | |
pred_boxes = [x.pred_boxes for x in instances] | |
proposal_boxes = pred_boxes | |
box_classes = torch.cat([x.pred_classes for x in instances]) | |
proposal_boxes_scaled = self.scale_proposals(proposal_boxes) | |
# forward features | |
cube_features = self.cube_pooler(features, proposal_boxes_scaled).flatten(1) | |
n = cube_features.shape[0] | |
# nothing to do.. | |
if n == 0: | |
return instances if not self.training else (instances, {}) | |
num_boxes_per_image = [len(i) for i in proposals] | |
# scale the intrinsics according to the ratio the image has been scaled. | |
# this means the projections at the current scale are in sync. | |
Ks_scaled_per_box = torch.cat([ | |
(Ks[i]/im_scales_ratio[i]).unsqueeze(0).repeat([num, 1, 1]) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
Ks_scaled_per_box[:, -1, -1] = 1 | |
focal_lengths_per_box = torch.cat([ | |
(Ks[i][1, 1]).unsqueeze(0).repeat([num]) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
im_ratios_per_box = torch.cat([ | |
torch.FloatTensor([im_scales_ratio[i]]).repeat(num) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
# scaling factor for Network resolution -> Original | |
im_scales_per_box = torch.cat([ | |
torch.FloatTensor([im_current_dims[i][0]]).repeat(num) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
im_scales_original_per_box = im_scales_per_box * im_ratios_per_box | |
if self.virtual_depth: | |
virtual_to_real = util.compute_virtual_scale_from_focal_spaces( | |
focal_lengths_per_box, im_scales_original_per_box, | |
self.virtual_focal, im_scales_per_box | |
) | |
real_to_virtual = 1 / virtual_to_real | |
else: | |
real_to_virtual = virtual_to_real = 1.0 | |
# 2D boxes are needed to apply deltas | |
src_boxes = torch.cat([box_per_im.tensor for box_per_im in proposal_boxes], dim=0) | |
src_widths = src_boxes[:, 2] - src_boxes[:, 0] | |
src_heights = src_boxes[:, 3] - src_boxes[:, 1] | |
src_scales = (src_heights**2 + src_widths**2).sqrt() | |
src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths | |
src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights | |
# For some methods, we need the predicted 2D box, | |
# e.g., the differentiable tensors from the 2D box head. | |
pred_src_boxes = torch.cat([box_per_im.tensor for box_per_im in pred_boxes], dim=0) | |
pred_widths = pred_src_boxes[:, 2] - pred_src_boxes[:, 0] | |
pred_heights = pred_src_boxes[:, 3] - pred_src_boxes[:, 1] | |
pred_src_x = (pred_src_boxes[:, 2] + pred_src_boxes[:, 0]) * 0.5 | |
pred_src_y = (pred_src_boxes[:, 3] + pred_src_boxes[:, 1]) * 0.5 | |
im_sizes = [] | |
im_idx = [] | |
for i,j in enumerate(num_boxes_per_image): | |
for _ in range(j): | |
im_sizes.append(list(im_current_dims[i])) | |
im_idx.append(i) | |
# forward predictions | |
cube_2d_deltas, cube_z, cube_dims, cube_pose, cube_uncert = self.cube_head(cube_features) | |
# simple indexing re-used commonly for selection purposes | |
fg_inds = torch.arange(n) | |
# Z when clusters are used | |
if cube_z is not None and self.cluster_bins > 1: | |
# compute closest bin assignments per batch per category (batch x n_category) | |
scales_diff = (self.priors_z_scales.detach().T.unsqueeze(0) - src_scales.unsqueeze(1).unsqueeze(2)).abs() | |
# assign the correct scale prediction. | |
# (the others are not used / thrown away) | |
assignments = scales_diff.argmin(1) | |
# select FG, category, and correct cluster | |
cube_z = cube_z[fg_inds, :, box_classes, :][fg_inds, assignments[fg_inds, box_classes]] | |
elif cube_z is not None: | |
# if z is available, collect the per-category predictions. | |
cube_z = cube_z[fg_inds, box_classes, :] | |
cube_dims = cube_dims[fg_inds, box_classes, :] | |
cube_pose = cube_pose[fg_inds, box_classes, :, :] | |
if self.use_confidence: | |
# if uncertainty is available, collect the per-category predictions. | |
cube_uncert = cube_uncert[fg_inds, box_classes] | |
cube_2d_deltas = cube_2d_deltas[fg_inds, box_classes, :] | |
# apply our predicted deltas based on src boxes. | |
cube_x = src_ctr_x + src_widths * cube_2d_deltas[:, 0] | |
cube_y = src_ctr_y + src_heights * cube_2d_deltas[:, 1] | |
cube_xy = torch.cat((cube_x.unsqueeze(1), cube_y.unsqueeze(1)), dim=1) | |
cube_dims_norm = cube_dims | |
if self.dims_priors_enabled: | |
# gather prior dimensions | |
prior_dims = self.priors_dims_per_cat.detach().repeat([n, 1, 1, 1])[fg_inds, box_classes] | |
prior_dims_mean = prior_dims[:, 0, :] | |
prior_dims_std = prior_dims[:, 1, :] | |
if self.dims_priors_func == 'sigmoid': | |
prior_dims_min = (prior_dims_mean - 3*prior_dims_std).clip(0.0) | |
prior_dims_max = (prior_dims_mean + 3*prior_dims_std) | |
cube_dims = util.scaled_sigmoid(cube_dims_norm, min=prior_dims_min, max=prior_dims_max) | |
elif self.dims_priors_func == 'exp': | |
cube_dims = torch.exp(cube_dims_norm.clip(max=5)) * prior_dims_mean | |
else: | |
# no priors are used | |
cube_dims = torch.exp(cube_dims_norm.clip(max=5)) | |
if self.allocentric_pose: | |
# To compare with GTs, we need the pose to be egocentric, not allocentric | |
cube_pose_allocentric = cube_pose | |
cube_pose = util.R_from_allocentric(Ks_scaled_per_box, cube_pose, u=cube_x.detach(), v=cube_y.detach()) | |
cube_z = cube_z.squeeze() | |
if self.z_type =='sigmoid': | |
cube_z_norm = torch.sigmoid(cube_z) | |
cube_z = cube_z_norm * 100 | |
elif self.z_type == 'log': | |
cube_z_norm = cube_z | |
cube_z = torch.exp(cube_z) | |
elif self.z_type == 'clusters': | |
# gather the mean depth, same operation as above, for a n x c result | |
z_means = self.priors_z_stats[:, :, 0].T.unsqueeze(0).repeat([n, 1, 1]) | |
z_means = torch.gather(z_means, 1, assignments.unsqueeze(1)).squeeze(1) | |
# gather the std depth, same operation as above, for a n x c result | |
z_stds = self.priors_z_stats[:, :, 1].T.unsqueeze(0).repeat([n, 1, 1]) | |
z_stds = torch.gather(z_stds, 1, assignments.unsqueeze(1)).squeeze(1) | |
# do not learn these, they are static | |
z_means = z_means.detach() | |
z_stds = z_stds.detach() | |
z_means = z_means[fg_inds, box_classes] | |
z_stds = z_stds[fg_inds, box_classes] | |
z_mins = (z_means - 3*z_stds).clip(0) | |
z_maxs = (z_means + 3*z_stds) | |
cube_z_norm = cube_z | |
cube_z = util.scaled_sigmoid(cube_z, min=z_mins, max=z_maxs) | |
if self.virtual_depth: | |
cube_z = (cube_z * virtual_to_real) | |
if self.training: | |
prefix = 'Cube/' | |
storage = get_event_storage() | |
# Pull off necessary GT information | |
gt_2d = gt_boxes3D[:, :2] | |
gt_z = gt_boxes3D[:, 2] | |
gt_dims = gt_boxes3D[:, 3:6] | |
# this box may have been mirrored and scaled so | |
# we need to recompute XYZ in 3D by backprojecting. | |
gt_x3d = gt_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
gt_y3d = gt_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
gt_3d = torch.stack((gt_x3d, gt_y3d, gt_z)).T | |
# put together the GT boxes | |
gt_cubes = Cubes(torch.cat((gt_3d, gt_dims, gt_poses.view(*gt_poses.shape[:-2], -1)), dim=1).unsqueeze(1)) | |
# Get center in meters and create cubes | |
#cube_z = gt_boxes3D[:,2] | |
cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
cubes_tensor = torch.cat((cube_x3d.unsqueeze(1),cube_y3d.unsqueeze(1),cube_z.unsqueeze(1),cube_dims,cube_pose.reshape(n,9)),axis=1).unsqueeze(1) | |
cubes = Cubes(cubes_tensor) | |
# 3d iou | |
IoU3Ds = None | |
storage = get_event_storage() | |
# log 3d iou less frequently because it is slow | |
if storage.iter % 200 == 0: | |
gt_corners = gt_cubes.get_all_corners().squeeze(1) | |
proposal_corners = cubes.get_all_corners().squeeze(1) | |
try: | |
vol, iou = box3d_overlap(gt_corners.cpu(),proposal_corners.cpu()) | |
IoU3Ds = torch.diag(iou) | |
except ValueError: | |
IoU3Ds = torch.zeros(n, device=cubes.device) | |
# Get bube corners | |
bube_corners = torch.zeros((n,8,2)) | |
for i in range(n): | |
bube_corner = cubes[i].get_bube_corners(Ks_scaled_per_box[i], im_sizes[i]) | |
x = torch.clamp(bube_corner[..., 0], 0, int(im_sizes[i][0]-1)) # clamp for segment loss, else CUDA error bc of accesing elements otside mask range | |
y = torch.clamp(bube_corner[..., 1], 0, int(im_sizes[i][1]-1)) | |
bube_corner = torch.stack((x, y), dim=-1) | |
bube_corners[i] = bube_corner | |
# Project to 2D | |
proj_boxes = [] | |
for i in range(cubes.num_instances): | |
proj_boxes.append(cubes_to_box(cubes[i], Ks_scaled_per_box[i], im_sizes[i])[0].tensor[0]) | |
proj_boxes = Boxes(torch.stack(proj_boxes)) | |
### Loss | |
loss_iou = None | |
loss_pose = None | |
loss_seg = None | |
loss_z = None | |
loss_dims_w = None | |
loss_pseudo_gt_z = None | |
loss_ground_rot = None | |
loss_depth = None | |
# 2D IoU | |
gt_boxes = [x.gt_boxes for x in proposals] | |
gt_boxes = Boxes(torch.cat([gt_boxes[i].tensor for i in range(len(gt_boxes))])) | |
# 2D IoU | |
if 'iou' in self.loss_functions: | |
loss_iou = generalized_box_iou_loss(gt_boxes.tensor, proj_boxes.tensor, reduction='none').view(n, -1).mean(dim=1) | |
# Pose | |
if 'pose_alignment' in self.loss_functions: | |
loss_pose = self.pose_loss(cube_pose, num_boxes_per_image) | |
if loss_pose is not None: | |
loss_pose = loss_pose.repeat(n) | |
# normal vector to ground loss | |
if 'pose_ground' in self.loss_functions: | |
valid_ground_maps_conf = torch.tensor([0.1 if shape == (1,1) else 1.0 for shape in ground_maps.image_sizes],device=cube_pose.device) | |
num_boxes_per_image_tensor = torch.tensor(num_boxes_per_image,device=Ks_scaled_per_box.device) | |
normal_vectors = self.normal_vector_from_maps(ground_maps, depth_maps, Ks_scaled_per_box) | |
normal_vectors = normal_vectors.repeat_interleave(num_boxes_per_image_tensor, 0) | |
valid_ground_maps_conf = valid_ground_maps_conf.repeat_interleave(num_boxes_per_image_tensor, 0) | |
pred_normal = cube_pose[:, 1, :] | |
loss_ground_rot = 1-F.cosine_similarity(normal_vectors, pred_normal, dim=1).abs() | |
loss_ground_rot = loss_ground_rot * valid_ground_maps_conf | |
if 'pose_ground2' in self.loss_functions: | |
valid_ground_maps_conf = torch.tensor([0.1 if shape == (1,1) else 1.0 for shape in ground_maps.image_sizes],device=cube_pose.device) | |
num_boxes_per_image_tensor = torch.tensor(num_boxes_per_image,device=Ks_scaled_per_box.device) | |
normal_vectors = self.normal_vector_from_maps(ground_maps, depth_maps, Ks_scaled_per_box) | |
normal_vectors = normal_vectors.repeat_interleave(num_boxes_per_image_tensor, 0) | |
valid_ground_maps_conf = valid_ground_maps_conf.repeat_interleave(num_boxes_per_image_tensor, 0) | |
ps_gt_rotation_matrix = self.normal_to_rotation(normal_vectors) | |
# might need to transpose the rotation matrices | |
pred_rotation_matrix = cube_pose | |
loss_ground_rot = 1 - so3_relative_angle(pred_rotation_matrix, ps_gt_rotation_matrix, cos_angle=True)#.abs() | |
loss_ground_rot = loss_ground_rot * valid_ground_maps_conf | |
# pseudo ground truth z loss | |
if 'z_pseudo_gt_patch' in self.loss_functions: | |
loss_pseudo_gt_z = self.pseudo_gt_z_box_loss(depth_maps, proj_boxes.tensor.split(num_boxes_per_image), cube_z) | |
elif 'z_pseudo_gt_center' in self.loss_functions: | |
loss_pseudo_gt_z = self.pseudo_gt_z_point_loss(depth_maps, cube_xy, cube_z, num_boxes_per_image) | |
# segment | |
if 'segmentation' in self.loss_functions: | |
loss_seg = self.segment_loss(masks_all_images, bube_corners, at_which_mask_idx) | |
# Z | |
if 'z' in self.loss_functions: | |
loss_z = self.z_loss(gt_boxes, cubes, Ks_scaled_per_box, im_sizes, proj_boxes) | |
# Dimensions | |
if 'dims' in self.loss_functions: | |
loss_dims_w, loss_dims_h, loss_dims_l = self.dim_loss((prior_dims_mean, prior_dims_std), cubes.dimensions.squeeze(1)) | |
# Depth Range | |
if 'depth' in self.loss_functions: | |
loss_depth = self.depth_range_loss(masks_all_images, at_which_mask_idx, depth_maps, cubes, gt_boxes, num_boxes_per_image) | |
total_3D_loss_for_reporting = 0 | |
if loss_iou is not None: | |
total_3D_loss_for_reporting += loss_iou*self.loss_w_iou | |
if loss_seg is not None: | |
total_3D_loss_for_reporting += loss_seg*self.loss_w_seg | |
if loss_pose is not None: | |
# this loss is a bit weird when adding, because it is a single number, which is broadcasted. instead of a number per instance | |
total_3D_loss_for_reporting += loss_pose*self.loss_w_pose | |
if loss_ground_rot is not None: | |
total_3D_loss_for_reporting += loss_ground_rot * self.loss_w_normal_vec * valid_ground_maps_conf | |
if loss_z is not None: | |
total_3D_loss_for_reporting += loss_z*self.loss_w_z | |
if loss_pseudo_gt_z is not None: | |
total_3D_loss_for_reporting += loss_pseudo_gt_z*self.loss_w_z | |
if loss_dims_w is not None: | |
total_3D_loss_for_reporting += loss_dims_w*self.loss_w_dims | |
total_3D_loss_for_reporting += loss_dims_h*self.loss_w_dims | |
total_3D_loss_for_reporting += loss_dims_l*self.loss_w_dims | |
if loss_depth is not None: | |
total_3D_loss_for_reporting += loss_depth*self.loss_w_depth | |
# reporting does not need gradients | |
if not isinstance(total_3D_loss_for_reporting, int): | |
total_3D_loss_for_reporting = total_3D_loss_for_reporting.detach() | |
# compute errors for tracking purposes | |
xy_error = (cube_xy - gt_2d).detach().abs() | |
z_error = (cube_z - gt_z).detach().abs() | |
dims_error = (cube_dims - gt_dims).detach().abs() | |
storage.put_scalar(prefix + 'z_error', z_error.mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + 'dims_error', dims_error.mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + 'xy_error', xy_error.mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + 'z_close', (z_error<0.20).float().mean().item(), smoothing_hint=False) | |
IoU2D = iou_2d(gt_boxes, proj_boxes).detach() | |
IoU2D = torch.diag(IoU2D.view(n, n)) | |
if IoU3Ds is not None: | |
storage.put_scalar(prefix + '3D IoU', IoU3Ds.detach().mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + '2D IoU', IoU2D.mean().item(), smoothing_hint=False) | |
if not isinstance(total_3D_loss_for_reporting, int): | |
storage.put_scalar(prefix + 'total_3D_loss', self.loss_w_3d * self.safely_reduce_losses(total_3D_loss_for_reporting), smoothing_hint=False) | |
if self.use_confidence > 0: | |
uncert_sf = SQRT_2_CONSTANT * torch.exp(-cube_uncert) | |
if loss_iou is not None: | |
loss_iou *= uncert_sf | |
if loss_seg is not None: | |
loss_seg *= uncert_sf | |
if loss_pose is not None: | |
loss_pose *= uncert_sf | |
if loss_ground_rot is not None: | |
loss_ground_rot *= uncert_sf | |
if loss_z is not None: | |
loss_z *= uncert_sf | |
if loss_pseudo_gt_z is not None: | |
loss_pseudo_gt_z *= uncert_sf | |
if loss_dims_w is not None: | |
loss_dims_w *= uncert_sf | |
loss_dims_h *= uncert_sf | |
loss_dims_l *= uncert_sf | |
if loss_depth is not None: | |
loss_depth *= uncert_sf | |
losses.update({prefix + 'uncert': self.use_confidence*self.safely_reduce_losses(cube_uncert.clone())}) | |
storage.put_scalar(prefix + 'conf', torch.exp(-cube_uncert).mean().item(), smoothing_hint=False) | |
if loss_iou is not None: | |
losses.update({ | |
prefix + 'loss_iou': self.safely_reduce_losses(loss_iou) * self.loss_w_iou * self.loss_w_3d, | |
}) | |
if loss_pose is not None: | |
losses.update({ | |
prefix + 'loss_pose': self.safely_reduce_losses(loss_pose) * self.loss_w_pose * self.loss_w_3d, | |
}) | |
if loss_ground_rot is not None: | |
losses.update({ | |
prefix + 'loss_normal_vec': self.safely_reduce_losses(loss_ground_rot) * self.loss_w_normal_vec * self.loss_w_3d, | |
}) | |
if loss_seg is not None: | |
losses.update({ | |
prefix + 'loss_seg': self.safely_reduce_losses(loss_seg) * self.loss_w_seg * self.loss_w_3d, | |
}) | |
if loss_z is not None: | |
losses.update({ | |
prefix + 'loss_z': self.safely_reduce_losses(loss_z) * self.loss_w_z * self.loss_w_3d, | |
}) | |
if loss_pseudo_gt_z is not None: | |
losses.update({ | |
prefix + 'loss_pseudo_gt_z': self.safely_reduce_losses(loss_pseudo_gt_z) * self.loss_w_z * self.loss_w_3d, | |
}) | |
if loss_dims_w is not None: | |
losses.update({ | |
prefix + 'loss_dims_w': self.safely_reduce_losses(loss_dims_w) * self.loss_w_dims * self.loss_w_3d, | |
}) | |
losses.update({ | |
prefix + 'loss_dims_h': self.safely_reduce_losses(loss_dims_h) * self.loss_w_dims * self.loss_w_3d, | |
}) | |
losses.update({ | |
prefix + 'loss_dims_l': self.safely_reduce_losses(loss_dims_l) * self.loss_w_dims * self.loss_w_3d, | |
}) | |
if loss_depth is not None: | |
losses.update({ | |
prefix + 'loss_depth': self.safely_reduce_losses(loss_depth) * self.loss_w_depth * self.loss_w_3d, | |
}) | |
''' | |
Inference | |
''' | |
if len(cube_z.shape) == 0: | |
cube_z = cube_z.unsqueeze(0) | |
# inference | |
cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
cube_3D = torch.cat((torch.stack((cube_x3d, cube_y3d, cube_z)).T, cube_dims, cube_xy*im_ratios_per_box.unsqueeze(1)), dim=1) | |
if self.use_confidence: | |
cube_conf = torch.exp(-cube_uncert) | |
cube_3D = torch.cat((cube_3D, cube_conf.unsqueeze(1)), dim=1) | |
# convert the predictions to intances per image | |
cube_3D = cube_3D.split(num_boxes_per_image) | |
cube_pose = cube_pose.split(num_boxes_per_image) | |
box_classes = box_classes.split(num_boxes_per_image) | |
pred_instances = None | |
pred_instances = instances if not self.training else \ | |
[Instances(image_size) for image_size in im_current_dims] | |
for cube_3D_i, cube_pose_i, instances_i, K, im_dim, im_scale_ratio, box_classes_i, pred_boxes_i in \ | |
zip(cube_3D, cube_pose, pred_instances, Ks, im_current_dims, im_scales_ratio, box_classes, pred_boxes): | |
# merge scores if they already exist | |
if hasattr(instances_i, 'scores'): | |
instances_i.scores = (instances_i.scores * cube_3D_i[:, -1])**(1/2) | |
# assign scores if none are present | |
else: | |
instances_i.scores = cube_3D_i[:, -1] | |
# assign box classes if none exist | |
if not hasattr(instances_i, 'pred_classes'): | |
instances_i.pred_classes = box_classes_i | |
# assign predicted boxes if none exist | |
if not hasattr(instances_i, 'pred_boxes'): | |
instances_i.pred_boxes = pred_boxes_i | |
instances_i.pred_bbox3D = util.get_cuboid_verts_faces(cube_3D_i[:, :6], cube_pose_i)[0] | |
instances_i.pred_center_cam = cube_3D_i[:, :3] | |
instances_i.pred_center_2D = cube_3D_i[:, 6:8] | |
instances_i.pred_dimensions = cube_3D_i[:, 3:6] | |
instances_i.pred_pose = cube_pose_i | |
if self.training: | |
return pred_instances, losses | |
else: | |
return pred_instances | |
def _sample_proposals( | |
self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor, matched_ious=None | |
) -> Tuple[torch.Tensor, torch.Tensor]: | |
""" | |
Based on the matching between N proposals and M groundtruth, | |
sample the proposals and set their classification labels. | |
Args: | |
matched_idxs (Tensor): a vector of length N, each is the best-matched | |
gt index in [0, M) for each proposal. | |
matched_labels (Tensor): a vector of length N, the matcher's label | |
(one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal. | |
gt_classes (Tensor): a vector of length M. | |
Returns: | |
Tensor: a vector of indices of sampled proposals. Each is in [0, N). | |
Tensor: a vector of the same length, the classification label for | |
each sampled proposal. Each sample is labeled as either a category in | |
[0, num_classes) or the background (num_classes). | |
""" | |
has_gt = gt_classes.numel() > 0 | |
# Get the corresponding GT for each proposal | |
if has_gt: | |
gt_classes = gt_classes[matched_idxs] | |
# Label unmatched proposals (0 label from matcher) as background (label=num_classes) | |
gt_classes[matched_labels == 0] = self.num_classes | |
# Label ignore proposals (-1 label) | |
gt_classes[matched_labels == -1] = -1 | |
else: | |
gt_classes = torch.zeros_like(matched_idxs) + self.num_classes | |
sampled_fg_idxs, sampled_bg_idxs = subsample_labels( | |
gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes, matched_ious=matched_ious | |
) | |
sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0) | |
return sampled_idxs, gt_classes[sampled_idxs] | |
def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]: | |
#separate valid and ignore gts | |
targets_ign = [target[target.gt_classes < 0] for target in targets] | |
targets = [target[target.gt_classes >= 0] for target in targets] | |
if self.proposal_append_gt: | |
proposals = add_ground_truth_to_proposals(targets, proposals) | |
proposals_with_gt = [] | |
num_fg_samples = [] | |
num_bg_samples = [] | |
for proposals_per_image, targets_per_image, targets_ign_per_image in zip(proposals, targets, targets_ign): | |
has_gt = len(targets_per_image) > 0 | |
match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) | |
matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) | |
try: | |
if len(targets_ign_per_image) > 0: | |
# compute the quality matrix, only on subset of background | |
background_inds = (matched_labels == 0).nonzero().squeeze() | |
# determine the boxes inside ignore regions with sufficient threshold | |
if background_inds.numel() > 1: | |
match_quality_matrix_ign = pairwise_ioa(targets_ign_per_image.gt_boxes, proposals_per_image.proposal_boxes[background_inds]) | |
matched_labels[background_inds[match_quality_matrix_ign.max(0)[0] >= self.ignore_thresh]] = -1 | |
del match_quality_matrix_ign | |
except: | |
pass | |
gt_arange = torch.arange(match_quality_matrix.shape[1]).to(matched_idxs.device) | |
matched_ious = match_quality_matrix[matched_idxs, gt_arange] | |
sampled_idxs, gt_classes = self._sample_proposals(matched_idxs, matched_labels, targets_per_image.gt_classes, matched_ious=matched_ious) | |
# Set target attributes of the sampled proposals: | |
proposals_per_image = proposals_per_image[sampled_idxs] | |
proposals_per_image.gt_classes = gt_classes | |
if has_gt: | |
sampled_targets = matched_idxs[sampled_idxs] | |
# We index all the attributes of targets that start with "gt_" | |
# and have not been added to proposals yet (="gt_classes"). | |
# NOTE: here the indexing waste some compute, because heads | |
# like masks, keypoints, etc, will filter the proposals again, | |
# (by foreground/background, or number of keypoints in the image, etc) | |
# so we essentially index the data twice. | |
for (trg_name, trg_value) in targets_per_image.get_fields().items(): | |
if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name): | |
proposals_per_image.set(trg_name, trg_value[sampled_targets]) | |
num_bg_samples.append((gt_classes == self.num_classes).sum().item()) | |
num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) | |
proposals_with_gt.append(proposals_per_image) | |
# Log the number of fg/bg samples that are selected for training ROI heads | |
storage = get_event_storage() | |
storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) | |
storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) | |
return proposals_with_gt | |
def safely_reduce_losses(self, loss): | |
valid = (~(loss.isinf())) & (~(loss.isnan())) | |
if valid.any(): | |
return loss[valid].mean() | |
else: | |
# no valid losses, simply zero out | |
return loss.mean()*0.0 | |
class ROIHeads3D(StandardROIHeads): | |
def __init__( | |
self, | |
*, | |
ignore_thresh: float, | |
cube_head: nn.Module, | |
cube_pooler: nn.Module, | |
loss_w_3d: float, | |
loss_w_xy: float, | |
loss_w_z: float, | |
loss_w_dims: float, | |
loss_w_pose: float, | |
loss_w_joint: float, | |
use_confidence: float, | |
inverse_z_weight: bool, | |
z_type: str, | |
pose_type: str, | |
cluster_bins: int, | |
priors = None, | |
dims_priors_enabled = None, | |
dims_priors_func = None, | |
disentangled_loss=None, | |
virtual_depth=None, | |
virtual_focal=None, | |
test_scale=None, | |
allocentric_pose=None, | |
chamfer_pose=None, | |
scale_roi_boxes=None, | |
**kwargs, | |
): | |
super().__init__(**kwargs) | |
self.scale_roi_boxes = scale_roi_boxes | |
# rotation settings | |
self.allocentric_pose = allocentric_pose | |
self.chamfer_pose = chamfer_pose | |
# virtual settings | |
self.virtual_depth = virtual_depth | |
self.virtual_focal = virtual_focal | |
# loss weights, <=0 is off | |
self.loss_w_3d = loss_w_3d | |
self.loss_w_xy = loss_w_xy | |
self.loss_w_z = loss_w_z | |
self.loss_w_dims = loss_w_dims | |
self.loss_w_pose = loss_w_pose | |
self.loss_w_joint = loss_w_joint | |
# loss modes | |
self.disentangled_loss = disentangled_loss | |
self.inverse_z_weight = inverse_z_weight | |
# misc | |
self.test_scale = test_scale | |
self.ignore_thresh = ignore_thresh | |
# related to network outputs | |
self.z_type = z_type | |
self.pose_type = pose_type | |
self.use_confidence = use_confidence | |
# related to priors | |
self.cluster_bins = cluster_bins | |
self.dims_priors_enabled = dims_priors_enabled | |
self.dims_priors_func = dims_priors_func | |
# if there is no 3D loss, then we don't need any heads. | |
if loss_w_3d > 0: | |
self.cube_head = cube_head | |
self.cube_pooler = cube_pooler | |
# the dimensions could rely on pre-computed priors | |
if self.dims_priors_enabled and priors is not None: | |
self.priors_dims_per_cat = nn.Parameter(torch.FloatTensor(priors['priors_dims_per_cat']).unsqueeze(0)) | |
else: | |
self.priors_dims_per_cat = nn.Parameter(torch.ones(1, self.num_classes, 2, 3)) | |
# Optionally, refactor priors and store them in the network params | |
if self.cluster_bins > 1 and priors is not None: | |
# the depth could have been clustered based on 2D scales | |
priors_z_scales = torch.stack([torch.FloatTensor(prior[1]) for prior in priors['priors_bins']]) | |
self.priors_z_scales = nn.Parameter(priors_z_scales) | |
else: | |
self.priors_z_scales = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins)) | |
# the depth can be based on priors | |
if self.z_type == 'clusters': | |
assert self.cluster_bins > 1, 'To use z_type of priors, there must be more than 1 cluster bin' | |
if priors is None: | |
self.priors_z_stats = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins, 2).float()) | |
else: | |
# stats | |
priors_z_stats = torch.cat([torch.FloatTensor(prior[2]).unsqueeze(0) for prior in priors['priors_bins']]) | |
self.priors_z_stats = nn.Parameter(priors_z_stats) | |
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], priors=None): | |
ret = super().from_config(cfg, input_shape) | |
# pass along priors | |
ret["box_predictor"] = FastRCNNOutputs(cfg, ret['box_head'].output_shape) | |
ret.update(cls._init_cube_head(cfg, input_shape)) | |
ret["priors"] = priors | |
return ret | |
def _init_cube_head(self, cfg, input_shape: Dict[str, ShapeSpec]): | |
in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES | |
pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features) | |
pooler_resolution = cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION | |
pooler_sampling_ratio = cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO | |
pooler_type = cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE | |
cube_pooler = ROIPooler( | |
output_size=pooler_resolution, | |
scales=pooler_scales, | |
sampling_ratio=pooler_sampling_ratio, | |
pooler_type=pooler_type, | |
) | |
in_channels = [input_shape[f].channels for f in in_features][0] | |
shape = ShapeSpec( | |
channels=in_channels, width=pooler_resolution, height=pooler_resolution | |
) | |
cube_head = build_cube_head(cfg, shape) | |
return { | |
'cube_head': cube_head, | |
'cube_pooler': cube_pooler, | |
'use_confidence': cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE, | |
'inverse_z_weight': cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT, | |
'loss_w_3d': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D, | |
'loss_w_xy': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY, | |
'loss_w_z': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z, | |
'loss_w_dims': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS, | |
'loss_w_pose': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE, | |
'loss_w_joint': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT, | |
'z_type': cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE, | |
'pose_type': cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE, | |
'dims_priors_enabled': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED, | |
'dims_priors_func': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC, | |
'disentangled_loss': cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS, | |
'virtual_depth': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH, | |
'virtual_focal': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL, | |
'test_scale': cfg.INPUT.MIN_SIZE_TEST, | |
'chamfer_pose': cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE, | |
'allocentric_pose': cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE, | |
'cluster_bins': cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS, | |
'ignore_thresh': cfg.MODEL.RPN.IGNORE_THRESHOLD, | |
'scale_roi_boxes': cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES, | |
} | |
def forward(self, images, features, proposals, Ks, im_scales_ratio, targets=None): | |
im_dims = [image.shape[1:] for image in images] | |
del images | |
if self.training: | |
proposals = self.label_and_sample_proposals(proposals, targets) | |
del targets | |
if self.training: | |
losses = self._forward_box(features, proposals) | |
if self.loss_w_3d > 0: | |
instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio) | |
losses.update(losses_cube) | |
else: | |
instances_3d = None | |
return instances_3d, losses | |
else: | |
# when oracle is available, by pass the box forward. | |
# simulate the predicted instances by creating a new | |
# instance for each passed in image. | |
if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]): | |
pred_instances = [] | |
for proposal, im_dim in zip(proposals, im_dims): | |
pred_instances_i = Instances(im_dim) | |
pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D']) | |
pred_instances_i.pred_classes = proposal['gt_classes'] | |
pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float() | |
pred_instances.append(pred_instances_i) | |
else: | |
pred_instances = self._forward_box(features, proposals) | |
if self.loss_w_3d > 0: | |
pred_instances = self._forward_cube(features, pred_instances, Ks, im_dims, im_scales_ratio) | |
return pred_instances, {} | |
def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]): | |
""" | |
Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`, | |
the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument. | |
Args: | |
features (dict[str, Tensor]): mapping from feature map names to tensor. | |
Same as in :meth:`ROIHeads.forward`. | |
proposals (list[Instances]): the per-image object proposals with | |
their matching ground truth. | |
Each has fields "proposal_boxes", and "objectness_logits", | |
"gt_classes", "gt_boxes". | |
Returns: | |
In training, a dict of losses. | |
In inference, a list of `Instances`, the predicted instances. | |
""" | |
features = [features[f] for f in self.box_in_features] | |
box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals]) | |
box_features = self.box_head(box_features) | |
predictions = self.box_predictor(box_features) | |
del box_features | |
if self.training: | |
losses = self.box_predictor.losses( | |
predictions, proposals, | |
) | |
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( | |
predictions, proposals | |
) | |
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): | |
proposals_per_image.pred_boxes = Boxes(pred_boxes_per_image) | |
# proposals is modified in-place below, so losses must be computed first. | |
if self.train_on_pred_boxes: | |
with torch.no_grad(): | |
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes( | |
predictions, proposals | |
) | |
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes): | |
proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image) | |
return losses | |
else: | |
pred_instances, _ = self.box_predictor.inference(predictions, proposals, ) | |
return pred_instances | |
def l1_loss(self, vals, target): | |
return F.smooth_l1_loss(vals, target, reduction='none', beta=0.0) | |
def chamfer_loss(self, vals, target): | |
B = vals.shape[0] | |
xx = vals.view(B, 8, 1, 3) | |
yy = target.view(B, 1, 8, 3) | |
l1_dist = (xx - yy).abs().sum(-1) | |
l1 = (l1_dist.min(1).values.mean(-1) + l1_dist.min(2).values.mean(-1)) | |
return l1 | |
# optionally, scale proposals to zoom RoI in (<1.0) our out (>1.0) | |
def scale_proposals(self, proposal_boxes): | |
if self.scale_roi_boxes > 0: | |
proposal_boxes_scaled = [] | |
for boxes in proposal_boxes: | |
centers = boxes.get_centers() | |
widths = boxes.tensor[:, 2] - boxes.tensor[:, 0] | |
heights = boxes.tensor[:, 2] - boxes.tensor[:, 0] | |
x1 = centers[:, 0] - 0.5*widths*self.scale_roi_boxes | |
x2 = centers[:, 0] + 0.5*widths*self.scale_roi_boxes | |
y1 = centers[:, 1] - 0.5*heights*self.scale_roi_boxes | |
y2 = centers[:, 1] + 0.5*heights*self.scale_roi_boxes | |
boxes_scaled = Boxes(torch.stack([x1, y1, x2, y2], dim=1)) | |
proposal_boxes_scaled.append(boxes_scaled) | |
else: | |
proposal_boxes_scaled = proposal_boxes | |
return proposal_boxes_scaled | |
def _forward_cube(self, features, instances, Ks, im_current_dims, im_scales_ratio): | |
features = [features[f] for f in self.in_features] | |
# training on foreground | |
if self.training: | |
losses = {} | |
# add up the amount we should normalize the losses by. | |
# this follows the same logic as the BoxHead, where each FG proposal | |
# is able to contribute the same amount of supervision. Technically, | |
# this value doesn't change during training unless the batch size is dynamic. | |
self.normalize_factor = max(sum([i.gt_classes.numel() for i in instances]), 1.0) | |
# The loss is only defined on positive proposals | |
proposals, _ = select_foreground_proposals(instances, self.num_classes) | |
proposal_boxes = [x.proposal_boxes for x in proposals] | |
pred_boxes = [x.pred_boxes for x in proposals] | |
box_classes = (torch.cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0)) | |
gt_boxes3D = torch.cat([p.gt_boxes3D for p in proposals], dim=0,) | |
gt_poses = torch.cat([p.gt_poses for p in proposals], dim=0,) | |
assert len(gt_poses) == len(gt_boxes3D) == len(box_classes) | |
# eval on all instances | |
else: | |
proposals = instances | |
pred_boxes = [x.pred_boxes for x in instances] | |
proposal_boxes = pred_boxes | |
box_classes = torch.cat([x.pred_classes for x in instances]) | |
proposal_boxes_scaled = self.scale_proposals(proposal_boxes) | |
# forward features | |
cube_features = self.cube_pooler(features, proposal_boxes_scaled).flatten(1) | |
n = cube_features.shape[0] | |
# nothing to do.. | |
if n == 0: | |
return instances if not self.training else (instances, {}) | |
num_boxes_per_image = [len(i) for i in proposals] | |
# scale the intrinsics according to the ratio the image has been scaled. | |
# this means the projections at the current scale are in sync. | |
Ks_scaled_per_box = torch.cat([ | |
(Ks[i]/im_scales_ratio[i]).unsqueeze(0).repeat([num, 1, 1]) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
Ks_scaled_per_box[:, -1, -1] = 1 | |
focal_lengths_per_box = torch.cat([ | |
(Ks[i][1, 1]).unsqueeze(0).repeat([num]) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
im_ratios_per_box = torch.cat([ | |
torch.FloatTensor([im_scales_ratio[i]]).repeat(num) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
# scaling factor for Network resolution -> Original | |
im_scales_per_box = torch.cat([ | |
torch.FloatTensor([im_current_dims[i][0]]).repeat(num) | |
for (i, num) in enumerate(num_boxes_per_image) | |
]).to(cube_features.device) | |
im_scales_original_per_box = im_scales_per_box * im_ratios_per_box | |
if self.virtual_depth: | |
virtual_to_real = util.compute_virtual_scale_from_focal_spaces( | |
focal_lengths_per_box, im_scales_original_per_box, | |
self.virtual_focal, im_scales_per_box | |
) | |
real_to_virtual = 1 / virtual_to_real | |
else: | |
real_to_virtual = virtual_to_real = 1.0 | |
# 2D boxes are needed to apply deltas | |
src_boxes = torch.cat([box_per_im.tensor for box_per_im in proposal_boxes], dim=0) | |
src_widths = src_boxes[:, 2] - src_boxes[:, 0] | |
src_heights = src_boxes[:, 3] - src_boxes[:, 1] | |
src_scales = (src_heights**2 + src_widths**2).sqrt() | |
src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths | |
src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights | |
# For some methods, we need the predicted 2D box, | |
# e.g., the differentiable tensors from the 2D box head. | |
pred_src_boxes = torch.cat([box_per_im.tensor for box_per_im in pred_boxes], dim=0) | |
pred_widths = pred_src_boxes[:, 2] - pred_src_boxes[:, 0] | |
pred_heights = pred_src_boxes[:, 3] - pred_src_boxes[:, 1] | |
pred_src_x = (pred_src_boxes[:, 2] + pred_src_boxes[:, 0]) * 0.5 | |
pred_src_y = (pred_src_boxes[:, 3] + pred_src_boxes[:, 1]) * 0.5 | |
# forward predictions | |
cube_2d_deltas, cube_z, cube_dims, cube_pose, cube_uncert = self.cube_head(cube_features) | |
# simple indexing re-used commonly for selection purposes | |
fg_inds = torch.arange(n) | |
# Z when clusters are used | |
if cube_z is not None and self.cluster_bins > 1: | |
# compute closest bin assignments per batch per category (batch x n_category) | |
scales_diff = (self.priors_z_scales.detach().T.unsqueeze(0) - src_scales.unsqueeze(1).unsqueeze(2)).abs() | |
# assign the correct scale prediction. | |
# (the others are not used / thrown away) | |
assignments = scales_diff.argmin(1) | |
# select FG, category, and correct cluster | |
cube_z = cube_z[fg_inds, :, box_classes, :][fg_inds, assignments[fg_inds, box_classes]] | |
elif cube_z is not None: | |
# if z is available, collect the per-category predictions. | |
cube_z = cube_z[fg_inds, box_classes, :] | |
cube_dims = cube_dims[fg_inds, box_classes, :] | |
cube_pose = cube_pose[fg_inds, box_classes, :, :] | |
if self.use_confidence: | |
# if uncertainty is available, collect the per-category predictions. | |
cube_uncert = cube_uncert[fg_inds, box_classes] | |
cube_2d_deltas = cube_2d_deltas[fg_inds, box_classes, :] | |
# apply our predicted deltas based on src boxes. | |
cube_x = src_ctr_x + src_widths * cube_2d_deltas[:, 0] | |
cube_y = src_ctr_y + src_heights * cube_2d_deltas[:, 1] | |
cube_xy = torch.cat((cube_x.unsqueeze(1), cube_y.unsqueeze(1)), dim=1) | |
cube_dims_norm = cube_dims | |
if self.dims_priors_enabled: | |
# gather prior dimensions | |
prior_dims = self.priors_dims_per_cat.detach().repeat([n, 1, 1, 1])[fg_inds, box_classes] | |
prior_dims_mean = prior_dims[:, 0, :] | |
prior_dims_std = prior_dims[:, 1, :] | |
if self.dims_priors_func == 'sigmoid': | |
prior_dims_min = (prior_dims_mean - 3*prior_dims_std).clip(0.0) | |
prior_dims_max = (prior_dims_mean + 3*prior_dims_std) | |
cube_dims = util.scaled_sigmoid(cube_dims_norm, min=prior_dims_min, max=prior_dims_max) | |
elif self.dims_priors_func == 'exp': | |
cube_dims = torch.exp(cube_dims_norm.clip(max=5)) * prior_dims_mean | |
else: | |
# no priors are used | |
cube_dims = torch.exp(cube_dims_norm.clip(max=5)) | |
if self.allocentric_pose: | |
# To compare with GTs, we need the pose to be egocentric, not allocentric | |
cube_pose_allocentric = cube_pose | |
cube_pose = util.R_from_allocentric(Ks_scaled_per_box, cube_pose, u=cube_x.detach(), v=cube_y.detach()) | |
cube_z = cube_z.squeeze() | |
if self.z_type =='sigmoid': | |
cube_z_norm = torch.sigmoid(cube_z) | |
cube_z = cube_z_norm * 100 | |
elif self.z_type == 'log': | |
cube_z_norm = cube_z | |
cube_z = torch.exp(cube_z) | |
elif self.z_type == 'clusters': | |
# gather the mean depth, same operation as above, for a n x c result | |
z_means = self.priors_z_stats[:, :, 0].T.unsqueeze(0).repeat([n, 1, 1]) | |
z_means = torch.gather(z_means, 1, assignments.unsqueeze(1)).squeeze(1) | |
# gather the std depth, same operation as above, for a n x c result | |
z_stds = self.priors_z_stats[:, :, 1].T.unsqueeze(0).repeat([n, 1, 1]) | |
z_stds = torch.gather(z_stds, 1, assignments.unsqueeze(1)).squeeze(1) | |
# do not learn these, they are static | |
z_means = z_means.detach() | |
z_stds = z_stds.detach() | |
z_means = z_means[fg_inds, box_classes] | |
z_stds = z_stds[fg_inds, box_classes] | |
z_mins = (z_means - 3*z_stds).clip(0) | |
z_maxs = (z_means + 3*z_stds) | |
cube_z_norm = cube_z | |
cube_z = util.scaled_sigmoid(cube_z, min=z_mins, max=z_maxs) | |
if self.virtual_depth: | |
cube_z = (cube_z * virtual_to_real) | |
if self.training: | |
prefix = 'Cube/' | |
storage = get_event_storage() | |
# Pull off necessary GT information | |
# let lowercase->2D and uppercase->3D | |
# [x, y, Z, W, H, L] | |
gt_2d = gt_boxes3D[:, :2] | |
gt_z = gt_boxes3D[:, 2] | |
gt_dims = gt_boxes3D[:, 3:6] | |
# this box may have been mirrored and scaled so | |
# we need to recompute XYZ in 3D by backprojecting. | |
gt_x3d = gt_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
gt_y3d = gt_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
gt_3d = torch.stack((gt_x3d, gt_y3d, gt_z)).T | |
# put together the GT boxes | |
gt_box3d = torch.cat((gt_3d, gt_dims), dim=1) | |
# These are the corners which will be the target for all losses!! | |
gt_corners = util.get_cuboid_verts_faces(gt_box3d, gt_poses)[0] | |
# project GT corners | |
gt_proj_boxes = torch.bmm(Ks_scaled_per_box, gt_corners.transpose(1,2)) | |
gt_proj_boxes /= gt_proj_boxes[:, -1, :].clone().unsqueeze(1) | |
gt_proj_x1 = gt_proj_boxes[:, 0, :].min(1)[0] | |
gt_proj_y1 = gt_proj_boxes[:, 1, :].min(1)[0] | |
gt_proj_x2 = gt_proj_boxes[:, 0, :].max(1)[0] | |
gt_proj_y2 = gt_proj_boxes[:, 1, :].max(1)[0] | |
gt_widths = gt_proj_x2 - gt_proj_x1 | |
gt_heights = gt_proj_y2 - gt_proj_y1 | |
gt_x = gt_proj_x1 + 0.5 * gt_widths | |
gt_y = gt_proj_y1 + 0.5 * gt_heights | |
gt_proj_boxes = torch.stack((gt_proj_x1, gt_proj_y1, gt_proj_x2, gt_proj_y2), dim=1) | |
if self.disentangled_loss: | |
''' | |
Disentangled loss compares each varaible group to the | |
cuboid corners, which is generally more robust to hyperparams. | |
''' | |
# compute disentangled Z corners | |
cube_dis_x3d_from_z = cube_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
cube_dis_y3d_from_z = cube_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
cube_dis_z = torch.cat((torch.stack((cube_dis_x3d_from_z, cube_dis_y3d_from_z, cube_z)).T, gt_dims), dim=1) | |
dis_z_corners = util.get_cuboid_verts_faces(cube_dis_z, gt_poses)[0] | |
# compute disentangled XY corners | |
cube_dis_x3d = gt_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
cube_dis_y3d = gt_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
cube_dis_XY = torch.cat((torch.stack((cube_dis_x3d, cube_dis_y3d, gt_z)).T, gt_dims), dim=1) | |
dis_XY_corners = util.get_cuboid_verts_faces(cube_dis_XY, gt_poses)[0] | |
loss_xy = self.l1_loss(dis_XY_corners, gt_corners).contiguous().view(n, -1).mean(dim=1) | |
# Pose | |
dis_pose_corners = util.get_cuboid_verts_faces(gt_box3d, cube_pose)[0] | |
# Dims | |
dis_dims_corners = util.get_cuboid_verts_faces(torch.cat((gt_3d, cube_dims), dim=1), gt_poses)[0] | |
# Loss dims | |
loss_dims = self.l1_loss(dis_dims_corners, gt_corners).contiguous().view(n, -1).mean(dim=1) | |
# Loss z | |
loss_z = self.l1_loss(dis_z_corners, gt_corners).contiguous().view(n, -1).mean(dim=1) | |
# Rotation uses chamfer or l1 like others | |
if self.chamfer_pose: | |
loss_pose = self.chamfer_loss(dis_pose_corners, gt_corners) | |
else: | |
loss_pose = self.l1_loss(dis_pose_corners, gt_corners).contiguous().view(n, -1).mean(dim=1) | |
# Non-disentangled training losses | |
else: | |
''' | |
These loss functions are fairly arbitrarily designed. | |
Generally, they are in some normalized space but there | |
are many alternative implementations for most functions. | |
''' | |
# XY | |
gt_deltas = (gt_2d.clone() - torch.cat((src_ctr_x.unsqueeze(1), src_ctr_y.unsqueeze(1)), dim=1)) \ | |
/ torch.cat((src_widths.unsqueeze(1), src_heights.unsqueeze(1)), dim=1) | |
loss_xy = self.l1_loss(cube_2d_deltas, gt_deltas).mean(1) | |
# Dims | |
if self.dims_priors_enabled: | |
cube_dims_gt_normspace = torch.log(gt_dims/prior_dims) | |
loss_dims = self.l1_loss(cube_dims_norm, cube_dims_gt_normspace).mean(1) | |
else: | |
loss_dims = self.l1_loss(cube_dims_norm, torch.log(gt_dims)).mean(1) | |
# Pose | |
try: | |
if self.allocentric_pose: | |
gt_poses_allocentric = util.R_to_allocentric(Ks_scaled_per_box, gt_poses, u=cube_x.detach(), v=cube_y.detach()) | |
loss_pose = 1-so3_relative_angle(cube_pose_allocentric, gt_poses_allocentric, eps=0.1, cos_angle=True) | |
else: | |
loss_pose = 1-so3_relative_angle(cube_pose, gt_poses, eps=0.1, cos_angle=True) | |
# Can fail with bad EPS values/instability | |
except: | |
loss_pose = None | |
if self.z_type == 'direct': | |
loss_z = self.l1_loss(cube_z, gt_z) | |
elif self.z_type == 'sigmoid': | |
loss_z = self.l1_loss(cube_z_norm, (gt_z * real_to_virtual / 100).clip(0, 1)) | |
elif self.z_type == 'log': | |
loss_z = self.l1_loss(cube_z_norm, torch.log((gt_z * real_to_virtual).clip(0.01))) | |
elif self.z_type == 'clusters': | |
loss_z = self.l1_loss(cube_z_norm, (((gt_z * real_to_virtual) - z_means)/(z_stds))) | |
total_3D_loss_for_reporting = loss_dims*self.loss_w_dims | |
if not loss_pose is None: | |
total_3D_loss_for_reporting += loss_pose*self.loss_w_pose | |
if not cube_2d_deltas is None: | |
total_3D_loss_for_reporting += loss_xy*self.loss_w_xy | |
if not loss_z is None: | |
total_3D_loss_for_reporting += loss_z*self.loss_w_z | |
# reporting does not need gradients | |
total_3D_loss_for_reporting = total_3D_loss_for_reporting.detach() | |
if self.loss_w_joint > 0: | |
''' | |
If we are using joint [entangled] loss, then we also need to pair all | |
predictions together and compute a chamfer or l1 loss vs. cube corners. | |
''' | |
cube_dis_x3d_from_z = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
cube_dis_y3d_from_z = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
cube_dis_z = torch.cat((torch.stack((cube_dis_x3d_from_z, cube_dis_y3d_from_z, cube_z)).T, cube_dims), dim=1) | |
dis_z_corners_joint = util.get_cuboid_verts_faces(cube_dis_z, cube_pose)[0] | |
if self.chamfer_pose and self.disentangled_loss: | |
loss_joint = self.chamfer_loss(dis_z_corners_joint, gt_corners) | |
else: | |
loss_joint = self.l1_loss(dis_z_corners_joint, gt_corners).contiguous().view(n, -1).mean(dim=1) | |
valid_joint = loss_joint < np.inf | |
total_3D_loss_for_reporting += (loss_joint*self.loss_w_joint).detach() | |
# compute errors for tracking purposes | |
z_error = (cube_z - gt_z).detach().abs() | |
dims_error = (cube_dims - gt_dims).detach().abs() | |
xy_error = (cube_xy - gt_2d).detach().abs() | |
storage.put_scalar(prefix + 'z_error', z_error.mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + 'dims_error', dims_error.mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + 'xy_error', xy_error.mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + 'z_close', (z_error<0.20).float().mean().item(), smoothing_hint=False) | |
storage.put_scalar(prefix + 'total_3D_loss', self.loss_w_3d * self.safely_reduce_losses(total_3D_loss_for_reporting), smoothing_hint=False) | |
if self.inverse_z_weight: | |
''' | |
Weights all losses to prioritize close up boxes. | |
''' | |
gt_z = gt_boxes3D[:, 2] | |
inverse_z_w = 1/torch.log(gt_z.clip(E_CONSTANT)) | |
loss_dims *= inverse_z_w | |
# scale based on log, but clip at e | |
if not cube_2d_deltas is None: | |
loss_xy *= inverse_z_w | |
if loss_z is not None: | |
loss_z *= inverse_z_w | |
if loss_pose is not None: | |
loss_pose *= inverse_z_w | |
if self.loss_w_joint > 0: | |
loss_joint *= inverse_z_w | |
if self.use_confidence > 0: | |
uncert_sf = SQRT_2_CONSTANT * torch.exp(-cube_uncert) | |
loss_dims *= uncert_sf | |
if not cube_2d_deltas is None: | |
loss_xy *= uncert_sf | |
if not loss_z is None: | |
loss_z *= uncert_sf | |
if loss_pose is not None: | |
loss_pose *= uncert_sf | |
if self.loss_w_joint > 0: | |
loss_joint *= uncert_sf | |
losses.update({prefix + 'uncert': self.use_confidence*self.safely_reduce_losses(cube_uncert.clone())}) | |
storage.put_scalar(prefix + 'conf', torch.exp(-cube_uncert).mean().item(), smoothing_hint=False) | |
# store per batch loss stats temporarily | |
self.batch_losses = [batch_losses.mean().item() for batch_losses in total_3D_loss_for_reporting.split(num_boxes_per_image)] | |
if self.loss_w_dims > 0: | |
losses.update({ | |
prefix + 'loss_dims': self.safely_reduce_losses(loss_dims) * self.loss_w_dims * self.loss_w_3d, | |
}) | |
if not cube_2d_deltas is None: | |
losses.update({ | |
prefix + 'loss_xy': self.safely_reduce_losses(loss_xy) * self.loss_w_xy * self.loss_w_3d, | |
}) | |
if not loss_z is None: | |
losses.update({ | |
prefix + 'loss_z': self.safely_reduce_losses(loss_z) * self.loss_w_z * self.loss_w_3d, | |
}) | |
if loss_pose is not None: | |
losses.update({ | |
prefix + 'loss_pose': self.safely_reduce_losses(loss_pose) * self.loss_w_pose * self.loss_w_3d, | |
}) | |
if self.loss_w_joint > 0: | |
if valid_joint.any(): | |
losses.update({prefix + 'loss_joint': self.safely_reduce_losses(loss_joint[valid_joint]) * self.loss_w_joint * self.loss_w_3d}) | |
''' | |
Inference | |
''' | |
if len(cube_z.shape) == 0: | |
cube_z = cube_z.unsqueeze(0) | |
# inference | |
cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0] | |
cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1] | |
cube_3D = torch.cat((torch.stack((cube_x3d, cube_y3d, cube_z)).T, cube_dims, cube_xy*im_ratios_per_box.unsqueeze(1)), dim=1) | |
if self.use_confidence: | |
cube_conf = torch.exp(-cube_uncert) | |
cube_3D = torch.cat((cube_3D, cube_conf.unsqueeze(1)), dim=1) | |
# convert the predictions to intances per image | |
cube_3D = cube_3D.split(num_boxes_per_image) | |
cube_pose = cube_pose.split(num_boxes_per_image) | |
box_classes = box_classes.split(num_boxes_per_image) | |
pred_instances = None | |
pred_instances = instances if not self.training else \ | |
[Instances(image_size) for image_size in im_current_dims] | |
for cube_3D_i, cube_pose_i, instances_i, K, im_dim, im_scale_ratio, box_classes_i, pred_boxes_i in \ | |
zip(cube_3D, cube_pose, pred_instances, Ks, im_current_dims, im_scales_ratio, box_classes, pred_boxes): | |
# merge scores if they already exist | |
if hasattr(instances_i, 'scores'): | |
instances_i.scores = (instances_i.scores * cube_3D_i[:, -1])**(1/2) | |
# assign scores if none are present | |
else: | |
instances_i.scores = cube_3D_i[:, -1] | |
# assign box classes if none exist | |
if not hasattr(instances_i, 'pred_classes'): | |
instances_i.pred_classes = box_classes_i | |
# assign predicted boxes if none exist | |
if not hasattr(instances_i, 'pred_boxes'): | |
instances_i.pred_boxes = pred_boxes_i | |
instances_i.pred_bbox3D = util.get_cuboid_verts_faces(cube_3D_i[:, :6], cube_pose_i)[0] | |
instances_i.pred_center_cam = cube_3D_i[:, :3] | |
instances_i.pred_center_2D = cube_3D_i[:, 6:8] | |
instances_i.pred_dimensions = cube_3D_i[:, 3:6] | |
instances_i.pred_pose = cube_pose_i | |
if self.training: | |
return pred_instances, losses | |
else: | |
return pred_instances | |
def _sample_proposals( | |
self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor, matched_ious=None | |
) -> Tuple[torch.Tensor, torch.Tensor]: | |
""" | |
Based on the matching between N proposals and M groundtruth, | |
sample the proposals and set their classification labels. | |
Args: | |
matched_idxs (Tensor): a vector of length N, each is the best-matched | |
gt index in [0, M) for each proposal. | |
matched_labels (Tensor): a vector of length N, the matcher's label | |
(one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal. | |
gt_classes (Tensor): a vector of length M. | |
Returns: | |
Tensor: a vector of indices of sampled proposals. Each is in [0, N). | |
Tensor: a vector of the same length, the classification label for | |
each sampled proposal. Each sample is labeled as either a category in | |
[0, num_classes) or the background (num_classes). | |
""" | |
has_gt = gt_classes.numel() > 0 | |
# Get the corresponding GT for each proposal | |
if has_gt: | |
gt_classes = gt_classes[matched_idxs] | |
# Label unmatched proposals (0 label from matcher) as background (label=num_classes) | |
gt_classes[matched_labels == 0] = self.num_classes | |
# Label ignore proposals (-1 label) | |
gt_classes[matched_labels == -1] = -1 | |
else: | |
gt_classes = torch.zeros_like(matched_idxs) + self.num_classes | |
sampled_fg_idxs, sampled_bg_idxs = subsample_labels( | |
gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes, matched_ious=matched_ious | |
) | |
sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0) | |
return sampled_idxs, gt_classes[sampled_idxs] | |
def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]: | |
#separate valid and ignore gts | |
targets_ign = [target[target.gt_classes < 0] for target in targets] | |
targets = [target[target.gt_classes >= 0] for target in targets] | |
if self.proposal_append_gt: | |
proposals = add_ground_truth_to_proposals(targets, proposals) | |
proposals_with_gt = [] | |
num_fg_samples = [] | |
num_bg_samples = [] | |
for proposals_per_image, targets_per_image, targets_ign_per_image in zip(proposals, targets, targets_ign): | |
has_gt = len(targets_per_image) > 0 | |
match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, proposals_per_image.proposal_boxes) | |
matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix) | |
try: | |
if len(targets_ign_per_image) > 0: | |
# compute the quality matrix, only on subset of background | |
background_inds = (matched_labels == 0).nonzero().squeeze() | |
# determine the boxes inside ignore regions with sufficient threshold | |
if background_inds.numel() > 1: | |
match_quality_matrix_ign = pairwise_ioa(targets_ign_per_image.gt_boxes, proposals_per_image.proposal_boxes[background_inds]) | |
matched_labels[background_inds[match_quality_matrix_ign.max(0)[0] >= self.ignore_thresh]] = -1 | |
del match_quality_matrix_ign | |
except: | |
pass | |
gt_arange = torch.arange(match_quality_matrix.shape[1]).to(matched_idxs.device) | |
matched_ious = match_quality_matrix[matched_idxs, gt_arange] | |
sampled_idxs, gt_classes = self._sample_proposals(matched_idxs, matched_labels, targets_per_image.gt_classes, matched_ious=matched_ious) | |
# Set target attributes of the sampled proposals: | |
proposals_per_image = proposals_per_image[sampled_idxs] | |
proposals_per_image.gt_classes = gt_classes | |
if has_gt: | |
sampled_targets = matched_idxs[sampled_idxs] | |
# We index all the attributes of targets that start with "gt_" | |
# and have not been added to proposals yet (="gt_classes"). | |
# NOTE: here the indexing waste some compute, because heads | |
# like masks, keypoints, etc, will filter the proposals again, | |
# (by foreground/background, or number of keypoints in the image, etc) | |
# so we essentially index the data twice. | |
for (trg_name, trg_value) in targets_per_image.get_fields().items(): | |
if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name): | |
proposals_per_image.set(trg_name, trg_value[sampled_targets]) | |
num_bg_samples.append((gt_classes == self.num_classes).sum().item()) | |
num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1]) | |
proposals_with_gt.append(proposals_per_image) | |
# Log the number of fg/bg samples that are selected for training ROI heads | |
storage = get_event_storage() | |
storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples)) | |
storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples)) | |
return proposals_with_gt | |
def safely_reduce_losses(self, loss): | |
valid = (~(loss.isinf())) & (~(loss.isnan())) | |
if valid.any(): | |
return loss[valid].mean() | |
else: | |
# no valid losses, simply zero out | |
return loss.mean()*0.0 | |