AndreasLH's picture
segment anything import remove
d805ffe
from detectron2.layers.nms import batched_nms
from pytorch3d.ops.iou_box3d import box3d_overlap
from ProposalNetwork.utils.plane import Plane_torch as Plane_torch
# from segment_anything.utils.transforms import ResizeLongestSide
# from cubercnn.data.generate_ground_segmentations import init_segmentation
import logging
import numpy as np
from torchvision.ops import sigmoid_focal_loss
from typing import Dict, List, Tuple
import torch
from torch import nn
import torch.nn.functional as F
from pytorch3d.transforms.so3 import (
so3_relative_angle
)
from detectron2.config import configurable
from detectron2.structures import Instances, Boxes, pairwise_iou, pairwise_ioa
from detectron2.layers import ShapeSpec
from detectron2.modeling.proposal_generator.proposal_utils import add_ground_truth_to_proposals
from detectron2.utils.events import get_event_storage
from detectron2.modeling.roi_heads import (
StandardROIHeads, ROI_HEADS_REGISTRY, select_foreground_proposals,
)
from detectron2.modeling.poolers import ROIPooler
from ProposalNetwork.utils.conversions import cubes_to_box
from ProposalNetwork.utils.spaces import Cubes
from ProposalNetwork.utils.utils import iou_2d, convex_hull
from cubercnn.modeling.roi_heads.cube_head import build_cube_head
from cubercnn.modeling.proposal_generator.rpn import subsample_labels
from cubercnn.modeling.roi_heads.fast_rcnn import FastRCNNOutputs
from cubercnn import util
from torchvision.ops import generalized_box_iou_loss
from cubercnn.util.math_util import so3_relative_angle_batched
logger = logging.getLogger(__name__)
E_CONSTANT = 2.71828183
SQRT_2_CONSTANT = 1.41421356
def build_roi_heads(cfg, input_shape=None, priors=None):
"""
Build ROIHeads defined by `cfg.MODEL.ROI_HEADS.NAME`.
"""
name = cfg.MODEL.ROI_HEADS.NAME
return ROI_HEADS_REGISTRY.get(name)(cfg, input_shape, priors=priors)
@ROI_HEADS_REGISTRY.register()
class ROIHeads3DScore(StandardROIHeads):
'''3D head for the weak cube rcnn model'''
@configurable
def __init__(
self,
*,
ignore_thresh: float,
cube_head: nn.Module,
cube_pooler: nn.Module,
loss_w_3d: float,
loss_w_iou: float,
loss_w_seg: float,
loss_w_pose: float,
loss_w_normal_vec: float,
loss_w_z: float,
loss_w_dims: float,
loss_w_depth: float,
use_confidence: float,
inverse_z_weight: bool,
z_type: str,
pose_type: str,
cluster_bins: int,
priors = None,
dims_priors_enabled = None,
dims_priors_func = None,
disentangled_loss=None,
virtual_depth=None,
virtual_focal=None,
test_scale=None,
allocentric_pose=None,
chamfer_pose=None,
scale_roi_boxes=None,
loss_functions=['dims', 'pose_alignment', 'pose_ground', 'iou', 'segmentation', 'z', 'z_pseudo_gt_patch'],
segmentor,
**kwargs,
):
super().__init__(**kwargs)
self.scale_roi_boxes = scale_roi_boxes
self.segmentor = segmentor
# rotation settings
self.allocentric_pose = allocentric_pose
self.chamfer_pose = chamfer_pose
# virtual settings
self.virtual_depth = virtual_depth
self.virtual_focal = virtual_focal
# loss weights, <=0 is off
self.loss_w_3d = loss_w_3d
self.loss_w_iou = loss_w_iou
self.loss_w_seg = loss_w_seg
self.loss_w_pose = loss_w_pose
self.loss_w_normal_vec = loss_w_normal_vec
self.loss_w_z = loss_w_z
self.loss_w_dims = loss_w_dims
self.loss_w_depth = loss_w_depth
# loss functions
self.loss_functions = loss_functions
# loss modes
self.disentangled_loss = disentangled_loss
self.inverse_z_weight = inverse_z_weight
# misc
self.test_scale = test_scale
self.ignore_thresh = ignore_thresh
# related to network outputs
self.z_type = z_type
self.pose_type = pose_type
self.use_confidence = use_confidence
# related to priors
self.cluster_bins = cluster_bins
self.dims_priors_enabled = dims_priors_enabled
self.dims_priors_func = dims_priors_func
# if there is no 3D loss, then we don't need any heads.
# if loss_w_3d > 0:
self.cube_head = cube_head
self.cube_pooler = cube_pooler
# the dimensions could rely on pre-computed priors
if self.dims_priors_enabled and priors is not None:
self.priors_dims_per_cat = nn.Parameter(torch.FloatTensor(priors['priors_dims_per_cat']).unsqueeze(0))
else:
self.priors_dims_per_cat = nn.Parameter(torch.ones(1, self.num_classes, 2, 3))
# Optionally, refactor priors and store them in the network params
if self.cluster_bins > 1 and priors is not None:
# the depth could have been clustered based on 2D scales
priors_z_scales = torch.stack([torch.FloatTensor(prior[1]) for prior in priors['priors_bins']])
self.priors_z_scales = nn.Parameter(priors_z_scales)
else:
self.priors_z_scales = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins))
# the depth can be based on priors
if self.z_type == 'clusters':
assert self.cluster_bins > 1, 'To use z_type of priors, there must be more than 1 cluster bin'
if priors is None:
self.priors_z_stats = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins, 2).float())
else:
# stats
priors_z_stats = torch.cat([torch.FloatTensor(prior[2]).unsqueeze(0) for prior in priors['priors_bins']])
self.priors_z_stats = nn.Parameter(priors_z_stats)
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], priors=None):
ret = super().from_config(cfg, input_shape)
# pass along priors
ret["box_predictor"] = FastRCNNOutputs(cfg, ret['box_head'].output_shape)
ret.update(cls._init_cube_head(cfg, input_shape))
ret["priors"] = priors
return ret
@classmethod
def _init_cube_head(self, cfg, input_shape: Dict[str, ShapeSpec]):
in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
pooler_resolution = cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION
pooler_sampling_ratio = cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO
pooler_type = cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE
cube_pooler = ROIPooler(
output_size=pooler_resolution,
scales=pooler_scales,
sampling_ratio=pooler_sampling_ratio,
pooler_type=pooler_type,
)
in_channels = [input_shape[f].channels for f in in_features][0]
shape = ShapeSpec(
channels=in_channels, width=pooler_resolution, height=pooler_resolution
)
cube_head = build_cube_head(cfg, shape)
logger.info('Loss functions: %s', cfg.loss_functions)
possible_losses = ['dims', 'pose_alignment', 'pose_ground', 'pose_ground2', 'iou', 'segmentation', 'z', 'z_pseudo_gt_patch', 'z_pseudo_gt_center','depth']
assert all([x in possible_losses for x in cfg.loss_functions]), f'loss functions must be in {possible_losses}, but was {cfg.loss_functions}'
if 'segmentation' in cfg.loss_functions or 'depth' in cfg.loss_functions:
segmentor = init_segmentation(device=cfg.MODEL.DEVICE)
else:
segmentor = None
return {
'cube_head': cube_head,
'cube_pooler': cube_pooler,
'use_confidence': cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE,
'inverse_z_weight': cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT,
'loss_w_3d': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D,
'loss_w_iou': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_IOU,
'loss_w_seg': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_SEG,
'loss_w_pose': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE,
'loss_w_dims': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS,
'loss_w_normal_vec': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_NORMAL_VEC,
'loss_w_z': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z,
'loss_w_depth': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DEPTH,
'z_type': cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE,
'pose_type': cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE,
'dims_priors_enabled': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED,
'dims_priors_func': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC,
'disentangled_loss': cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS,
'virtual_depth': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH,
'virtual_focal': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL,
'test_scale': cfg.INPUT.MIN_SIZE_TEST,
'chamfer_pose': cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE,
'allocentric_pose': cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE,
'cluster_bins': cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS,
'ignore_thresh': cfg.MODEL.RPN.IGNORE_THRESHOLD,
'scale_roi_boxes': cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES,
'loss_functions': cfg.loss_functions,
'segmentor': segmentor,
}
def forward(self, images, images_raw, ground_maps, depth_maps, features, proposals, Ks, im_scales_ratio, targets):
im_dims = [image.shape[1:] for image in images]
del images
if self.training:
proposals = self.label_and_sample_proposals(proposals, targets)
losses = self._forward_box(features, proposals)
if self.loss_w_3d > 0:
tmp_list = [x.gt_boxes3D.tolist() for x in targets]
idx_list = []
for i in range(len(tmp_list)):
for j in range(len(tmp_list[i])):
idx_list.append(tmp_list[i][j][0])
first_occurrence_indices = {}
unique_counter = 0
result_indices = []
for entry in idx_list:
if entry not in first_occurrence_indices:
first_occurrence_indices[entry] = unique_counter
unique_counter += 1
result_indices.append(first_occurrence_indices[entry])
if 'segmentation' in self.loss_functions or 'depth' in self.loss_functions:
mask_per_image = self.object_masks(images_raw.tensor, targets) # over all images in batch
masks_all_images = [sublist for outer_list in mask_per_image for sublist in outer_list]
else:
mask_per_image, masks_all_images = None, None
instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps)
losses.update(losses_cube)
else:
instances_3d = None
return instances_3d, losses
else:
# when oracle is available, by pass the box forward.
# simulate the predicted instances by creating a new
# instance for each passed in image.
if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]):
pred_instances = []
for proposal, im_dim in zip(proposals, im_dims):
pred_instances_i = Instances(im_dim)
pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D'])
pred_instances_i.pred_classes = proposal['gt_classes']
pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float()
pred_instances.append(pred_instances_i)
else:
pred_instances = self._forward_box(features, proposals)
mask_per_image, masks_all_images, first_occurrence_indices = None, None, None
pred_instances = self._forward_cube(features, pred_instances, Ks, im_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps)
return pred_instances, {}
def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]):
"""
Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`,
the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument.
Args:
features (dict[str, Tensor]): mapping from feature map names to tensor.
Same as in :meth:`ROIHeads.forward`.
proposals (list[Instances]): the per-image object proposals with
their matching ground truth.
Each has fields "proposal_boxes", and "objectness_logits",
"gt_classes", "gt_boxes".
Returns:
In training, a dict of losses.
In inference, a list of `Instances`, the predicted instances.
"""
features = [features[f] for f in self.box_in_features]
box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
box_features = self.box_head(box_features)
predictions = self.box_predictor(box_features)
del box_features
if self.training:
losses = self.box_predictor.losses(
predictions, proposals,
)
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
predictions, proposals
)
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
proposals_per_image.pred_boxes = Boxes(pred_boxes_per_image)
# proposals is modified in-place below, so losses must be computed first.
if self.train_on_pred_boxes:
with torch.no_grad():
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
predictions, proposals
)
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image)
return losses
else:
pred_instances, _ = self.box_predictor.inference(predictions, proposals, )
return pred_instances
def l1_loss(self, vals, target):
return F.smooth_l1_loss(vals, target, reduction='none', beta=0.0)
def chamfer_loss(self, vals, target):
B = vals.shape[0]
xx = vals.view(B, 8, 1, 3)
yy = target.view(B, 1, 8, 3)
l1_dist = (xx - yy).abs().sum(-1)
l1 = (l1_dist.min(1).values.mean(-1) + l1_dist.min(2).values.mean(-1))
return l1
# optionally, scale proposals to zoom RoI in (<1.0) our out (>1.0)
def scale_proposals(self, proposal_boxes):
if self.scale_roi_boxes > 0:
proposal_boxes_scaled = []
for boxes in proposal_boxes:
centers = boxes.get_centers()
widths = boxes.tensor[:, 2] - boxes.tensor[:, 0]
heights = boxes.tensor[:, 2] - boxes.tensor[:, 0]
x1 = centers[:, 0] - 0.5*widths*self.scale_roi_boxes
x2 = centers[:, 0] + 0.5*widths*self.scale_roi_boxes
y1 = centers[:, 1] - 0.5*heights*self.scale_roi_boxes
y2 = centers[:, 1] + 0.5*heights*self.scale_roi_boxes
boxes_scaled = Boxes(torch.stack([x1, y1, x2, y2], dim=1))
proposal_boxes_scaled.append(boxes_scaled)
else:
proposal_boxes_scaled = proposal_boxes
return proposal_boxes_scaled
def object_masks(self, images, instances):
'''list of masks for each object in the image.
Returns
------
mask_per_image: List of torch.Tensor of shape (N_instance, 1, H, W)
'''
org_shape = images.shape[-2:]
resize_transform = ResizeLongestSide(self.segmentor.image_encoder.img_size)
batched_input = []
images = resize_transform.apply_image_torch(images*1.0)# .permute(2, 0, 1).contiguous()
for image, instance in zip(images, instances):
boxes = instance.gt_boxes.tensor
transformed_boxes = resize_transform.apply_boxes_torch(boxes, org_shape) # Bx4
batched_input.append({'image': image, 'boxes': transformed_boxes, 'original_size':org_shape})
seg_out = self.segmentor(batched_input, multimask_output=False)
mask_per_image = [i['masks'] for i in seg_out]
return mask_per_image
def dice_loss(self, y, y_hat):
'''Andreas: i am extremely unconfident in the correctness of this implementation
taken from my implementation in the DLCV course
see also: https://gist.github.com/weiliu620/52d140b22685cf9552da4899e2160183'''
smooth = 1
y_hat = F.sigmoid(y_hat)
y_hat = y_hat.view(-1)
y = y.view(-1)
intersection = (y_hat * y).sum()
dice = (2.*intersection + smooth)/(y_hat.sum() + y.sum() + smooth)
return 1 - dice
def segment_loss(self, gt_mask, bube_corners, at_which_mask_idx, loss='focal'):
n = len(bube_corners)
y_hat = []
y = []
for i in range(n):
gt_mask_i = gt_mask[at_which_mask_idx[i]][0]
bube_corners_i = bube_corners[i]
# just need the shape of the gt_mask
bube_mask = convex_hull(gt_mask[0].squeeze(), bube_corners_i)
gt_mask_i = (gt_mask_i * 1.0).float()
y.append(gt_mask_i)
y_hat.append(bube_mask)
y = torch.stack(y)
y_hat = torch.stack(y_hat)
if loss == 'bce':
score = F.binary_cross_entropy_with_logits(y, y_hat, reduction='none').mean((1,2)) # mean over h,w
elif loss == 'dice':
score = self.dice_loss(y, y_hat)
elif loss == 'focal':
score = sigmoid_focal_loss(y, y_hat, reduction='none').mean((1,2))
return score
def pose_loss(self, cube_pose:torch.Tensor, num_boxes_per_image:list[int]):
'''
Loss based on pose consistency within a single image
generate all combinations of poses as one row of the combination matrix at the time
this will give the equivalent to the lower triangle of the matrix
'''
loss_pose = torch.zeros(1, device=cube_pose.device)
fail_count = 0
for cube_pose_ in cube_pose.split(num_boxes_per_image):
# normalise with the number of elements in the lower triangle to make the loss more fair between images with different number of boxes
# we don't really care about the eps
# we cannot use this when there is only one cube in an image, so skip it
if len(cube_pose_) == 1:
fail_count += 1
continue
loss_pose_t = 1-so3_relative_angle_batched(cube_pose_, eps=10000, cos_angle=True).abs()
loss_pose += torch.mean(loss_pose_t)
if fail_count == len(num_boxes_per_image): # ensure that loss is None if all images in batch only had 1 box
return None
return loss_pose * 1/(fail_count+1)
def normal_vector_from_maps(self, ground_maps, depth_maps, Ks, use_nth=5):
'''compute a normal vector corresponding to the ground from a point ground generated from a depth map'''
# ### point cloud
dvc = depth_maps.device
normal_vecs = []
# i cannot really see any other options than to loop over the them because the images have different sizes
for ground_map, depth_map, org_image_size, K in zip(ground_maps, depth_maps, depth_maps.image_sizes, Ks):
if ground_map.shape == (1,1): ground_map = None
z = depth_map[::use_nth,::use_nth]
# i don't know if it makes sense to use the image shape as the
# this way it looks much more correct
# https://github.com/DepthAnything/Depth-Anything-V2/blob/31dc97708961675ce6b3a8d8ffa729170a4aa273/metric_depth/depth_to_pointcloud.py#L100
width, height = z.shape[1], z.shape[0]
focal_length_x, focal_length_y = K[0,0] // use_nth, K[1,1] // use_nth
u, v = torch.meshgrid(torch.arange(width, device=dvc), torch.arange(height,device=dvc), indexing='xy')
cx, cy = width / 2, height / 2 # principal point of camera
# https://www.open3d.org/docs/0.7.0/python_api/open3d.geometry.create_point_cloud_from_depth_image.html
x = (u - cx) * z / focal_length_x
y = (v - cy) * z / focal_length_y
if ground_map is not None:
# select only the points in x,y,z that are part of the ground map
ground = ground_map[::use_nth,::use_nth]
zg = z[ground > 0]
xg = x[ground > 0]
yg = y[ground > 0]
else:
# the ground map also works to remove the padded 0's to the depth maps
# so in the case the ground map is not available we must ensure to only select the valid part of the image
mask = torch.ones(org_image_size, device=dvc)
image_without_pad = mask[::use_nth,::use_nth]
zg = z[image_without_pad > 0]
xg = x[image_without_pad > 0]
yg = y[image_without_pad > 0]
# normalise the points
points = torch.stack((xg, yg, zg), axis=-1)
plane = Plane_torch()
# best_eq is the ground plane as a,b,c,d in the equation ax + by + cz + d = 0
# if this errors out, run the filter ground script first
best_eq, best_inliers = plane.fit_parallel(points, thresh=0.05, maxIteration=1000)
normal_vec = best_eq[:-1]
x_up = torch.tensor([1.0, 0.0, 0.0], device=dvc)
y_up = torch.tensor([0.0, 1.0, 0.0], device=dvc)
z_up = torch.tensor([0.0, 0.0, 1.0], device=dvc)
# make sure normal vector is consistent with y-up
if (normal_vec @ z_up).abs() > (normal_vec @ y_up).abs():
# this means the plane has been found as the back wall
# to rectify this we can turn the vector 90 degrees around the local x-axis
# note that this assumes that the walls are perpendicular to the floor
normal_vec = normal_vec[torch.tensor([0,2,1], device=dvc)] * torch.tensor([1, 1, -1], device=dvc)
if (normal_vec @ x_up).abs() > (normal_vec @ y_up).abs():
# this means the plane has been found as the side wall
# to rectify this we can turn the vector 90 degrees around the local y-axis
# note that this assumes that the walls are perpendicular to the floor
normal_vec = normal_vec[torch.tensor([2,0,1], device=dvc)] * torch.tensor([-1, 1, 1], device=dvc)
if normal_vec @ y_up < 0:
normal_vec *= -1
normal_vecs.append(normal_vec)
return torch.stack(normal_vecs)
def z_loss(self, gt_boxes:Boxes, cubes:Cubes, Ks, im_sizes, proj_boxes:Boxes):
max_count = 50 # 50 steps of 0.1 meters
num_preds = cubes.num_instances
# Find losses
scores = torch.zeros((num_preds), device=cubes.device)
gt_area = gt_boxes.area()
pred_center = proj_boxes.get_centers()
pred_area = proj_boxes.area()
gt_boxes_t = gt_boxes.tensor
is_within_gt_box = ((gt_boxes_t[:, 0] - max_count <= pred_center[:,0]) <= gt_boxes_t[:, 2] + max_count) & \
((gt_boxes_t[:, 1] - max_count <= pred_center[:,1]) <= gt_boxes_t[:, 3] + max_count)
values_tensor = torch.linspace(0.0, (max_count-1)/10, max_count, device=cubes.device)
is_gt_smaller = gt_area < pred_area
for i in range(num_preds):
# Check if pred center is within gt box
if is_within_gt_box[i]:
cube_tensor = cubes[i].tensor
mod_cube_tensor = cube_tensor[0,0].clone().unsqueeze(0).repeat((max_count,1))
# Check if too small or too big.
if is_gt_smaller[i]: # NOTE has disadvantage when box has different shape, CAN FAIL TODO Change to checking each corner instead
mod_cube_tensor[:, 2] += values_tensor
else:
mod_cube_tensor[:, 2] -= values_tensor
mod_cube = Cubes(mod_cube_tensor)
mod_box = Boxes(cubes_to_box(mod_cube, Ks[i], im_sizes[i])[0].tensor)
pred_areas = mod_box.area()
mask_zero_area = (pred_areas == 0) * 10000000
pred_areas = pred_areas + mask_zero_area
idx = torch.argmin(self.l1_loss(gt_area[i].repeat(max_count), pred_areas))
scores[i] = self.l1_loss(cubes[i].tensor[0,0,2], mod_cube_tensor[idx,2])
else:
#If center is outside return something high?
scores[i] = torch.tensor(0.1 * max_count, requires_grad=True)
return scores/2
def pseudo_gt_z_box_loss(self, depth_maps, proposal_boxes:tuple[torch.Tensor], pred_z):
'''Compute the pseudo ground truth z loss based on the depth map
for now, use the median value depth constrained of the proposal box as the ground truth depth
Args:
depth_maps: detectron2 Imagelist
proposal_boxes: predicted 2d box. list[detectron2 Boxes of shape (N, 4)]
pred_z: predicted z. torch.Tensor of shape (N, 1)
Returns:
z_loss: torch.Tensor of shape (N, 1)'''
gt_z = []
for depth_map, boxes in zip(depth_maps, proposal_boxes):
boxes = Boxes(boxes)
h, w = depth_map.shape
# x1, y1, x2, y2 = box
# clamp boxes extending the image
boxes.clip((h, w))
# remove boxes fully outside the image
mask = boxes.area() > 0
boxes_in = boxes[mask]
# median of each of the depth maps corresponding each box
for box in boxes_in:
# TODO: this could be way more efficiently, but I don't know how to slice many boxes at once
gt_z.append(torch.median((depth_map[box[1].long():box[3].long(), box[0].long():box[2].long()])).unsqueeze(0))
# for boxes outside image, fall back to same method as in pseudo_gt_z_loss_point
boxes_out = boxes[~mask]
if len(boxes_out) == 0:
continue
xy = boxes_out.get_centers()
x = torch.clamp(xy[:,0],10,w-11)
y = torch.clamp(xy[:,1],10,h-11)
gt_z.append(depth_map[y.long(), x.long()])
gt_z_o = torch.cat(gt_z)
l1loss = self.l1_loss(pred_z, gt_z_o)
return l1loss
def dim_loss(self, priors:tuple[torch.Tensor], dimensions):
'''
priors : List
dimensions : List of Lists
P(dim|priors)
'''
[prior_mean, prior_std] = priors
# Drop rows of prior_mean and prior_std for rows in prior_std containing nan
mask = ~torch.isnan(prior_std).any(dim=1)
if not mask.all():
return None, None, None
prior_mean = prior_mean[mask]
prior_std = prior_std[mask]
dimensions = dimensions[mask]
# z-score ie how many std's we are from the mean
dimensions_scores = (dimensions - prior_mean).abs()/prior_std
dimensions_scores = torch.max(dimensions_scores - 1.0, torch.zeros_like(dimensions_scores, device=dimensions_scores.device))
return dimensions_scores[:,0], dimensions_scores[:,1], dimensions_scores[:,2]
def pseudo_gt_z_point_loss(self, depth_maps, pred_xy, pred_z, num_boxes_per_image):
'''Compute the pseudo ground truth z loss based on the depth map
for now, use the point in depth map corresponding to the center point of the pred box as the pseudo ground truth
Args:
depth_maps: detectron2 Imagelist
pred_xy: predicted centre. torch.Tensor of shape (N, 2)
pred_z: predicted z. torch.Tensor of shape (N, 1)
Returns:
z_loss: torch.Tensor of shape (N, 1)'''
gt_z = []
for depth_map, xy in zip(depth_maps, pred_xy.split(num_boxes_per_image)):
h, w = depth_map.shape
y, x = xy[:,1], xy[:,0]
# clamp points outside the image
x = torch.clamp(x,10,w-11)
y = torch.clamp(y,10,h-11)
gt_z.append(depth_map[y.long(), x.long()])
gt_z_o = torch.cat(gt_z)
l1loss = self.l1_loss(pred_z, gt_z_o)
return l1loss
def depth_range_loss(self, gt_mask, at_which_mask_idx, depth_maps, cubes, gt_boxes, num_instances):
"""
Apply seg_mask on depth image, take difference in min and max values as GT value. Take length as prediction value. Then l1-loss.
"""
gt_boxes_t = gt_boxes.tensor
counter = 0
gt_depths = []
corner_depths = cubes.get_all_corners()[:,0,:,2]
# max function gives both vals and idx, so we take only the vals
pred_depth = torch.max(corner_depths,dim=1)[0] - torch.min(corner_depths,dim=1)[0]
for depth_map, cube in zip(depth_maps, cubes.split(num_instances, dim=0)):
for j in range(cube.num_instances):
segmentation_mask = gt_mask[at_which_mask_idx[counter]][0]
depth_map = F.interpolate(depth_map.unsqueeze(0).unsqueeze(0),size=segmentation_mask.shape, mode='bilinear', align_corners=True).squeeze()
depth_range = depth_map[segmentation_mask]
# if segmentation fails, fall back to the bbox
if depth_range.numel() == 0:
depth_range = depth_map[gt_boxes_t[counter,1].long():gt_boxes_t[counter,3].long(), gt_boxes_t[counter,0].long():gt_boxes_t[counter,2].long()]
gt_depth = torch.quantile(depth_range,0.9) - torch.quantile(depth_range,0.1) #torch.max(depth_range) - torch.min(depth_range)
gt_depths.append(gt_depth)
counter += 1
gt_depths = torch.stack(gt_depths)
scores = self.l1_loss(gt_depths, pred_depth)
return scores
def normal_to_rotation(self, normal):
'''https://gamedev.stackexchange.com/questions/22204/from-normal-to-rotation-matrix'''
x1 = torch.tensor([1.0, 0, 0], device=normal.device).repeat(normal.shape[0],1)
t0 = torch.cross(normal, x1, dim=1)
if torch.bmm(t0.view(normal.shape[0],1,3), t0.view(normal.shape[0], 3, 1)).flatten().any() < 0.001:
y1 = torch.tensor([0, 1.0, 0], device=normal.device).repeat(normal.shape[0],1)
t0 = torch.cross(normal, y1, dim=1)
t0 = t0 / torch.norm(t0)
t1t = torch.cross(normal, t0, dim=1)
t1 = t1t / torch.norm(t1t)
return torch.cat([t0, t1, normal],dim=1).reshape((normal.shape[0],3,3))#.permute((0,2,1))
def _forward_cube(self, features, instances, Ks, im_current_dims, im_scales_ratio, masks_all_images, first_occurrence_indices, ground_maps, depth_maps):
features = [features[f] for f in self.in_features]
# training on foreground
if self.training:
losses = {}
# add up the amount we should normalize the losses by.
# this follows the same logic as the BoxHead, where each FG proposal
# is able to contribute the same amount of supervision. Technically,
# this value doesn't change during training unless the batch size is dynamic.
self.normalize_factor = max(sum([i.gt_classes.numel() for i in instances]), 1.0)
# The loss is only defined on positive proposals
proposals, _ = select_foreground_proposals(instances, self.num_classes)
proposal_boxes = [x.proposal_boxes for x in proposals]
pred_boxes = [x.pred_boxes for x in proposals]
box_classes = (torch.cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0))
gt_boxes3D = torch.cat([p.gt_boxes3D for p in proposals], dim=0,)
gt_poses = torch.cat([p.gt_poses for p in proposals], dim=0,)
assert len(gt_poses) == len(gt_boxes3D) == len(box_classes)
at_which_mask_idx = []
for entry in gt_boxes3D:
entry = entry[0].item()
at_which_mask_idx.append(first_occurrence_indices[entry])
# eval on all instances
else:
proposals = instances
pred_boxes = [x.pred_boxes for x in instances]
proposal_boxes = pred_boxes
box_classes = torch.cat([x.pred_classes for x in instances])
proposal_boxes_scaled = self.scale_proposals(proposal_boxes)
# forward features
cube_features = self.cube_pooler(features, proposal_boxes_scaled).flatten(1)
n = cube_features.shape[0]
# nothing to do..
if n == 0:
return instances if not self.training else (instances, {})
num_boxes_per_image = [len(i) for i in proposals]
# scale the intrinsics according to the ratio the image has been scaled.
# this means the projections at the current scale are in sync.
Ks_scaled_per_box = torch.cat([
(Ks[i]/im_scales_ratio[i]).unsqueeze(0).repeat([num, 1, 1])
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
Ks_scaled_per_box[:, -1, -1] = 1
focal_lengths_per_box = torch.cat([
(Ks[i][1, 1]).unsqueeze(0).repeat([num])
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
im_ratios_per_box = torch.cat([
torch.FloatTensor([im_scales_ratio[i]]).repeat(num)
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
# scaling factor for Network resolution -> Original
im_scales_per_box = torch.cat([
torch.FloatTensor([im_current_dims[i][0]]).repeat(num)
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
im_scales_original_per_box = im_scales_per_box * im_ratios_per_box
if self.virtual_depth:
virtual_to_real = util.compute_virtual_scale_from_focal_spaces(
focal_lengths_per_box, im_scales_original_per_box,
self.virtual_focal, im_scales_per_box
)
real_to_virtual = 1 / virtual_to_real
else:
real_to_virtual = virtual_to_real = 1.0
# 2D boxes are needed to apply deltas
src_boxes = torch.cat([box_per_im.tensor for box_per_im in proposal_boxes], dim=0)
src_widths = src_boxes[:, 2] - src_boxes[:, 0]
src_heights = src_boxes[:, 3] - src_boxes[:, 1]
src_scales = (src_heights**2 + src_widths**2).sqrt()
src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
# For some methods, we need the predicted 2D box,
# e.g., the differentiable tensors from the 2D box head.
pred_src_boxes = torch.cat([box_per_im.tensor for box_per_im in pred_boxes], dim=0)
pred_widths = pred_src_boxes[:, 2] - pred_src_boxes[:, 0]
pred_heights = pred_src_boxes[:, 3] - pred_src_boxes[:, 1]
pred_src_x = (pred_src_boxes[:, 2] + pred_src_boxes[:, 0]) * 0.5
pred_src_y = (pred_src_boxes[:, 3] + pred_src_boxes[:, 1]) * 0.5
im_sizes = []
im_idx = []
for i,j in enumerate(num_boxes_per_image):
for _ in range(j):
im_sizes.append(list(im_current_dims[i]))
im_idx.append(i)
# forward predictions
cube_2d_deltas, cube_z, cube_dims, cube_pose, cube_uncert = self.cube_head(cube_features)
# simple indexing re-used commonly for selection purposes
fg_inds = torch.arange(n)
# Z when clusters are used
if cube_z is not None and self.cluster_bins > 1:
# compute closest bin assignments per batch per category (batch x n_category)
scales_diff = (self.priors_z_scales.detach().T.unsqueeze(0) - src_scales.unsqueeze(1).unsqueeze(2)).abs()
# assign the correct scale prediction.
# (the others are not used / thrown away)
assignments = scales_diff.argmin(1)
# select FG, category, and correct cluster
cube_z = cube_z[fg_inds, :, box_classes, :][fg_inds, assignments[fg_inds, box_classes]]
elif cube_z is not None:
# if z is available, collect the per-category predictions.
cube_z = cube_z[fg_inds, box_classes, :]
cube_dims = cube_dims[fg_inds, box_classes, :]
cube_pose = cube_pose[fg_inds, box_classes, :, :]
if self.use_confidence:
# if uncertainty is available, collect the per-category predictions.
cube_uncert = cube_uncert[fg_inds, box_classes]
cube_2d_deltas = cube_2d_deltas[fg_inds, box_classes, :]
# apply our predicted deltas based on src boxes.
cube_x = src_ctr_x + src_widths * cube_2d_deltas[:, 0]
cube_y = src_ctr_y + src_heights * cube_2d_deltas[:, 1]
cube_xy = torch.cat((cube_x.unsqueeze(1), cube_y.unsqueeze(1)), dim=1)
cube_dims_norm = cube_dims
if self.dims_priors_enabled:
# gather prior dimensions
prior_dims = self.priors_dims_per_cat.detach().repeat([n, 1, 1, 1])[fg_inds, box_classes]
prior_dims_mean = prior_dims[:, 0, :]
prior_dims_std = prior_dims[:, 1, :]
if self.dims_priors_func == 'sigmoid':
prior_dims_min = (prior_dims_mean - 3*prior_dims_std).clip(0.0)
prior_dims_max = (prior_dims_mean + 3*prior_dims_std)
cube_dims = util.scaled_sigmoid(cube_dims_norm, min=prior_dims_min, max=prior_dims_max)
elif self.dims_priors_func == 'exp':
cube_dims = torch.exp(cube_dims_norm.clip(max=5)) * prior_dims_mean
else:
# no priors are used
cube_dims = torch.exp(cube_dims_norm.clip(max=5))
if self.allocentric_pose:
# To compare with GTs, we need the pose to be egocentric, not allocentric
cube_pose_allocentric = cube_pose
cube_pose = util.R_from_allocentric(Ks_scaled_per_box, cube_pose, u=cube_x.detach(), v=cube_y.detach())
cube_z = cube_z.squeeze()
if self.z_type =='sigmoid':
cube_z_norm = torch.sigmoid(cube_z)
cube_z = cube_z_norm * 100
elif self.z_type == 'log':
cube_z_norm = cube_z
cube_z = torch.exp(cube_z)
elif self.z_type == 'clusters':
# gather the mean depth, same operation as above, for a n x c result
z_means = self.priors_z_stats[:, :, 0].T.unsqueeze(0).repeat([n, 1, 1])
z_means = torch.gather(z_means, 1, assignments.unsqueeze(1)).squeeze(1)
# gather the std depth, same operation as above, for a n x c result
z_stds = self.priors_z_stats[:, :, 1].T.unsqueeze(0).repeat([n, 1, 1])
z_stds = torch.gather(z_stds, 1, assignments.unsqueeze(1)).squeeze(1)
# do not learn these, they are static
z_means = z_means.detach()
z_stds = z_stds.detach()
z_means = z_means[fg_inds, box_classes]
z_stds = z_stds[fg_inds, box_classes]
z_mins = (z_means - 3*z_stds).clip(0)
z_maxs = (z_means + 3*z_stds)
cube_z_norm = cube_z
cube_z = util.scaled_sigmoid(cube_z, min=z_mins, max=z_maxs)
if self.virtual_depth:
cube_z = (cube_z * virtual_to_real)
if self.training:
prefix = 'Cube/'
storage = get_event_storage()
# Pull off necessary GT information
gt_2d = gt_boxes3D[:, :2]
gt_z = gt_boxes3D[:, 2]
gt_dims = gt_boxes3D[:, 3:6]
# this box may have been mirrored and scaled so
# we need to recompute XYZ in 3D by backprojecting.
gt_x3d = gt_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
gt_y3d = gt_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
gt_3d = torch.stack((gt_x3d, gt_y3d, gt_z)).T
# put together the GT boxes
gt_cubes = Cubes(torch.cat((gt_3d, gt_dims, gt_poses.view(*gt_poses.shape[:-2], -1)), dim=1).unsqueeze(1))
# Get center in meters and create cubes
#cube_z = gt_boxes3D[:,2]
cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
cubes_tensor = torch.cat((cube_x3d.unsqueeze(1),cube_y3d.unsqueeze(1),cube_z.unsqueeze(1),cube_dims,cube_pose.reshape(n,9)),axis=1).unsqueeze(1)
cubes = Cubes(cubes_tensor)
# 3d iou
IoU3Ds = None
storage = get_event_storage()
# log 3d iou less frequently because it is slow
if storage.iter % 200 == 0:
gt_corners = gt_cubes.get_all_corners().squeeze(1)
proposal_corners = cubes.get_all_corners().squeeze(1)
try:
vol, iou = box3d_overlap(gt_corners.cpu(),proposal_corners.cpu())
IoU3Ds = torch.diag(iou)
except ValueError:
IoU3Ds = torch.zeros(n, device=cubes.device)
# Get bube corners
bube_corners = torch.zeros((n,8,2))
for i in range(n):
bube_corner = cubes[i].get_bube_corners(Ks_scaled_per_box[i], im_sizes[i])
x = torch.clamp(bube_corner[..., 0], 0, int(im_sizes[i][0]-1)) # clamp for segment loss, else CUDA error bc of accesing elements otside mask range
y = torch.clamp(bube_corner[..., 1], 0, int(im_sizes[i][1]-1))
bube_corner = torch.stack((x, y), dim=-1)
bube_corners[i] = bube_corner
# Project to 2D
proj_boxes = []
for i in range(cubes.num_instances):
proj_boxes.append(cubes_to_box(cubes[i], Ks_scaled_per_box[i], im_sizes[i])[0].tensor[0])
proj_boxes = Boxes(torch.stack(proj_boxes))
### Loss
loss_iou = None
loss_pose = None
loss_seg = None
loss_z = None
loss_dims_w = None
loss_pseudo_gt_z = None
loss_ground_rot = None
loss_depth = None
# 2D IoU
gt_boxes = [x.gt_boxes for x in proposals]
gt_boxes = Boxes(torch.cat([gt_boxes[i].tensor for i in range(len(gt_boxes))]))
# 2D IoU
if 'iou' in self.loss_functions:
loss_iou = generalized_box_iou_loss(gt_boxes.tensor, proj_boxes.tensor, reduction='none').view(n, -1).mean(dim=1)
# Pose
if 'pose_alignment' in self.loss_functions:
loss_pose = self.pose_loss(cube_pose, num_boxes_per_image)
if loss_pose is not None:
loss_pose = loss_pose.repeat(n)
# normal vector to ground loss
if 'pose_ground' in self.loss_functions:
valid_ground_maps_conf = torch.tensor([0.1 if shape == (1,1) else 1.0 for shape in ground_maps.image_sizes],device=cube_pose.device)
num_boxes_per_image_tensor = torch.tensor(num_boxes_per_image,device=Ks_scaled_per_box.device)
normal_vectors = self.normal_vector_from_maps(ground_maps, depth_maps, Ks_scaled_per_box)
normal_vectors = normal_vectors.repeat_interleave(num_boxes_per_image_tensor, 0)
valid_ground_maps_conf = valid_ground_maps_conf.repeat_interleave(num_boxes_per_image_tensor, 0)
pred_normal = cube_pose[:, 1, :]
loss_ground_rot = 1-F.cosine_similarity(normal_vectors, pred_normal, dim=1).abs()
loss_ground_rot = loss_ground_rot * valid_ground_maps_conf
if 'pose_ground2' in self.loss_functions:
valid_ground_maps_conf = torch.tensor([0.1 if shape == (1,1) else 1.0 for shape in ground_maps.image_sizes],device=cube_pose.device)
num_boxes_per_image_tensor = torch.tensor(num_boxes_per_image,device=Ks_scaled_per_box.device)
normal_vectors = self.normal_vector_from_maps(ground_maps, depth_maps, Ks_scaled_per_box)
normal_vectors = normal_vectors.repeat_interleave(num_boxes_per_image_tensor, 0)
valid_ground_maps_conf = valid_ground_maps_conf.repeat_interleave(num_boxes_per_image_tensor, 0)
ps_gt_rotation_matrix = self.normal_to_rotation(normal_vectors)
# might need to transpose the rotation matrices
pred_rotation_matrix = cube_pose
loss_ground_rot = 1 - so3_relative_angle(pred_rotation_matrix, ps_gt_rotation_matrix, cos_angle=True)#.abs()
loss_ground_rot = loss_ground_rot * valid_ground_maps_conf
# pseudo ground truth z loss
if 'z_pseudo_gt_patch' in self.loss_functions:
loss_pseudo_gt_z = self.pseudo_gt_z_box_loss(depth_maps, proj_boxes.tensor.split(num_boxes_per_image), cube_z)
elif 'z_pseudo_gt_center' in self.loss_functions:
loss_pseudo_gt_z = self.pseudo_gt_z_point_loss(depth_maps, cube_xy, cube_z, num_boxes_per_image)
# segment
if 'segmentation' in self.loss_functions:
loss_seg = self.segment_loss(masks_all_images, bube_corners, at_which_mask_idx)
# Z
if 'z' in self.loss_functions:
loss_z = self.z_loss(gt_boxes, cubes, Ks_scaled_per_box, im_sizes, proj_boxes)
# Dimensions
if 'dims' in self.loss_functions:
loss_dims_w, loss_dims_h, loss_dims_l = self.dim_loss((prior_dims_mean, prior_dims_std), cubes.dimensions.squeeze(1))
# Depth Range
if 'depth' in self.loss_functions:
loss_depth = self.depth_range_loss(masks_all_images, at_which_mask_idx, depth_maps, cubes, gt_boxes, num_boxes_per_image)
total_3D_loss_for_reporting = 0
if loss_iou is not None:
total_3D_loss_for_reporting += loss_iou*self.loss_w_iou
if loss_seg is not None:
total_3D_loss_for_reporting += loss_seg*self.loss_w_seg
if loss_pose is not None:
# this loss is a bit weird when adding, because it is a single number, which is broadcasted. instead of a number per instance
total_3D_loss_for_reporting += loss_pose*self.loss_w_pose
if loss_ground_rot is not None:
total_3D_loss_for_reporting += loss_ground_rot * self.loss_w_normal_vec * valid_ground_maps_conf
if loss_z is not None:
total_3D_loss_for_reporting += loss_z*self.loss_w_z
if loss_pseudo_gt_z is not None:
total_3D_loss_for_reporting += loss_pseudo_gt_z*self.loss_w_z
if loss_dims_w is not None:
total_3D_loss_for_reporting += loss_dims_w*self.loss_w_dims
total_3D_loss_for_reporting += loss_dims_h*self.loss_w_dims
total_3D_loss_for_reporting += loss_dims_l*self.loss_w_dims
if loss_depth is not None:
total_3D_loss_for_reporting += loss_depth*self.loss_w_depth
# reporting does not need gradients
if not isinstance(total_3D_loss_for_reporting, int):
total_3D_loss_for_reporting = total_3D_loss_for_reporting.detach()
# compute errors for tracking purposes
xy_error = (cube_xy - gt_2d).detach().abs()
z_error = (cube_z - gt_z).detach().abs()
dims_error = (cube_dims - gt_dims).detach().abs()
storage.put_scalar(prefix + 'z_error', z_error.mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + 'dims_error', dims_error.mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + 'xy_error', xy_error.mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + 'z_close', (z_error<0.20).float().mean().item(), smoothing_hint=False)
IoU2D = iou_2d(gt_boxes, proj_boxes).detach()
IoU2D = torch.diag(IoU2D.view(n, n))
if IoU3Ds is not None:
storage.put_scalar(prefix + '3D IoU', IoU3Ds.detach().mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + '2D IoU', IoU2D.mean().item(), smoothing_hint=False)
if not isinstance(total_3D_loss_for_reporting, int):
storage.put_scalar(prefix + 'total_3D_loss', self.loss_w_3d * self.safely_reduce_losses(total_3D_loss_for_reporting), smoothing_hint=False)
if self.use_confidence > 0:
uncert_sf = SQRT_2_CONSTANT * torch.exp(-cube_uncert)
if loss_iou is not None:
loss_iou *= uncert_sf
if loss_seg is not None:
loss_seg *= uncert_sf
if loss_pose is not None:
loss_pose *= uncert_sf
if loss_ground_rot is not None:
loss_ground_rot *= uncert_sf
if loss_z is not None:
loss_z *= uncert_sf
if loss_pseudo_gt_z is not None:
loss_pseudo_gt_z *= uncert_sf
if loss_dims_w is not None:
loss_dims_w *= uncert_sf
loss_dims_h *= uncert_sf
loss_dims_l *= uncert_sf
if loss_depth is not None:
loss_depth *= uncert_sf
losses.update({prefix + 'uncert': self.use_confidence*self.safely_reduce_losses(cube_uncert.clone())})
storage.put_scalar(prefix + 'conf', torch.exp(-cube_uncert).mean().item(), smoothing_hint=False)
if loss_iou is not None:
losses.update({
prefix + 'loss_iou': self.safely_reduce_losses(loss_iou) * self.loss_w_iou * self.loss_w_3d,
})
if loss_pose is not None:
losses.update({
prefix + 'loss_pose': self.safely_reduce_losses(loss_pose) * self.loss_w_pose * self.loss_w_3d,
})
if loss_ground_rot is not None:
losses.update({
prefix + 'loss_normal_vec': self.safely_reduce_losses(loss_ground_rot) * self.loss_w_normal_vec * self.loss_w_3d,
})
if loss_seg is not None:
losses.update({
prefix + 'loss_seg': self.safely_reduce_losses(loss_seg) * self.loss_w_seg * self.loss_w_3d,
})
if loss_z is not None:
losses.update({
prefix + 'loss_z': self.safely_reduce_losses(loss_z) * self.loss_w_z * self.loss_w_3d,
})
if loss_pseudo_gt_z is not None:
losses.update({
prefix + 'loss_pseudo_gt_z': self.safely_reduce_losses(loss_pseudo_gt_z) * self.loss_w_z * self.loss_w_3d,
})
if loss_dims_w is not None:
losses.update({
prefix + 'loss_dims_w': self.safely_reduce_losses(loss_dims_w) * self.loss_w_dims * self.loss_w_3d,
})
losses.update({
prefix + 'loss_dims_h': self.safely_reduce_losses(loss_dims_h) * self.loss_w_dims * self.loss_w_3d,
})
losses.update({
prefix + 'loss_dims_l': self.safely_reduce_losses(loss_dims_l) * self.loss_w_dims * self.loss_w_3d,
})
if loss_depth is not None:
losses.update({
prefix + 'loss_depth': self.safely_reduce_losses(loss_depth) * self.loss_w_depth * self.loss_w_3d,
})
'''
Inference
'''
if len(cube_z.shape) == 0:
cube_z = cube_z.unsqueeze(0)
# inference
cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
cube_3D = torch.cat((torch.stack((cube_x3d, cube_y3d, cube_z)).T, cube_dims, cube_xy*im_ratios_per_box.unsqueeze(1)), dim=1)
if self.use_confidence:
cube_conf = torch.exp(-cube_uncert)
cube_3D = torch.cat((cube_3D, cube_conf.unsqueeze(1)), dim=1)
# convert the predictions to intances per image
cube_3D = cube_3D.split(num_boxes_per_image)
cube_pose = cube_pose.split(num_boxes_per_image)
box_classes = box_classes.split(num_boxes_per_image)
pred_instances = None
pred_instances = instances if not self.training else \
[Instances(image_size) for image_size in im_current_dims]
for cube_3D_i, cube_pose_i, instances_i, K, im_dim, im_scale_ratio, box_classes_i, pred_boxes_i in \
zip(cube_3D, cube_pose, pred_instances, Ks, im_current_dims, im_scales_ratio, box_classes, pred_boxes):
# merge scores if they already exist
if hasattr(instances_i, 'scores'):
instances_i.scores = (instances_i.scores * cube_3D_i[:, -1])**(1/2)
# assign scores if none are present
else:
instances_i.scores = cube_3D_i[:, -1]
# assign box classes if none exist
if not hasattr(instances_i, 'pred_classes'):
instances_i.pred_classes = box_classes_i
# assign predicted boxes if none exist
if not hasattr(instances_i, 'pred_boxes'):
instances_i.pred_boxes = pred_boxes_i
instances_i.pred_bbox3D = util.get_cuboid_verts_faces(cube_3D_i[:, :6], cube_pose_i)[0]
instances_i.pred_center_cam = cube_3D_i[:, :3]
instances_i.pred_center_2D = cube_3D_i[:, 6:8]
instances_i.pred_dimensions = cube_3D_i[:, 3:6]
instances_i.pred_pose = cube_pose_i
if self.training:
return pred_instances, losses
else:
return pred_instances
def _sample_proposals(
self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor, matched_ious=None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Based on the matching between N proposals and M groundtruth,
sample the proposals and set their classification labels.
Args:
matched_idxs (Tensor): a vector of length N, each is the best-matched
gt index in [0, M) for each proposal.
matched_labels (Tensor): a vector of length N, the matcher's label
(one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
gt_classes (Tensor): a vector of length M.
Returns:
Tensor: a vector of indices of sampled proposals. Each is in [0, N).
Tensor: a vector of the same length, the classification label for
each sampled proposal. Each sample is labeled as either a category in
[0, num_classes) or the background (num_classes).
"""
has_gt = gt_classes.numel() > 0
# Get the corresponding GT for each proposal
if has_gt:
gt_classes = gt_classes[matched_idxs]
# Label unmatched proposals (0 label from matcher) as background (label=num_classes)
gt_classes[matched_labels == 0] = self.num_classes
# Label ignore proposals (-1 label)
gt_classes[matched_labels == -1] = -1
else:
gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes, matched_ious=matched_ious
)
sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
return sampled_idxs, gt_classes[sampled_idxs]
@torch.no_grad()
def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]:
#separate valid and ignore gts
targets_ign = [target[target.gt_classes < 0] for target in targets]
targets = [target[target.gt_classes >= 0] for target in targets]
if self.proposal_append_gt:
proposals = add_ground_truth_to_proposals(targets, proposals)
proposals_with_gt = []
num_fg_samples = []
num_bg_samples = []
for proposals_per_image, targets_per_image, targets_ign_per_image in zip(proposals, targets, targets_ign):
has_gt = len(targets_per_image) > 0
match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
try:
if len(targets_ign_per_image) > 0:
# compute the quality matrix, only on subset of background
background_inds = (matched_labels == 0).nonzero().squeeze()
# determine the boxes inside ignore regions with sufficient threshold
if background_inds.numel() > 1:
match_quality_matrix_ign = pairwise_ioa(targets_ign_per_image.gt_boxes, proposals_per_image.proposal_boxes[background_inds])
matched_labels[background_inds[match_quality_matrix_ign.max(0)[0] >= self.ignore_thresh]] = -1
del match_quality_matrix_ign
except:
pass
gt_arange = torch.arange(match_quality_matrix.shape[1]).to(matched_idxs.device)
matched_ious = match_quality_matrix[matched_idxs, gt_arange]
sampled_idxs, gt_classes = self._sample_proposals(matched_idxs, matched_labels, targets_per_image.gt_classes, matched_ious=matched_ious)
# Set target attributes of the sampled proposals:
proposals_per_image = proposals_per_image[sampled_idxs]
proposals_per_image.gt_classes = gt_classes
if has_gt:
sampled_targets = matched_idxs[sampled_idxs]
# We index all the attributes of targets that start with "gt_"
# and have not been added to proposals yet (="gt_classes").
# NOTE: here the indexing waste some compute, because heads
# like masks, keypoints, etc, will filter the proposals again,
# (by foreground/background, or number of keypoints in the image, etc)
# so we essentially index the data twice.
for (trg_name, trg_value) in targets_per_image.get_fields().items():
if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
proposals_per_image.set(trg_name, trg_value[sampled_targets])
num_bg_samples.append((gt_classes == self.num_classes).sum().item())
num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
proposals_with_gt.append(proposals_per_image)
# Log the number of fg/bg samples that are selected for training ROI heads
storage = get_event_storage()
storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
return proposals_with_gt
def safely_reduce_losses(self, loss):
valid = (~(loss.isinf())) & (~(loss.isnan()))
if valid.any():
return loss[valid].mean()
else:
# no valid losses, simply zero out
return loss.mean()*0.0
@ROI_HEADS_REGISTRY.register()
class ROIHeads3D(StandardROIHeads):
@configurable
def __init__(
self,
*,
ignore_thresh: float,
cube_head: nn.Module,
cube_pooler: nn.Module,
loss_w_3d: float,
loss_w_xy: float,
loss_w_z: float,
loss_w_dims: float,
loss_w_pose: float,
loss_w_joint: float,
use_confidence: float,
inverse_z_weight: bool,
z_type: str,
pose_type: str,
cluster_bins: int,
priors = None,
dims_priors_enabled = None,
dims_priors_func = None,
disentangled_loss=None,
virtual_depth=None,
virtual_focal=None,
test_scale=None,
allocentric_pose=None,
chamfer_pose=None,
scale_roi_boxes=None,
**kwargs,
):
super().__init__(**kwargs)
self.scale_roi_boxes = scale_roi_boxes
# rotation settings
self.allocentric_pose = allocentric_pose
self.chamfer_pose = chamfer_pose
# virtual settings
self.virtual_depth = virtual_depth
self.virtual_focal = virtual_focal
# loss weights, <=0 is off
self.loss_w_3d = loss_w_3d
self.loss_w_xy = loss_w_xy
self.loss_w_z = loss_w_z
self.loss_w_dims = loss_w_dims
self.loss_w_pose = loss_w_pose
self.loss_w_joint = loss_w_joint
# loss modes
self.disentangled_loss = disentangled_loss
self.inverse_z_weight = inverse_z_weight
# misc
self.test_scale = test_scale
self.ignore_thresh = ignore_thresh
# related to network outputs
self.z_type = z_type
self.pose_type = pose_type
self.use_confidence = use_confidence
# related to priors
self.cluster_bins = cluster_bins
self.dims_priors_enabled = dims_priors_enabled
self.dims_priors_func = dims_priors_func
# if there is no 3D loss, then we don't need any heads.
if loss_w_3d > 0:
self.cube_head = cube_head
self.cube_pooler = cube_pooler
# the dimensions could rely on pre-computed priors
if self.dims_priors_enabled and priors is not None:
self.priors_dims_per_cat = nn.Parameter(torch.FloatTensor(priors['priors_dims_per_cat']).unsqueeze(0))
else:
self.priors_dims_per_cat = nn.Parameter(torch.ones(1, self.num_classes, 2, 3))
# Optionally, refactor priors and store them in the network params
if self.cluster_bins > 1 and priors is not None:
# the depth could have been clustered based on 2D scales
priors_z_scales = torch.stack([torch.FloatTensor(prior[1]) for prior in priors['priors_bins']])
self.priors_z_scales = nn.Parameter(priors_z_scales)
else:
self.priors_z_scales = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins))
# the depth can be based on priors
if self.z_type == 'clusters':
assert self.cluster_bins > 1, 'To use z_type of priors, there must be more than 1 cluster bin'
if priors is None:
self.priors_z_stats = nn.Parameter(torch.ones(self.num_classes, self.cluster_bins, 2).float())
else:
# stats
priors_z_stats = torch.cat([torch.FloatTensor(prior[2]).unsqueeze(0) for prior in priors['priors_bins']])
self.priors_z_stats = nn.Parameter(priors_z_stats)
@classmethod
def from_config(cls, cfg, input_shape: Dict[str, ShapeSpec], priors=None):
ret = super().from_config(cfg, input_shape)
# pass along priors
ret["box_predictor"] = FastRCNNOutputs(cfg, ret['box_head'].output_shape)
ret.update(cls._init_cube_head(cfg, input_shape))
ret["priors"] = priors
return ret
@classmethod
def _init_cube_head(self, cfg, input_shape: Dict[str, ShapeSpec]):
in_features = cfg.MODEL.ROI_HEADS.IN_FEATURES
pooler_scales = tuple(1.0 / input_shape[k].stride for k in in_features)
pooler_resolution = cfg.MODEL.ROI_CUBE_HEAD.POOLER_RESOLUTION
pooler_sampling_ratio = cfg.MODEL.ROI_CUBE_HEAD.POOLER_SAMPLING_RATIO
pooler_type = cfg.MODEL.ROI_CUBE_HEAD.POOLER_TYPE
cube_pooler = ROIPooler(
output_size=pooler_resolution,
scales=pooler_scales,
sampling_ratio=pooler_sampling_ratio,
pooler_type=pooler_type,
)
in_channels = [input_shape[f].channels for f in in_features][0]
shape = ShapeSpec(
channels=in_channels, width=pooler_resolution, height=pooler_resolution
)
cube_head = build_cube_head(cfg, shape)
return {
'cube_head': cube_head,
'cube_pooler': cube_pooler,
'use_confidence': cfg.MODEL.ROI_CUBE_HEAD.USE_CONFIDENCE,
'inverse_z_weight': cfg.MODEL.ROI_CUBE_HEAD.INVERSE_Z_WEIGHT,
'loss_w_3d': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_3D,
'loss_w_xy': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_XY,
'loss_w_z': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_Z,
'loss_w_dims': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_DIMS,
'loss_w_pose': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_POSE,
'loss_w_joint': cfg.MODEL.ROI_CUBE_HEAD.LOSS_W_JOINT,
'z_type': cfg.MODEL.ROI_CUBE_HEAD.Z_TYPE,
'pose_type': cfg.MODEL.ROI_CUBE_HEAD.POSE_TYPE,
'dims_priors_enabled': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_ENABLED,
'dims_priors_func': cfg.MODEL.ROI_CUBE_HEAD.DIMS_PRIORS_FUNC,
'disentangled_loss': cfg.MODEL.ROI_CUBE_HEAD.DISENTANGLED_LOSS,
'virtual_depth': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_DEPTH,
'virtual_focal': cfg.MODEL.ROI_CUBE_HEAD.VIRTUAL_FOCAL,
'test_scale': cfg.INPUT.MIN_SIZE_TEST,
'chamfer_pose': cfg.MODEL.ROI_CUBE_HEAD.CHAMFER_POSE,
'allocentric_pose': cfg.MODEL.ROI_CUBE_HEAD.ALLOCENTRIC_POSE,
'cluster_bins': cfg.MODEL.ROI_CUBE_HEAD.CLUSTER_BINS,
'ignore_thresh': cfg.MODEL.RPN.IGNORE_THRESHOLD,
'scale_roi_boxes': cfg.MODEL.ROI_CUBE_HEAD.SCALE_ROI_BOXES,
}
def forward(self, images, features, proposals, Ks, im_scales_ratio, targets=None):
im_dims = [image.shape[1:] for image in images]
del images
if self.training:
proposals = self.label_and_sample_proposals(proposals, targets)
del targets
if self.training:
losses = self._forward_box(features, proposals)
if self.loss_w_3d > 0:
instances_3d, losses_cube = self._forward_cube(features, proposals, Ks, im_dims, im_scales_ratio)
losses.update(losses_cube)
else:
instances_3d = None
return instances_3d, losses
else:
# when oracle is available, by pass the box forward.
# simulate the predicted instances by creating a new
# instance for each passed in image.
if isinstance(proposals, list) and ~np.any([isinstance(p, Instances) for p in proposals]):
pred_instances = []
for proposal, im_dim in zip(proposals, im_dims):
pred_instances_i = Instances(im_dim)
pred_instances_i.pred_boxes = Boxes(proposal['gt_bbox2D'])
pred_instances_i.pred_classes = proposal['gt_classes']
pred_instances_i.scores = torch.ones_like(proposal['gt_classes']).float()
pred_instances.append(pred_instances_i)
else:
pred_instances = self._forward_box(features, proposals)
if self.loss_w_3d > 0:
pred_instances = self._forward_cube(features, pred_instances, Ks, im_dims, im_scales_ratio)
return pred_instances, {}
def _forward_box(self, features: Dict[str, torch.Tensor], proposals: List[Instances]):
"""
Forward logic of the box prediction branch. If `self.train_on_pred_boxes is True`,
the function puts predicted boxes in the `proposal_boxes` field of `proposals` argument.
Args:
features (dict[str, Tensor]): mapping from feature map names to tensor.
Same as in :meth:`ROIHeads.forward`.
proposals (list[Instances]): the per-image object proposals with
their matching ground truth.
Each has fields "proposal_boxes", and "objectness_logits",
"gt_classes", "gt_boxes".
Returns:
In training, a dict of losses.
In inference, a list of `Instances`, the predicted instances.
"""
features = [features[f] for f in self.box_in_features]
box_features = self.box_pooler(features, [x.proposal_boxes for x in proposals])
box_features = self.box_head(box_features)
predictions = self.box_predictor(box_features)
del box_features
if self.training:
losses = self.box_predictor.losses(
predictions, proposals,
)
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
predictions, proposals
)
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
proposals_per_image.pred_boxes = Boxes(pred_boxes_per_image)
# proposals is modified in-place below, so losses must be computed first.
if self.train_on_pred_boxes:
with torch.no_grad():
pred_boxes = self.box_predictor.predict_boxes_for_gt_classes(
predictions, proposals
)
for proposals_per_image, pred_boxes_per_image in zip(proposals, pred_boxes):
proposals_per_image.proposal_boxes = Boxes(pred_boxes_per_image)
return losses
else:
pred_instances, _ = self.box_predictor.inference(predictions, proposals, )
return pred_instances
def l1_loss(self, vals, target):
return F.smooth_l1_loss(vals, target, reduction='none', beta=0.0)
def chamfer_loss(self, vals, target):
B = vals.shape[0]
xx = vals.view(B, 8, 1, 3)
yy = target.view(B, 1, 8, 3)
l1_dist = (xx - yy).abs().sum(-1)
l1 = (l1_dist.min(1).values.mean(-1) + l1_dist.min(2).values.mean(-1))
return l1
# optionally, scale proposals to zoom RoI in (<1.0) our out (>1.0)
def scale_proposals(self, proposal_boxes):
if self.scale_roi_boxes > 0:
proposal_boxes_scaled = []
for boxes in proposal_boxes:
centers = boxes.get_centers()
widths = boxes.tensor[:, 2] - boxes.tensor[:, 0]
heights = boxes.tensor[:, 2] - boxes.tensor[:, 0]
x1 = centers[:, 0] - 0.5*widths*self.scale_roi_boxes
x2 = centers[:, 0] + 0.5*widths*self.scale_roi_boxes
y1 = centers[:, 1] - 0.5*heights*self.scale_roi_boxes
y2 = centers[:, 1] + 0.5*heights*self.scale_roi_boxes
boxes_scaled = Boxes(torch.stack([x1, y1, x2, y2], dim=1))
proposal_boxes_scaled.append(boxes_scaled)
else:
proposal_boxes_scaled = proposal_boxes
return proposal_boxes_scaled
def _forward_cube(self, features, instances, Ks, im_current_dims, im_scales_ratio):
features = [features[f] for f in self.in_features]
# training on foreground
if self.training:
losses = {}
# add up the amount we should normalize the losses by.
# this follows the same logic as the BoxHead, where each FG proposal
# is able to contribute the same amount of supervision. Technically,
# this value doesn't change during training unless the batch size is dynamic.
self.normalize_factor = max(sum([i.gt_classes.numel() for i in instances]), 1.0)
# The loss is only defined on positive proposals
proposals, _ = select_foreground_proposals(instances, self.num_classes)
proposal_boxes = [x.proposal_boxes for x in proposals]
pred_boxes = [x.pred_boxes for x in proposals]
box_classes = (torch.cat([p.gt_classes for p in proposals], dim=0) if len(proposals) else torch.empty(0))
gt_boxes3D = torch.cat([p.gt_boxes3D for p in proposals], dim=0,)
gt_poses = torch.cat([p.gt_poses for p in proposals], dim=0,)
assert len(gt_poses) == len(gt_boxes3D) == len(box_classes)
# eval on all instances
else:
proposals = instances
pred_boxes = [x.pred_boxes for x in instances]
proposal_boxes = pred_boxes
box_classes = torch.cat([x.pred_classes for x in instances])
proposal_boxes_scaled = self.scale_proposals(proposal_boxes)
# forward features
cube_features = self.cube_pooler(features, proposal_boxes_scaled).flatten(1)
n = cube_features.shape[0]
# nothing to do..
if n == 0:
return instances if not self.training else (instances, {})
num_boxes_per_image = [len(i) for i in proposals]
# scale the intrinsics according to the ratio the image has been scaled.
# this means the projections at the current scale are in sync.
Ks_scaled_per_box = torch.cat([
(Ks[i]/im_scales_ratio[i]).unsqueeze(0).repeat([num, 1, 1])
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
Ks_scaled_per_box[:, -1, -1] = 1
focal_lengths_per_box = torch.cat([
(Ks[i][1, 1]).unsqueeze(0).repeat([num])
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
im_ratios_per_box = torch.cat([
torch.FloatTensor([im_scales_ratio[i]]).repeat(num)
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
# scaling factor for Network resolution -> Original
im_scales_per_box = torch.cat([
torch.FloatTensor([im_current_dims[i][0]]).repeat(num)
for (i, num) in enumerate(num_boxes_per_image)
]).to(cube_features.device)
im_scales_original_per_box = im_scales_per_box * im_ratios_per_box
if self.virtual_depth:
virtual_to_real = util.compute_virtual_scale_from_focal_spaces(
focal_lengths_per_box, im_scales_original_per_box,
self.virtual_focal, im_scales_per_box
)
real_to_virtual = 1 / virtual_to_real
else:
real_to_virtual = virtual_to_real = 1.0
# 2D boxes are needed to apply deltas
src_boxes = torch.cat([box_per_im.tensor for box_per_im in proposal_boxes], dim=0)
src_widths = src_boxes[:, 2] - src_boxes[:, 0]
src_heights = src_boxes[:, 3] - src_boxes[:, 1]
src_scales = (src_heights**2 + src_widths**2).sqrt()
src_ctr_x = src_boxes[:, 0] + 0.5 * src_widths
src_ctr_y = src_boxes[:, 1] + 0.5 * src_heights
# For some methods, we need the predicted 2D box,
# e.g., the differentiable tensors from the 2D box head.
pred_src_boxes = torch.cat([box_per_im.tensor for box_per_im in pred_boxes], dim=0)
pred_widths = pred_src_boxes[:, 2] - pred_src_boxes[:, 0]
pred_heights = pred_src_boxes[:, 3] - pred_src_boxes[:, 1]
pred_src_x = (pred_src_boxes[:, 2] + pred_src_boxes[:, 0]) * 0.5
pred_src_y = (pred_src_boxes[:, 3] + pred_src_boxes[:, 1]) * 0.5
# forward predictions
cube_2d_deltas, cube_z, cube_dims, cube_pose, cube_uncert = self.cube_head(cube_features)
# simple indexing re-used commonly for selection purposes
fg_inds = torch.arange(n)
# Z when clusters are used
if cube_z is not None and self.cluster_bins > 1:
# compute closest bin assignments per batch per category (batch x n_category)
scales_diff = (self.priors_z_scales.detach().T.unsqueeze(0) - src_scales.unsqueeze(1).unsqueeze(2)).abs()
# assign the correct scale prediction.
# (the others are not used / thrown away)
assignments = scales_diff.argmin(1)
# select FG, category, and correct cluster
cube_z = cube_z[fg_inds, :, box_classes, :][fg_inds, assignments[fg_inds, box_classes]]
elif cube_z is not None:
# if z is available, collect the per-category predictions.
cube_z = cube_z[fg_inds, box_classes, :]
cube_dims = cube_dims[fg_inds, box_classes, :]
cube_pose = cube_pose[fg_inds, box_classes, :, :]
if self.use_confidence:
# if uncertainty is available, collect the per-category predictions.
cube_uncert = cube_uncert[fg_inds, box_classes]
cube_2d_deltas = cube_2d_deltas[fg_inds, box_classes, :]
# apply our predicted deltas based on src boxes.
cube_x = src_ctr_x + src_widths * cube_2d_deltas[:, 0]
cube_y = src_ctr_y + src_heights * cube_2d_deltas[:, 1]
cube_xy = torch.cat((cube_x.unsqueeze(1), cube_y.unsqueeze(1)), dim=1)
cube_dims_norm = cube_dims
if self.dims_priors_enabled:
# gather prior dimensions
prior_dims = self.priors_dims_per_cat.detach().repeat([n, 1, 1, 1])[fg_inds, box_classes]
prior_dims_mean = prior_dims[:, 0, :]
prior_dims_std = prior_dims[:, 1, :]
if self.dims_priors_func == 'sigmoid':
prior_dims_min = (prior_dims_mean - 3*prior_dims_std).clip(0.0)
prior_dims_max = (prior_dims_mean + 3*prior_dims_std)
cube_dims = util.scaled_sigmoid(cube_dims_norm, min=prior_dims_min, max=prior_dims_max)
elif self.dims_priors_func == 'exp':
cube_dims = torch.exp(cube_dims_norm.clip(max=5)) * prior_dims_mean
else:
# no priors are used
cube_dims = torch.exp(cube_dims_norm.clip(max=5))
if self.allocentric_pose:
# To compare with GTs, we need the pose to be egocentric, not allocentric
cube_pose_allocentric = cube_pose
cube_pose = util.R_from_allocentric(Ks_scaled_per_box, cube_pose, u=cube_x.detach(), v=cube_y.detach())
cube_z = cube_z.squeeze()
if self.z_type =='sigmoid':
cube_z_norm = torch.sigmoid(cube_z)
cube_z = cube_z_norm * 100
elif self.z_type == 'log':
cube_z_norm = cube_z
cube_z = torch.exp(cube_z)
elif self.z_type == 'clusters':
# gather the mean depth, same operation as above, for a n x c result
z_means = self.priors_z_stats[:, :, 0].T.unsqueeze(0).repeat([n, 1, 1])
z_means = torch.gather(z_means, 1, assignments.unsqueeze(1)).squeeze(1)
# gather the std depth, same operation as above, for a n x c result
z_stds = self.priors_z_stats[:, :, 1].T.unsqueeze(0).repeat([n, 1, 1])
z_stds = torch.gather(z_stds, 1, assignments.unsqueeze(1)).squeeze(1)
# do not learn these, they are static
z_means = z_means.detach()
z_stds = z_stds.detach()
z_means = z_means[fg_inds, box_classes]
z_stds = z_stds[fg_inds, box_classes]
z_mins = (z_means - 3*z_stds).clip(0)
z_maxs = (z_means + 3*z_stds)
cube_z_norm = cube_z
cube_z = util.scaled_sigmoid(cube_z, min=z_mins, max=z_maxs)
if self.virtual_depth:
cube_z = (cube_z * virtual_to_real)
if self.training:
prefix = 'Cube/'
storage = get_event_storage()
# Pull off necessary GT information
# let lowercase->2D and uppercase->3D
# [x, y, Z, W, H, L]
gt_2d = gt_boxes3D[:, :2]
gt_z = gt_boxes3D[:, 2]
gt_dims = gt_boxes3D[:, 3:6]
# this box may have been mirrored and scaled so
# we need to recompute XYZ in 3D by backprojecting.
gt_x3d = gt_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
gt_y3d = gt_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
gt_3d = torch.stack((gt_x3d, gt_y3d, gt_z)).T
# put together the GT boxes
gt_box3d = torch.cat((gt_3d, gt_dims), dim=1)
# These are the corners which will be the target for all losses!!
gt_corners = util.get_cuboid_verts_faces(gt_box3d, gt_poses)[0]
# project GT corners
gt_proj_boxes = torch.bmm(Ks_scaled_per_box, gt_corners.transpose(1,2))
gt_proj_boxes /= gt_proj_boxes[:, -1, :].clone().unsqueeze(1)
gt_proj_x1 = gt_proj_boxes[:, 0, :].min(1)[0]
gt_proj_y1 = gt_proj_boxes[:, 1, :].min(1)[0]
gt_proj_x2 = gt_proj_boxes[:, 0, :].max(1)[0]
gt_proj_y2 = gt_proj_boxes[:, 1, :].max(1)[0]
gt_widths = gt_proj_x2 - gt_proj_x1
gt_heights = gt_proj_y2 - gt_proj_y1
gt_x = gt_proj_x1 + 0.5 * gt_widths
gt_y = gt_proj_y1 + 0.5 * gt_heights
gt_proj_boxes = torch.stack((gt_proj_x1, gt_proj_y1, gt_proj_x2, gt_proj_y2), dim=1)
if self.disentangled_loss:
'''
Disentangled loss compares each varaible group to the
cuboid corners, which is generally more robust to hyperparams.
'''
# compute disentangled Z corners
cube_dis_x3d_from_z = cube_z * (gt_2d[:, 0] - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
cube_dis_y3d_from_z = cube_z * (gt_2d[:, 1] - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
cube_dis_z = torch.cat((torch.stack((cube_dis_x3d_from_z, cube_dis_y3d_from_z, cube_z)).T, gt_dims), dim=1)
dis_z_corners = util.get_cuboid_verts_faces(cube_dis_z, gt_poses)[0]
# compute disentangled XY corners
cube_dis_x3d = gt_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
cube_dis_y3d = gt_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
cube_dis_XY = torch.cat((torch.stack((cube_dis_x3d, cube_dis_y3d, gt_z)).T, gt_dims), dim=1)
dis_XY_corners = util.get_cuboid_verts_faces(cube_dis_XY, gt_poses)[0]
loss_xy = self.l1_loss(dis_XY_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
# Pose
dis_pose_corners = util.get_cuboid_verts_faces(gt_box3d, cube_pose)[0]
# Dims
dis_dims_corners = util.get_cuboid_verts_faces(torch.cat((gt_3d, cube_dims), dim=1), gt_poses)[0]
# Loss dims
loss_dims = self.l1_loss(dis_dims_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
# Loss z
loss_z = self.l1_loss(dis_z_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
# Rotation uses chamfer or l1 like others
if self.chamfer_pose:
loss_pose = self.chamfer_loss(dis_pose_corners, gt_corners)
else:
loss_pose = self.l1_loss(dis_pose_corners, gt_corners).contiguous().view(n, -1).mean(dim=1)
# Non-disentangled training losses
else:
'''
These loss functions are fairly arbitrarily designed.
Generally, they are in some normalized space but there
are many alternative implementations for most functions.
'''
# XY
gt_deltas = (gt_2d.clone() - torch.cat((src_ctr_x.unsqueeze(1), src_ctr_y.unsqueeze(1)), dim=1)) \
/ torch.cat((src_widths.unsqueeze(1), src_heights.unsqueeze(1)), dim=1)
loss_xy = self.l1_loss(cube_2d_deltas, gt_deltas).mean(1)
# Dims
if self.dims_priors_enabled:
cube_dims_gt_normspace = torch.log(gt_dims/prior_dims)
loss_dims = self.l1_loss(cube_dims_norm, cube_dims_gt_normspace).mean(1)
else:
loss_dims = self.l1_loss(cube_dims_norm, torch.log(gt_dims)).mean(1)
# Pose
try:
if self.allocentric_pose:
gt_poses_allocentric = util.R_to_allocentric(Ks_scaled_per_box, gt_poses, u=cube_x.detach(), v=cube_y.detach())
loss_pose = 1-so3_relative_angle(cube_pose_allocentric, gt_poses_allocentric, eps=0.1, cos_angle=True)
else:
loss_pose = 1-so3_relative_angle(cube_pose, gt_poses, eps=0.1, cos_angle=True)
# Can fail with bad EPS values/instability
except:
loss_pose = None
if self.z_type == 'direct':
loss_z = self.l1_loss(cube_z, gt_z)
elif self.z_type == 'sigmoid':
loss_z = self.l1_loss(cube_z_norm, (gt_z * real_to_virtual / 100).clip(0, 1))
elif self.z_type == 'log':
loss_z = self.l1_loss(cube_z_norm, torch.log((gt_z * real_to_virtual).clip(0.01)))
elif self.z_type == 'clusters':
loss_z = self.l1_loss(cube_z_norm, (((gt_z * real_to_virtual) - z_means)/(z_stds)))
total_3D_loss_for_reporting = loss_dims*self.loss_w_dims
if not loss_pose is None:
total_3D_loss_for_reporting += loss_pose*self.loss_w_pose
if not cube_2d_deltas is None:
total_3D_loss_for_reporting += loss_xy*self.loss_w_xy
if not loss_z is None:
total_3D_loss_for_reporting += loss_z*self.loss_w_z
# reporting does not need gradients
total_3D_loss_for_reporting = total_3D_loss_for_reporting.detach()
if self.loss_w_joint > 0:
'''
If we are using joint [entangled] loss, then we also need to pair all
predictions together and compute a chamfer or l1 loss vs. cube corners.
'''
cube_dis_x3d_from_z = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
cube_dis_y3d_from_z = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
cube_dis_z = torch.cat((torch.stack((cube_dis_x3d_from_z, cube_dis_y3d_from_z, cube_z)).T, cube_dims), dim=1)
dis_z_corners_joint = util.get_cuboid_verts_faces(cube_dis_z, cube_pose)[0]
if self.chamfer_pose and self.disentangled_loss:
loss_joint = self.chamfer_loss(dis_z_corners_joint, gt_corners)
else:
loss_joint = self.l1_loss(dis_z_corners_joint, gt_corners).contiguous().view(n, -1).mean(dim=1)
valid_joint = loss_joint < np.inf
total_3D_loss_for_reporting += (loss_joint*self.loss_w_joint).detach()
# compute errors for tracking purposes
z_error = (cube_z - gt_z).detach().abs()
dims_error = (cube_dims - gt_dims).detach().abs()
xy_error = (cube_xy - gt_2d).detach().abs()
storage.put_scalar(prefix + 'z_error', z_error.mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + 'dims_error', dims_error.mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + 'xy_error', xy_error.mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + 'z_close', (z_error<0.20).float().mean().item(), smoothing_hint=False)
storage.put_scalar(prefix + 'total_3D_loss', self.loss_w_3d * self.safely_reduce_losses(total_3D_loss_for_reporting), smoothing_hint=False)
if self.inverse_z_weight:
'''
Weights all losses to prioritize close up boxes.
'''
gt_z = gt_boxes3D[:, 2]
inverse_z_w = 1/torch.log(gt_z.clip(E_CONSTANT))
loss_dims *= inverse_z_w
# scale based on log, but clip at e
if not cube_2d_deltas is None:
loss_xy *= inverse_z_w
if loss_z is not None:
loss_z *= inverse_z_w
if loss_pose is not None:
loss_pose *= inverse_z_w
if self.loss_w_joint > 0:
loss_joint *= inverse_z_w
if self.use_confidence > 0:
uncert_sf = SQRT_2_CONSTANT * torch.exp(-cube_uncert)
loss_dims *= uncert_sf
if not cube_2d_deltas is None:
loss_xy *= uncert_sf
if not loss_z is None:
loss_z *= uncert_sf
if loss_pose is not None:
loss_pose *= uncert_sf
if self.loss_w_joint > 0:
loss_joint *= uncert_sf
losses.update({prefix + 'uncert': self.use_confidence*self.safely_reduce_losses(cube_uncert.clone())})
storage.put_scalar(prefix + 'conf', torch.exp(-cube_uncert).mean().item(), smoothing_hint=False)
# store per batch loss stats temporarily
self.batch_losses = [batch_losses.mean().item() for batch_losses in total_3D_loss_for_reporting.split(num_boxes_per_image)]
if self.loss_w_dims > 0:
losses.update({
prefix + 'loss_dims': self.safely_reduce_losses(loss_dims) * self.loss_w_dims * self.loss_w_3d,
})
if not cube_2d_deltas is None:
losses.update({
prefix + 'loss_xy': self.safely_reduce_losses(loss_xy) * self.loss_w_xy * self.loss_w_3d,
})
if not loss_z is None:
losses.update({
prefix + 'loss_z': self.safely_reduce_losses(loss_z) * self.loss_w_z * self.loss_w_3d,
})
if loss_pose is not None:
losses.update({
prefix + 'loss_pose': self.safely_reduce_losses(loss_pose) * self.loss_w_pose * self.loss_w_3d,
})
if self.loss_w_joint > 0:
if valid_joint.any():
losses.update({prefix + 'loss_joint': self.safely_reduce_losses(loss_joint[valid_joint]) * self.loss_w_joint * self.loss_w_3d})
'''
Inference
'''
if len(cube_z.shape) == 0:
cube_z = cube_z.unsqueeze(0)
# inference
cube_x3d = cube_z * (cube_x - Ks_scaled_per_box[:, 0, 2])/Ks_scaled_per_box[:, 0, 0]
cube_y3d = cube_z * (cube_y - Ks_scaled_per_box[:, 1, 2])/Ks_scaled_per_box[:, 1, 1]
cube_3D = torch.cat((torch.stack((cube_x3d, cube_y3d, cube_z)).T, cube_dims, cube_xy*im_ratios_per_box.unsqueeze(1)), dim=1)
if self.use_confidence:
cube_conf = torch.exp(-cube_uncert)
cube_3D = torch.cat((cube_3D, cube_conf.unsqueeze(1)), dim=1)
# convert the predictions to intances per image
cube_3D = cube_3D.split(num_boxes_per_image)
cube_pose = cube_pose.split(num_boxes_per_image)
box_classes = box_classes.split(num_boxes_per_image)
pred_instances = None
pred_instances = instances if not self.training else \
[Instances(image_size) for image_size in im_current_dims]
for cube_3D_i, cube_pose_i, instances_i, K, im_dim, im_scale_ratio, box_classes_i, pred_boxes_i in \
zip(cube_3D, cube_pose, pred_instances, Ks, im_current_dims, im_scales_ratio, box_classes, pred_boxes):
# merge scores if they already exist
if hasattr(instances_i, 'scores'):
instances_i.scores = (instances_i.scores * cube_3D_i[:, -1])**(1/2)
# assign scores if none are present
else:
instances_i.scores = cube_3D_i[:, -1]
# assign box classes if none exist
if not hasattr(instances_i, 'pred_classes'):
instances_i.pred_classes = box_classes_i
# assign predicted boxes if none exist
if not hasattr(instances_i, 'pred_boxes'):
instances_i.pred_boxes = pred_boxes_i
instances_i.pred_bbox3D = util.get_cuboid_verts_faces(cube_3D_i[:, :6], cube_pose_i)[0]
instances_i.pred_center_cam = cube_3D_i[:, :3]
instances_i.pred_center_2D = cube_3D_i[:, 6:8]
instances_i.pred_dimensions = cube_3D_i[:, 3:6]
instances_i.pred_pose = cube_pose_i
if self.training:
return pred_instances, losses
else:
return pred_instances
def _sample_proposals(
self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor, matched_ious=None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Based on the matching between N proposals and M groundtruth,
sample the proposals and set their classification labels.
Args:
matched_idxs (Tensor): a vector of length N, each is the best-matched
gt index in [0, M) for each proposal.
matched_labels (Tensor): a vector of length N, the matcher's label
(one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
gt_classes (Tensor): a vector of length M.
Returns:
Tensor: a vector of indices of sampled proposals. Each is in [0, N).
Tensor: a vector of the same length, the classification label for
each sampled proposal. Each sample is labeled as either a category in
[0, num_classes) or the background (num_classes).
"""
has_gt = gt_classes.numel() > 0
# Get the corresponding GT for each proposal
if has_gt:
gt_classes = gt_classes[matched_idxs]
# Label unmatched proposals (0 label from matcher) as background (label=num_classes)
gt_classes[matched_labels == 0] = self.num_classes
# Label ignore proposals (-1 label)
gt_classes[matched_labels == -1] = -1
else:
gt_classes = torch.zeros_like(matched_idxs) + self.num_classes
sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
gt_classes, self.batch_size_per_image, self.positive_fraction, self.num_classes, matched_ious=matched_ious
)
sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
return sampled_idxs, gt_classes[sampled_idxs]
@torch.no_grad()
def label_and_sample_proposals(self, proposals: List[Instances], targets: List[Instances]) -> List[Instances]:
#separate valid and ignore gts
targets_ign = [target[target.gt_classes < 0] for target in targets]
targets = [target[target.gt_classes >= 0] for target in targets]
if self.proposal_append_gt:
proposals = add_ground_truth_to_proposals(targets, proposals)
proposals_with_gt = []
num_fg_samples = []
num_bg_samples = []
for proposals_per_image, targets_per_image, targets_ign_per_image in zip(proposals, targets, targets_ign):
has_gt = len(targets_per_image) > 0
match_quality_matrix = pairwise_iou(targets_per_image.gt_boxes, proposals_per_image.proposal_boxes)
matched_idxs, matched_labels = self.proposal_matcher(match_quality_matrix)
try:
if len(targets_ign_per_image) > 0:
# compute the quality matrix, only on subset of background
background_inds = (matched_labels == 0).nonzero().squeeze()
# determine the boxes inside ignore regions with sufficient threshold
if background_inds.numel() > 1:
match_quality_matrix_ign = pairwise_ioa(targets_ign_per_image.gt_boxes, proposals_per_image.proposal_boxes[background_inds])
matched_labels[background_inds[match_quality_matrix_ign.max(0)[0] >= self.ignore_thresh]] = -1
del match_quality_matrix_ign
except:
pass
gt_arange = torch.arange(match_quality_matrix.shape[1]).to(matched_idxs.device)
matched_ious = match_quality_matrix[matched_idxs, gt_arange]
sampled_idxs, gt_classes = self._sample_proposals(matched_idxs, matched_labels, targets_per_image.gt_classes, matched_ious=matched_ious)
# Set target attributes of the sampled proposals:
proposals_per_image = proposals_per_image[sampled_idxs]
proposals_per_image.gt_classes = gt_classes
if has_gt:
sampled_targets = matched_idxs[sampled_idxs]
# We index all the attributes of targets that start with "gt_"
# and have not been added to proposals yet (="gt_classes").
# NOTE: here the indexing waste some compute, because heads
# like masks, keypoints, etc, will filter the proposals again,
# (by foreground/background, or number of keypoints in the image, etc)
# so we essentially index the data twice.
for (trg_name, trg_value) in targets_per_image.get_fields().items():
if trg_name.startswith("gt_") and not proposals_per_image.has(trg_name):
proposals_per_image.set(trg_name, trg_value[sampled_targets])
num_bg_samples.append((gt_classes == self.num_classes).sum().item())
num_fg_samples.append(gt_classes.numel() - num_bg_samples[-1])
proposals_with_gt.append(proposals_per_image)
# Log the number of fg/bg samples that are selected for training ROI heads
storage = get_event_storage()
storage.put_scalar("roi_head/num_fg_samples", np.mean(num_fg_samples))
storage.put_scalar("roi_head/num_bg_samples", np.mean(num_bg_samples))
return proposals_with_gt
def safely_reduce_losses(self, loss):
valid = (~(loss.isinf())) & (~(loss.isnan()))
if valid.any():
return loss[valid].mean()
else:
# no valid losses, simply zero out
return loss.mean()*0.0