|
|
|
import os |
|
import sys |
|
from typing import * |
|
|
|
import click |
|
import torch |
|
import torch.nn.functional as F |
|
import cv2 |
|
|
|
from moge.test.baseline import MGEBaselineInterface |
|
|
|
|
|
class Baseline(MGEBaselineInterface): |
|
def __init__(self, backbone: Literal['vits', 'vitl', 'vitg'], device): |
|
backbone_map = { |
|
'vits': 'metric3d_vit_small', |
|
'vitl': 'metric3d_vit_large', |
|
'vitg': 'metric3d_vit_giant2' |
|
} |
|
|
|
device = torch.device(device) |
|
model = torch.hub.load('yvanyin/metric3d', backbone_map[backbone], pretrain=True) |
|
model.to(device).eval() |
|
|
|
self.model = model |
|
self.device = device |
|
|
|
@click.command() |
|
@click.option('--backbone', type=click.Choice(['vits', 'vitl', 'vitg']), default='vitl', help='Encoder architecture.') |
|
@click.option('--device', type=str, default='cuda', help='Device to use.') |
|
@staticmethod |
|
def load(backbone: str = 'vitl', device: torch.device = 'cuda'): |
|
return Baseline(backbone, device) |
|
|
|
@torch.inference_mode() |
|
def inference_one_image(self, image: torch.Tensor, intrinsics: torch.Tensor = None): |
|
|
|
|
|
|
|
rgb_origin = image.cpu().numpy().transpose((1, 2, 0)) * 255 |
|
|
|
|
|
input_size = (616, 1064) |
|
h, w = rgb_origin.shape[:2] |
|
scale = min(input_size[0] / h, input_size[1] / w) |
|
rgb = cv2.resize(rgb_origin, (int(w * scale), int(h * scale)), interpolation=cv2.INTER_LINEAR) |
|
if intrinsics is not None: |
|
focal = intrinsics[0, 0] * int(w * scale) |
|
|
|
|
|
padding = [123.675, 116.28, 103.53] |
|
h, w = rgb.shape[:2] |
|
pad_h = input_size[0] - h |
|
pad_w = input_size[1] - w |
|
pad_h_half = pad_h // 2 |
|
pad_w_half = pad_w // 2 |
|
rgb = cv2.copyMakeBorder(rgb, pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half, cv2.BORDER_CONSTANT, value=padding) |
|
pad_info = [pad_h_half, pad_h - pad_h_half, pad_w_half, pad_w - pad_w_half] |
|
|
|
|
|
mean = torch.tensor([123.675, 116.28, 103.53]).float()[:, None, None] |
|
std = torch.tensor([58.395, 57.12, 57.375]).float()[:, None, None] |
|
rgb = torch.from_numpy(rgb.transpose((2, 0, 1))).float() |
|
rgb = torch.div((rgb - mean), std) |
|
rgb = rgb[None, :, :, :].cuda() |
|
|
|
|
|
pred_depth, confidence, output_dict = self.model.inference({'input': rgb}) |
|
|
|
|
|
pred_depth = pred_depth.squeeze() |
|
pred_depth = pred_depth[pad_info[0] : pred_depth.shape[0] - pad_info[1], pad_info[2] : pred_depth.shape[1] - pad_info[3]] |
|
pred_depth = pred_depth.clamp_min(0.5) |
|
|
|
|
|
pred_depth = F.interpolate(pred_depth[None, None, :, :], image.shape[-2:], mode='bilinear').squeeze() |
|
|
|
if intrinsics is not None: |
|
|
|
canonical_to_real_scale = focal / 1000.0 |
|
pred_depth = pred_depth * canonical_to_real_scale |
|
pred_depth = torch.clamp(pred_depth, 0, 300) |
|
|
|
pred_normal, normal_confidence = output_dict['prediction_normal'].split([3, 1], dim=1) |
|
|
|
|
|
pred_normal = pred_normal.squeeze(0) |
|
pred_normal = pred_normal[:, pad_info[0] : pred_normal.shape[1] - pad_info[1], pad_info[2] : pred_normal.shape[2] - pad_info[3]] |
|
|
|
|
|
pred_normal = F.interpolate(pred_normal[None, :, :, :], image.shape[-2:], mode='bilinear').squeeze(0) |
|
pred_normal = F.normalize(pred_normal, p=2, dim=0) |
|
|
|
return pred_depth, pred_normal.permute(1, 2, 0) |
|
|
|
@torch.inference_mode() |
|
def infer(self, image: torch.Tensor, intrinsics: torch.Tensor = None): |
|
|
|
if image.ndim == 3: |
|
pred_depth, pred_normal = self.inference_one_image(image, intrinsics) |
|
else: |
|
for i in range(image.shape[0]): |
|
pred_depth_i, pred_normal_i = self.inference_one_image(image[i], intrinsics[i] if intrinsics is not None else None) |
|
pred_depth.append(pred_depth_i) |
|
pred_normal.append(pred_normal_i) |
|
pred_depth = torch.stack(pred_depth, dim=0) |
|
pred_normal = torch.stack(pred_normal, dim=0) |
|
|
|
if intrinsics is not None: |
|
return { |
|
"depth_metric": pred_depth, |
|
} |
|
else: |
|
return { |
|
"depth_scale_invariant": pred_depth, |
|
} |
|
|