import numpy as np import torch from einops import repeat from PIL import Image from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, DEPTH_ANYTHING_V2_MODEL_NAME_DICT from custom_controlnet_aux.depth_anything_v2.dpt import DepthAnythingV2 import cv2 import torch.nn.functional as F # https://github.com/DepthAnything/Depth-Anything-V2/blob/main/app.py model_configs = { 'depth_anything_v2_vits.pth': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}, 'depth_anything_v2_vitb.pth': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]}, 'depth_anything_v2_vitl.pth': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 'depth_anything_v2_vitg.pth': {'encoder': 'vitg', 'features': 384, 'out_channels': [1536, 1536, 1536, 1536]}, 'depth_anything_v2_metric_vkitti_vitl.pth': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, 'depth_anything_v2_metric_hypersim_vitl.pth': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]}, } class DepthAnythingV2Detector: def __init__(self, model, filename): self.model = model self.device = "cpu" self.filename = filename @classmethod def from_pretrained(cls, pretrained_model_or_path=None, filename="depth_anything_v2_vits.pth"): if pretrained_model_or_path is None: pretrained_model_or_path = DEPTH_ANYTHING_V2_MODEL_NAME_DICT[filename] model_path = custom_hf_download(pretrained_model_or_path, filename) model = DepthAnythingV2(**model_configs[filename]) model.load_state_dict(torch.load(model_path, map_location="cpu")) model = model.eval() return cls(model, filename) def to(self, device): self.model.to(device) self.device = device return self def __call__(self, input_image, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", max_depth=20.0, **kwargs): input_image, output_type = common_input_validate(input_image, output_type, **kwargs) depth = self.model.infer_image(cv2.cvtColor(input_image, cv2.COLOR_RGB2BGR), input_size=518, max_depth=max_depth) depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 depth = depth.astype(np.uint8) if 'metric' in self.filename: depth = 255 - depth detected_map = repeat(depth, "h w -> h w 3") detected_map, remove_pad = resize_image_with_pad(detected_map, detect_resolution, upscale_method) detected_map = remove_pad(detected_map) if output_type == "pil": detected_map = Image.fromarray(detected_map) return detected_map