Spaces:

JasonSmithSO
/

FooocusEnhanced

Configuration error

App Files Files Community

FooocusEnhanced / custom_controlnet_aux /depth_anything /__init__.py

JasonSmithSO

Upload 777 files

0034848 verified 30 days ago

raw

history blame contribute delete

3.09 kB

	import numpy as np
	import torch
	from einops import repeat
	from PIL import Image
	from custom_controlnet_aux.util import HWC3, common_input_validate, resize_image_with_pad, custom_hf_download, DEPTH_ANYTHING_MODEL_NAME
	from custom_controlnet_aux.depth_anything.depth_anything.dpt import DPT_DINOv2
	from custom_controlnet_aux.depth_anything.depth_anything.util.transform import Resize, NormalizeImage, PrepareForNet
	from torchvision.transforms import Compose
	import cv2
	import torch.nn.functional as F

	transform = Compose([
	Resize(
	width=518,
	height=518,
	resize_target=False,
	keep_aspect_ratio=True,
	ensure_multiple_of=14,
	resize_method='lower_bound',
	image_interpolation_method=cv2.INTER_CUBIC,
	),
	NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
	PrepareForNet(),
	])

	#https://huggingface.co/LiheYoung/depth_anything_vitl14/raw/main/config.json
	DPT_CONFIGS = {
	"depth_anything_vitl14.pth": {"encoder": "vitl", "features": 256, "out_channels": [256, 512, 1024, 1024], "use_bn": False, "use_clstoken": False},
	"depth_anything_vitb14.pth": {"encoder": "vitb", "features": 128, "out_channels": [96, 192, 384, 768], "use_bn": False, "use_clstoken": False},
	"depth_anything_vits14.pth": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384], "use_bn": False, "use_clstoken": False}
	}

	class DepthAnythingDetector:
	def __init__(self, model):
	self.model = model
	self.device = "cpu"

	@classmethod
	def from_pretrained(cls, pretrained_model_or_path=DEPTH_ANYTHING_MODEL_NAME, filename="depth_anything_vitl14.pth"):
	model_path = custom_hf_download(pretrained_model_or_path, filename, subfolder="checkpoints", repo_type="space")
	model = DPT_DINOv2(**DPT_CONFIGS[filename], localhub=True)
	model.load_state_dict(torch.load(model_path, map_location="cpu"))
	model.eval()

	return cls(model)

	def to(self, device):
	self.model.to(device)
	self.device = device
	return self

	def __call__(self, input_image, detect_resolution=512, output_type=None, upscale_method="INTER_CUBIC", **kwargs):
	input_image, output_type = common_input_validate(input_image, output_type, **kwargs)
	t, remove_pad = resize_image_with_pad(np.zeros_like(input_image), detect_resolution, upscale_method)
	t = remove_pad(t)

	h, w = t.shape[:2]
	h, w = int(h), int(w)
	image = transform({'image': input_image / 255.})['image']
	image = torch.from_numpy(image).unsqueeze(0).to(self.device)

	with torch.no_grad():
	depth = self.model(image)
	depth = F.interpolate(depth[None], (h, w), mode='bilinear', align_corners=False)[0, 0]
	depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0

	detected_map = repeat(depth, "h w -> h w 3").cpu().numpy().astype(np.uint8)
	if output_type == "pil":
	detected_map = Image.fromarray(detected_map)

	return detected_map