Spaces:
Running
Running
import argparse | |
import cv2 | |
import glob | |
import matplotlib | |
import numpy as np | |
import os | |
import torch | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from torchvision.transforms import Compose | |
import sys | |
sys.path.append("/home/yepeng_liu/code_python/third_repos/Depth-Anything-V2") | |
from depth_anything_v2.dpt_opt import DepthAnythingV2 | |
from depth_anything_v2.util.transform import Resize, NormalizeImage, PrepareForNet | |
import time | |
VITS_MODEL_PATH = "/home/yepeng_liu/code_python/third_repos/Depth-Anything-V2/checkpoints/depth_anything_v2_vits.pth" | |
VITB_MODEL_PATH = "/home/yepeng_liu/code_python/third_repos/Depth-Anything-V2/checkpoints/depth_anything_v2_vitb.pth" | |
VITL_MODEL_PATH = "/home/yepeng_liu/code_python/third_repos/Depth-Anything-V2/checkpoints/depth_anything_v2_vitl.pth" | |
model_configs = { | |
"vits": {"encoder": "vits", "features": 64, "out_channels": [48, 96, 192, 384]}, | |
"vitb": { | |
"encoder": "vitb", | |
"features": 128, | |
"out_channels": [96, 192, 384, 768], | |
}, | |
"vitl": { | |
"encoder": "vitl", | |
"features": 256, | |
"out_channels": [256, 512, 1024, 1024], | |
}, | |
"vitg": { | |
"encoder": "vitg", | |
"features": 384, | |
"out_channels": [1536, 1536, 1536, 1536], | |
}, | |
} | |
class DepthAnythingExtractor(nn.Module): | |
def __init__(self, encoder_type, device, input_size, process_size=(608,800)): | |
super().__init__() | |
self.net = DepthAnythingV2(**model_configs[encoder_type]) | |
self.device = device | |
if encoder_type == "vits": | |
print(f"loading {VITS_MODEL_PATH}") | |
self.net.load_state_dict(torch.load(VITS_MODEL_PATH, map_location="cpu")) | |
elif encoder_type == "vitb": | |
print(f"loading {VITB_MODEL_PATH}") | |
self.net.load_state_dict(torch.load(VITB_MODEL_PATH, map_location="cpu")) | |
elif encoder_type == "vitl": | |
print(f"loading {VITL_MODEL_PATH}") | |
self.net.load_state_dict(torch.load(VITL_MODEL_PATH, map_location="cpu")) | |
else: | |
raise RuntimeError("unsupport encoder type") | |
self.net.to(self.device).eval() | |
self.tranform = Compose([ | |
Resize( | |
width=input_size, | |
height=input_size, | |
resize_target=False, | |
keep_aspect_ratio=True, | |
ensure_multiple_of=14, | |
resize_method='lower_bound', | |
image_interpolation_method=cv2.INTER_CUBIC, | |
), | |
NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), | |
PrepareForNet(), | |
]) | |
self.process_size=process_size | |
self.input_size=input_size | |
def infer_image(self,img): | |
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0 | |
img = self.tranform({'image': img})['image'] | |
img = torch.from_numpy(img).unsqueeze(0) | |
img = img.to(self.device) | |
with torch.no_grad(): | |
depth = self.net.forward(img) | |
depth = F.interpolate(depth[:, None], self.process_size, mode="bilinear", align_corners=True)[0, 0] | |
return depth.cpu().numpy() | |
def compute_normal_map_torch(self, depth_map, scale=1.0): | |
""" | |
通过深度图计算法向量 (PyTorch 实现) | |
参数: | |
depth_map (torch.Tensor): 深度图,形状为 (H, W) | |
scale (float): 深度值的比例因子,用于调整深度图中的梯度计算 | |
返回: | |
torch.Tensor: 法向量图,形状为 (H, W, 3) | |
""" | |
if depth_map.ndim != 2: | |
raise ValueError("输入 depth_map 必须是二维张量。") | |
# 计算深度图的梯度 | |
dzdx = torch.diff(depth_map, dim=1, append=depth_map[:, -1:]) * scale | |
dzdy = torch.diff(depth_map, dim=0, append=depth_map[-1:, :]) * scale | |
# 初始化法向量图 | |
H, W = depth_map.shape | |
normal_map = torch.zeros((H, W, 3), dtype=depth_map.dtype, device=depth_map.device) | |
normal_map[:, :, 0] = -dzdx # x 分量 | |
normal_map[:, :, 1] = -dzdy # y 分量 | |
normal_map[:, :, 2] = 1.0 # z 分量 | |
# 归一化法向量 | |
norm = torch.linalg.norm(normal_map, dim=2, keepdim=True) | |
norm = torch.where(norm == 0, torch.tensor(1.0, device=depth_map.device), norm) # 避免除以零 | |
normal_map /= norm | |
return normal_map | |
def extract(self, img): | |
depth = self.infer_image(img) | |
depth = (depth - depth.min()) / (depth.max() - depth.min()) * 255.0 | |
depth_t=torch.from_numpy(depth).float().to(self.device) | |
normal_map = self.compute_normal_map_torch(depth_t,1.0) | |
return depth_t,normal_map | |
if __name__=="__main__": | |
img_path=os.path.join(os.path.dirname(__file__),'../assert/ref.jpg') | |
img=cv2.imread(img_path) | |
img=cv2.resize(img,(800,608)) | |
import pdb;pdb.set_trace() | |
DAExtractor=DepthAnythingExtractor('vitb',torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu'),256) | |
depth_t,norm=DAExtractor.extract(img) | |
norm=norm.cpu().numpy() | |
norm=(norm+1)/2*255 | |
norm=norm.astype(np.uint8) | |
cv2.imwrite(os.path.join(os.path.dirname(__file__),"norm.png"),norm) | |
start=time.perf_counter() | |
for i in range(20): | |
depth_t,norm=DAExtractor.extract(img) | |
end=time.perf_counter() | |
print(f"cost {end-start} seconds") | |