Spaces:

alexnasa
/

AnySplat

Running on Zero

App Files Files Community

AnySplat / src /loss /loss_depth.py

alexnasa

Upload 243 files

2568013 verified 8 days ago

raw

history blame contribute delete

4.34 kB

	from dataclasses import dataclass

	import torch
	from einops import reduce
	from jaxtyping import Float
	from torch import Tensor

	from src.dataset.types import BatchedExample
	from src.model.decoder.decoder import DecoderOutput
	from src.model.types import Gaussians
	from .loss import Loss
	from typing import Generic, TypeVar
	from dataclasses import fields
	import torch.nn.functional as F
	import sys
	import os
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	# from src.loss.depth_anything.dpt import DepthAnything
	from src.misc.utils import vis_depth_map

	T_cfg = TypeVar("T_cfg")
	T_wrapper = TypeVar("T_wrapper")


	@dataclass
	class LossDepthCfg:
	weight: float
	sigma_image: float \| None
	use_second_derivative: bool


	@dataclass
	class LossDepthCfgWrapper:
	depth: LossDepthCfg


	class LossDepth(Loss[LossDepthCfg, LossDepthCfgWrapper]):
	def __init__(self, cfg: T_wrapper) -> None:
	super().__init__(cfg)

	# Extract the configuration from the wrapper.
	(field,) = fields(type(cfg))
	self.cfg = getattr(cfg, field.name)
	self.name = field.name

	model_configs = {
	'vitl': {'encoder': 'vitl', 'features': 256, 'out_channels': [256, 512, 1024, 1024]},
	'vitb': {'encoder': 'vitb', 'features': 128, 'out_channels': [96, 192, 384, 768]},
	'vits': {'encoder': 'vits', 'features': 64, 'out_channels': [48, 96, 192, 384]}
	}
	encoder = 'vits' # or 'vitb', 'vits'
	depth_anything = DepthAnything(model_configs[encoder])
	depth_anything.load_state_dict(torch.load(f'src/loss/depth_anything/depth_anything_{encoder}14.pth'))

	self.depth_anything = depth_anything
	for param in self.depth_anything.parameters():
	param.requires_grad = False

	def disp_rescale(self, disp: Float[Tensor, "B H W"]):
	disp = disp.flatten(1, 2)
	disp_median = torch.median(disp, dim=-1, keepdim=True)[0] # (B, V, 1)
	disp_var = (disp - disp_median).abs().mean(dim=-1, keepdim=True) # (B, V, 1)
	disp = (disp - disp_median) / (disp_var + 1e-6)
	return disp

	def smooth_l1_loss(self, pred, target, beta=1.0, reduction='none'):
	diff = pred - target
	abs_diff = torch.abs(diff)

	loss = torch.where(abs_diff < beta, 0.5 * diff ** 2 / beta, abs_diff - 0.5 * beta)

	if reduction == 'mean':
	return loss.mean()
	elif reduction == 'sum':
	return loss.sum()
	elif reduction == 'none':
	return loss
	else:
	raise ValueError("Invalid reduction type. Choose from 'mean', 'sum', or 'none'.")

	def ctx_depth_loss(self,
	depth_map: Float[Tensor, "B V H W C"],
	depth_conf: Float[Tensor, "B V H W"],
	batch: BatchedExample,
	cxt_depth_weight: float = 0.01,
	alpha: float = 0.2):
	B, V, _, H, W = batch["context"]["image"].shape
	ctx_imgs = batch["context"]["image"].view(B * V, 3, H, W).float()
	da_output = self.depth_anything(ctx_imgs)
	da_output = self.disp_rescale(da_output)

	disp_context = 1.0 / depth_map.flatten(0, 1).squeeze(-1).clamp(1e-3) # (B * V, H, W)
	context_output = self.disp_rescale(disp_context)

	depth_conf = depth_conf.flatten(0, 1).flatten(1, 2) # (B * V)

	return cxt_depth_weight * (self.smooth_l1_loss(context_outputdepth_conf, da_outputdepth_conf, reduction='none') - alpha * torch.log(depth_conf)).mean()


	def forward(
	self,
	prediction: DecoderOutput,
	batch: BatchedExample,
	gaussians: Gaussians,
	global_step: int,
	) -> Float[Tensor, ""]:
	# Scale the depth between the near and far planes.
	target_imgs = batch["target"]["image"]
	B, V, _, H, W = target_imgs.shape
	target_imgs = target_imgs.view(B * V, 3, H, W)
	da_output = self.depth_anything(target_imgs.float())
	da_output = self.disp_rescale(da_output)

	disp_gs = 1.0 / prediction.depth.flatten(0, 1).clamp(1e-3).float()
	gs_output = self.disp_rescale(disp_gs)


	return self.cfg.weight * torch.nan_to_num(F.smooth_l1_loss(da_output, gs_output), nan=0.0)