Spaces:

jev-aleks
/

SceneDINO

Running on Zero

App Files Files Community

SceneDINO / scenedino /common /ray_sampler.py

jev-aleks

scenedino init

9e15541 16 days ago

raw

history blame contribute delete

28 kB

	from math import isqrt
	import torch
	import torch.nn.functional as F
	from torch.cuda.amp import autocast
	from omegaconf import ListConfig

	from scenedino.common import util
	from scenedino.common.cameras.pinhole import outside_frustum, project_to_image, pts_into_camera


	class RaySampler:
	def __init__(self, z_near: float, z_far: float) -> None:
	self.z_near = z_near
	self.z_far = z_far

	def sample(self, images, poses, projs):
	raise NotImplementedError

	def reconstruct(self, render_dict):
	raise NotImplementedError


	class RandomRaySampler(RaySampler):
	def __init__(
	self, z_near: float, z_far: float, ray_batch_size: int, channels: int = 3
	) -> None:
	super().__init__(z_near, z_far)
	self.ray_batch_size = ray_batch_size
	self.channels = channels

	def sample(self, images, poses, projs, image_ids=None):
	n, v, c, h, w = images.shape

	all_rgb_gt = []
	all_rays = []

	for n_ in range(n):
	focals = projs[n_, :, [0, 1], [0, 1]]
	centers = projs[n_, :, [0, 1], [2, 2]]

	rays, xy = util.gen_rays(
	poses[n_].view(-1, 4, 4),
	w,
	h,
	focal=focals,
	c=centers,
	z_near=self.z_near,
	z_far=self.z_far,
	)

	# Append frame id to the ray
	if image_ids is None:
	ids = torch.arange(v, device=images.device, dtype=images.dtype)
	else:
	ids = torch.tensor(image_ids, device=images.device, dtype=images.dtype)
	ids = ids.view(v, 1, 1, 1).expand(v, h, w, 1)
	rays = torch.cat((rays, ids), dim=-1)
	rays = torch.cat((rays, xy), dim=-1)
	r_dim = rays.shape[-1]
	rays = rays.view(-1, r_dim)

	rgb_gt = images[n_].view(-1, c, h, w)
	rgb_gt = rgb_gt.permute(0, 2, 3, 1).contiguous().reshape(-1, c)

	pix_inds = torch.randint(0, v * h * w, (self.ray_batch_size,))

	rgb_gt = rgb_gt[pix_inds]
	rays = rays[pix_inds]

	all_rgb_gt.append(rgb_gt)
	all_rays.append(rays)

	all_rgb_gt = torch.stack(all_rgb_gt)
	all_rays = torch.stack(all_rays)

	return all_rays, all_rgb_gt

	def reconstruct(self, render_dict, channels=None):
	for name, render_dict_part in render_dict.items():
	if not type(render_dict_part) == dict or not "rgb" in render_dict_part:
	continue

	if channels is None:
	channels = self.channels

	rgb = render_dict_part["rgb"] # n, n_pts, v * 3
	depth = render_dict_part["depth"]
	invalid = render_dict_part["invalid"]

	rgb_gt = render_dict["rgb_gt"]

	n, n_pts, v_c = rgb.shape
	v = v_c // channels
	n_smps = invalid.shape[-2]

	render_dict_part["rgb"] = rgb.view(n, n_pts, v, channels)
	render_dict_part["depth"] = depth.view(n, n_pts)
	render_dict_part["invalid"] = invalid.view(n, n_pts, n_smps, v)

	if "invalid_features" in render_dict_part:
	invalid_features = render_dict_part["invalid_features"]
	render_dict_part["invalid_features"] = invalid_features.view(n, n_pts, n_smps, v)

	if "weights" in render_dict_part:
	weights = render_dict_part["weights"]
	render_dict_part["weights"] = weights.view(n, n_pts, n_smps)

	if "alphas" in render_dict_part:
	alphas = render_dict_part["alphas"]
	render_dict_part["alphas"] = alphas.view(n, n_pts, n_smps)

	if "z_samps" in render_dict_part:
	z_samps = render_dict_part["z_samps"]
	render_dict_part["z_samps"] = z_samps.view(n, n_pts, n_smps)

	if "rgb_samps" in render_dict_part:
	rgb_samps = render_dict_part["rgb_samps"]
	render_dict_part["rgb_samps"] = rgb_samps.view(n, n_pts, n_smps, v, channels)

	if "ray_info" in render_dict_part:
	ri_shape = render_dict_part["ray_info"].shape[-1]
	ray_info = render_dict_part["ray_info"]
	render_dict_part["ray_info"] = ray_info.view(n, n_pts, ri_shape)

	if "extras" in render_dict_part:
	extras_shape = render_dict_part["extras"].shape[-1]
	extras = render_dict_part["extras"]
	render_dict_part["extras"] = extras.view(n, n_pts, extras_shape)

	render_dict[name] = render_dict_part
	render_dict["rgb_gt"] = rgb_gt.view(n, n_pts, channels)

	return render_dict


	class PatchRaySampler(RaySampler):
	def __init__(
	self,
	z_near: float,
	z_far: float,
	ray_batch_size: int,
	patch_size: int,
	channels: int = 3,
	snap_to_grid: bool = False,
	dino_upscaled: bool = False,
	) -> None:
	super().__init__(z_near, z_far)
	self.ray_batch_size = ray_batch_size
	self.channels = channels
	self.snap_to_grid = snap_to_grid
	self.dino_upscaled = dino_upscaled

	if isinstance(patch_size, int):
	self.patch_size_x, self.patch_size_y = patch_size, patch_size
	elif (
	isinstance(patch_size, tuple)
	or isinstance(patch_size, list)
	or isinstance(patch_size, ListConfig)
	):
	self.patch_size_y = patch_size[0]
	self.patch_size_x = patch_size[1]
	else:
	raise ValueError(f"Invalid format for patch size")
	assert (ray_batch_size % (self.patch_size_x * self.patch_size_y)) == 0
	self._patch_count = self.ray_batch_size // (
	self.patch_size_x * self.patch_size_y
	)

	def sample(
	self, images, poses, projs, image_ids=None, dino_features=None, loss_feature_grid_shift=None,
	): ### dim(images) == nv (ids_loss nv randomly sampled)
	n, v, c, h, w = images.shape

	self.channels = c

	if dino_features is not None:
	_, _, dino_channels, _, _ = dino_features.shape
	dino_features = dino_features.permute(0, 1, 3, 4, 2)

	device = images.device

	images = images.permute(0, 1, 3, 4, 2)

	all_rgb_gt, all_rays, all_dino_gt = [], [], []

	for n_ in range(n):
	focals = projs[n_, :, [0, 1], [0, 1]]
	centers = projs[n_, :, [0, 1], [2, 2]]

	rays, xy = util.gen_rays(
	poses[n_].view(-1, 4, 4),
	w,
	h,
	focal=focals,
	c=centers,
	z_near=self.z_near,
	z_far=self.z_far,
	)

	# Append frame id to the ray
	if image_ids is None:
	ids = torch.arange(v, device=images.device, dtype=images.dtype)
	else:
	ids = torch.tensor(image_ids, device=images.device, dtype=images.dtype)
	ids = ids.view(v, 1, 1, 1).expand(v, h, w, 1)
	rays = torch.cat((rays, ids), dim=-1)
	rays = torch.cat((rays, xy), dim=-1)
	r_dim = rays.shape[-1]

	patch_coords_v = torch.randint(0, v, (self._patch_count,))

	if self.snap_to_grid:
	if loss_feature_grid_shift is not None:
	patch_coords_y = torch.randint(0, h // self.patch_size_y - 1, (self._patch_count,))
	patch_coords_x = torch.randint(0, w // self.patch_size_x - 1, (self._patch_count,))
	else:
	patch_coords_y = torch.randint(0, h // self.patch_size_y, (self._patch_count,))
	patch_coords_x = torch.randint(0, w // self.patch_size_x, (self._patch_count,))
	else:
	patch_coords_y = torch.randint(0, h - self.patch_size_y, (self._patch_count,))
	patch_coords_x = torch.randint(0, w - self.patch_size_x, (self._patch_count,))

	sample_rgb_gt = []
	sample_rays = []
	sample_dino_gt = []

	for v_, coord_y, coord_x in zip(patch_coords_v, patch_coords_y, patch_coords_x):
	if self.snap_to_grid:
	patch_y, patch_x = coord_y, coord_x
	if loss_feature_grid_shift is not None:
	y = (loss_feature_grid_shift[0] % self.patch_size_y) + self.patch_size_y * coord_y
	x = (loss_feature_grid_shift[1] % self.patch_size_x) + self.patch_size_x * coord_x
	if loss_feature_grid_shift[0] < 0:
	patch_y += 1
	if loss_feature_grid_shift[1] < 0:
	patch_x += 1
	else:
	y = self.patch_size_y * coord_y
	x = self.patch_size_x * coord_x
	else:
	raise NotImplementedError

	rgb_gt_patch = images[n_][
	v_, y : y + self.patch_size_y, x : x + self.patch_size_x, :
	].reshape(-1, self.channels)
	rays_patch = rays[
	v_, y : y + self.patch_size_y, x : x + self.patch_size_x, :
	].reshape(-1, r_dim)

	sample_rgb_gt.append(rgb_gt_patch)
	sample_rays.append(rays_patch)

	if dino_features is not None:
	if self.dino_upscaled:
	dino_gt_patch = dino_features[n_][
	v_, y: y + self.patch_size_y, x: x + self.patch_size_x, :
	].reshape(-1, dino_channels)
	else:
	dino_gt_patch = dino_features[n_][
	v_, patch_y, patch_x, :
	].reshape(-1, dino_channels)
	sample_dino_gt.append(dino_gt_patch)

	sample_rgb_gt = torch.cat(sample_rgb_gt, dim=0)
	sample_rays = torch.cat(sample_rays, dim=0)
	all_rgb_gt.append(sample_rgb_gt)
	all_rays.append(sample_rays)

	if dino_features is not None:
	sample_dino_gt = torch.cat(sample_dino_gt, dim=0)
	all_dino_gt.append(sample_dino_gt)

	all_rgb_gt = torch.stack(all_rgb_gt)
	all_rays = torch.stack(all_rays)

	if dino_features is not None:
	all_dino_gt = torch.stack(all_dino_gt)
	return all_rays, all_rgb_gt, all_dino_gt
	else:
	return all_rays, all_rgb_gt

	def reconstruct(self, render_dict, channels=None, dino_channels=None):

	for name, render_dict_part in render_dict.items():
	if not type(render_dict_part) == dict or not "rgb" in render_dict_part:
	continue

	if channels is None:
	channels = self.channels

	rgb_gt = render_dict["rgb_gt"]
	dino_gt = render_dict["dino_gt"]

	n, n_pts, v_c = render_dict_part["rgb"].shape
	v = v_c // channels
	n_smps = render_dict_part["weights"].shape[-1]
	# (This can be a different v from the sample method)

	render_dict_part["rgb"] = render_dict_part["rgb"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, v, channels
	)
	render_dict_part["weights"] = render_dict_part["weights"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, n_smps
	)
	render_dict_part["depth"] = render_dict_part["depth"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x
	)
	render_dict_part["invalid"] = render_dict_part["invalid"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, n_smps, v
	)

	# TODO: Figure out DINO invalid policy
	# if "invalid_features" in render_dict_part:
	# render_dict_part["invalid_features"] = render_dict_part["invalid_features"].view(
	# n, self._patch_count, self.patch_size_y, self.patch_size_x, n_smps, v
	# )

	if "alphas" in render_dict_part:
	render_dict_part["alphas"] = render_dict_part["alphas"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, n_smps
	)

	if "z_samps" in render_dict_part:
	render_dict_part["z_samps"] = render_dict_part["z_samps"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, n_smps
	)

	if "rgb_samps" in render_dict_part:
	render_dict_part["rgb_samps"] = render_dict_part["rgb_samps"].view(
	n,
	self._patch_count,
	self.patch_size_y,
	self.patch_size_x,
	n_smps,
	v,
	channels,
	)

	if "ray_info" in render_dict_part:
	ri_shape = render_dict_part["ray_info"].shape[-1]
	render_dict_part["ray_info"] = render_dict_part["ray_info"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, ri_shape
	)

	if "extras" in render_dict_part:
	extras_shape = render_dict_part["extras"].shape[-1]
	render_dict_part["extras"] = render_dict_part["extras"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, extras_shape
	)

	if "dino_features" in render_dict_part:
	dino_shape = render_dict_part["dino_features"].shape[-1]
	render_dict_part["dino_features"] = render_dict_part["dino_features"].view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, 1, dino_shape
	)

	render_dict[name] = render_dict_part

	render_dict["rgb_gt"] = rgb_gt.view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, channels
	)
	dino_gt_shape = dino_gt.shape[-1]
	if self.dino_upscaled:
	render_dict["dino_gt"] = dino_gt.view(
	n, self._patch_count, self.patch_size_y, self.patch_size_x, dino_gt_shape
	)
	else:
	render_dict["dino_gt"] = dino_gt.view(
	n, self._patch_count, dino_gt_shape
	)

	if "dino_artifacts" in render_dict:
	render_dict["dino_artifacts"] = render_dict["dino_artifacts"].view(
	n, self._patch_count, dino_gt_shape
	)

	return render_dict


	class PointBasedRaySampler(RandomRaySampler):
	def sample(
	self, images, poses, projs, xyz, image_ids=None
	): ### dim(images) == nv (ids_loss nv randomly sampled)
	n, v, c, h, w = images.shape

	assert v == 1

	with autocast(enabled=False):
	poses_w2c = torch.inverse(poses)
	inv_K = torch.inverse(projs[:, :, :3, :3])

	B, n_pts, _ = xyz.shape

	xyz_projected = pts_into_camera(xyz, poses_w2c)
	distance = torch.norm(xyz_projected, dim=-2, keepdim=True)
	xy, z = project_to_image(xyz_projected, projs)

	xy = xy[:, 0]
	z = z[:, 0]
	distance = distance[:, 0, 0, :, None]

	# For numerical stability with AMP. Should not affect training outcome
	xy = xy.clamp(-2, 2)

	# Build rays
	cam_centers = poses[:, 0, None, :3, 3].expand(-1, n_pts, -1)
	ray_dir = ((poses[:, 0, :3, :3] @ inv_K[:, 0]) @ torch.cat((xy, torch.ones_like(xy[..., :1])), dim=-1).permute(0, 2, 1)).permute(0, 2, 1)

	cam_nears = torch.ones_like(cam_centers[..., :1]) * self.z_near
	cam_fars = torch.ones_like(cam_centers[..., :1]) * self.z_far

	ids = torch.zeros_like(cam_nears)

	rays = torch.cat((cam_centers, ray_dir, cam_nears, cam_fars, ids, xy, distance), dim=-1)

	rgb_gt = F.grid_sample(images[:, 0], xy.reshape(n, -1, 1, 2).to(images.dtype), padding_mode="border", align_corners=False)[..., 0].permute(0, 2, 1)

	return rays, rgb_gt


	class ImageRaySampler(RaySampler):
	def __init__(
	self,
	z_near: float,
	z_far: float,
	height: int \| None = None,
	width: int \| None = None,
	channels: int = 3,
	norm_dir: bool = True,
	dino_upscaled: bool = False,
	) -> None:
	super().__init__(z_near, z_far)
	self.height = height
	self.width = width
	self.channels = channels
	self.norm_dir = norm_dir
	self.dino_upscaled = dino_upscaled

	def sample(self, images, poses, projs, image_ids=None, dino_features=None, dino_artifacts=None):
	n, v, _, _ = poses.shape
	device = poses.device
	dtype = poses.dtype

	if images is not None:
	self.channels = images.shape[2]

	if self.height is None:
	self.height, self.width = images.shape[-2:]

	if dino_features is not None:
	_, _, dino_channels, _, _ = dino_features.shape

	h = self.height
	w = self.width

	all_rgb_gt = []
	all_dino_gt = []
	all_rays = []

	for n_ in range(n):
	focals = projs[n_, :, [0, 1], [0, 1]]
	centers = projs[n_, :, [0, 1], [2, 2]]

	rays, xy = util.gen_rays(
	poses[n_].view(-1, 4, 4),
	self.width,
	self.height,
	focal=focals,
	c=centers,
	z_near=self.z_near,
	z_far=self.z_far,
	norm_dir=self.norm_dir,
	)

	# Append frame id to the ray
	if image_ids is None:
	ids = torch.arange(v, device=device, dtype=dtype)
	else:
	ids = torch.tensor(image_ids, device=device, dtype=dtype)
	ids = ids.view(v, 1, 1, 1).expand(v, h, w, 1)
	rays = torch.cat((rays, ids), dim=-1)
	rays = torch.cat((rays, xy), dim=-1)
	r_dim = rays.shape[-1]
	rays = rays.view(-1, r_dim)

	all_rays.append(rays)

	if images is not None:
	rgb_gt = images[n_].view(-1, self.channels, self.height, self.width)
	rgb_gt = (
	rgb_gt.permute(0, 2, 3, 1).contiguous().reshape(-1, self.channels)
	)
	all_rgb_gt.append(rgb_gt)

	if dino_features is not None:
	patch_h, patch_w = dino_features[n_].shape[-2], dino_features[n_].shape[-1]
	dino_gt = dino_features[n_].view(-1, dino_channels, patch_h, patch_w)
	dino_gt = (
	dino_gt.permute(0, 2, 3, 1).contiguous().reshape(-1, dino_channels)
	)
	all_dino_gt.append(dino_gt)

	all_rays = torch.stack(all_rays)
	if images is not None:
	all_rgb_gt = torch.stack(all_rgb_gt)
	else:
	all_rgb_gt = None

	if dino_features is not None:
	all_dino_gt = torch.stack(all_dino_gt)
	return all_rays, all_rgb_gt, all_dino_gt
	else:
	return all_rays, all_rgb_gt

	def reconstruct(self, render_dict, channels=None, dino_channels=None):
	for name, render_dict_part in render_dict.items():
	if not type(render_dict_part) == dict or not "rgb" in render_dict_part:
	continue

	if channels is None:
	channels = self.channels

	rgb = render_dict_part["rgb"] # n, n_pts, v * 3
	weights = render_dict_part["weights"]
	depth = render_dict_part["depth"]
	invalid = render_dict_part["invalid"]

	n, n_pts, v_c = rgb.shape
	v_in = n_pts // (self.height * self.width)
	v_render = v_c // channels
	n_smps = weights.shape[-1]
	# (This can be a different v from the sample method)

	render_dict_part["rgb"] = rgb.view(n, v_in, self.height, self.width, v_render, channels)
	render_dict_part["weights"] = weights.view(n, v_in, self.height, self.width, n_smps)
	render_dict_part["depth"] = depth.view(n, v_in, self.height, self.width)
	render_dict_part["invalid"] = invalid.view(
	n, v_in, self.height, self.width, n_smps, v_render
	)

	if "invalid_features" in render_dict_part:
	invalid_features = render_dict_part["invalid_features"]
	render_dict_part["invalid_features"] = invalid_features.view(
	n, v_in, self.height, self.width, n_smps, v_render
	)

	if "alphas" in render_dict_part:
	alphas = render_dict_part["alphas"]
	render_dict_part["alphas"] = alphas.view(n, v_in, self.height, self.width, n_smps)

	if "z_samps" in render_dict_part:
	z_samps = render_dict_part["z_samps"]
	render_dict_part["z_samps"] = z_samps.view(
	n, v_in, self.height, self.width, n_smps
	)

	if "rgb_samps" in render_dict_part:
	rgb_samps = render_dict_part["rgb_samps"]
	render_dict_part["rgb_samps"] = rgb_samps.view(
	n, v_in, self.height, self.width, n_smps, v_render, channels
	)

	if "ray_info" in render_dict_part:
	ri_shape = render_dict_part["ray_info"].shape[-1]
	ray_info = render_dict_part["ray_info"]
	render_dict_part["ray_info"] = ray_info.view(n, v_in, self.height, self.width, ri_shape)

	if "extras" in render_dict_part:
	ex_shape = render_dict_part["extras"].shape[-1]
	extras = render_dict_part["extras"]
	render_dict_part["extras"] = extras.view(n, v_in, self.height, self.width, ex_shape)

	if "dino_features" in render_dict_part:
	dino_shape = render_dict_part["dino_features"].shape[-1]
	dino = render_dict_part["dino_features"]
	render_dict_part["dino_features"] = dino.view(n, v_in, self.height, self.width, 1, dino_shape)

	render_dict[name] = render_dict_part

	if "rgb_gt" in render_dict:
	rgb_gt = render_dict["rgb_gt"]
	render_dict["rgb_gt"] = rgb_gt.view(
	n, v_in, self.height, self.width, channels
	)

	if "dino_gt" in render_dict:
	dino_gt = render_dict["dino_gt"]
	dino_gt_shape = dino_gt.shape[-1]
	if self.dino_upscaled:
	render_dict["dino_gt"] = dino_gt.view(
	n, v_in, self.height, self.width, dino_gt_shape
	)
	else:
	# TODO: patch size should not be inferred like this, but parameter
	patch_size = isqrt((n * v_in * self.height * self.width * dino_gt_shape) // dino_gt.numel())
	render_dict["dino_gt"] = dino_gt.view(
	n, v_in, self.height // patch_size, self.width // patch_size, dino_gt_shape
	)
	if "dino_artifacts" in render_dict:
	dino_artifacts = render_dict["dino_artifacts"]
	# TODO: patch size should not be inferred like this, but parameter
	patch_size = isqrt((n * v_in * self.height * self.width * dino_gt_shape) // dino_artifacts.numel())
	render_dict["dino_artifacts"] = dino_artifacts.view(
	n, v_in, self.height // patch_size, self.width // patch_size, dino_gt_shape
	)

	return render_dict


	class JitteredPatchRaySampler(PatchRaySampler):
	def __init__(
	self,
	z_near: float,
	z_far: float,
	ray_batch_size: int,
	patch_size: int,
	jitter_strength: float, # In pixels, max [0, 1)
	channels: int = 3,
	) -> None:
	super().__init__(z_near, z_far, ray_batch_size, patch_size, channels)

	assert 0 <= jitter_strength < 1, "Jitter strength is invalid."

	self.jitter_strength = jitter_strength

	x = torch.arange(0, self.patch_size_x).view(1, 1, -1, 1).expand(-1, self.patch_size_y, -1, -1)
	y = torch.arange(0, self.patch_size_y).view(1, -1, 1, 1).expand(-1, -1, self.patch_size_x, -1)
	self._grid = torch.cat((x, y), dim=-1)

	def sample(
	self, images, poses, projs, image_ids=None
	): ### dim(images) == nv (ids_loss nv randomly sampled)
	n, v, c, h, w = images.shape
	device = images.device

	all_rgb_gt, all_rays = [], []

	xy_offset = ((torch.rand(2) - .5) * self.jitter_strength)

	for n_ in range(n):
	focals = projs[n_, :, [0, 1], [0, 1]]
	centers = projs[n_, :, [0, 1], [2, 2]]

	rays, xy = util.gen_rays(
	poses[n_].view(-1, 4, 4),
	w,
	h,
	focal=focals,
	c=centers,
	z_near=self.z_near,
	z_far=self.z_far,
	xy_offset=xy_offset,
	)

	# Append frame id to the ray
	if image_ids is None:
	ids = torch.arange(v, device=images.device, dtype=images.dtype)
	else:
	ids = torch.tensor(image_ids, device=images.device, dtype=images.dtype)
	ids = ids.view(v, 1, 1, 1).expand(v, h, w, 1)
	rays = torch.cat((rays, ids), dim=-1)
	r_dim = rays.shape[-1]

	patch_coords_v = torch.randint(0, v, (self._patch_count,))
	patch_coords_y = torch.randint(
	0, h - self.patch_size_y, (self._patch_count,)
	)
	patch_coords_x = torch.randint(
	0, w - self.patch_size_x, (self._patch_count,)
	)

	sample_rgb_gt = []
	sample_rays = []

	for v_, y, x in zip(patch_coords_v, patch_coords_y, patch_coords_x):
	xy_patch = torch.tensor((x, y)).view(1, 1, 1, 2)
	patch_grid = self._grid + xy_patch + xy_offset.view(1, 1, 1, 2) + .5
	patch_grid = patch_grid.to(images.device)
	patch_grid[..., 0] = (patch_grid[..., 0] / w) * 2 - 1
	patch_grid[..., 1] = (patch_grid[..., 1] / h) * 2 - 1

	rgb_gt_patch = F.grid_sample(images[n_:n_+1, v_], patch_grid, padding_mode="border", align_corners=False)
	rgb_gt_patch = rgb_gt_patch.permute(0, 2, 3, 1).reshape(-1, self.channels)

	rays_patch = rays[
	v_, y : y + self.patch_size_y, x : x + self.patch_size_x, :
	].reshape(-1, r_dim)

	sample_rgb_gt.append(rgb_gt_patch)
	sample_rays.append(rays_patch)

	sample_rgb_gt = torch.cat(sample_rgb_gt, dim=0)
	sample_rays = torch.cat(sample_rays, dim=0)
	all_rgb_gt.append(sample_rgb_gt)
	all_rays.append(sample_rays)

	all_rgb_gt = torch.stack(all_rgb_gt)
	all_rays = torch.stack(all_rays)

	return all_rays, all_rgb_gt


	def get_ray_sampler(config) -> RaySampler:
	z_near = config["z_near"]
	z_far = config["z_far"]
	sample_mode = config.get("sample_mode", "random")

	# TODO: check channel size
	match sample_mode:
	case "random":
	return RandomRaySampler(z_near, z_far, **config["args"])
	case "patch":
	return PatchRaySampler(z_near, z_far, **config["args"])
	case "jitteredpatch":
	return JitteredPatchRaySampler(z_near, z_far, **config["args"])
	case "image":
	return ImageRaySampler(z_near, z_far)
	case _:
	raise NotImplementedError