Spaces:

jev-aleks
/

SceneDINO

Running on Zero

App Files Files Community

SceneDINO / scenedino /datasets /old_kitti_360.py

jev-aleks

scenedino init

9e15541 15 days ago

raw

history blame contribute delete

38.6 kB

	import os
	import time
	import xml.etree.ElementTree as ET
	from collections import Counter, defaultdict
	from pathlib import Path
	from typing import Optional
	from dotdict import dotdict
	import yaml

	import cv2
	import numpy as np
	import omegaconf
	import torch
	import torch.nn.functional as F
	from scipy.spatial.transform import Rotation
	from torch.utils.data import Dataset
	from torchvision.transforms import ColorJitter
	from scenedino.common.point_sampling import regular_grid

	from datasets.kitti_360.annotation import KITTI360Bbox3D
	from scenedino.common.augmentation import get_color_aug_fn


	class FisheyeToPinholeSampler:
	def __init__(self, K_target, target_image_size, calibs, rotation=None):
	self._compute_transform(K_target, target_image_size, calibs, rotation)

	def _compute_transform(self, K_target, target_image_size, calibs, rotation=None):
	x = (
	torch.linspace(-1, 1, target_image_size[1])
	.view(1, -1)
	.expand(target_image_size)
	)
	y = (
	torch.linspace(-1, 1, target_image_size[0])
	.view(-1, 1)
	.expand(target_image_size)
	)
	z = torch.ones_like(x)
	xyz = torch.stack((x, y, z), dim=-1).view(-1, 3)

	# Unproject
	xyz = (torch.inverse(torch.tensor(K_target)) @ xyz.T).T

	if rotation is not None:
	xyz = (torch.tensor(rotation) @ xyz.T).T

	# Backproject into fisheye
	xyz = xyz / torch.norm(xyz, dim=-1, keepdim=True)
	x = xyz[:, 0]
	y = xyz[:, 1]
	z = xyz[:, 2]

	xi_src = calibs["mirror_parameters"]["xi"]
	x = x / (z + xi_src)
	y = y / (z + xi_src)

	k1 = calibs["distortion_parameters"]["k1"]
	k2 = calibs["distortion_parameters"]["k2"]

	r = x * x + y * y
	factor = 1 + k1 * r + k2 * r * r
	x = x * factor
	y = y * factor

	gamma0 = calibs["projection_parameters"]["gamma1"]
	gamma1 = calibs["projection_parameters"]["gamma2"]
	u0 = calibs["projection_parameters"]["u0"]
	v0 = calibs["projection_parameters"]["v0"]

	x = x * gamma0 + u0
	y = y * gamma1 + v0

	xy = torch.stack((x, y), dim=-1).view(1, *target_image_size, 2)
	self.sample_pts = xy

	def resample(self, img):
	img = img.unsqueeze(0)
	resampled_img = F.grid_sample(img, self.sample_pts, align_corners=True).squeeze(
	0
	)
	return resampled_img


	class OldKITTI360Dataset(Dataset):
	def __init__(
	self,
	data_path: str,
	pose_path: str,
	split_path: Optional[str],
	target_image_size=(192, 640),
	return_stereo=False,
	return_depth=False,
	return_fisheye=True, ## default: True
	return_3d_bboxes=False,
	return_segmentation=False,
	frame_count=2,
	keyframe_offset=0,
	dilation=1,
	fisheye_rotation=0,
	fisheye_offset=0,
	stereo_offset=0,
	eigen_depth=True,
	color_aug=False,
	is_preprocessed=False,
	**kwargs,
	):
	self.data_path = data_path
	self.pose_path = pose_path
	self.split_path = split_path
	self.target_image_size = target_image_size
	self.return_stereo = return_stereo
	self.return_fisheye = return_fisheye
	self.return_depth = return_depth
	self.return_3d_bboxes = return_3d_bboxes
	self.return_segmentation = return_segmentation
	self.frame_count = frame_count
	self.dilation = dilation
	self.fisheye_rotation = fisheye_rotation
	self.fisheye_offset = fisheye_offset
	self.stereo_offset = stereo_offset
	self.keyframe_offset = keyframe_offset
	self.eigen_depth = eigen_depth
	self.color_aug = color_aug
	self.is_preprocessed = is_preprocessed

	if isinstance(self.fisheye_rotation, float) or isinstance(
	self.fisheye_rotation, int
	):
	self.fisheye_rotation = (0, self.fisheye_rotation)
	self.fisheye_rotation = tuple(self.fisheye_rotation)

	# if additional_random_front_offset and not self.random_fisheye_offset:
	# raise ValueError("Random Fisheye Offset needs to be active for additional random front offset!")
	# else:
	# self.additional_random_front_offset = additional_random_front_offset

	# Support random fisheye offset
	if type(self.fisheye_offset) == int:
	self.random_fisheye_offset = False
	self.fisheye_offset = (self.fisheye_offset,)
	elif type(self.fisheye_offset) in [
	tuple,
	list,
	omegaconf.listconfig.ListConfig,
	]:
	self.random_fisheye_offset = True
	self.fisheye_offset = tuple(sorted(self.fisheye_offset))
	else:
	raise ValueError(
	f"Invalid datatype for fisheye offset: {type(self.fisheye_offset)}"
	)

	if type(self.stereo_offset) == int:
	self.random_stereo_offset = False
	self.stereo_offset = (self.stereo_offset,)
	elif type(self.stereo_offset) in [tuple, list, omegaconf.listconfig.ListConfig]:
	self.random_stereo_offset = True
	self.stereo_offset = tuple(sorted(self.stereo_offset))
	else:
	raise ValueError(
	f"Invalid datatype for fisheye offset: {type(self.stereo_offset)}"
	)

	self._sequences = self._get_sequences(self.data_path)

	self._calibs = self._load_calibs(self.data_path, self.fisheye_rotation)
	self._resampler_02, self._resampler_03 = self._get_resamplers(
	self._calibs, self._calibs["K_fisheye"], self.target_image_size
	)
	self._img_ids, self._poses = self._load_poses(self.pose_path, self._sequences)
	self._left_offset = (
	(self.frame_count - 1) // 2 + self.keyframe_offset
	) * self.dilation

	self._perspective_folder = (
	"data_rect"
	if not self.is_preprocessed
	else f"data_{self.target_image_size[0]}x{self.target_image_size[1]}"
	)
	self._fisheye_folder = (
	"data_rgb"
	if not self.is_preprocessed
	else f"data_{self.target_image_size[0]}x{self.target_image_size[1]}_{self.fisheye_rotation[0]}x{self.fisheye_rotation[1]}"
	)

	if self.split_path is not None:
	self._datapoints = self._load_split(self.split_path, self._img_ids)
	elif self.return_segmentation:
	self._datapoints = self._semantics_split(
	self._sequences, self.data_path, self._img_ids
	)
	else:
	self._datapoints = self._full_split(
	self._sequences, self._img_ids, self.check_file_integrity
	)

	if self.return_3d_bboxes:
	self._3d_bboxes = self._load_3d_bboxes(
	Path(data_path) / "data_3d_bboxes" / "train_full", self._sequences
	)

	if self.return_segmentation:
	# Segmentations are only provided for the left camera
	self._datapoints = [dp for dp in self._datapoints if not dp[2]]
	# make sure we can load all segmentation masks
	self._datapoints = [dp for dp in self._datapoints if self.check_segmentation(dp)]

	seq_max_id = {seq: max([0] + [d[1] for d in self._datapoints if d[0] == seq]) for seq in self._sequences}
	for seq in self._sequences:
	self._poses[seq] = self._poses[seq][:seq_max_id[seq]+1]
	self._img_ids[seq] = self._img_ids[seq][:seq_max_id[seq]+1]

	self._skip = 0
	self.length = len(self._datapoints)

	def check_segmentation(self, dp):
	"""Checks for a datapoint dp if we can load all the segmentation masks for all image_ids."""
	sequence, id, is_right = dp
	seq_len = self._img_ids[sequence].shape[0]

	ids = [id] + [max(min(i, seq_len - 1), 0) for i in
	range(id - self._left_offset, id - self._left_offset + self.frame_count * self.dilation,
	self.dilation) if i != id]
	img_ids = [self.get_img_id_from_id(sequence, id) for id in ids]

	for img_id in img_ids:
	_p = os.path.join(self.data_path, "data_2d_semantics", "train", sequence, "image_00", "semantic",
	f"{img_id:010d}.png")

	if not os.path.isfile(_p):
	return False

	return True

	def get_points(self, pose: torch.Tensor) -> torch.Tensor:
	"""Get points from a pose.

	Args:
	pose (torch.Tensor): Pose of shape (4, 4)

	Returns:
	torch.Tensor: Points of shape (N, 3). NOTE: the points are in the world coordinate system.
	"""
	OUT_RES = dotdict(
	X_RANGE=(-9, 9),
	Y_RANGE=(0.0, 0.75),
	Z_RANGE=(21, 3),
	X_RES=256,
	Y_RES=64,
	Z_RES=256,
	)

	cam_incl_adjust = torch.tensor(
	[
	[1.0000000, 0.0000000, 0.0000000, 0],
	[0.0000000, 0.9961947, -0.0871557, 0],
	[0.0000000, 0.0871557, 0.9961947, 0],
	[0.0000000, 000000000, 0.0000000, 1],
	],
	dtype=torch.float32,
	).view(4, 4)

	points = regular_grid(
	OUT_RES.X_RANGE,
	OUT_RES.Y_RANGE,
	OUT_RES.Z_RANGE,
	OUT_RES.X_RES,
	OUT_RES.Y_RES,
	OUT_RES.Z_RES,
	cam_incl_adjust=cam_incl_adjust,
	)
	return points

	def check_file_integrity(self, seq, id):
	dp = Path(self.data_path)
	image_00 = dp / "data_2d_raw" / seq / "image_00" / self._perspective_folder
	image_01 = dp / "data_2d_raw" / seq / "image_01" / self._perspective_folder
	image_02 = dp / "data_2d_raw" / seq / "image_02" / self._fisheye_folder
	image_03 = dp / "data_2d_raw" / seq / "image_03" / self._fisheye_folder

	seq_len = self._img_ids[seq].shape[0]

	ids = [id] + [
	max(min(i, seq_len - 1), 0)
	for i in range(
	id - self._left_offset,
	id - self._left_offset + self.frame_count * self.dilation,
	self.dilation,
	)
	if i != id
	]
	ids_fish = [max(min(id + self.fisheye_offset, seq_len - 1), 0)] + [
	max(min(i, seq_len - 1), 0)
	for i in range(
	id + self.fisheye_offset - self._left_offset,
	id
	+ self.fisheye_offset
	- self._left_offset
	+ self.frame_count * self.dilation,
	self.dilation,
	)
	if i != id + self.fisheye_offset
	]

	img_ids = [self.get_img_id_from_id(seq, id) for id in ids]
	img_ids_fish = [self.get_img_id_from_id(seq, id) for id in ids_fish]

	for img_id in img_ids:
	if not (
	(image_00 / f"{img_id:010d}.png").exists()
	and (image_01 / f"{img_id:010d}.png").exists()
	):
	return False
	if self.return_fisheye:
	for img_id in img_ids_fish:
	if not (
	(image_02 / f"{img_id:010d}.png").exists()
	and (image_03 / f"{img_id:010d}.png").exists()
	):
	return False
	return True

	@staticmethod
	def _get_sequences(data_path):
	all_sequences = []

	seqs_path = Path(data_path) / "data_2d_raw"
	for seq in seqs_path.iterdir():
	if not seq.is_dir():
	continue
	all_sequences.append(seq.name)

	return all_sequences

	@staticmethod
	def _full_split(sequences, img_ids, check_integrity):
	datapoints = []
	for seq in sorted(sequences):
	ids = [id for id in range(len(img_ids[seq])) if check_integrity(seq, id)]
	datapoints_seq = [(seq, id, False) for id in ids] + [
	(seq, id, True) for id in ids
	]
	datapoints.extend(datapoints_seq)
	return datapoints

	@staticmethod
	def _semantics_split(sequences, data_path, img_ids):
	datapoints = []
	for seq in sorted(sequences):
	datapoints_seq = [(seq, id, False) for id in range(len(img_ids[seq]))]
	datapoints_seq = [
	dp
	for dp in datapoints_seq
	if os.path.exists(
	os.path.join(
	data_path,
	"data_2d_semantics",
	"train",
	seq,
	"image_00",
	"semantic_rgb",
	f"{img_ids[seq][dp[1]]:010d}.png",
	)
	)
	]
	datapoints.extend(datapoints_seq)
	return datapoints

	@staticmethod
	def _load_split(split_path, img_ids):
	img_id2id = {
	seq: {id: i for i, id in enumerate(ids)} for seq, ids in img_ids.items()
	}

	with open(split_path, "r") as f:
	lines = f.readlines()

	def split_line(l):
	segments = l.split(" ")
	seq = segments[0]
	id = img_id2id[seq][int(segments[1])]
	return seq, id, segments[2][0] == "r"

	return list(map(split_line, lines))

	@staticmethod
	def _load_calibs(data_path, fisheye_rotation=0):
	data_path = Path(data_path)

	calib_folder = data_path / "calibration"
	cam_to_pose_file = calib_folder / "calib_cam_to_pose.txt"
	cam_to_velo_file = calib_folder / "calib_cam_to_velo.txt"
	intrinsics_file = calib_folder / "perspective.txt"
	fisheye_02_file = calib_folder / "image_02.yaml"
	fisheye_03_file = calib_folder / "image_03.yaml"

	cam_to_pose_data = {}
	with open(cam_to_pose_file, "r") as f:
	for line in f.readlines():
	key, value = line.split(":", 1)
	try:
	cam_to_pose_data[key] = np.array(
	[float(x) for x in value.split()], dtype=np.float32
	)
	except ValueError:
	pass

	cam_to_velo_data = None
	with open(cam_to_velo_file, "r") as f:
	line = f.readline()
	try:
	cam_to_velo_data = np.array(
	[float(x) for x in line.split()], dtype=np.float32
	)
	except ValueError:
	pass

	intrinsics_data = {}
	with open(intrinsics_file, "r") as f:
	for line in f.readlines():
	key, value = line.split(":", 1)
	try:
	intrinsics_data[key] = np.array(
	[float(x) for x in value.split()], dtype=np.float32
	)
	except ValueError:
	pass

	with open(fisheye_02_file, "r") as f:
	f.readline() # Skips first line that defines the YAML version
	fisheye_02_data = yaml.safe_load(f)

	with open(fisheye_03_file, "r") as f:
	f.readline() # Skips first line that defines the YAML version
	fisheye_03_data = yaml.safe_load(f)

	im_size_rect = (
	int(intrinsics_data["S_rect_00"][1]),
	int(intrinsics_data["S_rect_00"][0]),
	)
	im_size_fish = (fisheye_02_data["image_height"], fisheye_02_data["image_width"])

	# Projection matrices
	# We use these projection matrices also when resampling the fisheye cameras.
	# This makes downstream processing easier, but it could be done differently.
	P_rect_00 = np.reshape(intrinsics_data["P_rect_00"], (3, 4))
	P_rect_01 = np.reshape(intrinsics_data["P_rect_01"], (3, 4))

	# Rotation matrices from raw to rectified -> Needs to be inverted later
	R_rect_00 = np.eye(4, dtype=np.float32)
	R_rect_01 = np.eye(4, dtype=np.float32)
	R_rect_00[:3, :3] = np.reshape(intrinsics_data["R_rect_00"], (3, 3))
	R_rect_01[:3, :3] = np.reshape(intrinsics_data["R_rect_01"], (3, 3))

	# Rotation matrices from resampled fisheye to raw fisheye
	fisheye_rotation = np.array(fisheye_rotation).reshape((1, 2))
	R_02 = np.eye(4, dtype=np.float32)
	R_03 = np.eye(4, dtype=np.float32)
	R_02[:3, :3] = (
	Rotation.from_euler("xy", fisheye_rotation[:, [1, 0]], degrees=True)
	.as_matrix()
	.astype(np.float32)
	)
	R_03[:3, :3] = (
	Rotation.from_euler(
	"xy", fisheye_rotation[:, [1, 0]] * np.array([[1, -1]]), degrees=True
	)
	.as_matrix()
	.astype(np.float32)
	)

	# Load cam to pose transforms
	T_00_to_pose = np.eye(4, dtype=np.float32)
	T_01_to_pose = np.eye(4, dtype=np.float32)
	T_02_to_pose = np.eye(4, dtype=np.float32)
	T_03_to_pose = np.eye(4, dtype=np.float32)
	T_00_to_velo = np.eye(4, dtype=np.float32)

	T_00_to_pose[:3, :] = np.reshape(cam_to_pose_data["image_00"], (3, 4))
	T_01_to_pose[:3, :] = np.reshape(cam_to_pose_data["image_01"], (3, 4))
	T_02_to_pose[:3, :] = np.reshape(cam_to_pose_data["image_02"], (3, 4))
	T_03_to_pose[:3, :] = np.reshape(cam_to_pose_data["image_03"], (3, 4))
	T_00_to_velo[:3, :] = np.reshape(cam_to_velo_data, (3, 4))

	# Compute cam to pose transforms for rectified perspective cameras
	T_rect_00_to_pose = T_00_to_pose @ np.linalg.inv(R_rect_00)
	T_rect_01_to_pose = T_01_to_pose @ np.linalg.inv(R_rect_01)

	# Compute cam to pose transform for fisheye cameras
	T_02_to_pose = T_02_to_pose @ R_02
	T_03_to_pose = T_03_to_pose @ R_03

	# Compute velo to cameras and velo to pose transforms
	T_velo_to_rect_00 = R_rect_00 @ np.linalg.inv(T_00_to_velo)
	T_velo_to_pose = T_rect_00_to_pose @ T_velo_to_rect_00
	T_velo_to_rect_01 = np.linalg.inv(T_rect_01_to_pose) @ T_velo_to_pose

	# Calibration matrix is the same for both perspective cameras
	K = P_rect_00[:3, :3]

	# Normalize calibration
	f_x = K[0, 0] / im_size_rect[1]
	f_y = K[1, 1] / im_size_rect[0]
	c_x = K[0, 2] / im_size_rect[1]
	c_y = K[1, 2] / im_size_rect[0]

	# Change to image coordinates [-1, 1]
	K[0, 0] = f_x * 2.0
	K[1, 1] = f_y * 2.0
	K[0, 2] = c_x * 2.0 - 1
	K[1, 2] = c_y * 2.0 - 1

	# Convert fisheye calibration to [-1, 1] image dimensions
	fisheye_02_data["projection_parameters"]["gamma1"] = (
	fisheye_02_data["projection_parameters"]["gamma1"] / im_size_fish[1]
	) * 2.0
	fisheye_02_data["projection_parameters"]["gamma2"] = (
	fisheye_02_data["projection_parameters"]["gamma2"] / im_size_fish[0]
	) * 2.0
	fisheye_02_data["projection_parameters"]["u0"] = (
	fisheye_02_data["projection_parameters"]["u0"] / im_size_fish[1]
	) * 2.0 - 1.0
	fisheye_02_data["projection_parameters"]["v0"] = (
	fisheye_02_data["projection_parameters"]["v0"] / im_size_fish[0]
	) * 2.0 - 1.0

	fisheye_03_data["projection_parameters"]["gamma1"] = (
	fisheye_03_data["projection_parameters"]["gamma1"] / im_size_fish[1]
	) * 2.0
	fisheye_03_data["projection_parameters"]["gamma2"] = (
	fisheye_03_data["projection_parameters"]["gamma2"] / im_size_fish[0]
	) * 2.0
	fisheye_03_data["projection_parameters"]["u0"] = (
	fisheye_03_data["projection_parameters"]["u0"] / im_size_fish[1]
	) * 2.0 - 1.0
	fisheye_03_data["projection_parameters"]["v0"] = (
	fisheye_03_data["projection_parameters"]["v0"] / im_size_fish[0]
	) * 2.0 - 1.0

	# Use same camera calibration as perspective cameras for resampling
	# K_fisheye = np.eye(3, dtype=np.float32)
	# K_fisheye[0, 0] = 2
	# K_fisheye[1, 1] = 2

	K_fisheye = K

	calibs = {
	"K_perspective": K,
	"K_fisheye": K_fisheye,
	"T_cam_to_pose": {
	"00": T_rect_00_to_pose,
	"01": T_rect_01_to_pose,
	"02": T_02_to_pose,
	"03": T_03_to_pose,
	},
	"T_velo_to_cam": {
	"00": T_velo_to_rect_00,
	"01": T_velo_to_rect_01,
	},
	"T_velo_to_pose": T_velo_to_pose,
	"fisheye": {
	"calib_02": fisheye_02_data,
	"calib_03": fisheye_03_data,
	"R_02": R_02[:3, :3],
	"R_03": R_03[:3, :3],
	},
	"im_size": im_size_rect,
	}

	return calibs

	@staticmethod
	def _get_resamplers(calibs, K_target, target_image_size):
	resampler_02 = FisheyeToPinholeSampler(
	K_target,
	target_image_size,
	calibs["fisheye"]["calib_02"],
	calibs["fisheye"]["R_02"],
	)
	resampler_03 = FisheyeToPinholeSampler(
	K_target,
	target_image_size,
	calibs["fisheye"]["calib_03"],
	calibs["fisheye"]["R_03"],
	)

	return resampler_02, resampler_03

	@staticmethod
	def _load_poses(pose_path, sequences):
	ids = {}
	poses = {}

	for seq in sequences:
	pose_file = Path(pose_path) / seq / f"poses.txt"

	try:
	pose_data = np.loadtxt(pose_file)
	except FileNotFoundError:
	print(f"Ground truth poses are not avaialble for sequence {seq}.")

	ids_seq = pose_data[:, 0].astype(int)
	poses_seq = pose_data[:, 1:].astype(np.float32).reshape((-1, 3, 4))
	poses_seq = np.concatenate(
	(poses_seq, np.zeros_like(poses_seq[:, :1, :])), axis=1
	)
	poses_seq[:, 3, 3] = 1

	ids[seq] = ids_seq
	poses[seq] = poses_seq
	return ids, poses

	@staticmethod
	def _load_3d_bboxes(bbox_path, sequences):
	bboxes = {}

	for seq in sequences:
	with open(Path(bbox_path) / f"{seq}.xml", "rb") as f:
	tree = ET.parse(f)
	root = tree.getroot()

	objects = defaultdict(list)

	num_bbox = 0

	for child in root:
	if child.find("transform") is None:
	continue
	obj = KITTI360Bbox3D()
	if child.find("semanticId") is not None:
	obj.parseBbox(child)
	else:
	obj.parseStuff(child)
	# globalId = local2global(obj.semanticId, obj.instanceId)
	# objects[globalId][obj.timestamp] = obj
	objects[obj.timestamp].append(obj)
	num_bbox += 1

	# globalIds = np.asarray(list(objects.keys()))
	# semanticIds, instanceIds = global2local(globalIds)
	# for label in labels:
	# if label.hasInstances:
	# print(f'{label.name:<30}:\t {(semanticIds==label.id).sum()}')
	# print(f'Loaded {len(globalIds)} instances')
	# print(f'Loaded {num_bbox} boxes')

	bboxes[seq] = objects

	return bboxes

	def get_img_id_from_id(self, sequence, id):
	return self._img_ids[sequence][id]

	def load_images(self, seq, img_ids, load_left, load_right, img_ids_fish=None):
	imgs_p_left = []
	imgs_f_left = []
	imgs_p_right = []
	imgs_f_right = []

	if img_ids_fish is None:
	img_ids_fish = img_ids

	for id in img_ids:
	if load_left:
	img_perspective = (
	cv2.cvtColor(
	cv2.imread(
	os.path.join(
	self.data_path,
	"data_2d_raw",
	seq,
	"image_00",
	self._perspective_folder,
	f"{id:010d}.png",
	)
	),
	cv2.COLOR_BGR2RGB,
	).astype(np.float32)
	/ 255
	)
	imgs_p_left += [img_perspective]

	if load_right:
	img_perspective = (
	cv2.cvtColor(
	cv2.imread(
	os.path.join(
	self.data_path,
	"data_2d_raw",
	seq,
	"image_01",
	self._perspective_folder,
	f"{id:010d}.png",
	)
	),
	cv2.COLOR_BGR2RGB,
	).astype(np.float32)
	/ 255
	)
	imgs_p_right += [img_perspective]

	for id in img_ids_fish:
	if load_left:
	img_fisheye = (
	cv2.cvtColor(
	cv2.imread(
	os.path.join(
	self.data_path,
	"data_2d_raw",
	seq,
	"image_02",
	self._fisheye_folder,
	f"{id:010d}.png",
	)
	),
	cv2.COLOR_BGR2RGB,
	).astype(np.float32)
	/ 255
	)
	imgs_f_left += [img_fisheye]
	if load_right:
	img_fisheye = (
	cv2.cvtColor(
	cv2.imread(
	os.path.join(
	self.data_path,
	"data_2d_raw",
	seq,
	"image_03",
	self._fisheye_folder,
	f"{id:010d}.png",
	)
	),
	cv2.COLOR_BGR2RGB,
	).astype(np.float32)
	/ 255
	)
	imgs_f_right += [img_fisheye]

	return imgs_p_left, imgs_f_left, imgs_p_right, imgs_f_right

	def process_img(
	self,
	img: np.array,
	color_aug_fn=None,
	resampler: FisheyeToPinholeSampler = None,
	):
	if resampler is not None and not self.is_preprocessed:
	img = torch.tensor(img).permute(2, 0, 1)
	img = resampler.resample(img)
	else:
	if self.target_image_size:
	img = cv2.resize(
	img,
	(self.target_image_size[1], self.target_image_size[0]),
	interpolation=cv2.INTER_LINEAR,
	)
	img = np.transpose(img, (2, 0, 1))
	img = torch.tensor(img)

	if color_aug_fn is not None:
	img = color_aug_fn(img)

	img = img * 2 - 1
	return img

	def get_3d_bboxes(self, seq, img_id, pose, projs):
	seq_3d_bboxes = self._3d_bboxes[seq]
	pose_w2c = np.linalg.inv(pose)

	def filter_bbox(bbox):
	verts = bbox.vertices
	verts = (projs @ (pose_w2c[:3, :3] @ verts.T + pose_w2c[:3, 3, None])).T
	verts[:, :2] /= verts[:, 2:3]
	valid = (
	((verts[:, 0] >= -1) & (verts[:, 0] <= 1))
	& ((verts[:, 1] >= -1) & (verts[:, 1] <= 1))
	& ((verts[:, 2] > 0) & (verts[:, 2] <= 80))
	)
	valid = np.any(valid, axis=-1)
	return valid

	bboxes = seq_3d_bboxes[-1] + seq_3d_bboxes[img_id]

	bboxes = list(filter(filter_bbox, bboxes))

	bboxes = [
	{
	"vertices": bbox.vertices,
	"faces": bbox.faces,
	"semanticId": bbox.semanticId,
	"instanceId": bbox.instanceId,
	}
	for i, bbox in enumerate(bboxes)
	] # if valid[i]

	return bboxes

	def load_segmentation(self, seq, img_id):
	seg = cv2.imread(
	os.path.join(
	self.data_path,
	"data_2d_semantics",
	"train",
	seq,
	"image_00",
	"semantic",
	f"{img_id:010d}.png",
	),
	cv2.IMREAD_UNCHANGED,
	)
	seg = cv2.resize(
	seg,
	(self.target_image_size[1], self.target_image_size[0]),
	interpolation=cv2.INTER_NEAREST,
	)
	return seg

	def load_depth(self, seq, img_id, is_right):
	points = np.fromfile(
	os.path.join(
	self.data_path,
	"data_3d_raw",
	seq,
	"velodyne_points",
	"data",
	f"{img_id:010d}.bin",
	),
	dtype=np.float32,
	).reshape(-1, 4)
	points[:, 3] = 1.0

	T_velo_to_cam = self._calibs["T_velo_to_cam"]["00" if not is_right else "01"]
	K = self._calibs["K_perspective"]

	# project the points to the camera
	velo_pts_im = np.dot(K @ T_velo_to_cam[:3, :], points.T).T
	velo_pts_im[:, :2] = velo_pts_im[:, :2] / velo_pts_im[:, 2][..., None]

	# the projection is normalized to [-1, 1] -> transform to [0, height-1] x [0, width-1]
	velo_pts_im[:, 0] = np.round(
	(velo_pts_im[:, 0] * 0.5 + 0.5) * self.target_image_size[1]
	)
	velo_pts_im[:, 1] = np.round(
	(velo_pts_im[:, 1] * 0.5 + 0.5) * self.target_image_size[0]
	)

	# check if in bounds
	val_inds = (velo_pts_im[:, 0] >= 0) & (velo_pts_im[:, 1] >= 0)
	val_inds = (
	val_inds
	& (velo_pts_im[:, 0] < self.target_image_size[1])
	& (velo_pts_im[:, 1] < self.target_image_size[0])
	)
	velo_pts_im = velo_pts_im[val_inds, :]

	# project to image
	depth = np.zeros(self.target_image_size)
	depth[
	velo_pts_im[:, 1].astype(np.int32), velo_pts_im[:, 0].astype(np.int32)
	] = velo_pts_im[:, 2]

	# find the duplicate points and choose the closest depth
	inds = (
	velo_pts_im[:, 1] * (self.target_image_size[1] - 1) + velo_pts_im[:, 0] - 1
	)
	dupe_inds = [item for item, count in Counter(inds).items() if count > 1]
	for dd in dupe_inds:
	pts = np.where(inds == dd)[0]
	x_loc = int(velo_pts_im[pts[0], 0])
	y_loc = int(velo_pts_im[pts[0], 1])
	depth[y_loc, x_loc] = velo_pts_im[pts, 2].min()
	depth[depth < 0] = 0

	return depth[None, :, :]

	def __getitem__(self, index: int):
	_start_time = time.time()

	if index >= self.length:
	raise IndexError()

	if self._skip != 0:
	index += self._skip

	sequence, id, is_right = self._datapoints[index]
	seq_len = self._img_ids[sequence].shape[0]

	load_left, load_right = (
	not is_right
	) or self.return_stereo, is_right or self.return_stereo

	## randomly sample fisheye in the time steps where it can see the occlusion with the stereo
	if self.random_fisheye_offset:
	fisheye_offset = self.fisheye_offset[
	torch.randint(0, len(self.fisheye_offset), (1,)).item()
	] ## randomly select among the given list of fisheye_ids from config
	else:
	fisheye_offset = self.fisheye_offset[-1]

	if self.random_stereo_offset:
	stereo_offset = self.stereo_offset[
	torch.randint(0, len(self.stereo_offset), (1,)).item()
	]
	else:
	stereo_offset = self.stereo_offset[-1]

	# ids = [id] + [max(min(i, seq_len-1), 0) for i in range(id - self._left_offset, id - self._left_offset + self.frame_count * self.dilation, self.dilation) if i != id]
	# ids_fish = [max(min(id + self.fisheye_offset, seq_len-1), 0)] + [max(min(i, seq_len-1), 0) for i in range(id + self.fisheye_offset - self._left_offset, id + self.fisheye_offset - self._left_offset + self.frame_count * self.dilation, self.dilation) if i != id + self.fisheye_offset]
	# img_ids = [self.get_img_id_from_id(sequence, id) for id in ids]
	# img_ids_fish = [self.get_img_id_from_id(sequence, id) for id in ids_fish]

	id_st = (
	id + stereo_offset
	) ## TODO: find out how to deal with 3 steps ahead without -1 => as we sample scenes with the amount of stereo_offset
	ids = [id] + [
	max(min(i, seq_len - 1), 0)
	for i in range(
	id_st - self._left_offset,
	id_st - self._left_offset + self.frame_count * self.dilation,
	self.dilation,
	)
	if i != id_st
	]
	ids_fish = [max(min(id + fisheye_offset, seq_len - 1), 0)] + [
	max(min(i, seq_len - 1), 0)
	for i in range(
	id + fisheye_offset - self._left_offset,
	id
	+ fisheye_offset
	- self._left_offset
	+ self.frame_count * self.dilation,
	self.dilation,
	)
	if i != id + fisheye_offset
	]
	## and now ids_fish is 5 steps ahead of ids with 2 fisheye scenes
	img_ids = [self.get_img_id_from_id(sequence, id) for id in ids]
	img_ids_fish = [self.get_img_id_from_id(sequence, id) for id in ids_fish]

	if not self.return_fisheye:
	ids_fish, img_ids_fish = [], []

	if self.color_aug:
	color_aug_fn = get_color_aug_fn(
	ColorJitter.get_params(
	brightness=(0.8, 1.2),
	contrast=(0.8, 1.2),
	saturation=(0.8, 1.2),
	hue=(-0.1, 0.1),
	)
	)
	else:
	color_aug_fn = None

	_start_time_loading = time.time()
	imgs_p_left, imgs_f_left, imgs_p_right, imgs_f_right = self.load_images(
	sequence, img_ids, load_left, load_right, img_ids_fish=img_ids_fish
	)
	_loading_time = np.array(time.time() - _start_time_loading)

	_start_time_processing = time.time()
	imgs_p_left = [
	self.process_img(img, color_aug_fn=color_aug_fn) for img in imgs_p_left
	]
	imgs_f_left = [
	self.process_img(
	img, color_aug_fn=color_aug_fn, resampler=self._resampler_02
	)
	for img in imgs_f_left
	]
	imgs_p_right = [
	self.process_img(img, color_aug_fn=color_aug_fn) for img in imgs_p_right
	]
	imgs_f_right = [
	self.process_img(
	img, color_aug_fn=color_aug_fn, resampler=self._resampler_03
	)
	for img in imgs_f_right
	]
	_processing_time = np.array(time.time() - _start_time_processing)

	# These poses are camera to world !!
	poses_p_left = (
	[
	self._poses[sequence][i, :, :] @ self._calibs["T_cam_to_pose"]["00"]
	for i in ids
	]
	if load_left
	else []
	)
	poses_f_left = (
	[
	self._poses[sequence][i, :, :] @ self._calibs["T_cam_to_pose"]["02"]
	for i in ids_fish
	]
	if load_left
	else []
	)
	poses_p_right = (
	[
	self._poses[sequence][i, :, :] @ self._calibs["T_cam_to_pose"]["01"]
	for i in ids
	]
	if load_right
	else []
	)
	poses_f_right = (
	[
	self._poses[sequence][i, :, :] @ self._calibs["T_cam_to_pose"]["03"]
	for i in ids_fish
	]
	if load_right
	else []
	)

	projs_p_left = [self._calibs["K_perspective"] for _ in ids] if load_left else []
	projs_f_left = (
	[self._calibs["K_fisheye"] for _ in ids_fish] if load_left else []
	)
	projs_p_right = (
	[self._calibs["K_perspective"] for _ in ids] if load_right else []
	)
	projs_f_right = (
	[self._calibs["K_fisheye"] for _ in ids_fish] if load_right else []
	)

	imgs = (
	imgs_p_left + imgs_p_right + imgs_f_left + imgs_f_right
	if not is_right
	else imgs_p_right + imgs_p_left + imgs_f_right + imgs_f_left
	)
	projs = (
	projs_p_left + projs_p_right + projs_f_left + projs_f_right
	if not is_right
	else projs_p_right + projs_p_left + projs_f_right + projs_f_left
	)
	poses = (
	poses_p_left + poses_p_right + poses_f_left + poses_f_right
	if not is_right
	else poses_p_right + poses_p_left + poses_f_right + poses_f_left
	)
	ids = np.array(ids + ids + ids_fish + ids_fish, dtype=np.int32)

	if self.return_depth:
	depths = [self.load_depth(sequence, img_ids[0], is_right)]
	else:
	depths = []

	if self.return_3d_bboxes:
	bboxes_3d = [self.get_3d_bboxes(sequence, img_ids[0], poses[0], projs[0])]
	else:
	bboxes_3d = []

	if self.return_segmentation:
	segs = [self.load_segmentation(sequence, img_ids[0])]
	else:
	segs = []

	_proc_time = np.array(time.time() - _start_time)

	# print(_loading_time, _processing_time, _proc_time)

	data = {
	"imgs": imgs,
	"projs": projs,
	"poses": poses,
	"depths": depths,
	"ts": ids,
	"3d_bboxes": bboxes_3d,
	"segs": segs,
	"t__get_item__": np.array([_proc_time]),
	"index": np.array([index]),
	}

	return data

	def __len__(self) -> int:
	return self.length