Spaces:

roll-ai
/

GENC3-docker

Build error

App Files Files Community

GENC3-docker / cosmos_predict1 /diffusion /training /datasets /dataset_3D_binary.py

roll-ai

Upload 381 files

b6af722 verified 12 days ago

raw

history blame contribute delete

6.63 kB

	# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	Run this command to interactively debug:
	PYTHONPATH=. python cosmos_predict1/diffusion/posttrain/datasets/dataset_3D.py

	Adapted from:
	https://github.com/bytedance/IRASim/blob/main/dataset/dataset_3D.py
	"""

	import json
	import pickle
	import random
	import traceback
	import warnings

	import numpy as np
	import torch

	from cosmos_predict1.diffusion.training.datasets.dataset_3D import Dataset_3D
	from cosmos_predict1.utils import log


	class Dataset_3DBinary(Dataset_3D):
	def __init__(
	self,
	train_annotation_path,
	val_annotation_path,
	test_annotation_path,
	video_path,
	sequence_interval,
	num_frames,
	cam_ids,
	accumulate_action,
	video_size,
	val_start_frame_interval,
	debug=False,
	normalize=False,
	pre_encode=False,
	do_evaluate=False,
	load_t5_embeddings=False,
	load_action=True,
	mode="train",
	):
	"""Dataset class for loading 3D robot action-conditional data.

	This dataset loads robot trajectories consisting of RGB video frames, robot states
	(arm positions and binary gripper states), and computes relative actions between
	consecutive frames.
	"""

	super().__init__(
	train_annotation_path=train_annotation_path,
	val_annotation_path=val_annotation_path,
	test_annotation_path=test_annotation_path,
	video_path=video_path,
	sequence_interval=sequence_interval,
	num_frames=num_frames,
	cam_ids=cam_ids,
	accumulate_action=accumulate_action,
	video_size=video_size,
	val_start_frame_interval=val_start_frame_interval,
	debug=debug,
	normalize=normalize,
	pre_encode=pre_encode,
	do_evaluate=do_evaluate,
	load_t5_embeddings=load_t5_embeddings,
	load_action=load_action,
	mode=mode,
	)

	log.info("Dataset_3DBinary: in this dataset, we binarize the gripper state to 0 or 1.")

	def _get_json_action(self, label, frame_ids):
	all_action = np.array(label["action"])
	actions = all_action[frame_ids[:-1]]
	return torch.from_numpy(actions)

	def __getitem__(self, index, cam_id=None, return_video=False):
	if self.mode != "train":
	np.random.seed(index)
	random.seed(index)

	try:
	sample = self.samples[index]
	ann_file = sample["ann_file"]
	frame_ids = sample["frame_ids"]
	with open(ann_file, "r") as f:
	label = json.load(f)
	arm_states, gripper_states = self._get_robot_states(label, frame_ids)
	actions = self._get_actions(arm_states, gripper_states, self.accumulate_action)
	actions *= self.c_act_scaler

	data = dict()
	if self.load_action:
	data["action"] = actions.float()
	json_action = self._get_json_action(label, frame_ids).float()
	json_action[:, :6] = data["action"][:, :6]
	data["action"] = json_action

	if self.pre_encode:
	raise NotImplementedError("Pre-encoded videos are not supported for this dataset.")
	else:
	video, cam_id = self._get_obs(label, frame_ids, cam_id, pre_encode=False)
	video = video.permute(1, 0, 2, 3) # Rearrange from [T, C, H, W] to [C, T, H, W]
	data["video"] = video.to(dtype=torch.uint8)

	data["annotation_file"] = ann_file

	if "episode_id" in label:
	data["__key__"] = label["episode_id"]
	else:
	data["__key__"] = label["original_path"]

	# Just add these to fit the interface
	if self.load_t5_embeddings:
	t5_embedding_path = ann_file.replace(".json", ".pickle")
	with open(t5_embedding_path, "rb") as f:
	data["t5_text_embeddings"] = torch.from_numpy(pickle.load(f)[0])
	else:
	data["t5_text_embeddings"] = torch.zeros(512, 1024, dtype=torch.bfloat16)
	data["t5_text_mask"] = torch.ones(512, dtype=torch.int64)
	data["fps"] = 4
	data["image_size"] = 256 * torch.ones(4) # TODO: Does this matter?
	data["num_frames"] = self.sequence_length
	data["padding_mask"] = torch.zeros(1, 256, 256)

	return data
	except Exception:
	warnings.warn(
	f"Invalid data encountered: {self.samples[index]['ann_file']}. Skipped "
	f"(by randomly sampling another sample in the same dataset)."
	)
	warnings.warn("FULL TRACEBACK:")
	warnings.warn(traceback.format_exc())
	self.wrong_number += 1
	print(self.wrong_number)
	return self[np.random.randint(len(self.samples))]


	if __name__ == "__main__":
	dataset = Dataset_3DBinary(
	train_annotation_path="datasets/bridge/annotation/train",
	val_annotation_path="datasets/bridge/annotation/val",
	test_annotation_path="datasets/bridge/annotation/test",
	video_path="datasets/bridge/",
	sequence_interval=1,
	num_frames=2,
	cam_ids=[0],
	accumulate_action=False,
	video_size=[256, 320],
	val_start_frame_interval=1,
	mode="train",
	load_t5_embeddings=True,
	)

	indices = [0, 13, 200, -1]
	for idx in indices:
	print(
	(
	f"{idx=} "
	f"{dataset[idx]['video'].sum()=}\n"
	f"{dataset[idx]['video'].shape=}\n"
	f"{dataset[idx]['video_name']=}\n"
	f"{dataset[idx]['action'].sum()=}\n"
	f"{dataset[idx]['json_action'].sum()=}\n"
	"---"
	)
	)

	from IPython import embed

	embed()