// Copyright (C) 2021 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // This work is made available under the Nvidia Source Code License-NC. // To view a copy of this license, check out LICENSE.md #include #include #include #include #include #include #include #include #include #include #include #include #define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") #define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) struct PE_Params { int ndegrees; int pre_size; int post_size; bool incl_orig; }; // const int TILE_DIM_X = 16; // channel dim // const int TILE_DIM_Y = 64; // entry dim // dim3 dimGrid((p.post_size+TILE_DIM_X-1)/TILE_DIM_X, (p.pre_size+TILE_DIM_Y-1)/TILE_DIM_Y, 1); // dim3 dimBlock(TILE_DIM_X, TILE_DIM_Y, 1); template __global__ void positional_encoding_kernel( float* __restrict__ out_feature, const float* __restrict__ in_feature, const PE_Params p) { const int idx_feat = blockIdx.x * TILE_DIM_X + threadIdx.x; const int idx_entry_base = blockIdx.y * TILE_DIM_Y * DUP_Y + threadIdx.y * DUP_Y; if (idx_feat >= p.post_size) { return; } int stride = p.ndegrees*2; if (p.incl_orig) { stride += 1; } for (int j=0; j= p.pre_size) { return; } float data = in_feature[idx_entry*p.post_size + idx_feat]; for (int i=0; i __global__ void positional_encoding_backward_kernel( float* __restrict__ in_feature_grad, const float* __restrict__ out_feature_grad, const float* __restrict__ out_feature, const PE_Params p) { int idx_feat = blockIdx.x * TILE_DIM_X + threadIdx.x; const int idx_entry_base = blockIdx.y * TILE_DIM_Y * DUP_Y + threadIdx.y * DUP_Y; if (idx_feat >= p.post_size) { return; } int stride = p.ndegrees*2; if (p.incl_orig) { stride += 1; } for (int j=0; j= p.pre_size) { return; } float grad = 0.0f; for (int i=0; i torch::Tensor positional_encoding_cuda(const torch::Tensor& in_feature, int ndegrees, int dim, bool incl_orig) { CHECK_CUDA(in_feature); int curDevice = -1; cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); torch::Device device = in_feature.device(); assert(in_feature.dtype() == torch::kFloat32); // Handle negative index if (dim < 0) { dim = in_feature.dim() + dim; } assert(dim >= 0 && dim < in_feature.dim()); // No need to be contiguous. Input and output has the same memory layout. CHECK_CONTIGUOUS(in_feature); PE_Params p; p.ndegrees = ndegrees; p.incl_orig = incl_orig; // This only works for contiguous tensors... int pre_size = 1; int post_size = 1; for (int i=0; i out_feature_shape; for (int i=0; i Each thread handle a single post_size // Case 2: Concat at the middle (post_size > pre_size) --> Each thread handle const int TILE_DIM_X = 16; // channel dim const int TILE_DIM_Y = 64; // entry dim //const int DUP_Y = 4; // Each thread handle multiple entries to save threads const int DUP_Y = 8; // DGXA 64 samples per ray @ 256x256 dim3 dimGrid((p.post_size+TILE_DIM_X-1)/TILE_DIM_X, (p.pre_size+(TILE_DIM_Y*DUP_Y)-1)/(TILE_DIM_Y*DUP_Y), 1); dim3 dimBlock(TILE_DIM_X, TILE_DIM_Y, 1); positional_encoding_kernel<<>>( out_feature.data_ptr(), in_feature.data_ptr(), p ); THCudaCheck(cudaGetLastError()); return out_feature; } //in_feature_grad = voxrender_op.positional_encoding_backward(out_feature_grad, out_feature, ctx.pe_degrees, ctx.dim, ctx.incl_orig); // Input: // out_feature_grad: float32 [..., N*ndegree*2+incl_orig, ...] // out_feature: float32 [..., N*ndegree*2+incl_orig, ...] // ndegrees: int32 Degrees of PE encoding // dim: int32 Dimension to concatenate // incl_orig: bool Whether to include original feature vector or not // Output: // in_feature_grad: float32 [..., N, ...] // std::vector torch::Tensor positional_encoding_backward_cuda(const torch::Tensor& out_feature_grad_, const torch::Tensor& out_feature, int ndegrees, int dim, bool incl_orig) { CHECK_CUDA(out_feature_grad_); CHECK_CUDA(out_feature); const torch::Tensor out_feature_grad = out_feature_grad_.contiguous(); int curDevice = -1; cudaGetDevice(&curDevice); cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice); torch::Device device = out_feature_grad.device(); assert(out_feature_grad.dtype() == torch::kFloat32); assert(out_feature.dtype() == torch::kFloat32); assert(out_feature_grad.sizes() == out_feature.sizes()); // Handle negative index if (dim < 0) { dim = out_feature.dim() + dim; } assert(dim >= 0 && dim < out_feature.dim()); CHECK_CONTIGUOUS(out_feature_grad); CHECK_CONTIGUOUS(out_feature); PE_Params p; p.ndegrees = ndegrees; p.incl_orig = incl_orig; int expansion_factor = ndegrees*2; if (incl_orig) { expansion_factor += 1; } // This only works for contiguous tensors... int pre_size = 1; int post_size = 1; for (int i=0; i out_feature_shape; for (int i=0; i Each thread handle a single post_size // Case 2: Concat at the middle (post_size > pre_size) --> Each thread handle const int TILE_DIM_X = 16; // channel dim const int TILE_DIM_Y = 64; // entry dim //const int DUP_Y = 4; // Nothing to amortize const int DUP_Y = 8; // DGXA dim3 dimGrid((p.post_size+TILE_DIM_X-1)/TILE_DIM_X, (p.pre_size+(TILE_DIM_Y*DUP_Y)-1)/(TILE_DIM_Y*DUP_Y), 1); dim3 dimBlock(TILE_DIM_X, TILE_DIM_Y, 1); positional_encoding_backward_kernel<<>>( in_feature_grad.data_ptr(), out_feature_grad.data_ptr(), out_feature.data_ptr(), p ); THCudaCheck(cudaGetLastError()); return in_feature_grad; }