Spaces:
Runtime error
Runtime error
File size: 6,061 Bytes
f670afc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
#include <ATen/ATen.h>
#include <ATen/Context.h>
#include <ATen/cuda/CUDAContext.h>
#include "channelnorm_kernel.cuh"
#define CUDA_NUM_THREADS 512
#define DIM0(TENSOR) ((TENSOR).x)
#define DIM1(TENSOR) ((TENSOR).y)
#define DIM2(TENSOR) ((TENSOR).z)
#define DIM3(TENSOR) ((TENSOR).w)
#define DIM3_INDEX(TENSOR, xx, yy, zz, ww) ((TENSOR)[((xx) * (TENSOR##_stride.x)) + ((yy) * (TENSOR##_stride.y)) + ((zz) * (TENSOR##_stride.z)) + ((ww) * (TENSOR##_stride.w))])
using at::Half;
template <typename scalar_t>
__global__ void kernel_channelnorm_update_output(
const int n,
const scalar_t* __restrict__ input1,
const long4 input1_size,
const long4 input1_stride,
scalar_t* __restrict__ output,
const long4 output_size,
const long4 output_stride,
int norm_deg) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) {
return;
}
int dim_b = DIM0(output_size);
int dim_c = DIM1(output_size);
int dim_h = DIM2(output_size);
int dim_w = DIM3(output_size);
int dim_chw = dim_c * dim_h * dim_w;
int b = ( index / dim_chw ) % dim_b;
int y = ( index / dim_w ) % dim_h;
int x = ( index ) % dim_w;
int i1dim_c = DIM1(input1_size);
int i1dim_h = DIM2(input1_size);
int i1dim_w = DIM3(input1_size);
int i1dim_chw = i1dim_c * i1dim_h * i1dim_w;
int i1dim_hw = i1dim_h * i1dim_w;
float result = 0.0;
for (int c = 0; c < i1dim_c; ++c) {
int i1Index = b * i1dim_chw + c * i1dim_hw + y * i1dim_w + x;
scalar_t val = input1[i1Index];
result += static_cast<float>(val * val);
}
result = sqrt(result);
output[index] = static_cast<scalar_t>(result);
}
template <typename scalar_t>
__global__ void kernel_channelnorm_backward_input1(
const int n,
const scalar_t* __restrict__ input1, const long4 input1_size, const long4 input1_stride,
const scalar_t* __restrict__ output, const long4 output_size, const long4 output_stride,
const scalar_t* __restrict__ gradOutput, const long4 gradOutput_size, const long4 gradOutput_stride,
scalar_t* __restrict__ gradInput, const long4 gradInput_size, const long4 gradInput_stride,
int norm_deg) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
if (index >= n) {
return;
}
float val = 0.0;
int dim_b = DIM0(gradInput_size);
int dim_c = DIM1(gradInput_size);
int dim_h = DIM2(gradInput_size);
int dim_w = DIM3(gradInput_size);
int dim_chw = dim_c * dim_h * dim_w;
int dim_hw = dim_h * dim_w;
int b = ( index / dim_chw ) % dim_b;
int y = ( index / dim_w ) % dim_h;
int x = ( index ) % dim_w;
int outIndex = b * dim_hw + y * dim_w + x;
val = static_cast<float>(gradOutput[outIndex]) * static_cast<float>(input1[index]) / (static_cast<float>(output[outIndex])+1e-9);
gradInput[index] = static_cast<scalar_t>(val);
}
void channelnorm_kernel_forward(
at::Tensor& input1,
at::Tensor& output,
int norm_deg) {
const long4 input1_size = make_long4(input1.size(0), input1.size(1), input1.size(2), input1.size(3));
const long4 input1_stride = make_long4(input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3));
const long4 output_size = make_long4(output.size(0), output.size(1), output.size(2), output.size(3));
const long4 output_stride = make_long4(output.stride(0), output.stride(1), output.stride(2), output.stride(3));
int n = output.numel();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channelnorm_forward", ([&] {
kernel_channelnorm_update_output<scalar_t><<< (n + CUDA_NUM_THREADS - 1)/CUDA_NUM_THREADS, CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream() >>>(
//at::globalContext().getCurrentCUDAStream() >>>(
n,
input1.data<scalar_t>(),
input1_size,
input1_stride,
output.data<scalar_t>(),
output_size,
output_stride,
norm_deg);
}));
// TODO: ATen-equivalent check
// THCudaCheck(cudaGetLastError());
}
void channelnorm_kernel_backward(
at::Tensor& input1,
at::Tensor& output,
at::Tensor& gradOutput,
at::Tensor& gradInput1,
int norm_deg) {
const long4 input1_size = make_long4(input1.size(0), input1.size(1), input1.size(2), input1.size(3));
const long4 input1_stride = make_long4(input1.stride(0), input1.stride(1), input1.stride(2), input1.stride(3));
const long4 output_size = make_long4(output.size(0), output.size(1), output.size(2), output.size(3));
const long4 output_stride = make_long4(output.stride(0), output.stride(1), output.stride(2), output.stride(3));
const long4 gradOutput_size = make_long4(gradOutput.size(0), gradOutput.size(1), gradOutput.size(2), gradOutput.size(3));
const long4 gradOutput_stride = make_long4(gradOutput.stride(0), gradOutput.stride(1), gradOutput.stride(2), gradOutput.stride(3));
const long4 gradInput1_size = make_long4(gradInput1.size(0), gradInput1.size(1), gradInput1.size(2), gradInput1.size(3));
const long4 gradInput1_stride = make_long4(gradInput1.stride(0), gradInput1.stride(1), gradInput1.stride(2), gradInput1.stride(3));
int n = gradInput1.numel();
AT_DISPATCH_FLOATING_TYPES_AND_HALF(input1.type(), "channelnorm_backward_input1", ([&] {
kernel_channelnorm_backward_input1<scalar_t><<< (n + CUDA_NUM_THREADS - 1)/CUDA_NUM_THREADS, CUDA_NUM_THREADS, 0, at::cuda::getCurrentCUDAStream() >>>(
//at::globalContext().getCurrentCUDAStream() >>>(
n,
input1.data<scalar_t>(),
input1_size,
input1_stride,
output.data<scalar_t>(),
output_size,
output_stride,
gradOutput.data<scalar_t>(),
gradOutput_size,
gradOutput_stride,
gradInput1.data<scalar_t>(),
gradInput1_size,
gradInput1_stride,
norm_deg
);
}));
// TODO: Add ATen-equivalent check
// THCudaCheck(cudaGetLastError());
}
|