Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,239 Bytes
4c954ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
// #include <THC/THC.h>
// #include <THC/THCDeviceUtils.cuh>
#include <torch/torch.h>
#include <torch/extension.h>
#include <vector>
#include <iostream>
int const CUDA_NUM_THREADS = 1024;
inline int CUDA_GET_BLOCKS(const int N) {
return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}
#define CUDA_1D_KERNEL_LOOP(i, n) \
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
i += blockDim.x * gridDim.x)
__global__ void encode_kernel(const int nthreads, const float* lines,
const int input_height, const int input_width, const int num,
const int height, const int width, float* map,
bool* label, float* tmap)
{
CUDA_1D_KERNEL_LOOP(index, nthreads){
int w = index % width;
int h = (index / width) % height;
int x_index = h*width + w;
int y_index = height*width + h*width + w;
int ux_index = 2*height*width + h*width + w;
int uy_index = 3*height*width + h*width + w;
int vx_index = 4*height*width + h*width + w;
int vy_index = 5*height*width + h*width + w;
int label_index = h*width + w;
float px = (float) w;
float py = (float) h;
float min_dis = 1e30;
int minp = -1;
bool flagp = true;
for(int i = 0; i < num; ++i) {
float xs = (float)width /(float)input_width;
float ys = (float)height /(float)input_height;
float x1 = lines[4*i ]*xs;
float y1 = lines[4*i+1]*ys;
float x2 = lines[4*i+2]*xs;
float y2 = lines[4*i+3]*ys;
float dx = x2 - x1;
float dy = y2 - y1;
float ux = x1 - px;
float uy = y1 - py;
float vx = x2 - px;
float vy = y2 - py;
float norm2 = dx*dx + dy*dy;
bool flag = false;
float t = ((px-x1)*dx + (py-y1)*dy)/(norm2+1e-6);
if (t<=1 && t>=0.0)
flag = true;
t = t<0.0? 0.0:t;
t = t>1.0? 1.0:t;
float ax = x1 + t*(x2-x1) - px;
float ay = y1 + t*(y2-y1) - py;
float dis = ax*ax + ay*ay;
if (dis < min_dis) {
min_dis = dis;
map[x_index] = ax;
map[y_index] = ay;
float norm_u2 = ux*ux+uy*uy;
float norm_v2 = vx*vx+vy*vy;
if (norm_u2 < norm_v2){
map[ux_index] = ux;
map[uy_index] = uy;
map[vx_index] = vx;
map[vy_index] = vy;
}
else{
map[ux_index] = vx;
map[uy_index] = vy;
map[vx_index] = ux;
map[vy_index] = uy;
}
minp = i;
if (flag)
flagp = true;
else
flagp = false;
tmap[index] = t;
}
}
// label[label_index+minp*height*width] = flagp;
}
}
std::tuple<at::Tensor, at::Tensor, at::Tensor> lsencode_cuda(
const at::Tensor& lines,
const int input_height,
const int input_width,
const int height,
const int width,
const int num_lines)
{
auto map = at::zeros({6,height,width}, lines.options());
auto tmap = at::zeros({1,height,width}, lines.options());
auto label = at::zeros({1,height,width}, lines.options().dtype(at::kBool));
auto nthreads = height*width;
cudaStream_t stream = at::cuda::getCurrentCUDAStream();
float* map_data = map.data<float>();
float* tmap_data = tmap.data<float>();
bool* label_data = label.data<bool>();
encode_kernel<<<CUDA_GET_BLOCKS(nthreads), CUDA_NUM_THREADS >>>(
nthreads,
lines.contiguous().data<float>(),
input_height, input_width,
num_lines,
height, width,
map_data,
label_data,
tmap_data);
// THCudaCheck(cudaGetLastError());
return std::make_tuple(map, label, tmap);
} |