File size: 4,239 Bytes
4c954ae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>

// #include <THC/THC.h>
// #include <THC/THCDeviceUtils.cuh>
#include <torch/torch.h>
#include <torch/extension.h>

#include <vector>
#include <iostream>

int const CUDA_NUM_THREADS = 1024;

inline int CUDA_GET_BLOCKS(const int N) {
    return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
}

#define CUDA_1D_KERNEL_LOOP(i, n)                            \
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
       i += blockDim.x * gridDim.x)


__global__ void encode_kernel(const int nthreads, const float* lines,
                           const int input_height, const int input_width, const int num,
                           const int height, const int width, float* map,
                           bool* label, float* tmap)
{
    CUDA_1D_KERNEL_LOOP(index, nthreads){
        int w = index % width;
        int h = (index / width) % height;
        int x_index = h*width + w;
        int y_index = height*width + h*width + w;
        int ux_index = 2*height*width + h*width + w;
        int uy_index = 3*height*width + h*width + w;
        int vx_index = 4*height*width + h*width + w;
        int vy_index = 5*height*width + h*width + w;
        int label_index = h*width + w;

        float px = (float) w;
        float py = (float) h;
        float min_dis = 1e30;
        int minp = -1;
        bool flagp = true;
        for(int i = 0; i < num; ++i) {
            float xs = (float)width  /(float)input_width;
            float ys = (float)height /(float)input_height;
            float x1 = lines[4*i  ]*xs;
            float y1 = lines[4*i+1]*ys;
            float x2 = lines[4*i+2]*xs;
            float y2 = lines[4*i+3]*ys;

            float dx = x2 - x1;
            float dy = y2 - y1;
            float ux = x1 - px;
            float uy = y1 - py;
            float vx = x2 - px;
            float vy = y2 - py;
            float norm2 = dx*dx + dy*dy;
            bool flag = false;
            float t = ((px-x1)*dx + (py-y1)*dy)/(norm2+1e-6);
            if (t<=1 && t>=0.0)
                flag = true;

            t = t<0.0? 0.0:t;
            t = t>1.0? 1.0:t;

            float ax = x1   + t*(x2-x1) - px;
            float ay = y1   + t*(y2-y1) - py;

            float dis = ax*ax + ay*ay;
            if (dis < min_dis) {
                min_dis = dis;
                map[x_index] = ax;
                map[y_index] = ay;
                float norm_u2 = ux*ux+uy*uy;
                float norm_v2 = vx*vx+vy*vy;

                if (norm_u2 < norm_v2){
                    map[ux_index] = ux;
                    map[uy_index] = uy;
                    map[vx_index] = vx;
                    map[vy_index] = vy;
                }
                else{
                    map[ux_index] = vx;
                    map[uy_index] = vy;
                    map[vx_index] = ux;
                    map[vy_index] = uy;
                }

                minp = i;
                if (flag)
                    flagp = true;
                else
                    flagp = false;

                tmap[index] = t;
            }
        }
        // label[label_index+minp*height*width] = flagp;

    }
}


std::tuple<at::Tensor, at::Tensor, at::Tensor> lsencode_cuda(
    const at::Tensor& lines,
    const int input_height,
    const int input_width,
    const int height,
    const int width,
    const int num_lines)

{
    auto map = at::zeros({6,height,width}, lines.options());
    auto tmap = at::zeros({1,height,width}, lines.options());
    auto label = at::zeros({1,height,width}, lines.options().dtype(at::kBool));
    auto nthreads = height*width;

    cudaStream_t stream = at::cuda::getCurrentCUDAStream();

    float* map_data = map.data<float>();
    float* tmap_data = tmap.data<float>();
    bool*  label_data = label.data<bool>();

    encode_kernel<<<CUDA_GET_BLOCKS(nthreads), CUDA_NUM_THREADS >>>(
            nthreads,
            lines.contiguous().data<float>(),
            input_height, input_width,
            num_lines,
            height, width,
            map_data,
            label_data,
            tmap_data);

    // THCudaCheck(cudaGetLastError());

    return std::make_tuple(map, label, tmap);
}