File size: 5,633 Bytes
b050f40
 
 
 
e374166
b050f40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e374166
 
 
b050f40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
Given the following task description:
[input.txt]
Write a complete CUDA program (.cu file) that solves this task using a basic and correct algorithm. The implementation should include a kernel function and the main function that tests it.
There are 5 sets of binary input data. The main function should test all five datasets. If all of them pass, the program should print "T", otherwise print "F". The output must strictly be either "T" or "F". Do not write any extra output.
When comparing the output of the CUDA program with the reference, you may increase the error tolerance in `compare_scalar()` for numerically unstable or ill-conditioned tasks. For example, tasks such as **matrix inversion** may involve large numerical errors due to floating-point instability, and thus require a larger tolerance threshold.
Here is a reference style for the structure of the .cu file:
```cu
#include <iostream>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <cmath>
#include <fstream>
#include <vector>

#define C 10    

__global__ void cross_entropy_kernel(const float* logits, const int* labels, float* loss_sum, int N) {
    int j = blockIdx.x * blockDim.x + threadIdx.x; 
    if (j >= N) return;

    float max_logit = -1e20f;
    for (int k = 0; k < C; ++k) {
        float z = logits[j * C + k];
        if (z > max_logit) max_logit = z;
    }

    float exp_sum = 0.0f;
    for (int k = 0; k < C; ++k) {
        exp_sum += expf(logits[j * C + k] - max_logit);
    }

    float log_softmax_sum = logf(exp_sum);
    float loss_j = log_softmax_sum + max_logit - logits[j * C + labels[j]];
    atomicAdd(loss_sum, loss_j / N); 
}

void read_binary_float(const std::string& filename, float* data, size_t size) {
    std::ifstream in(filename, std::ios::binary);
    if (!in) {
        std::cerr << "Cannot open: " << filename << std::endl;
        exit(1);
    }
    in.read(reinterpret_cast<char*>(data), size * sizeof(float));
    in.close();
}

void read_binary_int(const std::string& filename, int* data, size_t size) {
    std::ifstream in(filename, std::ios::binary);
    if (!in) {
        std::cerr << "Cannot open: " << filename << std::endl;
        exit(1);
    }
    in.read(reinterpret_cast<char*>(data), size * sizeof(int));
    in.close();
}

// test
bool compare_scalar(float a, float b, float tol = 1e-2f) {
    return fabs(a - b) < tol;
}

int main() {
    std::vector<size_t> Ns = {1<<14, 1<<16, 1<<18, 1<<20, 1<<22};
    bool all_match = true;

    for (int idx = 0; idx < Ns.size(); ++idx) {
        size_t N = Ns[idx];
        size_t logits_size = N * C;
        size_t logits_bytes = logits_size * sizeof(float);
        size_t labels_bytes = N * sizeof(int);

        // test
        std::string logits_file = "data/ce_logits_" + std::to_string(idx + 1) + ".bin";
        std::string labels_file = "data/ce_labels_" + std::to_string(idx + 1) + ".bin";
        std::string ref_file    = "data/ce_ref_"    + std::to_string(idx + 1) + ".bin";

        float* h_logits = (float*)malloc(logits_bytes);
        int* h_labels   = (int*)malloc(labels_bytes);
        float h_ref;

        read_binary_float(logits_file, h_logits, logits_size);
        read_binary_int(labels_file, h_labels, N);
        read_binary_float(ref_file, &h_ref, 1);

        float *d_logits, *d_loss;
        int* d_labels;
        cudaMalloc(&d_logits, logits_bytes);
        cudaMalloc(&d_labels, labels_bytes);
        cudaMalloc(&d_loss, sizeof(float));
        cudaMemcpy(d_logits, h_logits, logits_bytes, cudaMemcpyHostToDevice);
        cudaMemcpy(d_labels, h_labels, labels_bytes, cudaMemcpyHostToDevice);
        cudaMemset(d_loss, 0, sizeof(float));

        int threads = 256;
        int blocks = (N + threads - 1) / threads;
        cross_entropy_kernel<<<blocks, threads>>>(d_logits, d_labels, d_loss, N);

        float h_loss;
        cudaMemcpy(&h_loss, d_loss, sizeof(float), cudaMemcpyDeviceToHost);

        if (!compare_scalar(h_loss, h_ref)) {
            std::cout << "F" << std::endl;
            all_match = false;
            cudaFree(d_logits); cudaFree(d_labels); cudaFree(d_loss);
            free(h_logits); free(h_labels);
            break;
        }

        cudaFree(d_logits); cudaFree(d_labels); cudaFree(d_loss);
        free(h_logits); free(h_labels);
    }

    if (all_match) std::cout << "T" << std::endl;
    return 0;
}
```
You also need to write the Python script gen_test_data.py to generate the test binary files. 
When generating test data, make sure it is feasible for the target task. For example, in a matrix inversion task, the input matrix must be invertible (i.e., non-singular) to ensure correct reference generation and program behavior.
Use the following as a style reference. Make sure all binary files are stored under a folder named data/. Avoid generating extremely large data. Suggested sizes are shown below:
```py
import numpy as np
import os

np.random.seed(30)

C = 10
sizes = [2**14, 2**16, 2**18, 2**20, 2**22]

for idx, N in enumerate(sizes):
    logits = (np.random.randn(N, C) * 3).astype(np.float32)
    labels = np.random.randint(0, C, size=N, dtype=np.int32)

    logits_max = logits.max(axis=1, keepdims=True)
    exp_logits = np.exp(logits - logits_max)
    log_sum_exp = np.log(exp_logits.sum(axis=1))
    losses = log_sum_exp + logits_max[:, 0] - logits[np.arange(N), labels]
    avg_loss = losses.mean().astype(np.float32)

    logits.tofile(f"ce_logits_{idx+1}.bin")
    labels.tofile(f"ce_labels_{idx+1}.bin")
    np.array([avg_loss], dtype=np.float32).tofile(f"ce_ref_{idx+1}.bin")
```
You should generate these two code blocks at once.