Spaces:
Runtime error
Runtime error
Upload llama.cpp/ggml/src/ggml-cuda/sum.cu with huggingface_hub
Browse files
llama.cpp/ggml/src/ggml-cuda/sum.cu
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
2 |
+
#define USE_CUB
|
3 |
+
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
4 |
+
|
5 |
+
#ifdef USE_CUB
|
6 |
+
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
7 |
+
// For this reason CUB must be included BEFORE anything else.
|
8 |
+
#include <cub/cub.cuh>
|
9 |
+
using namespace cub;
|
10 |
+
#endif // USE_CUB
|
11 |
+
|
12 |
+
#include "sumrows.cuh"
|
13 |
+
#include "sum.cuh"
|
14 |
+
|
15 |
+
#include <cstdint>
|
16 |
+
|
17 |
+
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) {
|
18 |
+
#ifdef USE_CUB
|
19 |
+
size_t tmp_size = 0;
|
20 |
+
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream);
|
21 |
+
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size);
|
22 |
+
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream);
|
23 |
+
#else
|
24 |
+
// Use (inefficient) sum_rows implementation as a fallback.
|
25 |
+
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14.
|
26 |
+
sum_rows_f32_cuda(x, dst, ne, 1, stream);
|
27 |
+
GGML_UNUSED(pool);
|
28 |
+
#endif // USE_CUB
|
29 |
+
}
|
30 |
+
|
31 |
+
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
32 |
+
const ggml_tensor * src0 = dst->src[0];
|
33 |
+
|
34 |
+
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
35 |
+
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
36 |
+
GGML_ASSERT(ggml_is_contiguous(src0));
|
37 |
+
|
38 |
+
const float * src0_d = (const float *) src0->data;
|
39 |
+
float * dst_d = (float *) dst->data;
|
40 |
+
|
41 |
+
const int64_t ne = ggml_nelements(src0);
|
42 |
+
|
43 |
+
ggml_cuda_pool & pool = ctx.pool();
|
44 |
+
cudaStream_t stream = ctx.stream();
|
45 |
+
|
46 |
+
sum_f32_cuda(pool, src0_d, dst_d, ne, stream);
|
47 |
+
}
|