Spaces:
Runtime error
Runtime error
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 | |
#define USE_CUB | |
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700 | |
#ifdef USE_CUB | |
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh. | |
// For this reason CUB must be included BEFORE anything else. | |
#include <cub/cub.cuh> | |
using namespace cub; | |
#endif // USE_CUB | |
#include "sumrows.cuh" | |
#include "sum.cuh" | |
#include <cstdint> | |
void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { | |
#ifdef USE_CUB | |
size_t tmp_size = 0; | |
DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream); | |
ggml_cuda_pool_alloc<uint8_t> tmp_alloc(pool, tmp_size); | |
DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, x, dst, ne, stream); | |
#else | |
// Use (inefficient) sum_rows implementation as a fallback. | |
// For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14. | |
sum_rows_f32_cuda(x, dst, ne, 1, stream); | |
GGML_UNUSED(pool); | |
#endif // USE_CUB | |
} | |
void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { | |
const ggml_tensor * src0 = dst->src[0]; | |
GGML_ASSERT(src0->type == GGML_TYPE_F32); | |
GGML_ASSERT( dst->type == GGML_TYPE_F32); | |
GGML_ASSERT(ggml_is_contiguous(src0)); | |
const float * src0_d = (const float *) src0->data; | |
float * dst_d = (float *) dst->data; | |
const int64_t ne = ggml_nelements(src0); | |
ggml_cuda_pool & pool = ctx.pool(); | |
cudaStream_t stream = ctx.stream(); | |
sum_f32_cuda(pool, src0_d, dst_d, ne, stream); | |
} | |