Spaces:
Runtime error
Runtime error
| /****************************************************************************** | |
| * Copyright (c) 2011, Duane Merrill. All rights reserved. | |
| * Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved. | |
| * | |
| * Redistribution and use in source and binary forms, with or without | |
| * modification, are permitted provided that the following conditions are met: | |
| * * Redistributions of source code must retain the above copyright | |
| * notice, this list of conditions and the following disclaimer. | |
| * * Redistributions in binary form must reproduce the above copyright | |
| * notice, this list of conditions and the following disclaimer in the | |
| * documentation and/or other materials provided with the distribution. | |
| * * Neither the name of the NVIDIA CORPORATION nor the | |
| * names of its contributors may be used to endorse or promote products | |
| * derived from this software without specific prior written permission. | |
| * | |
| * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
| * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
| * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY | |
| * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
| * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
| * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND | |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
| * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
| * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
| * | |
| ******************************************************************************/ | |
| /** | |
| * \file | |
| * Static architectural properties by SM version. | |
| */ | |
| #pragma once | |
| #include "util_cpp_dialect.cuh" | |
| #include "util_namespace.cuh" | |
| #include "util_macro.cuh" | |
| /// Optional outer namespace(s) | |
| CUB_NS_PREFIX | |
| /// CUB namespace | |
| namespace cub { | |
| #ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document | |
| #if ((__CUDACC_VER_MAJOR__ >= 9) || defined(__NVCOMPILER_CUDA__)) && \ | |
| !defined(CUB_USE_COOPERATIVE_GROUPS) | |
| #define CUB_USE_COOPERATIVE_GROUPS | |
| #endif | |
| /// In device code, CUB_PTX_ARCH expands to the PTX version for which we are | |
| /// compiling. In host code, CUB_PTX_ARCH's value is implementation defined. | |
| #ifndef CUB_PTX_ARCH | |
| #if defined(__NVCOMPILER_CUDA__) | |
| // __NVCOMPILER_CUDA_ARCH__ is the target PTX version, and is defined | |
| // when compiling both host code and device code. Currently, only one | |
| // PTX version can be targeted. | |
| #define CUB_PTX_ARCH __NVCOMPILER_CUDA_ARCH__ | |
| #elif !defined(__CUDA_ARCH__) | |
| #define CUB_PTX_ARCH 0 | |
| #else | |
| #define CUB_PTX_ARCH __CUDA_ARCH__ | |
| #endif | |
| #endif | |
| #ifndef CUB_IS_DEVICE_CODE | |
| #if defined(__NVCOMPILER_CUDA__) | |
| #define CUB_IS_DEVICE_CODE __builtin_is_device_code() | |
| #define CUB_IS_HOST_CODE (!__builtin_is_device_code()) | |
| #define CUB_INCLUDE_DEVICE_CODE 1 | |
| #define CUB_INCLUDE_HOST_CODE 1 | |
| #elif CUB_PTX_ARCH > 0 | |
| #define CUB_IS_DEVICE_CODE 1 | |
| #define CUB_IS_HOST_CODE 0 | |
| #define CUB_INCLUDE_DEVICE_CODE 1 | |
| #define CUB_INCLUDE_HOST_CODE 0 | |
| #else | |
| #define CUB_IS_DEVICE_CODE 0 | |
| #define CUB_IS_HOST_CODE 1 | |
| #define CUB_INCLUDE_DEVICE_CODE 0 | |
| #define CUB_INCLUDE_HOST_CODE 1 | |
| #endif | |
| #endif | |
| /// Maximum number of devices supported. | |
| #ifndef CUB_MAX_DEVICES | |
| #define CUB_MAX_DEVICES 128 | |
| #endif | |
| #if CUB_CPP_DIALECT >= 2011 | |
| static_assert(CUB_MAX_DEVICES > 0, "CUB_MAX_DEVICES must be greater than 0."); | |
| #endif | |
| /// Whether or not the source targeted by the active compiler pass is allowed to invoke device kernels or methods from the CUDA runtime API. | |
| #ifndef CUB_RUNTIME_FUNCTION | |
| #if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__>= 350 && defined(__CUDACC_RDC__)) | |
| #define CUB_RUNTIME_ENABLED | |
| #define CUB_RUNTIME_FUNCTION __host__ __device__ | |
| #else | |
| #define CUB_RUNTIME_FUNCTION __host__ | |
| #endif | |
| #endif | |
| /// Number of threads per warp | |
| #ifndef CUB_LOG_WARP_THREADS | |
| #define CUB_LOG_WARP_THREADS(arch) \ | |
| (5) | |
| #define CUB_WARP_THREADS(arch) \ | |
| (1 << CUB_LOG_WARP_THREADS(arch)) | |
| #define CUB_PTX_WARP_THREADS CUB_WARP_THREADS(CUB_PTX_ARCH) | |
| #define CUB_PTX_LOG_WARP_THREADS CUB_LOG_WARP_THREADS(CUB_PTX_ARCH) | |
| #endif | |
| /// Number of smem banks | |
| #ifndef CUB_LOG_SMEM_BANKS | |
| #define CUB_LOG_SMEM_BANKS(arch) \ | |
| ((arch >= 200) ? \ | |
| (5) : \ | |
| (4)) | |
| #define CUB_SMEM_BANKS(arch) \ | |
| (1 << CUB_LOG_SMEM_BANKS(arch)) | |
| #define CUB_PTX_LOG_SMEM_BANKS CUB_LOG_SMEM_BANKS(CUB_PTX_ARCH) | |
| #define CUB_PTX_SMEM_BANKS CUB_SMEM_BANKS(CUB_PTX_ARCH) | |
| #endif | |
| /// Oversubscription factor | |
| #ifndef CUB_SUBSCRIPTION_FACTOR | |
| #define CUB_SUBSCRIPTION_FACTOR(arch) \ | |
| ((arch >= 300) ? \ | |
| (5) : \ | |
| ((arch >= 200) ? \ | |
| (3) : \ | |
| (10))) | |
| #define CUB_PTX_SUBSCRIPTION_FACTOR CUB_SUBSCRIPTION_FACTOR(CUB_PTX_ARCH) | |
| #endif | |
| /// Prefer padding overhead vs X-way conflicts greater than this threshold | |
| #ifndef CUB_PREFER_CONFLICT_OVER_PADDING | |
| #define CUB_PREFER_CONFLICT_OVER_PADDING(arch) \ | |
| ((arch >= 300) ? \ | |
| (1) : \ | |
| (4)) | |
| #define CUB_PTX_PREFER_CONFLICT_OVER_PADDING CUB_PREFER_CONFLICT_OVER_PADDING(CUB_PTX_ARCH) | |
| #endif | |
| template < | |
| int NOMINAL_4B_BLOCK_THREADS, | |
| int NOMINAL_4B_ITEMS_PER_THREAD, | |
| typename T> | |
| struct RegBoundScaling | |
| { | |
| enum { | |
| ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), | |
| BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), | |
| }; | |
| }; | |
| template < | |
| int NOMINAL_4B_BLOCK_THREADS, | |
| int NOMINAL_4B_ITEMS_PER_THREAD, | |
| typename T> | |
| struct MemBoundScaling | |
| { | |
| enum { | |
| ITEMS_PER_THREAD = CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), | |
| BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, (((1024 * 48) / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), | |
| }; | |
| }; | |
| #endif // Do not document | |
| } // CUB namespace | |
| CUB_NS_POSTFIX // Optional outer namespace(s) | |