Spaces:

CVPR
/

LIVE

Runtime error

LIVE / thrust /cub /util_device.cuh

Xu Ma

upload all files

28958dc about 3 years ago

23.1 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2020, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* Properties of a given CUDA device and the corresponding PTX bundle
	*/

	#pragma once

	#include "util_type.cuh"
	#include "util_arch.cuh"
	#include "util_debug.cuh"
	#include "util_cpp_dialect.cuh"
	#include "util_namespace.cuh"
	#include "util_macro.cuh"

	#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
	#include <atomic>
	#include <array>
	#include <cassert>
	#endif

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/**
	* \addtogroup UtilMgmt
	* @{
	*/

	#ifndef DOXYGEN_SHOULD_SKIP_THIS // Do not document


	/**
	* \brief Alias temporaries to externally-allocated device storage (or simply return the amount of storage needed).
	*/
	template <int ALLOCATIONS>
	__host__ __device__ __forceinline__
	cudaError_t AliasTemporaries(
	void *d_temp_storage, ///< [in] %Device-accessible allocation of temporary storage. When NULL, the required allocation size is written to \p temp_storage_bytes and no work is done.
	size_t& temp_storage_bytes, ///< [in,out] Size in bytes of \t d_temp_storage allocation
	void* (&allocations)[ALLOCATIONS], ///< [in,out] Pointers to device allocations needed
	size_t (&allocation_sizes)[ALLOCATIONS]) ///< [in] Sizes in bytes of device allocations needed
	{
	const int ALIGN_BYTES = 256;
	const int ALIGN_MASK = ~(ALIGN_BYTES - 1);

	// Compute exclusive prefix sum over allocation requests
	size_t allocation_offsets[ALLOCATIONS];
	size_t bytes_needed = 0;
	for (int i = 0; i < ALLOCATIONS; ++i)
	{
	size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
	allocation_offsets[i] = bytes_needed;
	bytes_needed += allocation_bytes;
	}
	bytes_needed += ALIGN_BYTES - 1;

	// Check if the caller is simply requesting the size of the storage allocation
	if (!d_temp_storage)
	{
	temp_storage_bytes = bytes_needed;
	return cudaSuccess;
	}

	// Check if enough storage provided
	if (temp_storage_bytes < bytes_needed)
	{
	return CubDebug(cudaErrorInvalidValue);
	}

	// Alias
	d_temp_storage = (void *) ((size_t(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
	for (int i = 0; i < ALLOCATIONS; ++i)
	{
	allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
	}

	return cudaSuccess;
	}


	/**
	* \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device
	*/
	template <typename T>
	__global__ void EmptyKernel(void) { }

	#endif // DOXYGEN_SHOULD_SKIP_THIS

	/**
	* \brief Returns the current device or -1 if an error occurred.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ int CurrentDevice()
	{
	#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.

	int device = -1;
	if (CubDebug(cudaGetDevice(&device))) return -1;
	return device;

	#else // Device code without the CUDA runtime.

	return -1;

	#endif
	}

	/**
	* \brief RAII helper which saves the current device and switches to the
	* specified device on construction and switches to the saved device on
	* destruction.
	*/
	struct SwitchDevice
	{
	private:
	int const old_device;
	bool const needs_reset;
	public:
	__host__ __forceinline__ SwitchDevice(int new_device)
	: old_device(CurrentDevice()), needs_reset(old_device != new_device)
	{
	if (needs_reset)
	CubDebug(cudaSetDevice(new_device));
	}

	__host__ __forceinline__ ~SwitchDevice()
	{
	if (needs_reset)
	CubDebug(cudaSetDevice(old_device));
	}
	};

	/**
	* \brief Returns the number of CUDA devices available or -1 if an error
	* occurred.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCountUncached()
	{
	#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.

	int count = -1;
	if (CubDebug(cudaGetDeviceCount(&count)))
	// CUDA makes no guarantees about the state of the output parameter if
	// `cudaGetDeviceCount` fails; in practice, they don't, but out of
	// paranoia we'll reset `count` to `-1`.
	count = -1;
	return count;

	#else // Device code without the CUDA runtime.

	return -1;

	#endif
	}

	#if CUB_CPP_DIALECT >= 2011 // C++11 and later.

	/**
	* \brief Cache for an arbitrary value produced by a nullary function.
	*/
	template <typename T, T(*Function)()>
	struct ValueCache
	{
	T const value;

	/**
	* \brief Call the nullary function to produce the value and construct the
	* cache.
	*/
	__host__ __forceinline__ ValueCache() : value(Function()) {}
	};

	#endif

	#if CUB_CPP_DIALECT >= 2011
	// Host code, only safely usable in C++11 or newer, where thread-safe
	// initialization of static locals is guaranteed. This is a separate function
	// to avoid defining a local static in a host/device function.
	__host__ __forceinline__ int DeviceCountCachedValue()
	{
	static ValueCache<int, DeviceCountUncached> cache;
	return cache.value;
	}
	#endif

	/**
	* \brief Returns the number of CUDA devices available.
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ int DeviceCount()
	{
	int result = -1;
	if (CUB_IS_HOST_CODE) {
	#if CUB_INCLUDE_HOST_CODE
	#if CUB_CPP_DIALECT >= 2011
	// Host code and C++11.
	result = DeviceCountCachedValue();
	#else
	// Host code and C++98.
	result = DeviceCountUncached();
	#endif
	#endif
	} else {
	#if CUB_INCLUDE_DEVICE_CODE
	// Device code.
	result = DeviceCountUncached();
	#endif
	}
	return result;
	}

	#if CUB_CPP_DIALECT >= 2011 // C++11 and later.

	/**
	* \brief Per-device cache for a CUDA attribute value; the attribute is queried
	* and stored for each device upon construction.
	*/
	struct PerDeviceAttributeCache
	{
	struct DevicePayload
	{
	int attribute;
	cudaError_t error;
	};

	// Each entry starts in the `DeviceEntryEmpty` state, then proceeds to the
	// `DeviceEntryInitializing` state, and then proceeds to the
	// `DeviceEntryReady` state. These are the only state transitions allowed;
	// e.g. a linear sequence of transitions.
	enum DeviceEntryStatus
	{
	DeviceEntryEmpty = 0,
	DeviceEntryInitializing,
	DeviceEntryReady
	};

	struct DeviceEntry
	{
	std::atomic<DeviceEntryStatus> flag;
	DevicePayload payload;
	};

	private:
	std::array<DeviceEntry, CUB_MAX_DEVICES> entries_;

	public:
	/**
	* \brief Construct the cache.
	*/
	__host__ __forceinline__ PerDeviceAttributeCache() : entries_()
	{
	assert(DeviceCount() <= CUB_MAX_DEVICES);
	}

	/**
	* \brief Retrieves the payload of the cached function \p f for \p device.
	*
	* \note You must pass a morally equivalent function in to every call or
	* this function has undefined behavior.
	*/
	template <typename Invocable>
	__host__ DevicePayload operator()(Invocable&& f, int device)
	{
	if (device >= DeviceCount())
	return DevicePayload{0, cudaErrorInvalidDevice};

	auto& entry = entries_[device];
	auto& flag = entry.flag;
	auto& payload = entry.payload;

	DeviceEntryStatus old_status = DeviceEntryEmpty;

	// First, check for the common case of the entry being ready.
	if (flag.load(std::memory_order_acquire) != DeviceEntryReady)
	{
	// Assume the entry is empty and attempt to lock it so we can fill
	// it by trying to set the state from `DeviceEntryReady` to
	// `DeviceEntryInitializing`.
	if (flag.compare_exchange_strong(old_status, DeviceEntryInitializing,
	std::memory_order_acq_rel,
	std::memory_order_acquire))
	{
	// We successfully set the state to `DeviceEntryInitializing`;
	// we have the lock and it's our job to initialize this entry
	// and then release it.

	// We don't use `CubDebug` here because we let the user code
	// decide whether or not errors are hard errors.
	if (payload.error = std::forward<Invocable>(f)(payload.attribute))
	// Clear the global CUDA error state which may have been
	// set by the last call. Otherwise, errors may "leak" to
	// unrelated kernel launches.
	cudaGetLastError();

	// Release the lock by setting the state to `DeviceEntryReady`.
	flag.store(DeviceEntryReady, std::memory_order_release);
	}

	// If the `compare_exchange_weak` failed, then `old_status` has
	// been updated with the value of `flag` that it observed.

	else if (old_status == DeviceEntryInitializing)
	{
	// Another execution agent is initializing this entry; we need
	// to wait for them to finish; we'll know they're done when we
	// observe the entry status as `DeviceEntryReady`.
	do { old_status = flag.load(std::memory_order_acquire); }
	while (old_status != DeviceEntryReady);
	// FIXME: Use `atomic::wait` instead when we have access to
	// host-side C++20 atomics. We could use libcu++, but it only
	// supports atomics for SM60 and up, even if you're only using
	// them in host code.
	}
	}

	// We now know that the state of our entry is `DeviceEntryReady`, so
	// just return the entry's payload.
	return entry.payload;
	}
	};

	#endif

	/**
	* \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version)
	{
	// Instantiate `EmptyKernel<void>` in both host and device code to ensure
	// it can be called.
	typedef void (*EmptyKernelPtr)();
	EmptyKernelPtr empty_kernel = EmptyKernel<void>;

	// This is necessary for unused variable warnings in host compilers. The
	// usual syntax of (void)empty_kernel; was not sufficient on MSVC2015.
	(void)reinterpret_cast<void*>(empty_kernel);

	cudaError_t result = cudaSuccess;
	if (CUB_IS_HOST_CODE) {
	#if CUB_INCLUDE_HOST_CODE
	cudaFuncAttributes empty_kernel_attrs;

	do {
	if (CubDebug(result = cudaFuncGetAttributes(&empty_kernel_attrs, empty_kernel)))
	break;
	}
	while(0);

	ptx_version = empty_kernel_attrs.ptxVersion * 10;
	#endif
	} else {
	#if CUB_INCLUDE_DEVICE_CODE
	// This is necessary to ensure instantiation of EmptyKernel in device code.
	// The `reinterpret_cast` is necessary to suppress a set-but-unused warnings.
	// This is a meme now: https://twitter.com/blelbach/status/1222391615576100864
	(void)reinterpret_cast<EmptyKernelPtr>(empty_kernel);

	ptx_version = CUB_PTX_ARCH;
	#endif
	}
	return result;
	}

	/**
	* \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
	*/
	__host__ __forceinline__ cudaError_t PtxVersionUncached(int& ptx_version, int device)
	{
	SwitchDevice sd(device);
	return PtxVersionUncached(ptx_version);
	}

	#if CUB_CPP_DIALECT >= 2011 // C++11 and later.
	template <typename Tag>
	__host__ __forceinline__ PerDeviceAttributeCache& GetPerDeviceAttributeCache()
	{
	// C++11 guarantees that initialization of static locals is thread safe.
	static PerDeviceAttributeCache cache;
	return cache;
	}

	struct PtxVersionCacheTag {};
	struct SmVersionCacheTag {};
	#endif

	/**
	* \brief Retrieves the PTX version that will be used on \p device (major * 100 + minor * 10).
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	__host__ __forceinline__ cudaError_t PtxVersion(int& ptx_version, int device)
	{
	#if CUB_CPP_DIALECT >= 2011 // C++11 and later.

	auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
	// If this call fails, then we get the error code back in the payload,
	// which we check with `CubDebug` below.
	[=] (int& pv) { return PtxVersionUncached(pv, device); },
	device);

	if (!CubDebug(payload.error))
	ptx_version = payload.attribute;

	return payload.error;

	#else // Pre C++11.

	return PtxVersionUncached(ptx_version, device);

	#endif
	}

	/**
	* \brief Retrieves the PTX version that will be used on the current device (major * 100 + minor * 10).
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t PtxVersion(int& ptx_version)
	{
	cudaError_t result = cudaErrorUnknown;
	if (CUB_IS_HOST_CODE) {
	#if CUB_INCLUDE_HOST_CODE
	#if CUB_CPP_DIALECT >= 2011
	// Host code and C++11.
	auto const device = CurrentDevice();

	auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
	// If this call fails, then we get the error code back in the payload,
	// which we check with `CubDebug` below.
	[=] (int& pv) { return PtxVersionUncached(pv, device); },
	device);

	if (!CubDebug(payload.error))
	ptx_version = payload.attribute;

	result = payload.error;
	#else
	// Host code and C++98.
	result = PtxVersionUncached(ptx_version);
	#endif
	#endif
	} else {
	#if CUB_INCLUDE_DEVICE_CODE
	// Device code.
	result = PtxVersionUncached(ptx_version);
	#endif
	}
	return result;
	}

	/**
	* \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersionUncached(int& sm_version, int device = CurrentDevice())
	{
	#if defined(CUB_RUNTIME_ENABLED) // Host code or device code with the CUDA runtime.

	cudaError_t error = cudaSuccess;
	do
	{
	int major = 0, minor = 0;
	if (CubDebug(error = cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device))) break;
	if (CubDebug(error = cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device))) break;
	sm_version = major * 100 + minor * 10;
	}
	while (0);

	return error;

	#else // Device code without the CUDA runtime.

	(void)sm_version;
	(void)device;

	// CUDA API calls are not supported from this device.
	return CubDebug(cudaErrorInvalidConfiguration);

	#endif
	}

	/**
	* \brief Retrieves the SM version of \p device (major * 100 + minor * 10)
	*
	* \note This function may cache the result internally.
	*
	* \note This function is thread safe.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SmVersion(int& sm_version, int device = CurrentDevice())
	{
	cudaError_t result = cudaErrorUnknown;
	if (CUB_IS_HOST_CODE) {
	#if CUB_INCLUDE_HOST_CODE
	#if CUB_CPP_DIALECT >= 2011
	// Host code and C++11
	auto const payload = GetPerDeviceAttributeCache<SmVersionCacheTag>()(
	// If this call fails, then we get the error code back in the payload,
	// which we check with `CubDebug` below.
	[=] (int& pv) { return SmVersionUncached(pv, device); },
	device);

	if (!CubDebug(payload.error))
	sm_version = payload.attribute;

	result = payload.error;
	#else
	// Host code and C++98
	result = SmVersionUncached(sm_version, device);
	#endif
	#endif
	} else {
	#if CUB_INCLUDE_DEVICE_CODE
	result = SmVersionUncached(sm_version, device);
	#endif
	}
	return result;
	}

	/**
	* Synchronize the specified \p stream.
	*/
	CUB_RUNTIME_FUNCTION __forceinline__ cudaError_t SyncStream(cudaStream_t stream)
	{
	cudaError_t result = cudaErrorUnknown;
	if (CUB_IS_HOST_CODE) {
	#if CUB_INCLUDE_HOST_CODE
	result = CubDebug(cudaStreamSynchronize(stream));
	#endif
	} else {
	#if CUB_INCLUDE_DEVICE_CODE
	#if defined(CUB_RUNTIME_ENABLED) // Device code with the CUDA runtime.
	(void)stream;
	// Device can't yet sync on a specific stream
	result = CubDebug(cudaDeviceSynchronize());
	#else // Device code without the CUDA runtime.
	(void)stream;
	// CUDA API calls are not supported from this device.
	result = CubDebug(cudaErrorInvalidConfiguration);
	#endif
	#endif
	}
	return result;
	}


	/**
	* \brief Computes maximum SM occupancy in thread blocks for executing the given kernel function pointer \p kernel_ptr on the current device with \p block_threads per thread block.
	*
	* \par Snippet
	* The code snippet below illustrates the use of the MaxSmOccupancy function.
	* \par
	* \code
	* #include <cub/cub.cuh> // or equivalently <cub/util_device.cuh>
	*
	* template <typename T>
	* __global__ void ExampleKernel()
	* {
	* // Allocate shared memory for BlockScan
	* __shared__ volatile T buffer[4096];
	*
	* ...
	* }
	*
	* ...
	*
	* // Determine SM occupancy for ExampleKernel specialized for unsigned char
	* int max_sm_occupancy;
	* MaxSmOccupancy(max_sm_occupancy, ExampleKernel<unsigned char>, 64);
	*
	* // max_sm_occupancy <-- 4 on SM10
	* // max_sm_occupancy <-- 8 on SM20
	* // max_sm_occupancy <-- 12 on SM35
	*
	* \endcode
	*
	*/
	template <typename KernelPtr>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t MaxSmOccupancy(
	int& max_sm_occupancy, ///< [out] maximum number of thread blocks that can reside on a single SM
	KernelPtr kernel_ptr, ///< [in] Kernel pointer for which to compute SM occupancy
	int block_threads, ///< [in] Number of threads per thread block
	int dynamic_smem_bytes = 0)
	{
	#ifndef CUB_RUNTIME_ENABLED

	(void)dynamic_smem_bytes;
	(void)block_threads;
	(void)kernel_ptr;
	(void)max_sm_occupancy;

	// CUDA API calls not supported from this device
	return CubDebug(cudaErrorInvalidConfiguration);

	#else

	return CubDebug(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
	&max_sm_occupancy,
	kernel_ptr,
	block_threads,
	dynamic_smem_bytes));

	#endif // CUB_RUNTIME_ENABLED
	}


	/******************************************************************************
	* Policy management
	******************************************************************************/

	/**
	* Kernel dispatch configuration
	*/
	struct KernelConfig
	{
	int block_threads;
	int items_per_thread;
	int tile_size;
	int sm_occupancy;

	CUB_RUNTIME_FUNCTION __forceinline__
	KernelConfig() : block_threads(0), items_per_thread(0), tile_size(0), sm_occupancy(0) {}

	template <typename AgentPolicyT, typename KernelPtrT>
	CUB_RUNTIME_FUNCTION __forceinline__
	cudaError_t Init(KernelPtrT kernel_ptr)
	{
	block_threads = AgentPolicyT::BLOCK_THREADS;
	items_per_thread = AgentPolicyT::ITEMS_PER_THREAD;
	tile_size = block_threads * items_per_thread;
	cudaError_t retval = MaxSmOccupancy(sm_occupancy, kernel_ptr, block_threads);
	return retval;
	}
	};



	/// Helper for dispatching into a policy chain
	template <int PTX_VERSION, typename PolicyT, typename PrevPolicyT>
	struct ChainedPolicy
	{
	/// The policy for the active compiler pass
	typedef typename If<(CUB_PTX_ARCH < PTX_VERSION), typename PrevPolicyT::ActivePolicy, PolicyT>::Type ActivePolicy;

	/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
	template <typename FunctorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t Invoke(int ptx_version, FunctorT& op)
	{
	if (ptx_version < PTX_VERSION) {
	return PrevPolicyT::Invoke(ptx_version, op);
	}
	return op.template Invoke<PolicyT>();
	}
	};

	/// Helper for dispatching into a policy chain (end-of-chain specialization)
	template <int PTX_VERSION, typename PolicyT>
	struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
	{
	/// The policy for the active compiler pass
	typedef PolicyT ActivePolicy;

	/// Specializes and dispatches op in accordance to the first policy in the chain of adequate PTX version
	template <typename FunctorT>
	CUB_RUNTIME_FUNCTION __forceinline__
	static cudaError_t Invoke(int /ptx_version/, FunctorT& op) {
	return op.template Invoke<PolicyT>();
	}
	};




	/** @} */ // end group UtilMgmt

	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)