Spaces:

ma-xu
/

LIVE

Runtime error

LIVE / thrust /cub /grid /grid_barrier.cuh

Xu Ma

update

1c3c0d9 over 3 years ago

5.85 kB

	/******************************************************************************
	* Copyright (c) 2011, Duane Merrill. All rights reserved.
	* Copyright (c) 2011-2018, NVIDIA CORPORATION. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions are met:
	* * Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	* * Redistributions in binary form must reproduce the above copyright
	* notice, this list of conditions and the following disclaimer in the
	* documentation and/or other materials provided with the distribution.
	* * Neither the name of the NVIDIA CORPORATION nor the
	* names of its contributors may be used to endorse or promote products
	* derived from this software without specific prior written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
	* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
	* DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
	* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
	* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
	* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
	* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
	* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*
	******************************************************************************/

	/**
	* \file
	* cub::GridBarrier implements a software global barrier among thread blocks within a CUDA grid
	*/

	#pragma once

	#include "../util_debug.cuh"
	#include "../config.cuh"
	#include "../thread/thread_load.cuh"

	/// Optional outer namespace(s)
	CUB_NS_PREFIX

	/// CUB namespace
	namespace cub {


	/**
	* \addtogroup GridModule
	* @{
	*/


	/**
	* \brief GridBarrier implements a software global barrier among thread blocks within a CUDA grid
	*/
	class GridBarrier
	{
	protected :

	typedef unsigned int SyncFlag;

	// Counters in global device memory
	SyncFlag* d_sync;

	public:

	/**
	* Constructor
	*/
	GridBarrier() : d_sync(NULL) {}


	/**
	* Synchronize
	*/
	__device__ __forceinline__ void Sync() const
	{
	volatile SyncFlag *d_vol_sync = d_sync;

	// Threadfence and syncthreads to make sure global writes are visible before
	// thread-0 reports in with its sync counter
	__threadfence();
	CTA_SYNC();

	if (blockIdx.x == 0)
	{
	// Report in ourselves
	if (threadIdx.x == 0)
	{
	d_vol_sync[blockIdx.x] = 1;
	}

	CTA_SYNC();

	// Wait for everyone else to report in
	for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
	{
	while (ThreadLoad<LOAD_CG>(d_sync + peer_block) == 0)
	{
	__threadfence_block();
	}
	}

	CTA_SYNC();

	// Let everyone know it's safe to proceed
	for (int peer_block = threadIdx.x; peer_block < gridDim.x; peer_block += blockDim.x)
	{
	d_vol_sync[peer_block] = 0;
	}
	}
	else
	{
	if (threadIdx.x == 0)
	{
	// Report in
	d_vol_sync[blockIdx.x] = 1;

	// Wait for acknowledgment
	while (ThreadLoad<LOAD_CG>(d_sync + blockIdx.x) == 1)
	{
	__threadfence_block();
	}
	}

	CTA_SYNC();
	}
	}
	};


	/**
	* \brief GridBarrierLifetime extends GridBarrier to provide lifetime management of the temporary device storage needed for cooperation.
	*
	* Uses RAII for lifetime, i.e., device resources are reclaimed when
	* the destructor is called.
	*/
	class GridBarrierLifetime : public GridBarrier
	{
	protected:

	// Number of bytes backed by d_sync
	size_t sync_bytes;

	public:

	/**
	* Constructor
	*/
	GridBarrierLifetime() : GridBarrier(), sync_bytes(0) {}


	/**
	* DeviceFrees and resets the progress counters
	*/
	cudaError_t HostReset()
	{
	cudaError_t retval = cudaSuccess;
	if (d_sync)
	{
	CubDebug(retval = cudaFree(d_sync));
	d_sync = NULL;
	}
	sync_bytes = 0;
	return retval;
	}


	/**
	* Destructor
	*/
	virtual ~GridBarrierLifetime()
	{
	HostReset();
	}


	/**
	* Sets up the progress counters for the next kernel launch (lazily
	* allocating and initializing them if necessary)
	*/
	cudaError_t Setup(int sweep_grid_size)
	{
	cudaError_t retval = cudaSuccess;
	do {
	size_t new_sync_bytes = sweep_grid_size * sizeof(SyncFlag);
	if (new_sync_bytes > sync_bytes)
	{
	if (d_sync)
	{
	if (CubDebug(retval = cudaFree(d_sync))) break;
	}

	sync_bytes = new_sync_bytes;

	// Allocate and initialize to zero
	if (CubDebug(retval = cudaMalloc((void**) &d_sync, sync_bytes))) break;
	if (CubDebug(retval = cudaMemset(d_sync, 0, new_sync_bytes))) break;
	}
	} while (0);

	return retval;
	}
	};


	/** @} */ // end group GridModule

	} // CUB namespace
	CUB_NS_POSTFIX // Optional outer namespace(s)