/*
* Copyright 1993-2021 NVIDIA Corporation. All rights reserved.
*
* NOTICE TO LICENSEE:
*
* This source code and/or documentation ("Licensed Deliverables") are
* subject to NVIDIA intellectual property rights under U.S. and
* international Copyright laws.
*
* These Licensed Deliverables contained herein is PROPRIETARY and
* CONFIDENTIAL to NVIDIA and is being provided under the terms and
* conditions of a form of NVIDIA software license agreement by and
* between NVIDIA and Licensee ("License Agreement") or electronically
* accepted by Licensee. Notwithstanding any terms or conditions to
* the contrary in the License Agreement, reproduction or disclosure
* of the Licensed Deliverables to any third party without the express
* written consent of NVIDIA is prohibited.
*
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THESE LICENSED DELIVERABLES.
*
* U.S. Government End Users. These Licensed Deliverables are a
* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
* 1995), consisting of "commercial computer software" and "commercial
* computer software documentation" as such terms are used in 48
* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
* U.S. Government End Users acquire the Licensed Deliverables with
* only those rights set forth herein.
*
* Any use of the Licensed Deliverables in individual and commercial
* software must include, in the user documentation and internal
* comments to the code, the above Disclaimer and U.S. Government End
* Users Notice.
*/
#ifndef _COOPERATIVE_GROUPS_H_
#define _COOPERATIVE_GROUPS_H_
#if defined(__cplusplus) && defined(__CUDACC__)
#include "cooperative_groups/details/info.h"
#include "cooperative_groups/details/driver_abi.h"
#include "cooperative_groups/details/helpers.h"
#include "cooperative_groups/details/memory.h"
#if defined(_CG_HAS_STL_ATOMICS)
#include <cuda/atomic>
#define _CG_THREAD_SCOPE(scope) _CG_STATIC_CONST_DECL cuda::thread_scope thread_scope = scope;
#else
#define _CG_THREAD_SCOPE(scope)
#endif
_CG_BEGIN_NAMESPACE
namespace details {
_CG_CONST_DECL unsigned int coalesced_group_id = 1;
_CG_CONST_DECL unsigned int multi_grid_group_id = 2;
_CG_CONST_DECL unsigned int grid_group_id = 3;
_CG_CONST_DECL unsigned int thread_block_id = 4;
_CG_CONST_DECL unsigned int multi_tile_group_id = 5;
_CG_CONST_DECL unsigned int cluster_group_id = 6;
}
/**
* class thread_group;
*
* Generic thread group type, into which all groups are convertible.
* It acts as a container for all storage necessary for the derived groups,
* and will dispatch the API calls to the correct derived group. This means
* that all derived groups must implement the same interface as thread_group.
*/
class thread_group
{
protected:
struct group_data {
unsigned int _unused : 1;
unsigned int type : 7, : 0;
};
struct gg_data {
details::grid_workspace *gridWs;
};
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
struct mg_data {
unsigned long long _unused : 1;
unsigned long long type : 7;
unsigned long long handle : 56;
const details::multi_grid::multi_grid_functions *functions;
};
#endif
struct tg_data {
unsigned int is_tiled : 1;
unsigned int type : 7;
unsigned int size : 24;
// packed to 4b
unsigned int metaGroupSize : 16;
unsigned int metaGroupRank : 16;
// packed to 8b
unsigned int mask;
// packed to 12b
unsigned int _res;
};
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
friend class thread_block;
union __align__(8) {
group_data group;
tg_data coalesced;
gg_data grid;
#if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
mg_data multi_grid;
#endif
} _data;
_CG_QUALIFIER thread_group operator=(const thread_group& src);
_CG_QUALIFIER thread_group(unsigned int type) {
_data.group.type = type;
_data.group._unused = false;
}
#ifdef _CG_CPP11_FEATURES
static_assert(sizeof(tg_data) <= 16, "Failed size check");
static_assert(sizeof(gg_data) <= 16, "Failed size check");
# ifdef _CG_ABI_EXPERIMENTAL
static_assert(sizeof(mg_data) <= 16, "Failed size check");
# endif
#endif
public:
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
_CG_QUALIFIER unsigned long long size() const;
_CG_QUALIFIER unsigned long long num_threads() const;
_CG_QUALIFIER unsigned long long thread_rank() const;
_CG_QUALIFIER void sync() const;
_CG_QUALIFIER unsigned int get_type() const {
return _data.group.type;
}
};
template <unsigned int TyId>
struct thread_group_base : public thread_group {
_CG_QUALIFIER thread_group_base() : thread_group(TyId) {}
_CG_STATIC_CONST_DECL unsigned int id = TyId;
};
#if defined(_CG_HAS_MULTI_GRID_GROUP)
/**
* class multi_grid_group;
*
* Threads within this this group are guaranteed to be co-resident on the
* same system, on multiple devices within the same launched kernels.
* To use this group, the kernel must have been launched with
* cuLaunchCooperativeKernelMultiDevice (or the CUDA Runtime equivalent),
* and the device must support it (queryable device attribute).
*
* Constructed via this_multi_grid();
*/
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
class multi_grid_group;
// Multi grid group requires these functions to be templated to prevent ptxas from trying to use CG syscalls
template <typename = void>
__device__ _CG_DEPRECATED multi_grid_group this_multi_grid();
class multi_grid_group : public thread_group_base<details::multi_grid_group_id>
{
private:
template <typename = void>
_CG_QUALIFIER multi_grid_group() {
_data.multi_grid.functions = details::multi_grid::load_grid_intrinsics();
_data.multi_grid.handle = _data.multi_grid.functions->get_intrinsic_handle();
}
friend multi_grid_group this_multi_grid<void>();
public:
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
_CG_QUALIFIER bool is_valid() const {
return (_data.multi_grid.handle != 0);
}
_CG_QUALIFIER void sync() const {
if (!is_valid()) {
_CG_ABORT();
}
_data.multi_grid.functions->sync(_data.multi_grid.handle);
}
_CG_QUALIFIER unsigned long long num_threads() const {
_CG_ASSERT(is_valid());
return _data.multi_grid.functions->size(_data.multi_grid.handle);
}
_CG_QUALIFIER unsigned long long size() const {
return num_threads();
}
_CG_QUALIFIER unsigned long long thread_rank() const {
_CG_ASSERT(is_valid());
return _data.multi_grid.functions->thread_rank(_data.multi_grid.handle);
}
_CG_QUALIFIER unsigned int grid_rank() const {
_CG_ASSERT(is_valid());
return (_data.multi_grid.functions->grid_rank(_data.multi_grid.handle));
}
_CG_QUALIFIER unsigned int num_grids() const {
_CG_ASSERT(is_valid());
return (_data.multi_grid.functions->num_grids(_data.multi_grid.handle));
}
};
# else
class multi_grid_group
{
private:
unsigned long long _handle;
unsigned int _size;
unsigned int _rank;
friend _CG_QUALIFIER multi_grid_group this_multi_grid();
_CG_QUALIFIER multi_grid_group() {
_handle = details::multi_grid::get_intrinsic_handle();
_size = details::multi_grid::size(_handle);
_rank = details::multi_grid::thread_rank(_handle);
}
public:
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_system)
_CG_QUALIFIER _CG_DEPRECATED bool is_valid() const {
return (_handle != 0);
}
_CG_QUALIFIER _CG_DEPRECATED void sync() const {
if (!is_valid()) {
_CG_ABORT();
}
details::multi_grid::sync(_handle);
}
_CG_QUALIFIER _CG_DEPRECATED unsigned long long num_threads() const {
_CG_ASSERT(is_valid());
return _size;
}
_CG_QUALIFIER _CG_DEPRECATED unsigned long long size() const {
return num_threads();
}
_CG_QUALIFIER _CG_DEPRECATED unsigned long long thread_rank() const {
_CG_ASSERT(is_valid());
return _rank;
}
_CG_QUALIFIER _CG_DEPRECATED unsigned int grid_rank() const {
_CG_ASSERT(is_valid());
return (details::multi_grid::grid_rank(_handle));
}
_CG_QUALIFIER _CG_DEPRECATED unsigned int num_grids() const {
_CG_ASSERT(is_valid());
return (details::multi_grid::num_grids(_handle));
}
};
# endif
/**
* multi_grid_group this_multi_grid()
*
* Constructs a multi_grid_group
*/
# if defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
template <typename>
__device__
#else
_CG_QUALIFIER
# endif
_CG_DEPRECATED
multi_grid_group this_multi_grid()
{
return multi_grid_group();
}
#endif
/**
* class grid_group;
*
* Threads within this this group are guaranteed to be co-resident on the
* same device within the same launched kernel. To use this group, the kernel
* must have been launched with cuLaunchCooperativeKernel (or the CUDA Runtime equivalent),
* and the device must support it (queryable device attribute).
*
* Constructed via this_grid();
*/
class grid_group : public thread_group_base<details::grid_group_id>
{
_CG_STATIC_CONST_DECL unsigned int _group_id = details::grid_group_id;
friend _CG_QUALIFIER grid_group this_grid();
private:
_CG_QUALIFIER grid_group(details::grid_workspace *gridWs) {
_data.grid.gridWs = gridWs;
}
public:
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_device)
_CG_QUALIFIER bool is_valid() const {
return (_data.grid.gridWs != NULL);
}
_CG_QUALIFIER void sync() const {
if (!is_valid()) {
_CG_ABORT();
}
details::grid::sync(&_data.grid.gridWs->barrier);
}
_CG_STATIC_QUALIFIER unsigned long long size() {
return details::grid::size();
}
_CG_STATIC_QUALIFIER unsigned long long thread_rank() {
return details::grid::thread_rank();
}
_CG_STATIC_QUALIFIER dim3 group_dim() {
return details::grid::grid_dim();
}
_CG_STATIC_QUALIFIER unsigned long long num_threads() {
return details::grid::num_threads();
}
_CG_STATIC_QUALIFIER dim3 dim_blocks() {
return details::grid::dim_blocks();
}
_CG_STATIC_QUALIFIER unsigned long long num_blocks() {
return details::grid::num_blocks();
}
_CG_STATIC_QUALIFIER dim3 block_index() {
return details::grid::block_index();
}
_CG_STATIC_QUALIFIER unsigned long long block_rank() {
return details::grid::block_rank();
}
# if defined(_CG_HAS_CLUSTER_GROUP)
_CG_STATIC_QUALIFIER dim3 dim_clusters() {
return details::grid::dim_clusters();
}
_CG_STATIC_QUALIFIER unsigned long long num_clusters() {
return details::grid::num_clusters();
}
_CG_STATIC_QUALIFIER dim3 cluster_index() {
return details::grid::cluster_index();
}
_CG_STATIC_QUALIFIER unsigned long long cluster_rank() {
return details::grid::cluster_rank();
}
# endif
};
_CG_QUALIFIER grid_group this_grid() {
// Load a workspace from the driver
grid_group gg(details::get_grid_workspace());
#ifdef _CG_DEBUG
// *all* threads must be available to synchronize
gg.sync();
#endif // _CG_DEBUG
return gg;
}
#if defined(_CG_HAS_CLUSTER_GROUP)
/**
* class cluster_group
*
* Every GPU kernel is executed by a grid of thread blocks. A grid can be evenly
* divided along all dimensions to form groups of blocks, each group of which is
* a block cluster. Clustered grids are subject to various restrictions and
* limitations. Primarily, a cluster consists of at most 8 blocks by default
* (although the user is allowed to opt-in to non-standard sizes,) and clustered
* grids are subject to additional occupancy limitations due to per-cluster
* hardware resource consumption. In exchange, a block cluster is guaranteed to
* be a cooperative group, with access to all cooperative group capabilities, as
* well as cluster specific capabilities and accelerations. A cluster_group
* represents a block cluster.
*
* Constructed via this_cluster_group();
*/
class cluster_group : public thread_group_base<details::cluster_group_id>
{
// Friends
friend _CG_QUALIFIER cluster_group this_cluster();
// Disable constructor
_CG_QUALIFIER cluster_group()
{
}
public:
//_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_cluster)
using arrival_token = struct {};
// Functionality exposed by the group
_CG_STATIC_QUALIFIER void sync()
{
return details::cluster::sync();
}
_CG_STATIC_QUALIFIER arrival_token barrier_arrive()
{
details::cluster::barrier_arrive();
return arrival_token();
}
_CG_STATIC_QUALIFIER void barrier_wait()
{
return details::cluster::barrier_wait();
}
_CG_STATIC_QUALIFIER void barrier_wait(arrival_token&&)
{
return details::cluster::barrier_wait();
}
_CG_STATIC_QUALIFIER unsigned int query_shared_rank(const void *addr)
{
return details::cluster::query_shared_rank(addr);
}
template <typename T>
_CG_STATIC_QUALIFIER T* map_shared_rank(T *addr, int rank)
{
return details::cluster::map_shared_rank(addr, rank);
}
_CG_STATIC_QUALIFIER dim3 block_index()
{
return details::cluster::block_index();
}
_CG_STATIC_QUALIFIER unsigned int block_rank()
{
return details::cluster::block_rank();
}
_CG_STATIC_QUALIFIER unsigned int thread_rank()
{
return details::cluster::thread_rank();
}
_CG_STATIC_QUALIFIER dim3 dim_blocks()
{
return details::cluster::dim_blocks();
}
_CG_STATIC_QUALIFIER unsigned int num_blocks()
{
return details::cluster::num_blocks();
}
_CG_STATIC_QUALIFIER dim3 dim_threads()
{
return details::cluster::dim_threads();
}
_CG_STATIC_QUALIFIER unsigned int num_threads()
{
return details::cluster::num_threads();
}
// Legacy aliases
_CG_STATIC_QUALIFIER unsigned int size()
{
return num_threads();
}
};
/*
* cluster_group this_cluster()
*
* Constructs a cluster_group
*/
_CG_QUALIFIER cluster_group this_cluster()
{
cluster_group cg;
#ifdef _CG_DEBUG
cg.sync();
#endif
return cg;
}
#endif
#if defined(_CG_CPP11_FEATURES)
class thread_block;
template <unsigned int MaxBlockSize>
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
#endif
/**
* class thread_block
*
* Every GPU kernel is executed by a grid of thread blocks, and threads within
* each block are guaranteed to reside on the same streaming multiprocessor.
* A thread_block represents a thread block whose dimensions are not known until runtime.
*
* Constructed via this_thread_block();
*/
class thread_block : public thread_group_base<details::thread_block_id>
{
// Friends
friend _CG_QUALIFIER thread_block this_thread_block();
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
friend _CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz);
#if defined(_CG_CPP11_FEATURES)
template <unsigned int MaxBlockSize>
friend _CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch);
template <unsigned int Size>
friend class __static_size_multi_warp_tile_base;
details::multi_warp_scratch* const tile_memory;
template <unsigned int MaxBlockSize>
_CG_QUALIFIER thread_block(block_tile_memory<MaxBlockSize>& scratch) :
tile_memory(details::get_scratch_ptr(&scratch)) {
#ifdef _CG_DEBUG
if (num_threads() > MaxBlockSize) {
details::abort();
}
#endif
#if !defined(_CG_HAS_RESERVED_SHARED)
tile_memory->init_barriers(thread_rank());
sync();
#endif
}
#endif
// Disable constructor
_CG_QUALIFIER thread_block()
#if defined(_CG_CPP11_FEATURES)
: tile_memory(details::get_scratch_ptr(NULL))
#endif
{ }
// Internal Use
_CG_QUALIFIER thread_group _get_tiled_threads(unsigned int tilesz) const {
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
// Invalid, immediately fail
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
details::abort();
return (thread_block());
}
unsigned int mask;
unsigned int base_offset = thread_rank() & (~(tilesz - 1));
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
mask = (unsigned int)(-1) >> (32 - masklength);
mask <<= (details::laneid() & ~(tilesz - 1));
thread_group tile = thread_group(details::coalesced_group_id);
tile._data.coalesced.mask = mask;
tile._data.coalesced.size = __popc(mask);
tile._data.coalesced.metaGroupSize = (details::cta::size() + tilesz - 1) / tilesz;
tile._data.coalesced.metaGroupRank = details::cta::thread_rank() / tilesz;
tile._data.coalesced.is_tiled = true;
return (tile);
}
public:
_CG_STATIC_CONST_DECL unsigned int _group_id = details::thread_block_id;
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
_CG_STATIC_QUALIFIER void sync() {
details::cta::sync();
}
_CG_STATIC_QUALIFIER unsigned int size() {
return details::cta::size();
}
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
return details::cta::thread_rank();
}
// Additional functionality exposed by the group
_CG_STATIC_QUALIFIER dim3 group_index() {
return details::cta::group_index();
}
_CG_STATIC_QUALIFIER dim3 thread_index() {
return details::cta::thread_index();
}
_CG_STATIC_QUALIFIER dim3 group_dim() {
return details::cta::block_dim();
}
_CG_STATIC_QUALIFIER dim3 dim_threads() {
return details::cta::dim_threads();
}
_CG_STATIC_QUALIFIER unsigned int num_threads() {
return details::cta::num_threads();
}
};
/**
* thread_block this_thread_block()
*
* Constructs a thread_block group
*/
_CG_QUALIFIER thread_block this_thread_block()
{
return (thread_block());
}
#if defined(_CG_CPP11_FEATURES)
template <unsigned int MaxBlockSize>
_CG_QUALIFIER thread_block this_thread_block(block_tile_memory<MaxBlockSize>& scratch) {
return (thread_block(scratch));
}
#endif
/**
* class coalesced_group
*
* A group representing the current set of converged threads in a warp.
* The size of the group is not guaranteed and it may return a group of
* only one thread (itself).
*
* This group exposes warp-synchronous builtins.
* Constructed via coalesced_threads();
*/
class coalesced_group : public thread_group_base<details::coalesced_group_id>
{
private:
friend _CG_QUALIFIER coalesced_group coalesced_threads();
friend _CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz);
friend _CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz);
friend class details::_coalesced_group_data_access;
_CG_QUALIFIER unsigned int _packLanes(unsigned laneMask) const {
unsigned int member_pack = 0;
unsigned int member_rank = 0;
for (int bit_idx = 0; bit_idx < 32; bit_idx++) {
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
if (lane_bit) {
if (laneMask & lane_bit)
member_pack |= 1 << member_rank;
member_rank++;
}
}
return (member_pack);
}
// Internal Use
_CG_QUALIFIER coalesced_group _get_tiled_threads(unsigned int tilesz) const {
const bool pow2_tilesz = ((tilesz & (tilesz - 1)) == 0);
// Invalid, immediately fail
if (tilesz == 0 || (tilesz > 32) || !pow2_tilesz) {
details::abort();
return (coalesced_group(0));
}
if (size() <= tilesz) {
return (*this);
}
if ((_data.coalesced.is_tiled == true) && pow2_tilesz) {
unsigned int base_offset = (thread_rank() & (~(tilesz - 1)));
unsigned int masklength = min((unsigned int)size() - base_offset, tilesz);
unsigned int mask = (unsigned int)(-1) >> (32 - masklength);
mask <<= (details::laneid() & ~(tilesz - 1));
coalesced_group coalesced_tile = coalesced_group(mask);
coalesced_tile._data.coalesced.metaGroupSize = size() / tilesz;
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
coalesced_tile._data.coalesced.is_tiled = true;
return (coalesced_tile);
}
else if ((_data.coalesced.is_tiled == false) && pow2_tilesz) {
unsigned int mask = 0;
unsigned int member_rank = 0;
int seen_lanes = (thread_rank() / tilesz) * tilesz;
for (unsigned int bit_idx = 0; bit_idx < 32; bit_idx++) {
unsigned int lane_bit = _data.coalesced.mask & (1 << bit_idx);
if (lane_bit) {
if (seen_lanes <= 0 && member_rank < tilesz) {
mask |= lane_bit;
member_rank++;
}
seen_lanes--;
}
}
coalesced_group coalesced_tile = coalesced_group(mask);
// Override parent with the size of this group
coalesced_tile._data.coalesced.metaGroupSize = (size() + tilesz - 1) / tilesz;
coalesced_tile._data.coalesced.metaGroupRank = thread_rank() / tilesz;
return coalesced_tile;
}
else {
// None in _CG_VERSION 1000
details::abort();
}
return (coalesced_group(0));
}
protected:
_CG_QUALIFIER coalesced_group(unsigned int mask) {
_data.coalesced.mask = mask;
_data.coalesced.size = __popc(mask);
_data.coalesced.metaGroupRank = 0;
_data.coalesced.metaGroupSize = 1;
_data.coalesced.is_tiled = false;
}
_CG_QUALIFIER unsigned int get_mask() const {
return (_data.coalesced.mask);
}
public:
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
_CG_QUALIFIER unsigned int num_threads() const {
return _data.coalesced.size;
}
_CG_QUALIFIER unsigned int size() const {
return num_threads();
}
_CG_QUALIFIER unsigned int thread_rank() const {
return (__popc(_data.coalesced.mask & details::lanemask32_lt()));
}
// Rank of this group in the upper level of the hierarchy
_CG_QUALIFIER unsigned int meta_group_rank() const {
return _data.coalesced.metaGroupRank;
}
// Total num partitions created out of all CTAs when the group was created
_CG_QUALIFIER unsigned int meta_group_size() const {
return _data.coalesced.metaGroupSize;
}
_CG_QUALIFIER void sync() const {
__syncwarp(_data.coalesced.mask);
}
#ifdef _CG_CPP11_FEATURES
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
unsigned int lane = (srcRank == 0) ? __ffs(_data.coalesced.mask) - 1 :
(size() == 32) ? srcRank : __fns(_data.coalesced.mask, 0, (srcRank + 1));
return details::tile::shuffle_dispatch<TyElem>::shfl(
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
}
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
if (size() == 32) {
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
}
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
if (lane >= 32)
lane = details::laneid();
return details::tile::shuffle_dispatch<TyElem>::shfl(
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
}
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, int delta) const {
if (size() == 32) {
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
_CG_STL_NAMESPACE::forward<TyElem>(elem), 0xFFFFFFFF, delta, 32);
}
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
if (lane >= 32)
lane = details::laneid();
return details::tile::shuffle_dispatch<TyElem>::shfl(
_CG_STL_NAMESPACE::forward<TyElem>(elem), _data.coalesced.mask, lane, 32);
}
#else
template <typename TyIntegral>
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, unsigned int src_rank) const {
details::assert_if_not_arithmetic<TyIntegral>();
unsigned int lane = (src_rank == 0) ? __ffs(_data.coalesced.mask) - 1 :
(size() == 32) ? src_rank : __fns(_data.coalesced.mask, 0, (src_rank + 1));
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
}
template <typename TyIntegral>
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, int delta) const {
details::assert_if_not_arithmetic<TyIntegral>();
if (size() == 32) {
return (__shfl_up_sync(0xFFFFFFFF, var, delta, 32));
}
unsigned lane = __fns(_data.coalesced.mask, details::laneid(), -(delta + 1));
if (lane >= 32) lane = details::laneid();
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
}
template <typename TyIntegral>
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, int delta) const {
details::assert_if_not_arithmetic<TyIntegral>();
if (size() == 32) {
return (__shfl_down_sync(0xFFFFFFFF, var, delta, 32));
}
unsigned int lane = __fns(_data.coalesced.mask, details::laneid(), delta + 1);
if (lane >= 32) lane = details::laneid();
return (__shfl_sync(_data.coalesced.mask, var, lane, 32));
}
#endif
_CG_QUALIFIER int any(int predicate) const {
return (__ballot_sync(_data.coalesced.mask, predicate) != 0);
}
_CG_QUALIFIER int all(int predicate) const {
return (__ballot_sync(_data.coalesced.mask, predicate) == _data.coalesced.mask);
}
_CG_QUALIFIER unsigned int ballot(int predicate) const {
if (size() == 32) {
return (__ballot_sync(0xFFFFFFFF, predicate));
}
unsigned int lane_ballot = __ballot_sync(_data.coalesced.mask, predicate);
return (_packLanes(lane_ballot));
}
#ifdef _CG_HAS_MATCH_COLLECTIVE
template <typename TyIntegral>
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
details::assert_if_not_arithmetic<TyIntegral>();
if (size() == 32) {
return (__match_any_sync(0xFFFFFFFF, val));
}
unsigned int lane_match = __match_any_sync(_data.coalesced.mask, val);
return (_packLanes(lane_match));
}
template <typename TyIntegral>
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
details::assert_if_not_arithmetic<TyIntegral>();
if (size() == 32) {
return (__match_all_sync(0xFFFFFFFF, val, &pred));
}
unsigned int lane_match = __match_all_sync(_data.coalesced.mask, val, &pred);
return (_packLanes(lane_match));
}
#endif /* !_CG_HAS_MATCH_COLLECTIVE */
};
_CG_QUALIFIER coalesced_group coalesced_threads()
{
return (coalesced_group(__activemask()));
}
namespace details {
template <unsigned int Size> struct verify_thread_block_tile_size;
template <> struct verify_thread_block_tile_size<32> { typedef void OK; };
template <> struct verify_thread_block_tile_size<16> { typedef void OK; };
template <> struct verify_thread_block_tile_size<8> { typedef void OK; };
template <> struct verify_thread_block_tile_size<4> { typedef void OK; };
template <> struct verify_thread_block_tile_size<2> { typedef void OK; };
template <> struct verify_thread_block_tile_size<1> { typedef void OK; };
#ifdef _CG_CPP11_FEATURES
template <unsigned int Size>
using _is_power_of_2 = _CG_STL_NAMESPACE::integral_constant<bool, (Size & (Size - 1)) == 0>;
template <unsigned int Size>
using _is_single_warp = _CG_STL_NAMESPACE::integral_constant<bool, Size <= 32>;
template <unsigned int Size>
using _is_multi_warp =
_CG_STL_NAMESPACE::integral_constant<bool, (Size > 32) && (Size <= 1024)>;
template <unsigned int Size>
using _is_valid_single_warp_tile =
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_single_warp<Size>::value>;
template <unsigned int Size>
using _is_valid_multi_warp_tile =
_CG_STL_NAMESPACE::integral_constant<bool, _is_power_of_2<Size>::value && _is_multi_warp<Size>::value>;
#else
template <unsigned int Size>
struct _is_multi_warp {
static const bool value = false;
};
#endif
}
template <unsigned int Size>
class __static_size_tile_base
{
protected:
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
public:
_CG_THREAD_SCOPE(cuda::thread_scope::thread_scope_block)
// Rank of thread within tile
_CG_STATIC_QUALIFIER unsigned int thread_rank() {
return (details::cta::thread_rank() & (numThreads - 1));
}
// Number of threads within tile
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int num_threads() {
return numThreads;
}
_CG_STATIC_CONSTEXPR_QUALIFIER unsigned int size() {
return num_threads();
}
};
template <unsigned int Size>
class __static_size_thread_block_tile_base : public __static_size_tile_base<Size>
{
friend class details::_coalesced_group_data_access;
typedef details::tile::tile_helpers<Size> th;
#ifdef _CG_CPP11_FEATURES
static_assert(details::_is_valid_single_warp_tile<Size>::value, "Size must be one of 1/2/4/8/16/32");
#else
typedef typename details::verify_thread_block_tile_size<Size>::OK valid;
#endif
using __static_size_tile_base<Size>::numThreads;
_CG_STATIC_CONST_DECL unsigned int fullMask = 0xFFFFFFFF;
protected:
_CG_STATIC_QUALIFIER unsigned int build_mask() {
unsigned int mask = fullMask;
if (numThreads != 32) {
// [0,31] representing the current active thread in the warp
unsigned int laneId = details::laneid();
// shift mask according to the partition it belongs to
mask = th::tileMask << (laneId & ~(th::laneMask));
}
return (mask);
}
public:
_CG_STATIC_CONST_DECL unsigned int _group_id = details::coalesced_group_id;
_CG_STATIC_QUALIFIER void sync() {
__syncwarp(build_mask());
}
#ifdef _CG_CPP11_FEATURES
// PTX supported collectives
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
_CG_QUALIFIER TyRet shfl(TyElem&& elem, int srcRank) const {
return details::tile::shuffle_dispatch<TyElem>::shfl(
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), srcRank, numThreads);
}
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
_CG_QUALIFIER TyRet shfl_down(TyElem&& elem, unsigned int delta) const {
return details::tile::shuffle_dispatch<TyElem>::shfl_down(
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
}
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
_CG_QUALIFIER TyRet shfl_up(TyElem&& elem, unsigned int delta) const {
return details::tile::shuffle_dispatch<TyElem>::shfl_up(
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), delta, numThreads);
}
template <typename TyElem, typename TyRet = details::remove_qual<TyElem>>
_CG_QUALIFIER TyRet shfl_xor(TyElem&& elem, unsigned int laneMask) const {
return details::tile::shuffle_dispatch<TyElem>::shfl_xor(
_CG_STL_NAMESPACE::forward<TyElem>(elem), build_mask(), laneMask, numThreads);
}
#else
template <typename TyIntegral>
_CG_QUALIFIER TyIntegral shfl(TyIntegral var, int srcRank) const {
details::assert_if_not_arithmetic<TyIntegral>();
return (__shfl_sync(build_mask(), var, srcRank, numThreads));
}
template <typename TyIntegral>
_CG_QUALIFIER TyIntegral shfl_down(TyIntegral var, unsigned int delta) const {
details::assert_if_not_arithmetic<TyIntegral>();
return (__shfl_down_sync(build_mask(), var, delta, numThreads));
}
template <typename TyIntegral>
_CG_QUALIFIER TyIntegral shfl_up(TyIntegral var, unsigned int delta) const {
details::assert_if_not_arithmetic<TyIntegral>();
return (__shfl_up_sync(build_mask(), var, delta, numThreads));
}
template <typename TyIntegral>
_CG_QUALIFIER TyIntegral shfl_xor(TyIntegral var, unsigned int laneMask) const {
details::assert_if_not_arithmetic<TyIntegral>();
return (__shfl_xor_sync(build_mask(), var, laneMask, numThreads));
}
#endif //_CG_CPP11_FEATURES
_CG_QUALIFIER int any(int predicate) const {
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
return (lane_ballot != 0);
}
_CG_QUALIFIER int all(int predicate) const {
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
return (lane_ballot == build_mask());
}
_CG_QUALIFIER unsigned int ballot(int predicate) const {
unsigned int lane_ballot = __ballot_sync(build_mask(), predicate);
return (lane_ballot >> (details::laneid() & (~(th::laneMask))));
}
#ifdef _CG_HAS_MATCH_COLLECTIVE
template <typename TyIntegral>
_CG_QUALIFIER unsigned int match_any(TyIntegral val) const {
details::assert_if_not_arithmetic<TyIntegral>();
unsigned int lane_match = __match_any_sync(build_mask(), val);
return (lane_match >> (details::laneid() & (~(th::laneMask))));
}
template <typename TyIntegral>
_CG_QUALIFIER unsigned int match_all(TyIntegral val, int &pred) const {
details::assert_if_not_arithmetic<TyIntegral>();
unsigned int lane_match = __match_all_sync(build_mask(), val, &pred);
return (lane_match >> (details::laneid() & (~(th::laneMask))));
}
#endif
};
template <unsigned int Size, typename ParentT>
class __static_parent_thread_block_tile_base
{
public:
// Rank of this group in the upper level of the hierarchy
_CG_STATIC_QUALIFIER unsigned int meta_group_rank() {
return ParentT::thread_rank() / Size;
}
// Total num partitions created out of all CTAs when the group was created
_CG_STATIC_QUALIFIER unsigned int meta_group_size() {
return (ParentT::size() + Size - 1) / Size;
}
};
/**
* class thread_block_tile<unsigned int Size, ParentT = void>
*
* Statically-sized group type, representing one tile of a thread block.
* The only specializations currently supported are those with native
* hardware support (1/2/4/8/16/32)
*
* This group exposes warp-synchronous builtins.
* Can only be constructed via tiled_partition<Size>(ParentT&)
*/
template <unsigned int Size, typename ParentT = void>
class __single_warp_thread_block_tile :
public __static_size_thread_block_tile_base<Size>,
public __static_parent_thread_block_tile_base<Size, ParentT>
{
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
friend class details::_coalesced_group_data_access;
protected:
_CG_QUALIFIER __single_warp_thread_block_tile() { };
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int, unsigned int) { };
_CG_STATIC_QUALIFIER unsigned int get_mask() {
return __static_size_thread_block_tile_base<Size>::build_mask();
}
};
template <unsigned int Size>
class __single_warp_thread_block_tile<Size, void> :
public __static_size_thread_block_tile_base<Size>,
public thread_group_base<details::coalesced_group_id>
{
_CG_STATIC_CONST_DECL unsigned int numThreads = Size;
template <unsigned int, typename ParentT> friend class __single_warp_thread_block_tile;
friend class details::_coalesced_group_data_access;
typedef __static_size_thread_block_tile_base<numThreads> staticSizeBaseT;
protected:
_CG_QUALIFIER __single_warp_thread_block_tile(unsigned int meta_group_rank, unsigned int meta_group_size) {
_data.coalesced.mask = staticSizeBaseT::build_mask();
_data.coalesced.size = numThreads;
_data.coalesced.metaGroupRank = meta_group_rank;
_data.coalesced.metaGroupSize = meta_group_size;
_data.coalesced.is_tiled = true;
}
_CG_QUALIFIER unsigned int get_mask() const {
return (_data.coalesced.mask);
}
public:
using staticSizeBaseT::sync;
using staticSizeBaseT::size;
using staticSizeBaseT::num_threads;
using staticSizeBaseT::thread_rank;
_CG_QUALIFIER unsigned int meta_group_rank() const {
return _data.coalesced.metaGroupRank;
}
_CG_QUALIFIER unsigned int meta_group_size() const {
return _data.coalesced.metaGroupSize;
}
};
/**
* Outer level API calls
* void sync(GroupT) - see <group_type>.sync()
* void thread_rank(GroupT) - see <group_type>.thread_rank()
* void group_size(GroupT) - see <group_type>.size()
*/
template <class GroupT>
_CG_QUALIFIER void sync(GroupT const &g)
{
g.sync();
}
// TODO: Use a static dispatch to determine appropriate return type
// C++03 is stuck with unsigned long long for now
#ifdef _CG_CPP11_FEATURES
template <class GroupT>
_CG_QUALIFIER auto thread_rank(GroupT const& g) -> decltype(g.thread_rank()) {
return g.thread_rank();
}
template <class GroupT>
_CG_QUALIFIER auto group_size(GroupT const &g) -> decltype(g.num_threads()) {
return g.num_threads();
}
#else
template <class GroupT>
_CG_QUALIFIER unsigned long long thread_rank(GroupT const& g) {
return static_cast<unsigned long long>(g.thread_rank());
}
template <class GroupT>
_CG_QUALIFIER unsigned long long group_size(GroupT const &g) {
return static_cast<unsigned long long>(g.num_threads());
}
#endif
/**
* tiled_partition
*
* The tiled_partition(parent, tilesz) method is a collective operation that
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
*
* A total of ((size(parent)+tilesz-1)/tilesz) subgroups will
* be created where threads having identical k = (thread_rank(parent)/tilesz)
* will be members of the same subgroup.
*
* The implementation may cause the calling thread to wait until all the members
* of the parent group have invoked the operation before resuming execution.
*
* Functionality is limited to power-of-two sized subgorup instances of at most
* 32 threads. Only thread_block, thread_block_tile<>, and their subgroups can be
* tiled_partition() in _CG_VERSION 1000.
*/
_CG_QUALIFIER thread_group tiled_partition(const thread_group& parent, unsigned int tilesz)
{
if (parent.get_type() == details::coalesced_group_id) {
const coalesced_group *_cg = static_cast<const coalesced_group*>(&parent);
return _cg->_get_tiled_threads(tilesz);
}
else {
const thread_block *_tb = static_cast<const thread_block*>(&parent);
return _tb->_get_tiled_threads(tilesz);
}
}
// Thread block type overload: returns a basic thread_group for now (may be specialized later)
_CG_QUALIFIER thread_group tiled_partition(const thread_block& parent, unsigned int tilesz)
{
return (parent._get_tiled_threads(tilesz));
}
// Coalesced group type overload: retains its ability to stay coalesced
_CG_QUALIFIER coalesced_group tiled_partition(const coalesced_group& parent, unsigned int tilesz)
{
return (parent._get_tiled_threads(tilesz));
}
namespace details {
template <unsigned int Size, typename ParentT>
class internal_thread_block_tile : public __single_warp_thread_block_tile<Size, ParentT> {};
template <unsigned int Size, typename ParentT>
_CG_QUALIFIER internal_thread_block_tile<Size, ParentT> tiled_partition_internal() {
return internal_thread_block_tile<Size, ParentT>();
}
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
_CG_QUALIFIER TyVal multi_warp_collectives_helper(
const GroupT& group,
WarpLambda warp_lambda,
InterWarpLambda inter_warp_lambda) {
return group.template collectives_scheme<TyVal>(warp_lambda, inter_warp_lambda);
}
template <typename T, typename GroupT>
_CG_QUALIFIER T* multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id) {
return group.template get_scratch_location<T>(warp_id);
}
template <typename GroupT>
_CG_QUALIFIER details::barrier_t* multi_warp_sync_location_getter(const GroupT& group) {
return group.get_sync_location();
}
}
/**
* tiled_partition<tilesz>
*
* The tiled_partition<tilesz>(parent) method is a collective operation that
* partitions the parent group into a one-dimensional, row-major, tiling of subgroups.
*
* A total of ((size(parent)/tilesz) subgroups will be created,
* therefore the parent group size must be evenly divisible by the tilesz.
* The allow parent groups are thread_block or thread_block_tile<size>.
*
* The implementation may cause the calling thread to wait until all the members
* of the parent group have invoked the operation before resuming execution.
*
* Functionality is limited to native hardware sizes, 1/2/4/8/16/32.
* The size(parent) must be greater than the template Size parameter
* otherwise the results are undefined.
*/
#if defined(_CG_CPP11_FEATURES)
template <unsigned int Size>
class __static_size_multi_warp_tile_base : public __static_size_tile_base<Size>
{
static_assert(details::_is_valid_multi_warp_tile<Size>::value, "Size must be one of 64/128/256/512");
template <typename TyVal, typename GroupT, typename WarpLambda, typename InterWarpLambda>
friend __device__ TyVal details::multi_warp_collectives_helper(
const GroupT& group,
WarpLambda warp_lambda,
InterWarpLambda inter_warp_lambda);
template <typename T, typename GroupT>
friend __device__ T* details::multi_warp_scratch_location_getter(const GroupT& group, unsigned int warp_id);
template <typename GroupT>
friend __device__ details::barrier_t* details::multi_warp_sync_location_getter(const GroupT& group);
template <unsigned int OtherSize>
friend class __static_size_multi_warp_tile_base;
using WarpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
using ThisType = __static_size_multi_warp_tile_base<Size>;
_CG_STATIC_CONST_DECL int numWarps = Size / 32;
protected:
details::multi_warp_scratch* const tile_memory;
template <typename GroupT>
_CG_QUALIFIER __static_size_multi_warp_tile_base(const GroupT& g) : tile_memory(g.tile_memory) {
#if defined(_CG_HAS_RESERVED_SHARED)
details::sync_warps_reset(get_sync_location(), details::cta::thread_rank());
g.sync();
#endif
}
private:
_CG_QUALIFIER details::barrier_t* get_sync_location() const {
// Different group sizes use different barriers, all groups of a given size share one barrier.
unsigned int sync_id = details::log2(Size / 64);
return &tile_memory->barriers[sync_id];
}
template <typename T>
_CG_QUALIFIER T* get_scratch_location(unsigned int warp_id) const {
unsigned int scratch_id = (details::cta::thread_rank() - thread_rank()) / 32 + warp_id;
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
}
template <typename T>
_CG_QUALIFIER T* get_scratch_location() const {
unsigned int scratch_id = details::cta::thread_rank() / 32;
return reinterpret_cast<T*>(&tile_memory->communication_memory[scratch_id]);
}
template <typename TyVal>
_CG_QUALIFIER TyVal shfl_impl(TyVal val, unsigned int src) const {
unsigned int src_warp = src / 32;
auto warp = details::tiled_partition_internal<32, ThisType>();
details::barrier_t* sync_location = get_sync_location();
// Get warp slot of the source threads warp.
TyVal* warp_scratch_location = get_scratch_location<TyVal>(src_warp);
if (warp.meta_group_rank() == src_warp) {
warp.sync();
// Put shuffled value into my warp slot and let my warp arrive at the barrier.
if (thread_rank() == src) {
*warp_scratch_location = val;
}
details::sync_warps_arrive(sync_location, details::cta::thread_rank(), numWarps);
TyVal result = *warp_scratch_location;
details::sync_warps_wait(sync_location, details::cta::thread_rank());
return result;
}
else {
// Wait for the source warp to arrive on the barrier.
details::sync_warps_wait_for_specific_warp(sync_location,
(details::cta::thread_rank() / 32 - warp.meta_group_rank() + src_warp));
TyVal result = *warp_scratch_location;
details::sync_warps(sync_location, details::cta::thread_rank(), numWarps);
return result;
}
}
template <typename TyVal, typename WarpLambda, typename InterWarpLambda>
_CG_QUALIFIER TyVal collectives_scheme(const WarpLambda& warp_lambda, const InterWarpLambda& inter_warp_lambda) const {
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
auto warp = details::tiled_partition_internal<32, ThisType>();
details::barrier_t* sync_location = get_sync_location();
TyVal* warp_scratch_location = get_scratch_location<TyVal>();
warp_lambda(warp, warp_scratch_location);
if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), numWarps)) {
auto subwarp = details::tiled_partition_internal<numWarps, decltype(warp)>();
if (subwarp.meta_group_rank() == 0) {
TyVal* thread_scratch_location = get_scratch_location<TyVal>(subwarp.thread_rank());
inter_warp_lambda(subwarp, thread_scratch_location);
}
warp.sync();
details::sync_warps_release(sync_location, warp.thread_rank() == 0, details::cta::thread_rank(), numWarps);
}
TyVal result = *warp_scratch_location;
return result;
}
public:
_CG_STATIC_CONST_DECL unsigned int _group_id = details::multi_tile_group_id;
using __static_size_tile_base<Size>::thread_rank;
template <typename TyVal>
_CG_QUALIFIER TyVal shfl(TyVal val, unsigned int src) const {
static_assert(sizeof(TyVal) <= details::multi_warp_scratch::communication_size,
"Collectives with tiles larger than 32 threads are limited to types smaller then 8 bytes");
return shfl_impl(val, src);
}
_CG_QUALIFIER void sync() const {
details::sync_warps(get_sync_location(), details::cta::thread_rank(), numWarps);
}
_CG_QUALIFIER int any(int predicate) const {
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
*warp_scratch_location = __any_sync(0xFFFFFFFF, predicate);
};
auto inter_warp_lambda =
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
*thread_scratch_location = __any_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
};
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
}
_CG_QUALIFIER int all(int predicate) const {
auto warp_lambda = [=] (WarpType& warp, int* warp_scratch_location) {
*warp_scratch_location = __all_sync(0xFFFFFFFF, predicate);
};
auto inter_warp_lambda =
[] (details::internal_thread_block_tile<numWarps, WarpType>& subwarp, int* thread_scratch_location) {
*thread_scratch_location = __all_sync(0xFFFFFFFFU >> (32 - numWarps), *thread_scratch_location);
};
return collectives_scheme<int>(warp_lambda, inter_warp_lambda);
}
};
template <unsigned int Size, typename ParentT = void>
class __multi_warp_thread_block_tile :
public __static_size_multi_warp_tile_base<Size>,
public __static_parent_thread_block_tile_base<Size, ParentT>
{
typedef __static_parent_thread_block_tile_base<Size, ParentT> staticParentBaseT;
typedef __static_size_multi_warp_tile_base<Size> staticTileBaseT;
protected:
_CG_QUALIFIER __multi_warp_thread_block_tile(const ParentT& g) :
__static_size_multi_warp_tile_base<Size>(g) {}
};
template <unsigned int Size>
class __multi_warp_thread_block_tile<Size, void> : public __static_size_multi_warp_tile_base<Size>
{
const unsigned int metaGroupRank;
const unsigned int metaGroupSize;
protected:
template <unsigned int OtherSize, typename ParentT>
_CG_QUALIFIER __multi_warp_thread_block_tile(const __multi_warp_thread_block_tile<OtherSize, ParentT>& g) :
__static_size_multi_warp_tile_base<Size>(g), metaGroupRank(g.meta_group_rank()), metaGroupSize(g.meta_group_size()) {}
public:
_CG_QUALIFIER unsigned int meta_group_rank() const {
return metaGroupRank;
}
_CG_QUALIFIER unsigned int meta_group_size() const {
return metaGroupSize;
}
};
#endif
template <unsigned int Size, typename ParentT = void>
class thread_block_tile;
namespace details {
template <unsigned int Size, typename ParentT, bool IsMultiWarp>
class thread_block_tile_impl;
template <unsigned int Size, typename ParentT>
class thread_block_tile_impl<Size, ParentT, false>: public __single_warp_thread_block_tile<Size, ParentT>
{
protected:
template <unsigned int OtherSize, typename OtherParentT, bool OtherIsMultiWarp>
_CG_QUALIFIER thread_block_tile_impl(const thread_block_tile_impl<OtherSize, OtherParentT, OtherIsMultiWarp>& g) :
__single_warp_thread_block_tile<Size, ParentT>(g.meta_group_rank(), g.meta_group_size()) {}
_CG_QUALIFIER thread_block_tile_impl(const thread_block& g) :
__single_warp_thread_block_tile<Size, ParentT>() {}
};
#if defined(_CG_CPP11_FEATURES)
template <unsigned int Size, typename ParentT>
class thread_block_tile_impl<Size, ParentT, true> : public __multi_warp_thread_block_tile<Size, ParentT>
{
protected:
template <typename GroupT>
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) :
__multi_warp_thread_block_tile<Size, ParentT>(g) {}
};
#else
template <unsigned int Size, typename ParentT>
class thread_block_tile_impl<Size, ParentT, true>
{
protected:
template <typename GroupT>
_CG_QUALIFIER thread_block_tile_impl(const GroupT& g) {}
};
#endif
}
template <unsigned int Size, typename ParentT>
class thread_block_tile : public details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>
{
friend _CG_QUALIFIER thread_block_tile<1, void> this_thread();
protected:
_CG_QUALIFIER thread_block_tile(const ParentT& g) :
details::thread_block_tile_impl<Size, ParentT, details::_is_multi_warp<Size>::value>(g) {}
public:
_CG_QUALIFIER operator thread_block_tile<Size, void>() const {
return thread_block_tile<Size, void>(*this);
}
};
template <unsigned int Size>
class thread_block_tile<Size, void> : public details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>
{
template <unsigned int, typename ParentT>
friend class thread_block_tile;
protected:
template <unsigned int OtherSize, typename OtherParentT>
_CG_QUALIFIER thread_block_tile(const thread_block_tile<OtherSize, OtherParentT>& g) :
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
public:
template <typename ParentT>
_CG_QUALIFIER thread_block_tile(const thread_block_tile<Size, ParentT>& g) :
details::thread_block_tile_impl<Size, void, details::_is_multi_warp<Size>::value>(g) {}
};
namespace details {
template <unsigned int Size, typename ParentT>
struct tiled_partition_impl;
template <unsigned int Size>
struct tiled_partition_impl<Size, thread_block> : public thread_block_tile<Size, thread_block> {
_CG_QUALIFIER tiled_partition_impl(const thread_block& g) :
thread_block_tile<Size, thread_block>(g) {}
};
// ParentT = static thread_block_tile<ParentSize, GrandParent> specialization
template <unsigned int Size, unsigned int ParentSize, typename GrandParent>
struct tiled_partition_impl<Size, thread_block_tile<ParentSize, GrandParent> > :
public thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> > {
#ifdef _CG_CPP11_FEATURES
static_assert(Size < ParentSize, "Tile size bigger or equal to the parent group size");
#endif
_CG_QUALIFIER tiled_partition_impl(const thread_block_tile<ParentSize, GrandParent>& g) :
thread_block_tile<Size, thread_block_tile<ParentSize, GrandParent> >(g) {}
};
}
template <unsigned int Size, typename ParentT>
_CG_QUALIFIER thread_block_tile<Size, ParentT> tiled_partition(const ParentT& g)
{
return details::tiled_partition_impl<Size, ParentT>(g);
}
/**
* thread_group this_thread()
*
* Constructs a generic thread_group containing only the calling thread
*/
_CG_QUALIFIER thread_block_tile<1, void> this_thread()
{
// Make thread_block_tile<1, thread_block> parent of the returned group, so it will have its
// meta group rank and size set to 0 and 1 respectively.
return thread_block_tile<1, thread_block_tile<1, thread_block> >(this_thread_block());
}
/**
* <group_type>.sync()
*
* Executes a barrier across the group
*
* Implements both a compiler fence and an architectural fence to prevent,
* memory reordering around the barrier.
*/
_CG_QUALIFIER void thread_group::sync() const
{
switch (_data.group.type) {
case details::coalesced_group_id:
cooperative_groups::sync(*static_cast<const coalesced_group*>(this));
break;
case details::thread_block_id:
cooperative_groups::sync(*static_cast<const thread_block*>(this));
break;
case details::grid_group_id:
cooperative_groups::sync(*static_cast<const grid_group*>(this));
break;
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
case details::multi_grid_group_id:
cooperative_groups::sync(*static_cast<const multi_grid_group*>(this));
break;
#endif
#if defined(_CG_HAS_CLUSTER_GROUP)
case details::cluster_group_id:
cooperative_groups::sync(*static_cast<const cluster_group*>(this));
break;
#endif
default:
break;
}
}
/**
* <group_type>.size()
*
* Returns the total number of threads in the group.
*/
_CG_QUALIFIER unsigned long long thread_group::size() const
{
unsigned long long size = 0;
switch (_data.group.type) {
case details::coalesced_group_id:
size = cooperative_groups::group_size(*static_cast<const coalesced_group*>(this));
break;
case details::thread_block_id:
size = cooperative_groups::group_size(*static_cast<const thread_block*>(this));
break;
case details::grid_group_id:
size = cooperative_groups::group_size(*static_cast<const grid_group*>(this));
break;
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
case details::multi_grid_group_id:
size = cooperative_groups::group_size(*static_cast<const multi_grid_group*>(this));
break;
#endif
#if defined(_CG_HAS_CLUSTER_GROUP)
case details::cluster_group_id:
size = cooperative_groups::group_size(*static_cast<const cluster_group*>(this));
break;
#endif
default:
break;
}
return size;
}
/**
* <group_type>.thread_rank()
*
* Returns the linearized rank of the calling thread along the interval [0, size()).
*/
_CG_QUALIFIER unsigned long long thread_group::thread_rank() const
{
unsigned long long rank = 0;
switch (_data.group.type) {
case details::coalesced_group_id:
rank = cooperative_groups::thread_rank(*static_cast<const coalesced_group*>(this));
break;
case details::thread_block_id:
rank = cooperative_groups::thread_rank(*static_cast<const thread_block*>(this));
break;
case details::grid_group_id:
rank = cooperative_groups::thread_rank(*static_cast<const grid_group*>(this));
break;
#if defined(_CG_HAS_MULTI_GRID_GROUP) && defined(_CG_CPP11_FEATURES) && defined(_CG_ABI_EXPERIMENTAL)
case details::multi_grid_group_id:
rank = cooperative_groups::thread_rank(*static_cast<const multi_grid_group*>(this));
break;
#endif
#if defined(_CG_HAS_CLUSTER_GROUP)
case details::cluster_group_id:
rank = cooperative_groups::thread_rank(*static_cast<const cluster_group*>(this));
break;
#endif
default:
break;
}
return rank;
}
_CG_END_NAMESPACE
#include <cooperative_groups/details/partitioning.h>
#if (!defined(_MSC_VER) || defined(_WIN64))
# include <cooperative_groups/details/invoke.h>
#endif
# endif /* ! (__cplusplus, __CUDACC__) */
#endif /* !_COOPERATIVE_GROUPS_H_ */